Fill NA with a series of characters in R dplyr - r

I have a large data frame that looks like this. Each player is assigned to a group.
library(tidyverse)
df <- tibble(player=c(1,2,3,4,5),groups=c("group1","group2","group2",NA,NA))
df
#> # A tibble: 5 × 2
#> player groups
#> <dbl> <chr>
#> 1 1 group1
#> 2 2 group2
#> 3 3 group2
#> 4 4 <NA>
#> 5 5 <NA>
Created on 2022-04-12 by the reprex package (v2.0.1)
Some players are not assigned into groups and I want to fill them serially -i.e. like this-
#> # A tibble: 5 × 2
#> player groups
#> <dbl> <chr>
#> 1 1 group1
#> 2 2 group2
#> 3 3 group2
#> 4 4 group3
#> 5 5 group4

dplyr
library(dplyr)
df %>%
mutate(
maxgrp = max(as.integer(gsub("[^0-9]", "", groups)), na.rm = TRUE),
groups = if_else(is.na(groups), paste0("group", maxgrp + cumsum(is.na(groups))), groups)
) %>%
select(-maxgrp)
# # A tibble: 5 x 2
# player groups
# <dbl> <chr>
# 1 1 group1
# 2 2 group2
# 3 3 group2
# 4 4 group3
# 5 5 group4
data.table
library(data.table)
DT <- as.data.table(df)
DT[, groups := fifelse(
is.na(groups),
paste0("group", cumsum(is.na(groups)) + max(as.integer(gsub("[^0-9]", "", groups)), na.rm = TRUE)),
groups) ]

This was tricky, finally I think we could do it this way:
library(dplyr)
df %>%
mutate(x = cumsum(groups %in% NA)+1) %>%
mutate(groups = ifelse(is.na(groups), paste0("group", x+1), groups), .keep="unused")
player groups
<dbl> <chr>
1 1 group1
2 2 group2
3 3 group2
4 4 group3
5 5 group4

You could do:
df |>
mutate(new_group = max(parse_number(groups), na.rm = TRUE) + cumsum(is.na(groups)),
groups = if_else(is.na(groups), paste0("group", new_group), groups)) |>
select(-new_group)
Using a slightly different data example where after the missings another group is mentioned, this would give you:
Input:
library(tidyverse)
df <- tibble(player=c(1,2,3,4,5,6),groups=c("group1","group2","group2",NA,NA, "group3"))
# A tibble: 6 x 2
player groups
<dbl> <chr>
1 1 group1
2 2 group2
3 3 group2
4 4 NA
5 5 NA
6 6 group3
Output:
# A tibble: 6 x 2
player groups
<dbl> <chr>
1 1 group1
2 2 group2
3 3 group2
4 4 group4
5 5 group5
6 6 group3

Related

Aggregate AND count data in R

I have a data frame with N participants. Each participant has 50 trials, half of them with condition A and half with condition B. In each trial, they either got 0 or 1 in a certain variable. I need to count the occurrences of the 0's or 1's for each participant, in each of the conditions.
so far, i tried something like this:
the_answer = aggregate(certain_variable==0 ~ participant, data = data[data$condition=="A" , ], FUN = sum, na.rm = TRUE).
The problem is I always get a different number of participants in my results, instead of getting the same N participants, with different counting of the variables...
Hope i was clear enough... I would really appreciate any help...
thanks!
Generate example data
###########################################################################
# Set-up
###########################################################################
# Packages
library(tibble)
libary(dplyr)
# Simulation parameters
set.seed(123)
participant_n <- 3
trial_n <- 50
trials_per_arm <- trial_n * 0.5
outcome_prob_A <- 0.8
outcome_prob_B <- 0.2
###########################################################################
# Simulate data
###########################################################################
# Participant and trials structure
data <- tibble(
participant = rep(1:participant_n, trial_n),
trial = rep(1:trial_n, each = participant_n),
)
# Randomly assign half of the trials to each condition, letting the trials
# assigned vary across participants
data <- data %>%
group_by(participant) %>%
mutate(
condition = sample(rep(c("A", "B"), trials_per_arm),
trial_n,
replace = FALSE),
outcome = case_when(
condition == "A" ~ rbinom(n(), 1, outcome_prob_A),
condition == "B" ~ rbinom(n(), 1, outcome_prob_B)
)
)
#> # A tibble: 150 x 4
#> # Groups: participant [3]
#> participant trial condition outcome
#> <int> <int> <chr> <int>
#> 1 1 1 A 1
#> 2 2 1 A 1
#> 3 3 1 B 0
#> 4 1 2 A 1
#> 5 2 2 B 0
#> 6 3 2 B 1
#> 7 1 3 B 1
#> 8 2 3 A 1
#> 9 3 3 B 0
#> 10 1 4 A 1
#> # ... with 140 more rows
Count each outcome for each participant
data %>%
group_by(participant, condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 12 x 4
#> participant condition outcome n
#> <int> <chr> <int> <int>
#> 1 1 A 0 2
#> 2 1 A 1 23
#> 3 1 B 0 21
#> 4 1 B 1 4
#> 5 2 A 0 5
#> 6 2 A 1 20
#> 7 2 B 0 22
#> 8 2 B 1 3
#> 9 3 A 0 4
#> 10 3 A 1 21
#> 11 3 B 0 22
#> 12 3 B 1 3
# If you just want counts for each outcome for each condition:
data %>%
group_by(condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 4 x 3
#> condition outcome n
#> <chr> <int> <int>
#> 1 A 0 11
#> 2 A 1 64
#> 3 B 0 65
#> 4 B 1 10

How to group interconnected elements in R dplyr

I have a data frame that looks like this.
Elements from the col1 are connected indirectly with elements in col2.
for example 1 is connected with 2 and 3.
and 2 is connected with 3. Therefore 1 should be connected with 3 as well.
library(tidyverse)
df1 <- tibble(col1=c(1,1,2,5,5,6),
col2=c(2,3,3,6,7,7))
df1
#> # A tibble: 6 × 2
#> col1 col2
#> <dbl> <dbl>
#> 1 1 2
#> 2 1 3
#> 3 2 3
#> 4 5 6
#> 5 5 7
#> 6 6 7
Created on 2022-03-15 by the reprex package (v2.0.1)
I want my data to look like this
#> col1 col2 col3
#> <dbl> <dbl>
#> 1 1 2 group1
#> 2 1 3 group1
#> 3 2 3 group1
#> 4 5 6 group2
#> 5 5 7 group2
#> 6 6 7 group2
I would appreciate any possible help to solve this riddle.
Thank you for your time
We may use igraph
library(igraph)
library(dplyr)
library(stringr)
g <- graph.data.frame(df1, directed = TRUE)
df1 %>%
mutate(col3 = str_c("group", clusters(g)$membership[as.character(col1)]))
-output
# A tibble: 6 × 3
col1 col2 col3
<dbl> <dbl> <chr>
1 1 2 group1
2 1 3 group1
3 2 3 group1
4 5 6 group2
5 5 7 group2
6 6 7 group2
Another igraph option
df1 %>%
mutate(
col3 =
paste0("group", {
graph_from_data_frame(.) %>%
components() %>%
membership()
}[as.character(col1)])
)
gives
# A tibble: 6 x 3
col1 col2 col3
<dbl> <dbl> <chr>
1 1 2 group1
2 1 3 group1
3 2 3 group1
4 5 6 group2
5 5 7 group2
6 6 7 group2

How to replace specific rows with their column sums in R?

I feel like there should be a simpler way of doing this. Here is my sample data.
df <-
tibble(
group1 = c(1,1,2,2,3,3,3,3),
group2 = c("A", "B", "A", "B", "A", "B", "A", "B"),
vals = c(13,56,15,50,5,22,9,59)
)
df
# A tibble: 8 x 3
group1 group2 vals
<dbl> <chr> <dbl>
1 1 A 13
2 1 B 56
3 2 A 15
4 2 B 50
5 3 A 5
6 3 B 22
7 3 A 9
8 3 B 59
I want to combine the vals where group1 is 3 and replace the summed rows with the old ones. Can anyone come up with a cleaner/tidier solution than this?
df %>%
group_by(group1, group2) %>%
bind_rows(
summarize(
.[.$group1 == 3,],
across(vals, sum),
summed = "x"
)
) %>%
ungroup() %>%
filter(!(group1 == 3 & is.na(summed))) %>%
select(-summed)
Here is what the result should be:
# A tibble: 6 x 3
group1 group2 vals
<dbl> <chr> <dbl>
1 1 A 13
2 1 B 56
3 2 A 15
4 2 B 50
5 3 A 14
6 3 B 81
This isn't very efficient, but it gives you your intended output.
df %>%
mutate(tmp = if_else(group1 == 3, 0L, row_number())) %>%
group_by(tmp, group1, group2) %>%
summarize(vals = sum(vals)) %>%
ungroup() %>%
select(-tmp)
# # A tibble: 6 x 3
# group1 group2 vals
# <dbl> <chr> <dbl>
# 1 3 A 14
# 2 3 B 81
# 3 1 A 13
# 4 1 B 56
# 5 2 A 15
# 6 2 B 50
Another technique would be to split your data into "3" and "not 3", process the "3" data, then recombine them.
df3 <- filter(df, group1 == 3)
dfnot3 <- filter(df, group1 != 3)
df3 %>%
group_by(group1, group2) %>%
summarize(vals = sum(vals)) %>%
ungroup() %>%
bind_rows(dfnot3)
# # A tibble: 6 x 3
# group1 group2 vals
# <dbl> <chr> <dbl>
# 1 3 A 14
# 2 3 B 81
# 3 1 A 13
# 4 1 B 56
# 5 2 A 15
# 6 2 B 50
(This second one is really only meaningful/efficient if you have lots of non-3 rows.)

Finding rowwise minimum and column index in a tibble

I have the following tibble:
> df <- tibble(
ID = LETTERS[1:4],
a = c(1,5,9,8),
b = c(5,9,8,2),
c = c(5,4,5,5)
)
> df
# A tibble: 4 x 4
ID a b c
<chr> <dbl> <dbl> <dbl>
1 A 1 5 5
2 B 5 9 4
3 C 9 8 5
4 D 8 2 5
>
What I want is to get the rowwise minimum of columns a:c and also the column index from this minimum.
The output tabel should look like this:
# A tibble: 4 x 6
ID a b c Min Col_Index
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 5 5 1 1
2 B 5 9 4 4 3
3 C 9 8 5 5 3
4 D 8 2 5 2 2
I don't want to use rowwise()!
Thank you!
You could use pmin with do.call to get rowwise minimum and negate the values to use with max.col to get the column index of minimum.
library(dplyr)
library(purrr)
df %>%
mutate(Min = do.call(pmin, select(., a:c)),
Col_Index = max.col(-select(., a:c)))
# ID a b c Min Col_Index
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
#1 A 1 5 5 1 1
#2 B 5 9 4 4 3
#3 C 9 8 5 5 3
#4 D 8 2 5 2 2
Using purrr's pmap_dbl :
df %>%
mutate(Min = pmap_dbl(select(., a:c), ~min(c(...))),
Col_Index = pmap_dbl(select(., a:c), ~which.min(c(...))))
One option could be:
df %>%
rowwise() %>%
mutate(min = min(c_across(a:c)),
min_index = which.min(c_across(a:c)))
ID a b c min min_index
<chr> <dbl> <dbl> <dbl> <dbl> <int>
1 A 1 5 5 1 1
2 B 5 9 4 4 3
3 C 9 8 5 5 3
4 D 8 2 5 2 2
Base R solution:
setNames(cbind(df, t(apply(df[, vapply(df, is.numeric, logical(1))], 1, function(row) {
cbind(min(row), which.min(row))}))), c(names(df), "min", "col_index"))

Overwrite left_join dplyr to update data

My question is similar to this one however I have additional columns in the LHS that should be kept https://stackoverflow.com/a/35642948/9285732
y is a subset of x with updated values for val1. In x I want to overwrite the relevant values but keep the rest.
Sample data:
library(tidyverse)
x <- tibble(name = c("hans", "dieter", "bohlen", "hans", "dieter", "alf"),
location = c(1,1,1,2,2,3),
val1 = 1:6, val2 = 1:6, val3 = 1:6)
y <- tibble(name = c("hans", "dieter", "hans"),
location = c(2,2,1),
val1 = 10)
> x
# A tibble: 6 x 5
name location val1 val2 val3
<chr> <dbl> <int> <int> <int>
1 hans 1 1 1 1
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 4 4 4
5 dieter 2 5 5 5
6 alf 3 6 6 6
> y
# A tibble: 3 x 3
name location val1
<chr> <dbl> <dbl>
1 hans 2 10
2 dieter 2 10
3 hans 1 10
> # desired output
> out
# A tibble: 6 x 5
name location val1 val2 val3
<chr> <dbl> <dbl> <int> <int>
1 hans 1 10 1 1
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 10 4 4
5 dieter 2 10 5 5
6 alf 3 6 6 6
I wrote a function that is doing what I want, however it's quite cumbersome. I wonder if there's a more elegant way or even a dplyr function that I'm unaware of.
overwrite_join <- function(x, y, by = NULL){
bycols <- which(colnames(x) %in% by)
commoncols <- which(colnames(x) %in% colnames(y))
extracols <- which(!(colnames(x) %in% colnames(y)))
x1 <- anti_join(x, y, by = by) %>%
bind_rows(y) %>%
select(commoncols) %>%
left_join(x %>% select(bycols, extracols), by = by)
out <- x %>% select(by) %>%
left_join(x1, by = by)
return(out)
}
overwrite_join(t1, t2, by = c("name", "location"))
You could do something along the lines of
> x %>%
left_join(y = y, by = c("name", "location")) %>%
within(., val1.x <- ifelse(!is.na(val1.y), val1.y, val1.x)) %>%
select(-val1.y)
# # A tibble: 6 x 5
# name location val1.x val2 val3
# <chr> <dbl> <dbl> <int> <int>
# 1 hans 1 10 1 1
# 2 dieter 1 2 2 2
# 3 bohlen 1 3 3 3
# 4 hans 2 10 4 4
# 5 dieter 2 10 5 5
# 6 alf 3 6 6 6
and then rename val1.x.
My package safejoin might help. Only available on github so far but has a feature designed just for that.
The conflict argument below must be fed a function or lambda to deal with conflicting columns when joining, here we want in priority a value from the y data frame so we can use dplyr::coalesce() there. Note that we must first coerce y$val1 as in your example it's double while x$val1 is integer. Your real case might not need this step.
# remotes::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
y$val1 <- as.integer(y$val1)
safe_left_join(x, y, by = c("name", "location"), conflict = ~coalesce(.y, .x))
#> # A tibble: 6 x 5
#> name location val1 val2 val3
#> <chr> <dbl> <int> <int> <int>
#> 1 hans 1 10 1 1
#> 2 dieter 1 2 2 2
#> 3 bohlen 1 3 3 3
#> 4 hans 2 10 4 4
#> 5 dieter 2 10 5 5
#> 6 alf 3 6 6 6
Edit : inspired by your own solution here's a 100% dplyr option that you might like better, just like your option though it's not a proper join!
bind_rows(y, x) %>%
group_by(name, location) %>%
summarize_all(~na.omit(.x)[[1]]) %>%
ungroup()
#> # A tibble: 6 x 5
#> name location val1 val2 val3
#> <chr> <dbl> <dbl> <int> <int>
#> 1 alf 3 6 6 6
#> 2 bohlen 1 3 3 3
#> 3 dieter 1 2 2 2
#> 4 dieter 2 10 5 5
#> 5 hans 1 10 1 1
#> 6 hans 2 10 4 4
Try dplyr::coalesce
x %>%
left_join(y, by = c("name", "location")) %>%
mutate(val1 = coalesce(val1.y, val1.x)) %>%
select(-val1.x, -val1.y)
# A tibble: 6 x 5
name location val2 val3 val1
<chr> <dbl> <int> <int> <int>
1 hans 1 1 1 10
2 dieter 1 2 2 2
3 bohlen 1 3 3 3
4 hans 2 4 4 10
5 dieter 2 5 5 10
6 alf 3 6 6 6
This is the idiom I now use. It does not preserve the row or column order in x, if that is important.
I like it because I can evaluate the values to just before the bind_rows(), do a visual inspection, and if I like it, put the fixed rows back onto the base dataframe.
library(dplyr)
x <- tibble(name = c("hans", "dieter", "bohlen", "hans", "dieter", "alf"),
location = c(1,1,1,2,2,3),
val1 = 1:6, val2 = 1:6, val3 = 1:6)
y <- tibble(name = c("hans", "dieter", "hans"),
location = c(2,2,1),
val1 = 10)
keys <- c("name", "location")
out <- x %>%
semi_join(y, keys) %>%
select(-matches(setdiff(names(y), keys))) %>%
left_join(y, keys) %>%
bind_rows(x %>% anti_join(y, keys))
out %>%
print()
#> # A tibble: 6 x 5
#> name location val2 val3 val1
#> <chr> <dbl> <int> <int> <dbl>
#> 1 hans 1 1 1 10
#> 2 hans 2 4 4 10
#> 3 dieter 2 5 5 10
#> 4 dieter 1 2 2 2
#> 5 bohlen 1 3 3 3
#> 6 alf 3 6 6 6
Created on 2019-12-12 by the reprex package (v0.3.0)

Resources