Reshape dataframe so that matching family members have their own column - r

I have a dataframe...
df <- tibble(
id = 1:5,
family = c("a","a","b","b","c"),
twin = c(1,2,1,2,1),
datacol1 = 11:15,
datacol2 = 21:25
)
For every twin pair (members of the same family) I need to have a second 'datacol' with the other twins' data. This should only happen for matching twins, so the 5th row (from family "c") should have duplicate columns that are empty.
Ideally, by the end the data would look like the following...
df <- tibble(
id = 1:5,
family = c("a","a","b","b","c"),
twin = c(1,2,1,2,1),
datacol1 = 11:15,
datacol1.b = c(12,11,14,13,NA),
datacol2 = 21:25,
datacol2.b = c(22,21,24,23,NA)
)
I have added an image to help illustrate what I am trying to get to.
I would like to be able to do this for all columns or for selected columns and preferably using tidyverse.

We can also use mutate_at
library(dplyr)
df %>%
group_by(family) %>%
mutate_at(vars(starts_with('datacol')), list(`2` =
~if(n() == 1) NA_integer_ else rev(.)))
# A tibble: 5 x 7
# Groups: family [3]
# id family twin datacol1 datacol2 datacol1_2 datacol2_2
# <int> <chr> <dbl> <int> <int> <int> <int>
#1 1 a 1 11 21 12 22
#2 2 a 2 12 22 11 21
#3 3 b 1 13 23 14 24
#4 4 b 2 14 24 13 23
#5 5 c 1 15 25 NA NA

cols = c("datacol1", "datacol2")
df %>%
group_by(family) %>%
mutate_at(vars(cols), function(x){
if (n() == 2){
rev(x)
} else {
NA
}
}) %>%
ungroup() %>%
select(cols) %>%
rename_all(funs(paste0(., ".b"))) %>%
cbind(df, .)
Base R
cols = c("datacol1", "datacol2")
do.call(rbind, lapply(split(df, df$family), function(x){
cbind(x, setNames(lapply(x[cols], function(y) {
if (length(y) == 2) {
rev(y)
} else {
NA
}}),
paste0(cols, ".b")))
}))

Related

cleaning time series based on previous timepoints

In my clincal dataset, I have a unique identifors by patient ID and time, and then the variable of interest that look like so:
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,NA,2,1,NA,1,3,NA,NA,1,NA)
Data <- data.frame(patientid=patientid, time=time, V1=V1)
Timepoint 3 is blank for each patient. I want to fill in timepoint three for each patient based on the following criteria. If at either time point 1 or 2 the variable is coded as a 2 or 3 then time point 3 should be coded as a 2. If at both time point 1 and 2, variable is coded as a 1 then time point point 3 should be coded as a one. If there is missing data at time point 1 or 2 then time point three should be missing. So for the toy expample it should look like this:
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,1,2,1,2,1,3,2,NA,1,NA)
Data <- data.frame(patientid=patientid, time=time, V1=V1)
You can use pivot_wider from tidyr to convert your data to wide format and you can mutate the 3 column with your logic using a function with the help of map from purrr package. You can return back to the original shape of the data frame using pivot-longer
library(tidyverse)
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,NA,2,1,NA,1,3,NA,NA,1,NA)
df <- data.frame(patientid=patientid, time=time, V1=V1)
flag <- function(t1,t2){
if(is.na(t1)|is.na(t2)){
NA
} else if(t1 %in% c(2,3)|t2 %in% c(2,3)){
2
} else if(t1 == 1|t2 == 1){
1
}else {
NA
}
}
df %>%
as_tibble() %>%
pivot_wider(names_from = time, values_from = V1) %>%
mutate(`3` = pmap_dbl(list(`1`,`2`),flag )) %>%
pivot_longer(-1, names_to = "time", values_to = "V1")
#> # A tibble: 12 x 3
#> patientid time V1
#> <dbl> <chr> <dbl>
#> 1 100 1 1
#> 2 100 2 1
#> 3 100 3 1
#> 4 101 1 2
#> 5 101 2 1
#> 6 101 3 2
#> 7 102 1 1
#> 8 102 2 3
#> 9 102 3 2
#> 10 104 1 NA
#> 11 104 2 1
#> 12 104 3 NA
Created on 2021-01-29 by the reprex package (v0.3.0)
This should do it!
library(tidyverse)
patientid <- c(100,100,100,101,101,101,102,102,102,104,104,104)
time <- c(1,2,3,1,2,3,1,2,3,1,2,3)
V1 <- c(1,1,NA,2,1,NA,1,3,NA,NA,1,NA)
Data <- data.frame(patientid=patientid, time=time, V1=V1)
Data <- Data %>% pivot_wider(names_from = "time", values_from = "V1",
names_prefix = "timepoint_")
timepoint_impute <- function(x,y) {
if(is.na(x) | is.na(y)) {
return(NA)
} else if(2 %in% c(x,y) | 3 %in% c(x,y)) {
return(2)
} else if(x==1 & y==1) {
return(1)
}
}
Data$timepoint_3 <- map2(.x = Data$timepoint_1, .y = Data$timepoint_2,
.f = timepoint_impute)
You end up with wide data format but if you need long data format you can just use tidyr::pivot_longer. This approach writes a custom function to handle the logic you need.

Mutate Paste0 Min Max to create new id variable

I need to create an id that defines the relationship between contact_id and relationship_id into a common household_id if and where the combination of contact_id and relationship_id are the same.
Sample Data
account_id <- c(1,1,1,1)
contact_id <- c(1234,2345,3456,4567)
relationship_id <- c(2345,1234,NA,"")
ownership_percent <- c(26,22,40,12)
score <- c(500,300,700,600)
testdata <- data.frame(account_id,contact_id,relationship_id,ownership_percent,score)
Have been using combinations of mutate, paste0, min, max, group_indices - have not found the right combination, getting tripped up by NA and order output of new household_id
Approach 1
library(dplyr)
testdata %>%
mutate(col1 = pmin(contact_id, relationship_id),
col2 = pmax(contact_id, relationship_id),
household_id = paste0(col1,col2)) %>%
Approach 2
testdata %>%
mutate(household_id = sort(paste0(c(contact_id, relationship_id))), collapse = "")
Error: Column household_id must be length 4 (the number of rows) or one, not 8
Expected Outcome
library(dplyr)
# replace missing or NA values with 1
testdata$relationship_id <- type.convert(testdata$relationship_id)
testdata$relationship_id[relationship_id == ""] <- 1
testdata$relationship_id[is.na(testdata$relationship_id)] <- 1
# Create household_id
testdata %>%
mutate(group = paste0(pmin(contact_id, relationship_id), pmax(contact_id, relationship_id)),
household_id = match(group, unique(group)))
You can use -
library(dplyr)
testdata %>%
mutate(col1 = pmin(contact_id, relationship_id, na.rm = TRUE),
col2 = pmax(contact_id, relationship_id, na.rm = TRUE)) %>%
rowwise() %>%
mutate(household_id = paste0(unique(c(col1, col2)), collapse = '')) %>%
ungroup %>%
select(-col1, -col2)
# account_id contact_id relationship_id ownership_percent score household_id
# <dbl> <dbl> <chr> <dbl> <dbl> <chr>
#1 1 1234 "2345" 26 500 12342345
#2 1 2345 "1234" 22 300 12342345
#3 1 3456 NA 40 700 3456
#4 1 4567 "" 12 600 4567

Tidyeval: apply function to data frames extracted from list

This is a simplified version of a problem involving a large list containing complex tables. I want to extract the tables from the list and apply a function to each one. Here we can create a simple list containing small named data frames:
library(tidyverse)
table_names <- c('dfA', 'dfB', 'dfC')
dfA <- tibble(a = 1:3, b = 4:6, c = 7:9)
dfB <- tibble(a = 10:12, b = 13:15, c = 16:18)
dfC <- tibble(a = 19:21, b = 22:24, c = 25:27)
df_list <- list(dfA, dfB, dfC) %>% setNames(table_names)
Here is a simplified example of the kind of operation I would like to apply:
dfA_mod <- df_list$dfA %>%
mutate(name = 'dfA') %>%
select(name, everything())
In this example, I extract one of three tables in the list df_list$dfA, create a new column with the same value in each row mutate(name = 'dfA'), and re-order the columns so that the new column appears in the left-most position select(name, everything()). The resulting object is assigned to dfA_mod.
To solve the larger problem, I want to use one of the purrr::map() variants to apply the function over the character vector table_names, which was initiated in the first block of code above. The elements of table_names serve two purposes: 1) naming the tables held in the list; and 2) supplying values for the name column in the modified table.
I could write a function such as:
fun <- function(x) {
df_list$x %>%
mutate(name = x) %>%
select(name, everything()) %>%
assign(paste0(x, '_mod'), ., envir = .GlobalEnv)
}
And then use map() to create a new list of modified tables:
new_list <- df_list %>% map(table_name, fun(x))
But of course this code does not work, with the main obstacle being (for me at least) figuring out how to quote and unquote the right terms within the function. I'm a beginner at tidy evaluation, and I could use some help in specifying the function and using map properly.
Here is the desired output (for one modified table):
# A tibble: 3 x 4
name a b c
<chr> <int> <int> <int>
1 dfA 1 4 7
2 dfA 2 5 8
3 dfA 3 6 9
Thanks in advance for any help!
We can use purrr::imap which passes data in the list as well as name of the list
library(dplyr)
library(purrr)
df_out <- imap(df_list, ~.x %>% mutate(name = .y) %>% select(name, everything()))
df_out
#$dfA
# A tibble: 3 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfA 1 4 7
#2 dfA 2 5 8
#3 dfA 3 6 9
#$dfB
# A tibble: 3 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfB 10 13 16
#....
#....
This gives a list of desired dataframes, if you want them as separate dataframes, you can do
names(df_out) <- paste0(names(df_out), "_mod")
list2env(df_out, .GlobalEnv)
We can also do it using base R Map
df_out <- Map(function(x, y) transform(x, name = y)[c('name', names(x))],
df_list, names(df_list))
and give list names same as above.
We can convert it to a single data.frame with map while passing the .id
library(purrr)
map_dfr(df_list, I, .id = 'name')
Or with bind_rows
library(dplyr)
bind_rows(df_list, .id = 'name')
# A tibble: 9 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfA 1 4 7
#2 dfA 2 5 8
#3 dfA 3 6 9
#4 dfB 10 13 16
#5 dfB 11 14 17
#6 dfB 12 15 18
#7 dfC 19 22 25
#8 dfC 20 23 26
#9 dfC 21 24 27

Filter group only when both levels are present

This feels like it should be more straightforward and I'm just missing something. The goal is to filter the data into a new df where both var values 1 & 2 are represented in the group
here's some toy data:
grp <- c(rep("A", 3), rep("B", 2), rep("C", 2), rep("D", 1), rep("E",2))
var <- c(1,1,2,1,1,2,1,2,2,2)
id <- c(1:10)
df <- as.data.frame(cbind(id, grp, var))
only grp A and C should be present in the new data because they are the only ones where var 1 & 2 are present.
I tried dplyr, but obviously '&' won't work since it's not row based and '|' just returns the same df:
df.new <- df %>% group_by(grp) %>% filter(var==1 & var==2) #returns no rows
Here is another dplyr method. This can work for more than two factor levels in var.
library(dplyr)
df2 <- df %>%
group_by(grp) %>%
filter(all(levels(var) %in% var)) %>%
ungroup()
df2
# # A tibble: 5 x 3
# id grp var
# <fct> <fct> <fct>
# 1 1 A 1
# 2 2 A 1
# 3 3 A 2
# 4 6 C 2
# 5 7 C 1
We can condition on there being at least one instance of var == 1 and at least one instance of var == 2 by doing the following:
library(tidyverse)
df1 <- data_frame(grp, var, id) # avoids coercion to character/factor
df1 %>%
group_by(grp) %>%
filter(sum(var == 1) > 0 & sum(var == 2) > 0)
grp var id
<chr> <dbl> <int>
1 A 1 1
2 A 1 2
3 A 2 3
4 C 2 6
5 C 1 7

If/else condition in dplyr 0.7 function

I'd like to make a simple if/else condition in a dplyr function. I've looked at some helpful posts (e.g., How to parametrize function calls in dplyr 0.7?), but am still running into trouble.
Below is a toy example that works when I call the function without the grouping variable. The function then fails with the grouping variable.
# example dataset
test <- tibble(
A = c(1:5,1:5),
B = c(1,2,1,2,3,3,3,3,3,3),
C = c(1,1,1,1,2,3,4,5,4,3)
)
# begin function, set default for group var to NULL.
prop_tab <- function(df, column, group = NULL) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is NOT null, then...
if(!is.null(group)) {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
test %>% prop_tab(column = C) # works
test %>% prop_tab(column = A, group = B) # fails
# Error in prop_tab(., column = A, group = B) : object 'B' not found
The problem here is that when you supply unquoted arguments, is.null doesn't know what to do with it. So this code tries to check whether object B is null and errors because B does not exist in that scope. Instead, you can use missing() to check whether an argument was supplied to the function, like so. There may be a cleaner way but this at least works, as you can see at the bottom.
library(tidyverse)
test <- tibble(
A = c(1:5,1:5),
B = c(1,2,1,2,3,3,3,3,3,3),
C = c(1,1,1,1,2,3,4,5,4,3)
)
# begin function, set default for group var to NULL.
prop_tab <- function(df, column, group) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is not supplied, then:
if(!missing(group)) {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
test %>% prop_tab(column = C) # works
#> # A tibble: 5 x 2
#> col_name Percentages
#> <dbl> <dbl>
#> 1 1 40
#> 2 2 10
#> 3 3 20
#> 4 4 20
#> 5 5 10
test %>% prop_tab(column = A, group = B)
#> # A tibble: 3 x 2
#> B Percentages
#> <dbl> <dbl>
#> 1 1 20
#> 2 2 20
#> 3 3 60
Created on 2018-06-29 by the reprex package (v0.2.0).
You can use missing instead of is.null, so your argument won't be evaluated (that's what cause the error):
prop_tab <- function(df, column, group = NULL) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is NOT null, then...
if(!missing(group)) {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
test %>% prop_tab(column = C)
# example dataset
# # A tibble: 5 x 2
# col_name Percentages
# <dbl> <dbl>
# 1 1 40
# 2 2 10
# 3 3 20
# 4 4 20
# 5 5 10
test %>% prop_tab(column = A, group = B)
# # A tibble: 3 x 2
# B Percentages
# <dbl> <dbl>
# 1 1 20
# 2 2 20
# 3 3 60
You can also use length(substitute(group)) instead of !missing(group), it'll be more robust as it won't fail in the unlikely case where someone fills explicitely the group argument with NULL (the former option will crash in this case).
One option would be to check on the "group_name" instead of the 'group'
prop_tab <- function(df, column, group = NULL) {
col_name <- enquo(column)
group_name <- enquo(group)
# if group_by var is NOT null, then...
if(as.character(group_name)[2] != "NULL") {
temp <- df %>%
select(!!col_name, !!group_name) %>%
group_by(!!group_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
} else {
# if group_by var is null, then...
temp <- df %>%
select(!!col_name) %>%
group_by(col_name = !!col_name) %>%
summarise(Percentages = 100 * length(!!col_name) / nrow(df))
}
temp
}
-checking
prop_tab(test, column = C, group = B)
# A tibble: 3 x 2
#< B Percentages
# <dbl> <dbl>
#1 1 20
#2 2 20
#3 3 60
prop_tab(test, column = C)
# A tibble: 5 x 2
# col_name Percentages
# <dbl> <dbl>
#1 1 40
#2 2 10
#3 3 20
#4 4 20
#5 5 10

Resources