calculate quantile for each group i dataframe and assign NA? - r

I made up this example to explain my question:
df= structure(list(group = structure(c(1L, 1L, 2L, 2L, 10L, 10L
), .Label = c("Eve", "ba", "De", "De","Mi", "C", "O", "W",
"as", "ras", "Cro", "ics"), class = "factor"), ds = c(8, 8,
1, 4, 4, 6), em = c(1, 3, 8,2, 7, 3)), row.names = c(74567L,
74568L, 74570L, 74576L, 74577L, 74578L), class = "data.frame")
I need for each group to assign all values of em and ds to NA
> quantile 90 = NA
< quantile 10 = NA

Here's a way to do it for each group and each numeric variable using dplyr and ifelse.
Having only a couple of samples per group makes it difficult to interpret the whole concept of quantiles, so the result you get very much depends on how you define a quantile. The type parameter allows you to specify the definition you are using. R defaults to type = 7:
library(dplyr)
df %>%
group_by(group) %>%
mutate(ds = ifelse(ds > quantile(ds, .9) | ds < quantile(ds, .1), NA, ds),
em = ifelse(em > quantile(em, .9) | em < quantile(em, .1), NA, em))
#> # A tibble: 6 x 3
#> # Groups: group [3]
#> group ds em
#> <fct> <dbl> <lgl>
#> 1 Eve 8 NA
#> 2 Eve 8 NA
#> 3 ba NA NA
#> 4 ba NA NA
#> 5 ras NA NA
#> 6 ras NA NA
However, you can change this depending on your definition:
df %>%
group_by(group) %>%
mutate(ds = ifelse(ds > quantile(ds, .9, type = 1) |
ds < quantile(ds, .1, type = 1), NA, ds),
em = ifelse(em > quantile(em, .9, type = 1) |
em < quantile(em, .1, type = 1), NA, em))
#> # A tibble: 6 x 3
#> # Groups: group [3]
#> group ds em
#> <fct> <dbl> <dbl>
#> 1 Eve 8 1
#> 2 Eve 8 3
#> 3 ba 1 8
#> 4 ba 4 2
#> 5 ras 4 7
#> 6 ras 6 3
Created on 2020-05-17 by the reprex package (v0.3.0)

Related

Calculate difference between rows in R based on a specifc row for each group

Hi everyone,
I have a dataframe with where each ID has multiple visits from 1-5. I am trying to calculate the difference of a score between each visit to visit 1. eg. (Score(Visit 5-score(Visit1) and so on). How do I achieve that in R ? Below is a sample dataset and result dataset
structure(list(ID = c("A", "A", "A", "A", "A", "B", "B", "B"),
Visit = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L), Score = c(16,
15, 13, 12, 12, 20, 19, 18)), class = "data.frame", row.names = c(NA,
-8L))
#> ID Visit Score
#> 1 A 1 16
#> 2 A 2 15
#> 3 A 3 13
#> 4 A 4 12
#> 5 A 5 12
#> 6 B 1 20
#> 7 B 2 19
#> 8 B 3 18
Created on 2021-05-20 by the reprex package (v2.0.0)
Here is the expected output
Here's a solution using dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Difference = ifelse(Visit == 1, NA, Score[Visit == 1] - Score))
# A tibble: 8 x 4
# Groups: ID [2]
ID Visit Score Difference
<chr> <int> <dbl> <dbl>
1 A 1 16 NA
2 A 2 15 1
3 A 3 13 3
4 A 4 12 4
5 A 5 12 4
6 B 1 20 NA
7 B 2 19 1
8 B 3 18 2
Sample data
df <- data.frame(
ID = c('A', 'A', 'A', 'A', 'A', 'B', 'B', 'B'),
Visit = c(1:5, 1:3),
Score = c(16,15,13,12,12,20,19,18)
)
Sidenote: next time I suggest you to post not images but a sample data using the dput() function on your dataframe
Solution with dplyr using first
data <- data.frame(
ID = c(rep("A", 5), rep("B", 3)),
Visit = c(1:5, 1:3),
Score = c(16, 15, 13, 12, 12, 20, 19, 18))
library(dplyr)
data %>%
group_by(ID) %>%
arrange(Visit) %>%
mutate(Difference = first(Score) - Score)
#> # A tibble: 8 x 4
#> # Groups: ID [2]
#> ID Visit Score Difference
#> <chr> <int> <dbl> <dbl>
#> 1 A 1 16 0
#> 2 A 2 15 1
#> 3 A 3 13 3
#> 4 A 4 12 4
#> 5 A 5 12 4
#> 6 B 1 20 0
#> 7 B 2 19 1
#> 8 B 3 18 2
Created on 2021-05-20 by the reprex package (v2.0.0)

How to transform row to column based on a single row in R?

I have a data set that looks something like this
A B 1960 1970 1980
x a 1 2 3
x b 1.1 2.1 NA
y a 2 3 4
y b 1 NA 1
I want to transform the columns based on row B so that it looks something like this
A year a b
x 1960 1 1.1
x 1970 2 2.1
x 1980 3 NA
y 1960 2 1
y 1970 3 NA
y 1980 4 1
I am not sure how to do this. I know that I can do a full transformation using t() or using row_to_columns() from tidyverse, but the result is not what I want.
The initial data has about 60 columns and 165 distinct values in column B.
You can do pivot_long() and then pivot_wide() , although might be a bad idea to rename your column "B" again:
library(dplyr)
library(tidyr)
df %>% pivot_longer(-c(A,B)) %>%
pivot_wider(names_from=B) %>% rename(B=name)
# A tibble: 6 x 4
A B a b
<fct> <chr> <dbl> <dbl>
1 x 1960 1 1.1
2 x 1970 2 2.1
3 x 1980 3 NA
4 y 1960 2 1
5 y 1970 3 NA
6 y 1980 4 1
df = structure(list(A = structure(c(1L, 1L, 2L, 2L), .Label = c("x",
"y"), class = "factor"), B = structure(c(1L, 2L, 1L, 2L), .Label = c("a",
"b"), class = "factor"), `1960` = c(1, 1.1, 2, 1), `1970` = c(2,
2.1, 3, NA), `1980` = c(3L, NA, 4L, 1L)), class = "data.frame", row.names = c(NA,
-4L))
library(data.table)
dt <- fread('A B 1960 1970 1980
x a 1 2 3
x b 1.1 2.1 NA
y a 2 3 4
y b 1 NA 1')
names(dt) <- as.character(dt[1,])
dt <- dt[-1,]
dt[,(3:5):=lapply(.SD,as.numeric),.SDcols=3:5]
dcast(melt(dt,measure.vars = 3:5),...~B,value.var = "value")
#> A variable a b
#> 1: x 1960 1 1.1
#> 2: x 1970 2 2.1
#> 3: x 1980 3 NA
#> 4: y 1960 2 1.0
#> 5: y 1970 3 NA
#> 6: y 1980 4 1.0
Created on 2020-05-05 by the reprex package (v0.3.0)
Base R solution:
long_df <- reshape(df, direction = "long",
varying = which(!names(df) %in% c("A", "B")),
v.names = "value",
timevar = "year",
times = names(df)[!(names(df) %in% c("A", "B"))],
ids = NULL,
new.row.names = 1:(length(which(!names(df) %in% c("A", "B"))) * nrow(df)))
wide_df <- setNames(reshape(long_df, direction = "wide",
idvar = c("A", "year"),
timevar = "B"), c("A", "B", unique(df$B)))
Data:
df <- structure(list(A = c("x", "x", "y", "y"), B = c("a", "b", "a",
"b"), `1960` = c(1, 1.1, 2, 1), `1970` = c(2, 2.1, 3, NA), `1980` = c(3L,
NA, 4L, 1L)), row.names = 2:5, class = "data.frame")

Join data frames without creating duplicate rows while concatenating unique entries under one column

I'm trying to merge two data frames together which are related to each other via a specific variable named patient. The second data frame has multiple entries for the same patient column. I don't want to create duplicate patient entries upon merging, but I want to retain unique information in the second data frame by concatenating the values under one column.
I tried manually concatenating certain variables using group_by which works. I have several variables, however, and manually specifying all of them is not feasible
I can also concatenate every variable in the data frame by using dplyr as seen below. The problem in the second case is that duplicate values are also concatenated making the data frame unnecessarily big and difficult to deal with. Please see the reprex below.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 <- data.frame(patient=c("a", "b", "c"),
var1 = 1:3,
var2=11:13)
df1
#> patient var1 var2
#> 1 a 1 11
#> 2 b 2 12
#> 3 c 3 13
df2 <- data.frame(patient=c("a","a", "b", "b", "c", "c" ),
treatment= rep(c("drug1", "drug2"), 3),
time= rep(c("time1", "time2"), 3),
var3= "constant")
df2
#> patient treatment time var3
#> 1 a drug1 time1 constant
#> 2 a drug2 time2 constant
#> 3 b drug1 time1 constant
#> 4 b drug2 time2 constant
#> 5 c drug1 time1 constant
#> 6 c drug2 time2 constant
df_merged <- left_join(df1, df2)
#> Joining, by = "patient"
# Don't want duplicates like this
df_merged
#> patient var1 var2 treatment time var3
#> 1 a 1 11 drug1 time1 constant
#> 2 a 1 11 drug2 time2 constant
#> 3 b 2 12 drug1 time1 constant
#> 4 b 2 12 drug2 time2 constant
#> 5 c 3 13 drug1 time1 constant
#> 6 c 3 13 drug2 time2 constant
df_merged2 <- df_merged %>%
group_by(patient) %>%
mutate(treatment = paste(treatment, collapse = "_"),
time=paste(time, collapse = "_")) %>%
filter(!duplicated(patient))
# I can manually edit a few variables like this
df_merged2
#> # A tibble: 3 x 6
#> # Groups: patient [3]
#> patient var1 var2 treatment time var3
#> <fct> <int> <int> <chr> <chr> <fct>
#> 1 a 1 11 drug1_drug2 time1_time2 constant
#> 2 b 2 12 drug1_drug2 time1_time2 constant
#> 3 c 3 13 drug1_drug2 time1_time2 constant
df_merged3 <- df_merged %>%
group_by(patient) %>%
mutate_at(vars(-group_cols()), .funs = ~paste(., collapse ="_")) %>%
filter(!duplicated(patient))
# I have many variables I can't specify manually
# I can create this merged data frame, but I don't want to
# concatenate duplicated values such as var1, var2, and var3
df_merged3
#> # A tibble: 3 x 6
#> # Groups: patient [3]
#> patient var1 var2 treatment time var3
#> <fct> <chr> <chr> <chr> <chr> <chr>
#> 1 a 1_1 11_11 drug1_drug2 time1_time2 constant_constant
#> 2 b 2_2 12_12 drug1_drug2 time1_time2 constant_constant
#> 3 c 3_3 13_13 drug1_drug2 time1_time2 constant_constant
Created on 2019-10-23 by the reprex package (v0.3.0)
I'd like to see if there is a way of concatenating variables containing only unique values to retain information from the second data frame without duplicating the rows in the df_merged.
I would be happy to hear if you have recommendations other than dplyr. A data.table solution may also be suitable for me as well, since my real data frames are quite large.
Thanks!
We can use summarise_at and unique
library(dplyr)
df_merged %>%
group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~paste(unique(.), collapse ="_"))
Or we can do the merge/joint directly instead of adding/altering the Global Env with an intermediate dataframe.
left_join(df1,
df2 %>% group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~paste(unique(.), collapse ="_")) %>%
ungroup()
)
Joining, by = "patient"
patient var1 var2 treatment time var3
1 a 1 11 drug1_drug2 time1_time2 constant
2 b 2 12 drug1_drug2 time1_time2 constant
3 c 3 13 drug1_drug2 time1_time2 constant
Update
#Here a toy example to experiment with, uncomment browser to see how it works inside Reduce,
#also see ?Reduce for more info
paste_mod <- function(x) Reduce(function(u, v){
u <- ifelse(!grepl('_',u) & is.na(u),'.',u)
v <- ifelse(is.na(v),'.',v)
if(v=='.' | !grepl(v,u)) paste0(u,'_',v) else u
}, x)
paste_mod(c("drug1",NA,NA,"drug2","drug1","drug2"))
[1] "drug1_._._drug2"
paste_mod(c(NA,NA,"drug2","drug1","drug2"))
[1] "._._drug2_drug1"
#replace NA with . then apply Reduce
df2 %>%
mutate_if(is.factor,as.character) %>% mutate_all(~replace(.,is.na(.),'.')) %>%
group_by(patient) %>%
summarise_at(vars(-group_cols()), .funs = ~Reduce(function(u, v) if(v=='.' | !grepl(v,u)) paste0(u,'_',v) else u, .)) %>%
ungroup()
# A tibble: 2 x 4
patient treatment time var3
<chr> <chr> <chr> <chr>
1 a drug1_._._drug2 time1_time2 constant
2 c drug1_drug2 time1_time2 constant
New df2 for testing the updated solution
df2 <- structure(list(patient = structure(c(1L, 1L, 1L, 1L, 2L, 2L), .Label = c("a",
"c"), class = "factor"), treatment = structure(c(1L, NA, NA,
2L, 1L, 2L), .Label = c("drug1", "drug2"), class = "factor"),
time = structure(c(1L, 2L, 1L, 2L, 1L, 2L), .Label = c("time1",
"time2"), class = "factor"), var3 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), class = "factor", .Label = "constant")), class = "data.frame", row.names = c(NA,
-6L))

How to add a column with progressive number based on condition

I am trying to add a column to my existing data set.
The data set has three columns:
Student (which is the column with the participant ID),
Week (the number of the week of the year during which the data were collected),
and
Day (the number of the weekday during which the data were
collected).
Now, a new column Obs that I am trying to create would contain a progressive number (from 1 to n) referring to the week during which every student was tested.
I have tried to use group_by in combination with rep but it does not seem to produce the result I want:
Week <- c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4)
Day <- c(1, 2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)
Student <- c("A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C")
fake.db <- data.frame(Student, Week, Day)
library(dplyr)
fake.db %>%
group_by(Student) %>%
mutate(Obs = rep(1:length(Student), each = Week))
# Student Week Day Obs
# <fct> <dbl> <dbl> <int>
# 1 A 1 1 1
# 2 A 1 2 2
# 3 A 1 3 3
# 4 B 2 2 1
# 5 B 2 3 2
# 6 B 2 5 3
# 7 B 3 1 4
# 8 B 3 3 5
# 9 C 4 2 1
#10 C 4 3 2
#11 C 4 4 3
#12 C 4 5 4
What I would like to obtain is different. For the first week of data collection, 1 should be reported, and for the students for whom data were collected during a second week, 2 should be reported, etc.:
# Student Week Day Obs
#1 A 1 1 1
#2 A 1 2 1
#3 A 1 3 1
#4 B 2 2 1
#5 B 2 3 1
#6 B 2 5 1
#7 B 3 1 2
#8 B 3 3 2
#9 C 4 2 1
#10 C 4 3 1
#11 C 4 4 1
#12 C 4 5 1
One dplyr possibility could be:
fake.db %>%
group_by(Student) %>%
mutate(Obs = cumsum(!duplicated(Week)))
Student Week Day Obs
<fct> <dbl> <dbl> <int>
1 A 1 1 1
2 A 1 2 1
3 A 1 3 1
4 B 2 2 1
5 B 2 3 1
6 B 2 5 1
7 B 3 1 2
8 B 3 3 2
9 C 4 2 1
10 C 4 3 1
11 C 4 4 1
12 C 4 5 1
It groups by "Student" column and calculates the cumulative sum of non-duplicate "Week" values.
Or:
fake.db %>%
group_by(Student) %>%
mutate(Obs = with(rle(Week), rep(seq_along(lengths), lengths)))
It groups by "Student" column and creates a run-length type group ID around "Week" column".
Or:
fake.db %>%
group_by(Student) %>%
mutate(Obs = dense_rank(Week))
It groups by "Student" column and ranks the values in "Week" column.
What I understand the issue to be is that you want to count the weeks since the first test week for each student. I.e. Week 2 is student B's first week of testing, so it gets Obs = 1. That means you can do a grouped mutate:
library(dplyr)
fake.db <- structure(list(Student = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), Week = c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4), Day = c(1, 2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)), class = "data.frame", row.names = c(NA, -12L))
fake.db %>%
group_by(Student) %>%
mutate(Obs = Week - min(Week) + 1)
#> # A tibble: 12 x 4
#> # Groups: Student [3]
#> Student Week Day Obs
#> <fct> <dbl> <dbl> <dbl>
#> 1 A 1 1 1
#> 2 A 1 2 1
#> 3 A 1 3 1
#> 4 B 2 2 1
#> 5 B 2 3 1
#> 6 B 2 5 1
#> 7 B 3 1 2
#> 8 B 3 3 2
#> 9 C 4 2 1
#> 10 C 4 3 1
#> 11 C 4 4 1
#> 12 C 4 5 1
Created on 2019-05-10 by the reprex package (v0.2.1)
A brief method with by
unlist(by(fake.db, fake.db[, 1], function(x) as.numeric(factor(x[, 2]))))
# A1 A2 A3 B1 B2 B3 B4 B5 C1 C2 C3 C4
# 1 1 1 1 1 1 2 2 1 1 1 1
Data
fake.db <- structure(list(Student = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Week = c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4), Day = c(1,
2, 3, 2, 3, 5, 1, 3, 2, 3, 4, 5)), class = "data.frame", row.names = c(NA,
-12L))
You can see if there is a non-zero difference
fake.db %>%
group_by(Student) %>%
arrange(Week) %>%
mutate(Obs = cumsum(c(1, diff(Week)!=0)))
or if they values arne't numeric, you can compare to the lag value
fake.db %>%
group_by(Student) %>%
arrange(Week) %>%
mutate(Obs = cumsum(Week != lag(Week, default=first(Week))) + 1)

How to concatenate data.frame inside lists by using names?

I have to import over 1,000 excel files, and each excel contains multiple sheets (some have the same sheet name and some have different sheet names).
Let's say with a small example as follows
games <- data.frame(index = c(1,2,3), player = c('John', 'Sam', 'Mary'))
weather <- data.frame(index = c(1,2,3), temperature = c('hot', 'cold', 'rainy'))
cars <- data.frame(index = c(1,2,3), car = c('honda', 'toyota','bmw'))
list1 <- list(games, weather, cars)
names(list1) <- c('games', 'weather', 'cars')
games <- data.frame(index = c(1,2,3), player = c('AA', 'BB', 'CC'))
weather <- data.frame(index = c(1,2,3), temperature = c('cold', 'rainy', 'hot'))
sport <- data.frame(index = c(1,2,3), interest = c('swim', 'soccer', 'rugby'))
list2 <- list(games, weather, sport)
names(list2) <- c('games', 'weather', 'sport')
list3 <- list(games, weather)
names(list3) <- c('games', 'weather')
rm(games, sport, weather, cars) # clean envir from unneeded stuff
I am looking for the way to combine lists by using lists' name. I have tried to use merge() and mapply(), but they did not return what I wanted
The return that I want is as follows:
$`games`
# A tibble: 6 x 2
index player
<dbl> <chr>
1 1 John
2 2 Sam
3 3 Mary
4 1 AA
5 2 BB
6 3 CC
$weather
# A tibble: 6 x 2
index temperature
<dbl> <chr>
1 1 hot
2 2 cold
3 3 rainy
4 1 cold
5 2 rainy
6 3 hot
$cars
# A tibble: 3 x 2
index car
<dbl> <chr>
1 1 honda
2 2 toyota
3 3 bmw
$sport
index interest
1 1 swim
2 2 soccer
3 3 rugby
EDIT: I have encountered with the case when there is a data.frame sport in list2 (not in list1)
You can use purrr to help manipulate the list. I add the stringAsFactors=FALSE only so that I could bind the data.frame. If you already use tibble, you won't have the issue.
I create a list of the lists.
transpose change the list to regroup the element by name. Basically, x[[1]][[2]] is equivalent to transpose(x)[[2]][[1]]
I use map to iterate through the list, and dplyr::bind_rows to get the resulting tibble.
options(stringsAsFactors = FALSE)
games <- data.frame(index = c(1,2,3), player = c('John', 'Sam', 'Mary'))
weather <- data.frame(index = c(1,2,3), temperature = c('hot', 'cold', 'rainy'))
cars <- data.frame(index = c(1,2,3), car = c('honda', 'toyota','bmw'))
list1 <- list(games, weather, cars)
names(list1) <- c('games', 'weather', 'cars')
games <- data.frame(index = c(1,2,3), player = c('AA', 'BB', 'CC'))
weather <- data.frame(index = c(1,2,3), temperature = c('cold', 'rainy', 'hot'))
list2 <- list(games, weather)
names(list2) <- c('games', 'weather')
library(purrr)
list(list1, list2) %>%
# regroup named element together
transpose() %>%
# bind the df together
map(dplyr::bind_rows)
#> $games
#> index player
#> 1 1 John
#> 2 2 Sam
#> 3 3 Mary
#> 4 1 AA
#> 5 2 BB
#> 6 3 CC
#>
#> $weather
#> index temperature
#> 1 1 hot
#> 2 2 cold
#> 3 3 rainy
#> 4 1 cold
#> 5 2 rainy
#> 6 3 hot
#>
#> $cars
#> index car
#> 1 1 honda
#> 2 2 toyota
#> 3 3 bmw
Created on 2018-11-04 by the reprex package (v0.2.1)
If the first list does not contain all the elements you want, you need to provide the .names argument in transpose. See help("transpose", package = "purrr").
I build an example for that.
options(stringsAsFactors = FALSE)
games <- data.frame(index = c(1,2,3), player = c('John', 'Sam', 'Mary'))
weather <- data.frame(index = c(1,2,3), temperature = c('hot', 'cold', 'rainy'))
list1 <- list(games = games, weather = weather)
games <- data.frame(index = c(1,2,3), player = c('AA', 'BB', 'CC'))
weather <- data.frame(index = c(1,2,3), temperature = c('cold', 'rainy', 'hot'))
cars <- data.frame(index = c(1,2,3), car = c('honda', 'toyota','bmw'))
list2 <- list(games = games, weather = weather, cars = cars)
library(purrr)
all_list <- list(list1, list2)
all_names <- all_list %>% map(names) %>% reduce(union)
list(list1, list2) %>%
# regroup named element together
transpose(.names = all_names) %>%
# bind the df together
map(dplyr::bind_rows)
#> $games
#> index player
#> 1 1 John
#> 2 2 Sam
#> 3 3 Mary
#> 4 1 AA
#> 5 2 BB
#> 6 3 CC
#>
#> $weather
#> index temperature
#> 1 1 hot
#> 2 2 cold
#> 3 3 rainy
#> 4 1 cold
#> 5 2 rainy
#> 6 3 hot
#>
#> $cars
#> index car
#> 1 1 honda
#> 2 2 toyota
#> 3 3 bmw
Created on 2018-11-04 by the reprex package (v0.2.1)
There's an easy way with lapply().
lapply(unique(unlist(lapply(mget(ls(pattern="list")), names))),
function(x) unique(rbind(list1[[x]], list2[[x]], list3[[x]])))
Use setNames() and dplyr::as_tibble to get list names and tibbles.
Like so:
nms <- unique(unlist(lapply(Lol, names)))
setNames(lapply(lapply(nms, function(x) unique(rbind(list1[[x]], list2[[x]], list3[[x]]))),
dplyr::as_tibble), nms)
Yields
$`games`
# A tibble: 6 x 2
index player
* <dbl> <fct>
1 1 John
2 2 Sam
3 3 Mary
4 1 AA
5 2 BB
6 3 CC
$weather
# A tibble: 6 x 2
index temperature
* <dbl> <fct>
1 1 hot
2 2 cold
3 3 rainy
4 1 cold
5 2 rainy
6 3 hot
$cars
# A tibble: 3 x 2
index car
* <dbl> <fct>
1 1 honda
2 2 toyota
3 3 bmw
$sport
# A tibble: 3 x 2
index interest
* <dbl> <fct>
1 1 swim
2 2 soccer
3 3 rugby
However, if the number of lists is unknown, supposed all your lists in the global environment with pattern "list", you could make following approach .
Lol <- mget(ls(pattern="^list+")) # list of lists
mergeFun <- function(z) {
l1 <- lapply(z,
function(y) lapply(1:length(y), # new column w/ sublist names
function(x) cbind(y[[x]], list=names(y)[x])))
l2 <- unlist(l1, recursive=FALSE) # unnest lists
l3 <- Reduce(function(...) merge(..., all=TRUE), l2) # merge list
l4 <- split(l3, l3$list) # new list of lists by sublist names
l5 <- lapply(l4, function(w)
Filter(function(v) !all(is.na(v)), w[, -2])) # delete NA cols
return(lapply(l5, function(u) `rownames<-`(u, NULL))) # reset row names
}
Do lapply(mergeFun(Lol), dplyr::as_tibble) to obtain tibbles if desired, otherwise just mergeFun(Lol).
Yields
> lapply(mergeFun(Lol), dplyr::as_tibble)
$`games`
# A tibble: 6 x 2
index player
<dbl> <fct>
1 1 John
2 1 AA
3 2 Sam
4 2 BB
5 3 Mary
6 3 CC
$weather
# A tibble: 6 x 2
index temperature
<dbl> <fct>
1 1 cold
2 1 hot
3 2 cold
4 2 rainy
5 3 hot
6 3 rainy
$cars
# A tibble: 3 x 2
index car
<dbl> <fct>
1 1 honda
2 2 toyota
3 3 bmw
$sport
# A tibble: 3 x 2
index interest
<dbl> <fct>
1 1 swim
2 2 soccer
3 3 rugby
Data
list1 <- list(games = structure(list(index = c(1, 2, 3), player = structure(c(1L,
3L, 2L), .Label = c("John", "Mary", "Sam"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)), weather = structure(list(index = c(1, 2, 3), temperature = structure(c(2L,
1L, 3L), .Label = c("cold", "hot", "rainy"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)), cars = structure(list(index = c(1, 2, 3), car = structure(c(2L,
3L, 1L), .Label = c("bmw", "honda", "toyota"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)))
list2 <- list(games = structure(list(index = c(1, 2, 3), player = structure(1:3, .Label = c("AA",
"BB", "CC"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)), weather = structure(list(index = c(1, 2, 3), temperature = structure(c(1L,
3L, 2L), .Label = c("cold", "hot", "rainy"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)), sport = structure(list(index = c(1, 2, 3), interest = structure(3:1, .Label = c("rugby",
"soccer", "swim"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)))
list3 <- list(games = structure(list(index = c(1, 2, 3), player = structure(1:3, .Label = c("AA",
"BB", "CC"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)), weather = structure(list(index = c(1, 2, 3), temperature = structure(c(1L,
3L, 2L), .Label = c("cold", "hot", "rainy"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L)))

Resources