Subsetting nested lists within R - r

I am trying to create a dataframe from nested lists from within R. Here is an example:
mylist<-list(file1 = list("a", sample1 = list(x = 2, y = list(c(1, 2)),
sample2 = list(x = 4, y = list(c(3, 8))))), file2 = list(
"a", sample1 = list(x = 6, y = list(c(6, 4)), sample2 = list(
x = 6, y = list(c(7, 4))))))
I would like to know how I could extract all the features 'x' and the features 'y' from the nested lists, with 'y' split into two columns; one for each value?
Thanks you for your time everyone!

I'm not exactly sure what you're expected output is supposed to be like, but perhaps something like this?
library(tidyverse)
unlist(mylist) %>%
data.frame(val = .) %>%
rownames_to_column("id") %>%
filter(str_detect(id, "(x|y1|y2)")) %>%
separate(id, into = c("id", "col"), sep = "\\.(?=\\w+$)") %>%
spread(col, val)
# id x y1 y2
#1 file1.sample1 2 1 2
#2 file1.sample1.sample2 4 3 8
#3 file2.sample1 6 6 4
#4 file2.sample1.sample2 6 7 4

Related

Selecting elements from a list with non compatible length

Given the following structure of the list:
x <- list(list(Main = list(one = list(tlv = 1, beta = 2), two = "three", three = 4,list_a = list(list(value_1 = "a1", value_2 = "b", c = "c")))),
list(Main = list(one = list(tlv = 2, beta = 6), two = "seven", three = 8,list_a = list(list(value_1 = "aa2", value_2 = "bb", c = "cc")))),
list(Main = list(one = list(tlv = 3),list_a = list(list(value_1 = c("aaa3", "aaaa4"), value_2 = c("bbb", "bbbb"), c = c("ccc", "ccc"))))))
I'm trying to create a dataframe with a structure like this:
tlv | value_1
1 | a1
2 | aa2
3 | aaa3
3 | aaaa4
so far I have to the following:
library(tidyverse)
tibble::tibble(
tlv = map(x, list(1,1,"tlv"), .default = NA) %>% unlist(),
value = map(x, list(1,"list_a", 1, "value"), .default = NA) %>% unlist())
Which leads to the following error:
Error: Tibble columns must have compatible sizes.
* Size 3: Existing data.
* Size 4: Column `value`.
i Only values of size one are recycled.
This makes sense given the structure of the list (3 values for one of the variables en 4 values for the other). But I don't see a solution to link the values to the parent element of the list. So that every 'value' also gets the corresponding 'tlv' value. Any guidance how to solve this problem?
Found a solution, this does the trick:
x %>%
map_df(~tibble(
tlv = .$Main$one$tlv,
value = .$Main$list_a[[1]]$value_1))
An alternative :
library(tidyverse)
value_1 <-
map_depth(x, 4, pluck, "value_1", .ragged = TRUE) %>%
map(unlist, use.names = FALSE)
tlv <-
map_depth(x, 3, pluck, "tlv") %>%
map_dbl(unlist, use.names = FALSE)
df <-
tibble(tlv = tlv, value_1 = value_1) %>%
unnest_auto(col = value_1)

Writing for loop in r to combine columns that has matching names (with little variance)

I have a data frame where column names are duplicated once. Now I need to combine them to get a proper data set. I can use dplyr select command to extract matching columns and combine them later. However, I wish to achieve it using for loop.
#Example data frame
x <- c(1, NA, 3)
y <- c(1, NA, 4)
x.1 <- c(NA, 3, NA)
y.1 <- c(NA, 5, NA)
data <- data.frame(x, y, x1, y1)
##with `dplyr` I can do like
t1 <- data%>%select(contains("x"))%>%
mutate(x = rowSums(., na.rm = TRUE))%>%
select(x)
t2 <- data%>%select(contains("y"))%>%
mutate(y = rowSums(., na.rm = TRUE))%>%
select(y)
data <- cbind(t1,t2)
This is cumbersome as I have more than 25 similar columns
How to achieve the same result using for loop by matching columns names and perform rowSums. Or even simple approach using dplyr will also help.
We can use split.default to split based on the substring of the column names into a list and then apply the rowSums
library(dplyr)
library(stringr)
library(purrr)
data %>%
split.default(str_remove(names(.), "\\.\\d+")) %>%
map_dfr(rowSums, na.rm = TRUE)
# A tibble: 3 x 2
# x y
# <dbl> <dbl>
#1 1 1
#2 3 5
#3 3 4
If we want to use a for loop
un1 <- unique(sub("\\..*", "", names(data)))
out <- setNames(rep(list(NA), length(un1)), un1)
for(un in un1) {
out[[un]] <- rowSums(data[grep(un, names(data))], na.rm = TRUE)
}
as.data.frame(out)
data
data <- structure(list(x = c(1, NA, 3), y = c(1, NA, 4), x.1 = c(NA,
3, NA), y.1 = c(NA, 5, NA)), class = "data.frame", row.names = c(NA,
-3L))
Using purrr::map_dfc and transmute instead of mutate
library(dplyr)
purrr::map_dfc(c('x','y'), ~data %>% select(contains(.x)) %>%
transmute(!!.x := rowSums(., na.rm = TRUE)))
x y
1 1 1
2 3 5
3 3 4

bind rows on list of elements to list of data.frame

I have list of R elements and want to bind row all elements within the list.
Each row binds to data.frame based on the column class.
The actual data is quite large and each class has different columns. Here is sample
df_list <- list()
df_list[[1]] <- data.frame(Class = "x", y = 1, stringsAsFactors = F)
df_list[[2]] <- data.frame(Class = "x", y = 2, stringsAsFactors = F)
df_list[[3]] <- data.frame(Class = "a", y = 3, stringsAsFactors = F)
df_list[[4]] <- data.frame(Class = "x", y = 4, stringsAsFactors = F)
df_list[[5]] <- data.frame(Class = "a", y = 5, stringsAsFactors = F)
Desired output, looking this to be done programmatically
df_list_out <- list()
df_list_out[[1]] <- bind_rows(data.frame(Class = "x", y = 1,
stringsAsFactors = F),
data.frame(Class = "x", y = 2,
stringsAsFactors = F),
data.frame(Class = "x", y = 4,
stringsAsFactors = F))
df_list_out[[2]] <- bind_rows(data.frame(Class = "a", y = 3,
stringsAsFactors = F),
data.frame(Class = "a", y = 5,
stringsAsFactors = F))
One way would be to rbind the list of dataframes together and then split
temp <- do.call(rbind, df_list)
split(temp, temp$Class)
#$a
# Class y
#3 a 3
#5 a 5
#$x
# Class y
#1 x 1
#2 x 2
#4 x 4
In dplyr, we can do
library(dplyr)
df_list %>% bind_rows() %>% group_split(Class)
You could lapply() over a vector of "Class"es and thus achieve that only one "Class" is processed at a time.
lapply(c("x", "a"), function(x) do.call(rbind, df_list[Map(`[[`, df_list, "Class") == x]))
# [[1]]
# Class y
# 1 x 1
# 2 x 2
# 3 x 4
#
# [[2]]
# Class y
# 1 a 3
# 2 a 5

R add column for sparkline with value from each row vector

Start with a dataframe
library(dplyr)
library(sparkline)
df <- data.frame(matrix(1:9, nrow = 3, ncol = 3))
X1 X2 X3
1 1 4 7
2 2 5 8
3 3 6 9
Would like to add a column 'spark' for use with sparkline:
df <- df %>% mutate(spark = spk_chr(values = ?, type = "bar", elementId = X1))
So the question mark (?) would be replaced by a vector made up of each row of df.
For the first row, ? = c(1, 4, 7), the values from the first row, spark = spk(values = c(1, 4, 7)...)
I know how to extract a vector from any row, first row vector is unlist(df[1,]), but do not understand if this can be used in mutate.
Used Ronak's suggestion to create intermediate column:
cols = names(df)
df$y <- apply(df[,cols], 1, paste, collapse = "-")
Then created vectorized spk_chr:
sparky <- Vectorize(sparkline::spk_chr)
To use in making the spark column:
df <- df %>% mutate(spark = sparky(strsplit(y, split="-"), type = "bar", elementId = X1))

How to I cast data frame with more than 3 columns in R?

Importing from an Access database, I have data that look similar to this:
p <- data.frame(SurvDate = as.Date(c('2018-11-1','2018-11-1','2018-11-1',
'2018-11-3', '2018-11-3')),
Area = c('AF','BB','CT', 'DF', 'BB'),
pCount = c(6, 3, 0, 12, 32),
ObsTime = c('8:51','8:59','9:13', '9:24', '9:30'),
stringsAsFactors = FALSE)
I want to cast my data with Rows as SurvDate and columns to be Areas (values as pCount) and ObsTime columns next to each Area with value ObsTime.
Example:
n <- data.frame(SurvDate = as.Date(c('2018-11-1','2018-11-3')),
AF = c(6, NA),
TimeAF = c('8:51', NA),
BB = c(3, 32),
TimeBB = c('8:59', '9:30'),
CT = c(0, NA),
TimeCT = c(NA, '9:13'),
DF = c(NA,12),
TimeDF = c(NA, '9:24'))
I've tried variations on this theme, but can't get time to work.
library(reshape2)
dcast(p, SurvDate+ObsTime ~ Area)
Here is one way using tidyverse tools. Note that the output is not the same as your expected output, because it seems like you didn't put the values for CT in the right place (values spread across two dates). Approach is to unite the values so we have a single key-value pair to spread, and then separate out the columns again with mutate_at. We could also have used separate multiple times, though this would become unwieldy with too many Areas.
SurvDate <- as.Date(c('2018-11-1','2018-11-1','2018-11-1', '2018-11-3', '2018-11-3'))
Area <- c('AF','BB','CT', 'DF', 'BB')
People <- c(6, 3, 0, 12, 32)
ObsTime <- (c('8:51','8:59','9:13', '9:24', '9:30'))
p <- data.frame(SurvDate, Area, People, ObsTime, stringsAsFactors = FALSE)
library(tidyverse)
p %>%
unite(vals, People, ObsTime) %>%
spread(Area, vals) %>%
mutate_at(
.vars = vars(-SurvDate),
.funs = funs(
Time = str_extract(., "(?<=_).*$"),
Area = str_extract(., "^.*(?=_)")
)
) %>%
filter(!is.na(SurvDate)) %>%
select(SurvDate, matches("_")) %>%
select(SurvDate, order(colnames(.)))
#> SurvDate AF_Area AF_Time BB_Area BB_Time CT_Area CT_Time DF_Area
#> 1 2018-11-01 6 8:51 3 8:59 0 9:13 <NA>
#> 2 2018-11-03 <NA> <NA> 32 9:30 <NA> <NA> 12
#> DF_Time
#> 1 <NA>
#> 2 9:24
Created on 2018-04-30 by the reprex package (v0.2.0).

Resources