Re-organizing nested list elements in R - r

I have a long list that has elements including charactor and numeric vectors. I want to re-organize them to the data frame format.
Let's say this is the structure of two first elements of the list_a:
list_a <- list(ENSG00000040608 = list(var = c("chr22_20230714_G_A_b38",
"chr22_20230737_G_A_b38", "chr22_20231229_T_A_b38", "chr22_20231474_G_A_b38",
"chr22_20231667_C_G_b38", "chr22_20231957_G_C_b38", "chr22_20231969_G_T_b38",
"chr22_20232125_G_A_b38", "chr22_20232392_G_A_b38", "chr22_20232643_A_C_b38"
), coeff = c(0.0000953181301665087, -0.00124036704551427, 0.000808061558738542,
0.000387528601423933, -0.000120624028990859, -0.00119982510044018,
-0.000120623899338185, -0.000435011907222715, 0.000715410285903684,
-0.000347899088267475)), ENSG00000040608 = list(var = c("chr22_20230714_G_A_b38",
"chr22_20230737_G_A_b38", "chr22_20231229_T_A_b38", "chr22_20231474_G_A_b38",
"chr22_20231667_C_G_b38", "chr22_20231957_G_C_b38", "chr22_20231969_G_T_b38",
"chr22_20232125_G_A_b38", "chr22_20232392_G_A_b38", "chr22_20232643_A_C_b38"
), coeff = c(0.0000953181301665087, -0.00124036704551427, 0.000808061558738542,
0.000387528601423933, -0.000120624028990859, -0.00119982510044018,
-0.000120623899338185, -0.000435011907222715, 0.000715410285903684,
-0.000347899088267475)))
The desire output as list_b:
list_b <- list(ENSG00000040608 = structure(list(chr22_20230714_G_A_b38 = -0.000347899088267475,
chr22_20230737_G_A_b38 = 0.0000953181301665087, chr22_20231229_T_A_b38 = -0.00124036704551427,
chr22_20231474_G_A_b38 = 0.000808061558738542, chr22_20231667_C_G_b38 = 0.000387528601423933,
chr22_20231957_G_C_b38 = -0.000120624028990859, chr22_20231969_G_T_b38 = -0.00119982510044018,
chr22_20232125_G_A_b38 = -0.000120623899338185, chr22_20232392_G_A_b38 = -0.000435011907222715,
chr22_20232643_A_C_b38 = -0.000347899088267475), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")), ENSG00000040608 = structure(list(
chr22_20230714_G_A_b38 = -0.000347899088267475, chr22_20230737_G_A_b38 = 0.0000953181301665087,
chr22_20231229_T_A_b38 = -0.00124036704551427, chr22_20231474_G_A_b38 = 0.000808061558738542,
chr22_20231667_C_G_b38 = 0.000387528601423933, chr22_20231957_G_C_b38 = -0.000120624028990859,
chr22_20231969_G_T_b38 = -0.00119982510044018, chr22_20232125_G_A_b38 = -0.000120623899338185,
chr22_20232392_G_A_b38 = -0.000435011907222715, chr22_20232643_A_C_b38 = -0.000347899088267475), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")))
Also I need to have those numbers as integer.
I would appreciate your guidance.

base R
lapply(list_a, function(x) {
df <- as.data.frame(t(x[[2]]))
colnames(df) <- x[[1]]
df
})
tidyverse
library(tidyverse)
map(list_a, ~ .x %>%
as_tibble() %>%
pivot_wider(names_from = "var", values_from = "coeff"))

Related

How to cbind a list of tables by one column, and suffix headings with the list item name

I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp

Merging data frames to plot multiple time series using plyr

I want to plot multiple time series on one plot. Currently I can plot them all individually but not together. How can I join the data because the years are split by decimals.
What I basically want to end up with is this Plotting multiple time-series in ggplot (See plot in the first answer)
library(tidyverse)
library(plyr)
theme_set(theme_bw(10))
Sydney1<-read.csv("Sydney1.csv",header=TRUE)
Sydney2<-read.csv("Sydney2.csv",header=TRUE)
Eden<-read.csv("Eden.csv",header=TRUE)
StonyBay<-read.csv("Stonybay.csv",header=TRUE)
LowHead<-read.csv("Lowhead.csv",header=TRUE)
Hobart<-read.csv("Hobart.csv",header=TRUE)
Devonport<-read.csv("Devonport.csv",header=TRUE)
Freemantle<-read.csv("Freemantle.csv",header=TRUE)
ggplot(Sydney1,aes(x=Year,y=SLR))+geom_line(aes(color="Sydney1"))
ggplot(Sydney2,aes(x=Year,y=SLR))+geom_line(aes(color="Sydney2"))
ggplot(Eden,aes(x=Year,y=SLR))+geom_line(aes(color="Eden"))
ggplot(StonyBay,aes(x=Year,y=SLR))+geom_line(aes(color="StonyBay"))
ggplot(LowHead,aes(x=Year,y=SLR))+geom_line(aes(color="Lowhead"))
ggplot(Hobart,aes(x=Year,y=SLR))+geom_line(aes(color="Hobart"))
ggplot(Devonport,aes(x=Year,y=SLR))+geom_line(aes(color="Devonport"))
ggplot(Freemantle,aes(x=Year,y=SLR))+geom_line(aes(color="Freemantle"))
#Sydney 1
structure(list(Year = c(1886.0417, 1886.125, 1886.2083, 1886.2917,
1886.375, 1886.4583), SLR = c(6819L, 6942L, 6980L, 6958L, 7015L,
6892L)), row.names = c(NA, 6L), class = "data.frame")
#Sydney 2
structure(list(Year = c(1914.4583, 1914.5417, 1914.625, 1914.7083,
1914.7917, 1914.875), SLR = c(7022L, 6963L, 6915L, 6924L, 6866L,
6956L)), row.names = c(NA, 6L), class = "data.frame")
#Eden
structure(list(Year = c(1986.7917, 1986.875, 1986.9583, 1987.0417,
1987.125, 1987.2083), SLR = c(7003L, 6942L, 6969L, 7067L, NA,
7015L)), row.names = c(NA, 6L), class = "data.frame")
#Stony Bay
structure(list(Year = c(1993.0417, 1993.125, 1993.2083, 1993.2917,
1993.375, 1993.4583), SLR = c(6826L, 6868L, 6796L, 6862L, 6893L,
6951L)), row.names = c(NA, 6L), class = "data.frame")
#Low head
structure(list(Year = c(2010.125, 2010.2083, 2010.2917, 2010.375,
2010.4583, 2010.5417), SLR = c(6971L, 6968L, 7030L, 7088L, 7063L,
7035L)), row.names = c(NA, 6L), class = "data.frame")
#Hobart
structure(list(Year = c(1987.875, 1987.9583, 1988.0417, 1988.125,
1988.2083, 1988.2917), SLR = c(6916L, 6870L, 6930L, 6870L, 6820L,
6817L)), row.names = c(NA, 6L), class = "data.frame")
#Devonport
structure(list(Year = c(1989.875, 1989.9583, 1990.0417, 1990.125,
1990.2083, 1990.2917), SLR = c(6976L, 7025L, 7030L, 7046L, 6999L,
7055L)), row.names = c(NA, 6L), class = "data.frame")
#Freemantle
structure(list(Year = c(1897.0417, 1897.125, 1897.2083, 1897.2917,
1897.375, 1897.4583), SLR = c(6542L, 6524L, 6557L, 6655L, 6648L,
6729L)), row.names = c(NA, 6L), class = "data.frame")
Using the data in the Note at the end first accumulate the series in a list L -- we assume any data frame having column names Year and SLR is to be added -- and then convert that to a single zoo object and plot it using autoplot.zoo which uses ggplot2. Remove the facet = NULL argument if you want them plotted in separate facets.
library(ggplot2)
library(zoo)
is_city_df <- function(x) is.data.frame(x) && identical(names(x), c("Year", "SLR"))
L <- Filter(is_city_df, mget(ls()))
z <- do.call("merge", lapply(L, read.zoo))
autoplot(z, facet = NULL)
Note
We assume the following inputs:
Sydney1 <-
structure(list(Year = c(1886.0417, 1886.125, 1886.2083, 1886.2917,
1886.375, 1886.4583), SLR = c(6819L, 6942L, 6980L, 6958L, 7015L,
6892L)), row.names = c(NA, 6L), class = "data.frame")
Sydney2 <-
structure(list(Year = c(1914.4583, 1914.5417, 1914.625, 1914.7083,
1914.7917, 1914.875), SLR = c(7022L, 6963L, 6915L, 6924L, 6866L,
6956L)), row.names = c(NA, 6L), class = "data.frame")
Eden <-
structure(list(Year = c(1986.7917, 1986.875, 1986.9583, 1987.0417,
1987.125, 1987.2083), SLR = c(7003L, 6942L, 6969L, 7067L, NA,
7015L)), row.names = c(NA, 6L), class = "data.frame")
Freemantle <-
structure(list(Year = c(1897.0417, 1897.125, 1897.2083, 1897.2917,
1897.375, 1897.4583), SLR = c(6542L, 6524L, 6557L, 6655L, 6648L,
6729L)), row.names = c(NA, 6L), class = "data.frame")
Assuming that your global environment only contains the dataframes you want to plot:
bind_rows(mget(ls()), .id = "City") %>%
ggplot(aes(x = Year, y = SLR, color = City)) +
geom_line()
Instead of creating multiple objects in the global env, it can be read all at once in a loop
library(dplyr)
library(purrr)
library(ggplot2)
library(readr)
files <- c("Sydney1.csv", "Sydney2.csv", "Eden.csv", "Stonybay.csv",
"Lowhead.csv", "Hobart.csv", "Devonport.csv", "Freemantle.csv")
map_dfr(files, ~ read_csv(.x) %>%
mutate(City = .x)) %>%
ggplot(aes(x = Year, y = SLR, color = City)) +
geom_line())
Try this:
Sydney1 %>% mutate(city = 'sydney1') %>%
bind_rows(Sydney2 %>% mutate(city = 'sydney2')) %>%
bind_rows(Eden %>% mutate(city = 'eden')) %>%
bind_rows(Freemantle %>% mutate(city = 'freemantle')) %>%
ggplot(aes(x=Year, y=SLR, color=city)) + geom_line() + facet_wrap(~city, scale='free')

Match strings from main df with those in reference df - if found, add all cols from that row of the reference df into main df

I have a "main_df" along the lines of this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture")), row.names = c(NA,
-6L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), reshapeWide = list(
v.names = NULL, timevar = "index", idvar = c("study_id",
"question"), times = c("rt", "response"), varying = structure(c("response.rt",
"response.response"), .Dim = 1:2)), groups = structure(list(
study_id = "02ipnnqgeovkrxz", .rows = list(1:6)), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
and a reference df along the lines of this:
structure(list(stim = c("ashtray_word", "bell_word", "blouse_word",
"boot_word", "bottle_word", "bread_word"), url = c("eW1BRoUDV4BKQMl",
"5zKTGwHlwlzpssB", "55SVfoQudZJNCFT", "bOORR1zuKYSnAe9", "6RrOQfDZim81pHv",
"1F97ouH0HrwQOgZ"), study_list = c("A", "A", "A", "A", "A", "A"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Each value in the 'question' column of the main df can be found in the 'url' column of the reference df. I want to match these values, and add all columns from that row of the reference df to my main df. The output will look like this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture"), stim = c("chisel_picture",
"raccoon_picture", "apple_picture", "belt_picture", "bicycle_picture",
"cake_picture"), url = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ",
"40iyFKjeMEFGI2V", "6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"
), study_list = c("B FILLER", "B FILLER", "B", "B", "B", "B")), row.names = c(NA,
-6L), groups = structure(list(study_id = "02ipnnqgeovkrxz", .rows = list(
1:6)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", "tbl", "data.frame"))
This will allow me to see the 'sensible' item names (e.g. "chisel_picture") that subjects were responding to, as opposed to the nonsensical code names I have now (e.g. "3eEVJgaAP6c9FPL"). The same items appear over and over again in the 'question' column (as different subjects saw the same items), and I need to preserve these repeats.
I have successfully managed this using a for loop...but it's super slow! A tidyverse solution would be amazing!
My awful for loop (study_data = main df / image_urls = reference df):
all_study_stim_items <- study_data$question # List all values in 'question' column.
matched_items <- tibble() # Create empty tibble to store results of for loop.
for (i in all_study_stim_items) {
temp <- image_urls %>%
filter(url == i) %>%
select(stim, url, study_list)
matched_items <- bind_rows(matched_items, temp) } # Continuously overwrite tibble with each match.
# I then join this with the main df.

values become NA after use left_join() function in r

The value in another data frame becomes NA after I used left_join() function. And I check the answer at here[dplyr::left_join produce NA values for new joined columns.
I also specify the by arguement but failed.
I don't know why.
qx_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020-01-21", "2020-01-22", "2020-01-23", "2020-01-24",
"2020-01-25"), PRS = c(1013.9035387141, 1011.48779584751, 1014.28302402211,
1019.16970261716, 1018.92203467498), PRS_Sea = c(1024.73084750567,
1022.22210612717, 1025.02632842026, 1029.97905104403, 1029.77650132275
), PRS_Max = c(1014.26828869048, 1011.80445613662, 1014.51628117914,
1019.43671957672, 1019.31935504063), PRS_Min = c(1013.7138513322,
1011.13447054516, 1013.86811271731, 1018.75406934996, 1018.62469257842
), WIN_S_Max = c(2.30187606292517, 2.08586132369615, 2.76893908257748,
4.22074853552532, 3.63427225056689), WIN_S_Inst_Max = c(3.44360343442933,
3.09963836923658, 4.28499952758881, 6.68930898053666, 5.80619165721844
), WIN_D_INST_Max = c(116.878029336735, 218.745851048753, 120.88310303288,
72.1640447845805, 72.0331526360544), WIN_D_Avg_2mi = c(116.23329724764,
210.524530689871, 113.104009452075, 68.7694017991261, 70.322008604388
), WIN_S_Avg_2mi = c(1.77558118386243, 1.49959490740741, 2.20936874055178,
3.47942613851096, 2.99431642101285), WIN_D_S_Max = c(116.68018866665,
218.180671371681, 120.40502999811, 71.0831467309146, 68.3670670351474
), TEM = c(3.81968088624339, 5.16464226662887, 6.82721856103553,
5.98099596088435, 4.8940626181028), TEM_Max = c(4.04776301492819,
5.35075514928193, 6.97597470238095, 6.15192401266062, 5.07960293839758
), TEM_Min = c(3.49020455404384, 4.95346053004535, 6.65049142573696,
5.85618067365835, 4.76455794123205), RHU = c(85.9359859221466,
96.1710766250945, 91.749678760393, 88.3347741874528, 80.693040202192
), VAP = c(6.98015376984127, 8.55406509826153, 9.08114866780046,
8.27843124055178, 6.98599714191232), RHU_Min = c(83.965092356387,
95.6411387471655, 90.9997401738473, 87.3134436413454, 79.2219635770975
), PRE_1h = c(0.102133763227513, 0.422205333522298, 1488.33246492347,
0.0715384070294785, 372.116791028911)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L))
covid_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020/1/21", "2020/1/22", "2020/1/23", "2020/1/24",
"2020/1/25"), 新增确诊 = c(0L, 1L, 14L, 24L, 21L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
dat2 <- covid_p2 %>% left_join(qx_p2, by = c('province' = 'province', 'date' = 'date'))
dat2
Your date columns are character columns and do not have the same format:
qx_p2$date
# "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" "2020-01-25"
covid_p2$date
# "2020/1/21" "2020/1/22" "2020/1/23" "2020/1/24" "2020/1/25"
You can get them in the same format by, for example, applying as.Date(...):
covid_p2$date <- as.Date(covid_p2$date)
qx_p2$date <- as.Date(qx_p2$date)
After that, your join works.

How to plot layers of tupples on same plot in R?

I am trying to plot the time and NDVI for each region on the same plot. I think to do this I have to convert the date column from characters to time and then plot each layer. However I cannot figure out how to do this. Any thoughts?
list(structure(list(observation = 1L, HRpcode = NA_character_,
timeseries = NA_character_), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3793765496776215", "0.21686891782421552", "0.3785652933528299",
"0.41027240624704164", "0.4035578030242673", "0.341299793064468"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.4071076986818826", "0.09090719657570319", "0.35214166081795284",
"0.4444311032927228", "0.5220702877666005", "0.5732370503295022"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3412131556625801", "0.18815996897460135", "0.5218904976415136",
"0.6970128777711452", "0.7229657162729096", "0.535967435470161"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)))
111
First we need to clean your data. The first element in this list is empty
df = df[-1]
Now we need to make a data.frame
df = do.call(rbind, df)
I am going to add a region variable, change the name of NDVI to remove the space,
change ndvi into a numeric vector, and change time into a Date object
library(dplyr)
df = df %>%
mutate(region = factor(rep(1:3, rep(6, 3)))) %>%
rename(ndvi = ' NDVI') %>%
mutate(ndvi = as.numeric(ndvi)) %>%
mutate(time = as.Date(time))
Now we can use ggplot2 to plot the data by region
library(ggplot2)
g = df %>%
ggplot(aes(x = time, y = ndvi, col = region)) +
geom_line()
g
Which gives this plot:
Here's an approach with lubridate to handle dates and dplyr to make the binding of the data.frames easier to understand.
Note that the group names are taken from the names of the list, and since those don't exist in the data you provided, we have to set them in advance.
library(lubridate)
library(ggplot2)
library(dplyr)
names(data) <- 1:3
data <- bind_rows(data, .id = "group")
data$time <- ymd(data$time)
setnames(data," NDVI","NDVI")
data$NDVI <- as.numeric(data$NDVI)
ggplot(data, aes(x=time,y=NDVI,color=Group)) + geom_line()

Resources