values become NA after use left_join() function in r - r

The value in another data frame becomes NA after I used left_join() function. And I check the answer at here[dplyr::left_join produce NA values for new joined columns.
I also specify the by arguement but failed.
I don't know why.
qx_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020-01-21", "2020-01-22", "2020-01-23", "2020-01-24",
"2020-01-25"), PRS = c(1013.9035387141, 1011.48779584751, 1014.28302402211,
1019.16970261716, 1018.92203467498), PRS_Sea = c(1024.73084750567,
1022.22210612717, 1025.02632842026, 1029.97905104403, 1029.77650132275
), PRS_Max = c(1014.26828869048, 1011.80445613662, 1014.51628117914,
1019.43671957672, 1019.31935504063), PRS_Min = c(1013.7138513322,
1011.13447054516, 1013.86811271731, 1018.75406934996, 1018.62469257842
), WIN_S_Max = c(2.30187606292517, 2.08586132369615, 2.76893908257748,
4.22074853552532, 3.63427225056689), WIN_S_Inst_Max = c(3.44360343442933,
3.09963836923658, 4.28499952758881, 6.68930898053666, 5.80619165721844
), WIN_D_INST_Max = c(116.878029336735, 218.745851048753, 120.88310303288,
72.1640447845805, 72.0331526360544), WIN_D_Avg_2mi = c(116.23329724764,
210.524530689871, 113.104009452075, 68.7694017991261, 70.322008604388
), WIN_S_Avg_2mi = c(1.77558118386243, 1.49959490740741, 2.20936874055178,
3.47942613851096, 2.99431642101285), WIN_D_S_Max = c(116.68018866665,
218.180671371681, 120.40502999811, 71.0831467309146, 68.3670670351474
), TEM = c(3.81968088624339, 5.16464226662887, 6.82721856103553,
5.98099596088435, 4.8940626181028), TEM_Max = c(4.04776301492819,
5.35075514928193, 6.97597470238095, 6.15192401266062, 5.07960293839758
), TEM_Min = c(3.49020455404384, 4.95346053004535, 6.65049142573696,
5.85618067365835, 4.76455794123205), RHU = c(85.9359859221466,
96.1710766250945, 91.749678760393, 88.3347741874528, 80.693040202192
), VAP = c(6.98015376984127, 8.55406509826153, 9.08114866780046,
8.27843124055178, 6.98599714191232), RHU_Min = c(83.965092356387,
95.6411387471655, 90.9997401738473, 87.3134436413454, 79.2219635770975
), PRE_1h = c(0.102133763227513, 0.422205333522298, 1488.33246492347,
0.0715384070294785, 372.116791028911)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L))
covid_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020/1/21", "2020/1/22", "2020/1/23", "2020/1/24",
"2020/1/25"), 新增确诊 = c(0L, 1L, 14L, 24L, 21L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
dat2 <- covid_p2 %>% left_join(qx_p2, by = c('province' = 'province', 'date' = 'date'))
dat2

Your date columns are character columns and do not have the same format:
qx_p2$date
# "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" "2020-01-25"
covid_p2$date
# "2020/1/21" "2020/1/22" "2020/1/23" "2020/1/24" "2020/1/25"
You can get them in the same format by, for example, applying as.Date(...):
covid_p2$date <- as.Date(covid_p2$date)
qx_p2$date <- as.Date(qx_p2$date)
After that, your join works.

Related

How to cbind a list of tables by one column, and suffix headings with the list item name

I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp

Re-organizing nested list elements in R

I have a long list that has elements including charactor and numeric vectors. I want to re-organize them to the data frame format.
Let's say this is the structure of two first elements of the list_a:
list_a <- list(ENSG00000040608 = list(var = c("chr22_20230714_G_A_b38",
"chr22_20230737_G_A_b38", "chr22_20231229_T_A_b38", "chr22_20231474_G_A_b38",
"chr22_20231667_C_G_b38", "chr22_20231957_G_C_b38", "chr22_20231969_G_T_b38",
"chr22_20232125_G_A_b38", "chr22_20232392_G_A_b38", "chr22_20232643_A_C_b38"
), coeff = c(0.0000953181301665087, -0.00124036704551427, 0.000808061558738542,
0.000387528601423933, -0.000120624028990859, -0.00119982510044018,
-0.000120623899338185, -0.000435011907222715, 0.000715410285903684,
-0.000347899088267475)), ENSG00000040608 = list(var = c("chr22_20230714_G_A_b38",
"chr22_20230737_G_A_b38", "chr22_20231229_T_A_b38", "chr22_20231474_G_A_b38",
"chr22_20231667_C_G_b38", "chr22_20231957_G_C_b38", "chr22_20231969_G_T_b38",
"chr22_20232125_G_A_b38", "chr22_20232392_G_A_b38", "chr22_20232643_A_C_b38"
), coeff = c(0.0000953181301665087, -0.00124036704551427, 0.000808061558738542,
0.000387528601423933, -0.000120624028990859, -0.00119982510044018,
-0.000120623899338185, -0.000435011907222715, 0.000715410285903684,
-0.000347899088267475)))
The desire output as list_b:
list_b <- list(ENSG00000040608 = structure(list(chr22_20230714_G_A_b38 = -0.000347899088267475,
chr22_20230737_G_A_b38 = 0.0000953181301665087, chr22_20231229_T_A_b38 = -0.00124036704551427,
chr22_20231474_G_A_b38 = 0.000808061558738542, chr22_20231667_C_G_b38 = 0.000387528601423933,
chr22_20231957_G_C_b38 = -0.000120624028990859, chr22_20231969_G_T_b38 = -0.00119982510044018,
chr22_20232125_G_A_b38 = -0.000120623899338185, chr22_20232392_G_A_b38 = -0.000435011907222715,
chr22_20232643_A_C_b38 = -0.000347899088267475), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")), ENSG00000040608 = structure(list(
chr22_20230714_G_A_b38 = -0.000347899088267475, chr22_20230737_G_A_b38 = 0.0000953181301665087,
chr22_20231229_T_A_b38 = -0.00124036704551427, chr22_20231474_G_A_b38 = 0.000808061558738542,
chr22_20231667_C_G_b38 = 0.000387528601423933, chr22_20231957_G_C_b38 = -0.000120624028990859,
chr22_20231969_G_T_b38 = -0.00119982510044018, chr22_20232125_G_A_b38 = -0.000120623899338185,
chr22_20232392_G_A_b38 = -0.000435011907222715, chr22_20232643_A_C_b38 = -0.000347899088267475), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")))
Also I need to have those numbers as integer.
I would appreciate your guidance.
base R
lapply(list_a, function(x) {
df <- as.data.frame(t(x[[2]]))
colnames(df) <- x[[1]]
df
})
tidyverse
library(tidyverse)
map(list_a, ~ .x %>%
as_tibble() %>%
pivot_wider(names_from = "var", values_from = "coeff"))

How to assign the row names of a dataframe as the column names of another dataframe?

I have two dataframes:
meth_df
> dput(meth_df[0:5,0:5])
structure(list(`TCGA-2K-A9WE-01A` = c(0.611033076810465, 0.786837244239289,
0.531054614303851, 0.711916183761331, 0.758443223998425), `TCGA-2Z-A9J1-01A` = c(0.468013052647261,
0.386177267500376, 0.508623627469028, 0.403601275088479, 0.754642399207848
), `TCGA-2Z-A9J2-01A` = c(0.593559707995411, 0.54983504208745,
0.535207192925841, 0.613971903755576, 0.717278085189431), `TCGA-2Z-A9J3-01A` = c(0.638211007873003,
0.319561448644096, 0.526699541432941, 0.450002172806716, 0.736440001203422
), `TCGA-2Z-A9J5-01A` = c(0.603998109440889, 0.638039512259872,
0.584328151056768, 0.594021097192165, 0.818583455926719)), row.names = c("A1BG",
"A1CF", "A2BP1", "A2LD1", "A2M"), class = "data.frame")
class_df
> dput(class_df[0:5,])
structure(list(X = c("TCGA-2K-A9WE-01A", "TCGA-2Z-A9J1-01A",
"TCGA-2Z-A9J2-01A", "TCGA-2Z-A9J3-01A", "TCGA-2Z-A9J5-01A"),
survival = c("lts", "lts", "lts", "lts", "lts"), type = c("tumor",
"tumor", "tumor", "tumor", "tumor"), subtype = c("kirp",
"kirp", "kirp", "kirp", "kirp")), row.names = c(NA, 5L), class = "data.frame")
The columns of meth_df corresponds with the X column of class_df. The actual meth_df has 19143 obs. of 865 variables. The actual class_df has 1146 obs. of 4 variables.
First, I append the survival info to corresponding X columns in class_df
cols <- c("X", "survival")
rownames(class_df) <- apply(class_df[,cols], 1, paste, collapse="_")
Now, I want to assign the row names of class_df as the column names of meth_df if the existing column names of meth_df matches the X column of class_df. I'm not sure how to do it.
Desired output:
> dput(meth_df[0:5,0:5])
structure(list(`TCGA-2K-A9WE-01A_lts` = c(0.611033076810465, 0.786837244239289,
0.531054614303851, 0.711916183761331, 0.758443223998425), `TCGA-2Z-A9J1-01A_lts` = c(0.468013052647261,
0.386177267500376, 0.508623627469028, 0.403601275088479, 0.754642399207848
), `TCGA-2Z-A9J2-01A_lts` = c(0.593559707995411, 0.54983504208745,
0.535207192925841, 0.613971903755576, 0.717278085189431), `TCGA-2Z-A9J3-01A_lts` = c(0.638211007873003,
0.319561448644096, 0.526699541432941, 0.450002172806716, 0.736440001203422
), `TCGA-2Z-A9J5-01A_lts` = c(0.603998109440889, 0.638039512259872,
0.584328151056768, 0.594021097192165, 0.818583455926719)), row.names = c("A1BG",
"A1CF", "A2BP1", "A2LD1", "A2M"), class = "data.frame")

Match strings from main df with those in reference df - if found, add all cols from that row of the reference df into main df

I have a "main_df" along the lines of this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture")), row.names = c(NA,
-6L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), reshapeWide = list(
v.names = NULL, timevar = "index", idvar = c("study_id",
"question"), times = c("rt", "response"), varying = structure(c("response.rt",
"response.response"), .Dim = 1:2)), groups = structure(list(
study_id = "02ipnnqgeovkrxz", .rows = list(1:6)), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
and a reference df along the lines of this:
structure(list(stim = c("ashtray_word", "bell_word", "blouse_word",
"boot_word", "bottle_word", "bread_word"), url = c("eW1BRoUDV4BKQMl",
"5zKTGwHlwlzpssB", "55SVfoQudZJNCFT", "bOORR1zuKYSnAe9", "6RrOQfDZim81pHv",
"1F97ouH0HrwQOgZ"), study_list = c("A", "A", "A", "A", "A", "A"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Each value in the 'question' column of the main df can be found in the 'url' column of the reference df. I want to match these values, and add all columns from that row of the reference df to my main df. The output will look like this:
structure(list(study_id = c("02ipnnqgeovkrxz", "02ipnnqgeovkrxz",
"02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz", "02ipnnqgeovkrxz"
), question = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ", "40iyFKjeMEFGI2V",
"6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"), study_rt = c("1.353",
"0.714", "0.68", "0.695", "0.696", "0.656"), study_response = c("picture",
"picture", "picture", "picture", "picture", "picture"), stim = c("chisel_picture",
"raccoon_picture", "apple_picture", "belt_picture", "bicycle_picture",
"cake_picture"), url = c("3eEVJgaAP6c9FPL", "b8GLxGjZKtstCQZ",
"40iyFKjeMEFGI2V", "6eZGejSZ1oTZYLb", "3pXAUvZH8GGuryd", "0kYkUAHe4iODUl7"
), study_list = c("B FILLER", "B FILLER", "B", "B", "B", "B")), row.names = c(NA,
-6L), groups = structure(list(study_id = "02ipnnqgeovkrxz", .rows = list(
1:6)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", "tbl", "data.frame"))
This will allow me to see the 'sensible' item names (e.g. "chisel_picture") that subjects were responding to, as opposed to the nonsensical code names I have now (e.g. "3eEVJgaAP6c9FPL"). The same items appear over and over again in the 'question' column (as different subjects saw the same items), and I need to preserve these repeats.
I have successfully managed this using a for loop...but it's super slow! A tidyverse solution would be amazing!
My awful for loop (study_data = main df / image_urls = reference df):
all_study_stim_items <- study_data$question # List all values in 'question' column.
matched_items <- tibble() # Create empty tibble to store results of for loop.
for (i in all_study_stim_items) {
temp <- image_urls %>%
filter(url == i) %>%
select(stim, url, study_list)
matched_items <- bind_rows(matched_items, temp) } # Continuously overwrite tibble with each match.
# I then join this with the main df.

How to plot layers of tupples on same plot in R?

I am trying to plot the time and NDVI for each region on the same plot. I think to do this I have to convert the date column from characters to time and then plot each layer. However I cannot figure out how to do this. Any thoughts?
list(structure(list(observation = 1L, HRpcode = NA_character_,
timeseries = NA_character_), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3793765496776215", "0.21686891782421552", "0.3785652933528299",
"0.41027240624704164", "0.4035578030242673", "0.341299793064468"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.4071076986818826", "0.09090719657570319", "0.35214166081795284",
"0.4444311032927228", "0.5220702877666005", "0.5732370503295022"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3412131556625801", "0.18815996897460135", "0.5218904976415136",
"0.6970128777711452", "0.7229657162729096", "0.535967435470161"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)))
111
First we need to clean your data. The first element in this list is empty
df = df[-1]
Now we need to make a data.frame
df = do.call(rbind, df)
I am going to add a region variable, change the name of NDVI to remove the space,
change ndvi into a numeric vector, and change time into a Date object
library(dplyr)
df = df %>%
mutate(region = factor(rep(1:3, rep(6, 3)))) %>%
rename(ndvi = ' NDVI') %>%
mutate(ndvi = as.numeric(ndvi)) %>%
mutate(time = as.Date(time))
Now we can use ggplot2 to plot the data by region
library(ggplot2)
g = df %>%
ggplot(aes(x = time, y = ndvi, col = region)) +
geom_line()
g
Which gives this plot:
Here's an approach with lubridate to handle dates and dplyr to make the binding of the data.frames easier to understand.
Note that the group names are taken from the names of the list, and since those don't exist in the data you provided, we have to set them in advance.
library(lubridate)
library(ggplot2)
library(dplyr)
names(data) <- 1:3
data <- bind_rows(data, .id = "group")
data$time <- ymd(data$time)
setnames(data," NDVI","NDVI")
data$NDVI <- as.numeric(data$NDVI)
ggplot(data, aes(x=time,y=NDVI,color=Group)) + geom_line()

Resources