Aggregate columns based on categories given by another dataframe - r

I have a dataframe where each column has some vector of data. I want to apply the mean columnwise, but filtered by groups which are given by a second dataframe. That is, each column belongs to a group and this information is in the second dataframe.
Here is some example dataset: df is the dataframe with the data vectors, df_category contains the category for each column.
df=structure(list(x1 = c(0.461302090047301, -1.19974381763812, -0.888258056235799,
0.300889698419314, 0.836911163114131, 0.0540388337324712), x2 = c(1.33736696170763,
-0.687026295689823, 1.12205295626651, -0.848925266014684, 1.16092168555067,
0.591202293337843), x3 = c(-0.279052669225263, -0.780435476613128,
-0.852870619718068, -0.708611614262357, -0.761659405740852, 0.487033696695474
), x4 = c(-0.222767493777229, 1.50328295132467, 0.934670132217215,
1.37678188537077, 0.343280062984192, 1.23279081824003), x5 = c(-1.08074586121729,
0.208120194894818, -0.52245832008453, 0.944618465137011, 0.749834485631317,
-0.81118414509141)), class = "data.frame", row.names = c(NA,
-6L))
df_category=structure(list(Col_name = structure(1:5, .Label = c("x1", "x2",
"x3", "x4", "x5"), class = "factor"), Category = structure(c(1L,
1L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
The result I want is this one:
df_result=structure(list(mean_A = c(0.899334525877468, -0.943385056663974,
0.116897450015357, -0.274017783797685, 0.998916424332403, 0.322620563535157
), mean_B = c(-0.527522008073261, 0.310322556535454, -0.146886269195128,
0.537596245415141, 0.110485047624885, 0.302880123281364)), class = "data.frame", row.names = c(NA,
-6L))

in Base R:
a <- with(df_category, setNames(Category, Col_name))[names(df)[col(df)]]
tapply(unlist(df), list(row(df), a), mean)
A B
1 0.8993345 -0.5275220
2 -0.9433851 0.3103226
3 0.1168975 -0.1468863
4 -0.2740178 0.5375962
5 0.9989164 0.1104850
6 0.3226206 0.3028801
Another option:
sapply(with(df_category, split.default(df[Col_name], Category)), rowMeans)
A B
[1,] 0.8993345 -0.5275220
[2,] -0.9433851 0.3103226
[3,] 0.1168975 -0.1468863
[4,] -0.2740178 0.5375962
[5,] 0.9989164 0.1104850
[6,] 0.3226206 0.3028801

We can use tidyverse to reshape the data values, merge the category data, and compute means for groups "A" and "B":
library(tidyverse)
df_result <- df %>%
mutate(idx = row_number()) %>%
pivot_longer(-idx) %>%
inner_join(df_category, c(name = 'Col_name')) %>%
group_by(Category, idx) %>%
summarize(mean = mean(value)) %>%
pivot_wider(names_from = Category, values_from = mean, names_prefix = 'mean_') %>%
select(-idx)
mean_A mean_B
<dbl> <dbl>
1 0.899 -0.528
2 -0.943 0.310
3 0.117 -0.147
4 -0.274 0.538
5 0.999 0.110
6 0.323 0.303

Related

Dplyr function to match column value to row values and replace it

I have 2 data frames. Data Frame A is and Data Frame B is
I want to take the values of column active ident in Data Frame B and input them as a row on the top of Data Frame A in a way where they match up according to column name from Data Frame B
I have tried using dplyr but I cant seem to figure out how to do this in R. Would appreciate any help
Attaching dput(head) for both my files
Cell Labels
structure(list(`hnscc.vp.fibroblasts#active.ident` = structure(c(3L,
2L, 3L, 3L, 3L, 3L), .Label = c("Cluster_0", "Cluster_4", "Cluster_3",
"Cluster_2", "Cluster_1"), class = "factor")), row.names = c("pat01.pre_AAACCTGAGGAGCGAG",
"pat01.pre_AAACCTGCACTACAGT", "pat01.pre_AAACCTGTCACCGTAA", "pat01.pre_AAATGCCCACTATCTT",
"pat01.pre_AACCATGAGCATCATC", "pat01.pre_AACCGCGCAGATGGCA"), class = "data.frame")
Gene Count Per Cell :
dput(head(Gene_Counts_per_Cell[, c(1:5)]))
structure(list(pat01.pre_AAACCTGAGGAGCGAG = c(1.99399322071276,
1.5433201979508, 2.4725719042226, -2.59159111384049, 1.56977481481343,
0.192853860719877), pat01.pre_AAACCTGCACTACAGT = c(2.90248911455912,
2.27707326162242, 2.12992680712843, -1.44512552229319, 0.541062218328074,
1.8626908687607), pat01.pre_AAACCTGTCACCGTAA = c(3.99090573935858,
3.00560247848693, 2.9656947677965, -3.23693215603618, 4.72557633990864,
0.0247894431208639), pat01.pre_AAATGCCCACTATCTT = c(1.08405270702075,
-0.884466121620786, 0.500175551980942, -2.28142505510742, 3.97105313918843,
-1.01130712883293), pat01.pre_AACCATGAGCATCATC = c(4.55944063063621,
2.43937477176712, 3.93016796802459, -1.92695887361317, 3.16070890309665,
1.65917938530014)), row.names = c("ACTB", "ACTG1", "ACTN1", "ADAP2",
"ADM", "ADRB2"), class = "data.frame")
This maybe what you are looking for. Note have had to convert the clusters into numeric to ensure type consistency for columns. Use rownames to distinguish between clusters and other numeric data in the columns.
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
dfb %>%
rownames_to_column("rowname") %>%
pivot_wider(names_from = rowname, values_from = `hnscc.vp.fibroblasts#active.ident`) %>%
mutate(across(everything(), ~as.numeric(str_extract(.x, "\\d$")))) %>%
mutate(cluster = "cluster") %>%
column_to_rownames(var = "cluster") %>%
bind_rows(dfa)
#> pat01.pre_AAACCTGAGGAGCGAG pat01.pre_AAACCTGCACTACAGT
#> cluster 3.0000000 4.0000000
#> ACTB 1.9939932 2.9024891
#> ACTG1 1.5433202 2.2770733
#> ACTN1 2.4725719 2.1299268
#> ADAP2 -2.5915911 -1.4451255
#> ADM 1.5697748 0.5410622
#> ADRB2 0.1928539 1.8626909
#> pat01.pre_AAACCTGTCACCGTAA pat01.pre_AAATGCCCACTATCTT
#> cluster 3.00000000 3.0000000
#> ACTB 3.99090574 1.0840527
#> ACTG1 3.00560248 -0.8844661
#> ACTN1 2.96569477 0.5001756
#> ADAP2 -3.23693216 -2.2814251
#> ADM 4.72557634 3.9710531
#> ADRB2 0.02478944 -1.0113071
#> pat01.pre_AACCATGAGCATCATC pat01.pre_AACCGCGCAGATGGCA
#> cluster 3.000000 3
#> ACTB 4.559441 NA
#> ACTG1 2.439375 NA
#> ACTN1 3.930168 NA
#> ADAP2 -1.926959 NA
#> ADM 3.160709 NA
#> ADRB2 1.659179 NA
data
dfb <- structure(list(`hnscc.vp.fibroblasts#active.ident` = structure(c(3L,
2L, 3L, 3L, 3L, 3L), .Label = c("Cluster_0", "Cluster_4", "Cluster_3",
"Cluster_2", "Cluster_1"), class = "factor")), row.names = c("pat01.pre_AAACCTGAGGAGCGAG",
"pat01.pre_AAACCTGCACTACAGT", "pat01.pre_AAACCTGTCACCGTAA", "pat01.pre_AAATGCCCACTATCTT",
"pat01.pre_AACCATGAGCATCATC", "pat01.pre_AACCGCGCAGATGGCA"), class = "data.frame")
dfa <- structure(list(pat01.pre_AAACCTGAGGAGCGAG = c(1.99399322071276,
1.5433201979508, 2.4725719042226, -2.59159111384049, 1.56977481481343,
0.192853860719877), pat01.pre_AAACCTGCACTACAGT = c(2.90248911455912,
2.27707326162242, 2.12992680712843, -1.44512552229319, 0.541062218328074,
1.8626908687607), pat01.pre_AAACCTGTCACCGTAA = c(3.99090573935858,
3.00560247848693, 2.9656947677965, -3.23693215603618, 4.72557633990864,
0.0247894431208639), pat01.pre_AAATGCCCACTATCTT = c(1.08405270702075,
-0.884466121620786, 0.500175551980942, -2.28142505510742, 3.97105313918843,
-1.01130712883293), pat01.pre_AACCATGAGCATCATC = c(4.55944063063621,
2.43937477176712, 3.93016796802459, -1.92695887361317, 3.16070890309665,
1.65917938530014)), row.names = c("ACTB", "ACTG1", "ACTN1", "ADAP2",
"ADM", "ADRB2"), class = "data.frame")
Created on 2022-03-21 by the reprex package (v2.0.1)

r transfer values from one dataset to another by ID

I have two datasets , the first dataset is like this
ID Weight State
1 12.34 NA
2 11.23 IA
2 13.12 IN
3 12.67 MA
4 10.89 NA
5 14.12 NA
The second dataset is a lookup table for state values by ID
ID State
1 WY
2 IA
3 MA
4 OR
4 CA
5 FL
As you can see there are two different state values for ID 4, which is normal.
What I want to do is replace the NAs in dataset1 State column with State values from dataset 2. Expected dataset
ID Weight State
1 12.34 WY
2 11.23 IA
2 13.12 IN
3 12.67 MA
4 10.89 OR,CA
5 14.12 FL
Since ID 4 has two state values in dataset2 , these two values are collapsed and separated by , and used to replace the NA in dataset1. Any suggestion on accomplishing this is much appreciated. Thanks in advance.
Collapse df2 value and join it with df1 by 'ID'. Use coalesce to use non-NA value from the two state columns.
library(dplyr)
df1 %>%
left_join(df2 %>%
group_by(ID) %>%
summarise(State = toString(State)), by = 'ID') %>%
mutate(State = coalesce(State.x, State.y)) %>%
select(-State.x, -State.y)
# ID Weight State
#1 1 12.3 WY
#2 2 11.2 IA
#3 2 13.1 IN
#4 3 12.7 MA
#5 4 10.9 OR, CA
#6 5 14.1 FL
In base R with merge and transform.
merge(df1, aggregate(State~ID, df2, toString), by = 'ID') |>
transform(State = ifelse(is.na(State.x), State.y, State.x))
Tidyverse way:
library(tidyverse)
df1 %>%
left_join(df2 %>%
group_by(ID) %>%
summarise(State = toString(State)) %>%
ungroup(), by = 'ID') %>%
transmute(ID, Weight, State = coalesce(State.x, State.y))
Base R alternative:
na_idx <- which(is.na(df1$State))
df1$State[na_idx] <- with(
aggregate(State ~ ID, df2, toString),
State[match(df1$ID, ID)]
)[na_idx]
Data:
df1 <- structure(list(ID = c(1L, 2L, 2L, 3L, 4L, 5L), Weight = c(12.34,
11.23, 13.12, 12.67, 10.89, 14.12), State = c("WY", "IA", "IN",
"MA", "OR, CA", "FL")), row.names = c(NA, -6L), class = "data.frame")
df2 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L), State = c("WY",
"IA", "MA", "OR", "CA", "FL")), class = "data.frame", row.names = c(NA,
-6L))

how to convert the information from row to column in R

I have a dateset named "df":
df <- structure(list(outcome = c("cg00000029", "cg00000029", "cg00000029",
"cg00000108", "cg00000108", "cg00000108"),
pval = c("0.63", "0.91", "0.01","0.09", "0.55", "0.23")),
.Names = c("outcome", "pval"), class = "data.frame",row.names = c(NA, -6L))
How could I convert it into dataset named "df1"?
df1 <- structure(list(outcome = c("cg00000029", "cg00000108"),
pval_1 = c("0.63", "0.91"),
pval_2 = c("0.01","0.09"),
pval_3 = c("0.55", "0.23")),
.Names = c("outcome", "pval_1", "pval_2", "pval_3"), class = "data.frame",row.names = c(NA, -2L))
Thank you!
A data.table option using dcast
> dcast(setDT(df), outcome ~ paste0("pval_", rowid(outcome)))
Using 'pval' as value column. Use 'value.var' to override
outcome pval_1 pval_2 pval_3
1: cg00000029 0.63 0.91 0.01
2: cg00000108 0.09 0.55 0.23
Here is a tidyverse approach:
library(dplyr)
library(tidyr)
df %>%
group_by(outcome) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = id, values_from = pval,
names_glue = "{.value}_{id}")
# A tibble: 2 x 4
# Groups: outcome [2]
outcome pval_1 pval_2 pval_3
<chr> <chr> <chr> <chr>
1 cg00000029 0.63 0.91 0.01
2 cg00000108 0.09 0.55 0.23

Merge two data frames based on multiple columns in R

I have two data frames looking like that
data frame 1:
P.X value
OOPA 5
POKA 4
JKIO 3
KOPP 1
data frame 2:
P.X.1 P.X.2 P.X.3 P.X.4 mass
JKIO UIX HOP 56
CX OOPA 44
EDD POKA 13
KOPP FOSI 11
and I want to merge the two data files based on the df1 P.X and df2 P.X.1,P.X.2,P.X.3,P.X.4. So if it the JKIO in P.X.2. appears in the P.X one then merge them in a new data frame in the same row JKIO, 3, 56 as below:
data frame new:
P.X value mass
OOPA 5 44
POKA 4 13
JKIO 3 56
KOPP 1 11
Do you know how can I do it maybe with
merge(df1,df2 by(P.X == P.X.1 | P.X.2 | P.X.3 | P.X.4)
?
The following is one way to achieve your goal. You want to convert df2 to a long-format data and get rows that have more than 1 character. Once you have this data, you merge df1 with the updated df2.
library(dplyr)
library(tidyr)
left_join(df1,
pivot_longer(df2, cols = P.X.1:P.X.4, names_to = "foo",
values_to = "P.X") %>% filter(nchar(P.X) > 0),
by = "P.X") %>%
select(-foo)
P.X value mass
1 OOPA 5 44
2 POKA 4 13
3 JKIO 3 56
4 KOPP 1 11
DATA
df1 <- structure(list(P.X = c("OOPA", "POKA", "JKIO", "KOPP"), value = c(5L,
4L, 3L, 1L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(P.X.1 = c("", "", "EDD", "KOPP"), P.X.2 = c("JKIO",
"", "", "FOSI"), P.X.3 = c("UIX", "CX", "POKA", ""), P.X.4 = c("HOP",
"OOPA", "", ""), mass = c(56, 44, 13, 11)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
You could also just do:
df_new <- cbind(df1, df2[,5])

How to look up values from a table and insert name of the lookup-list?

I have a (sample)table like this:
df <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="Gene SYMBOL Values
TP53 2 3.55
XBP1 5 4.06
TP27 1 2.53
REDD1 4 3.99
ERO1L 6 5.02
STK11 9 3.64
HIF2A 8 2.96")
I want to look up the symbols from two different genelists, given here as genelist1 and genelist2:
genelist1 <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="Gene SYMBOL
P4H 10
PLK 7
TP27 1
KTD 11
ERO1L 6")
genelist2 <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="Gene SYMBOL
TP53 2
XBP1 5
BHLHB 12
STK11 9
TP27 1
UPK 18")
What I want to is to get a new column where I can see in which genelist(s) I can find each of the genes in my dataframe, but when I run the following code it is just the symbols that are repeated in the new columns.
df_geneinfo <- df %>%
join(genelist1,by="SYMBOL") %>%
join(genelist2, by="SYMBOL")
Any suggestions of how to solve this, either to make one new column with the name of the genelists, or to make one column for each of the genelists?
Thanks in advance! :)
For the sake of completeness (and performance with large tables, perhaps), here is a data.table approach:
library(data.table)
rbindlist(list(genelist1, genelist2), idcol = "glid")[, -"Gene"][
setDT(df), on = "SYMBOL"][, .(glid = toString(glid)), by = .(Gene, SYMBOL, Values)][]
Gene SYMBOL Values glid
1: TP53 2 3.55 2
2: XBP1 5 4.06 2
3: TP27 1 2.53 1, 2
4: REDD1 4 3.99 1
5: ERO1L 6 5.02 NA
6: STK11 9 3.64 2
7: HIF2A 8 2.96 NA
rbindlist() creates a data.table from all genelists and adds a column glid to identify the origin of each row. The Gene column is ignored as the subsequent join is only on SYMBOL. Before joining, df is coerced to class data.table using setDT(). The joined result is then aggregated by SYMBOL to exhibit cases where a symbol appears in both genelists which is the case for SYMBOL == 1.
Edit
In case there are many genelists or the full name of the genelist is required instead of just a number, we can try this:
rbindlist(mget(ls(pattern = "^genelist")), idcol = "glid")[, -"Gene"][
setDT(df), on = "SYMBOL"][, .(glid = toString(glid)), by = .(Gene, SYMBOL, Values)][]
Gene SYMBOL Values glid
1: TP53 2 3.55 genelist2
2: XBP1 5 4.06 genelist2
3: TP27 1 2.53 genelist1, genelist2
4: REDD1 4 3.99 NA
5: ERO1L 6 5.02 genelist1
6: STK11 9 3.64 genelist2
7: HIF2A 8 2.96 NA
ls()is looking for objects in the environment the name of which is starting with genelist.... mget() returns a named list of those objects which is passed to rbindlist().
Data
As provided by the OP
df <- structure(list(Gene = c("TP53", "XBP1", "TP27", "REDD1", "ERO1L",
"STK11", "HIF2A"), SYMBOL = c(2L, 5L, 1L, 4L, 6L, 9L, 8L), Values = c(3.55,
4.06, 2.53, 3.99, 5.02, 3.64, 2.96)), .Names = c("Gene", "SYMBOL",
"Values"), class = "data.frame", row.names = c(NA, -7L))
genelist1 <- structure(list(Gene = c("P4H", "PLK", "TP27", "KTD", "ERO1L"),
SYMBOL = c(10L, 7L, 1L, 11L, 4L)), .Names = c("Gene", "SYMBOL"
), class = "data.frame", row.names = c(NA, -5L))
genelist2 <- structure(list(Gene = c("TP53", "XBP1", "BHLHB", "STK11", "TP27",
"UPK"), SYMBOL = c(2L, 5L, 12L, 9L, 1L, 18L)), .Names = c("Gene",
"SYMBOL"), class = "data.frame", row.names = c(NA, -6L))
I just wrote my own function, which replaces the column values:
replace_by_lookuptable <- function(df, col, lookup) {
assertthat::assert_that(all(col %in% names(df))) # all cols exist in df
assertthat::assert_that(all(c("new", "old") %in% colnames(lookup)))
cond_na_exists <- is.na(unlist(lapply(df[, col], function(x) my_match(x, lookup$old))))
assertthat::assert_that(!any(cond_na_exists))
df[, col] <- unlist(lapply(df[, col], function(x) lookup$new[my_match(x, lookup$old)]))
return(df)
}
df is the data.frame, col is a vector of column names which should be replaced using lookup, a data.frame with column "old" and "new".
If you add a listid column to your genelists
genelist1$listid = 1
genelist2$listid = 2
you can then merge your df with the genelists:
merge(df,rbind(genelist1,genelist2),all.x=T, by = "SYMBOL")
Note that ERO1L is SYMBOL 6 in your df and 4 in genelist1, and HIF2A and REDD1 are missing from genelists but REDD1 is symbol 4 in your df (which is ERO1L in genlist1... so I'm a not sure of what output you're expecting in that case.
You could also merge only on Gene names:
merge(df,rbind(genelist1,genelist2),all.x=T, by.x = "Gene", by.y= "Gene")
You could put all of your genlists in a list:
gen_list <- list(genelist1 = genelist1,genelist2 = genelist2)
and compare them to your target data.frame:
cbind(df,do.call(cbind,lapply(seq_along(gen_list),function(x) ifelse( df$Gene %in% gen_list[[x]]$Gene,names(gen_list[x]),NA))))

Resources