Apply a function to each group - r

I have this dataset:
A<- c(10,20,10,31,51,1,60,1,02,0,12,0,20,1,0,0,0,0,1,0,1,1,1)
B<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0)
C<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1)
SUB <- c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2)
dat <- as.data.frame(cbind(SUB,B,A,C))
I wrote a function calculating the cor among A/B, B/C, C/A.
Z <- function(a,b,c) {
cor1 = cor(a,b)
cor2 = cor(b,c)
cor3 = cor(c,a)
x = c(cor1,cor2,cor3)
return(x)
}
if I type
Z(dat$A, dat$B,dat$C)
I get the vector of results:
> [1] 0.11294312 0.91417410 0.06457059
I need to condition my function to the SUB variable and get a matrix whose rows are the cor among A/B, B/C, C/A for each SUB.
For instance:
A/B B/C C/A
SUB1 0.11294312 0.91417410 0.06457059
SUB2 0.10335312 0.96744677 0.16356059
Thank you,
Best regards

base R
You can split with by and then recombine.
do.call(rbind, by(dat, dat$SUB, function(x) Z(x$A, x$B, x$C)))
# [,1] [,2] [,3]
# 1 -0.1534126 1.0000000 -0.15341258
# 2 0.1081781 0.8215838 0.04608456
The row names 1 and 2 are the SUB values themselves; if SUB is more "interesting" than counting numbers, it will be more apparent. Column names can be applied trivially.
dplyr
library(dplyr)
dat %>%
group_by(SUB) %>%
summarize(as.data.frame(matrix(Z(A, B, C), nr = 1)))
# # A tibble: 2 x 4
# SUB V1 V2 V3
# <dbl> <dbl> <dbl> <dbl>
# 1 1 -0.153 1.00 -0.153
# 2 2 0.108 0.822 0.0461

Try split in combination with sapply
sapply( split(dat,dat$SUB), function(x) Z(x["A"],x["B"],x["C"]) )
1 2
[1,] -0.1534126 0.10817808
[2,] 1.0000000 0.82158384
[3,] -0.1534126 0.04608456

Actually there's no need for your function if you use the upper.tri of the correlation matrix. Recently you can do this very easily by piping:
sapply(unique(dat$SUB), \(i) cor(dat[dat$SUB == i, -1]) |> {\(x) x[upper.tri(x)]}())
# [,1] [,2]
# [1,] -0.1534126 0.10817808
# [2,] 1.0000000 0.82158384
# [3,] -0.1534126 0.04608456
R.version.string
# [1] "R version 4.1.2 (2021-11-01)"
Data
dat <- structure(list(SUB = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2), B = c(1, 0, 0, 1, 1, 1, 0, 1,
1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), A = c(10, 20, 10,
31, 51, 1, 60, 1, 2, 0, 12, 0, 20, 1, 0, 0, 0, 0, 1, 0, 1, 1,
1), C = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 1)), class = "data.frame", row.names = c(NA, -23L
))

This is a lengthy answer, but it should be pretty flexible.
library(tidyverse)
cor.by.group.combos <- function(.data, groups, vars){
by <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
piv <- gsub(x = rlang::quo_get_expr(enquo(vars)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
.data %>%
group_by(!!!groups) %>%
group_split() %>%
map(.,
~pivot_longer(., cols = all_of(piv), names_to = "name", values_to = "val") %>%
nest(data = val) %>%
full_join(.,.,by = by) %>%
filter(name.x != name.y) %>%
mutate(test = paste(name.x, "vs",name.y, sep = "."),
grp = paste0(by,!!!groups),
cor = map2_dbl(data.x,data.y, ~cor(unlist(.x), unlist(.y)))) %>%
select(test,grp, cor)
) %>%
bind_rows() %>%
pivot_wider(names_from = test, values_from = cor)
}
cor.by.group.combos(dat, vars(SUB), vars(A, B, C))
#> # A tibble: 2 x 7
#> grp A.vs.B A.vs.C B.vs.A B.vs.C C.vs.A C.vs.B
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 SUB1 -0.153 -0.153 -0.153 1 -0.153 1
#> 2 SUB2 0.108 0.0461 0.108 0.822 0.0461 0.822
In essence, what we are doing is splitting the data by group, and then applying a cor test to every combination of the selected variables. The way I set this up will give some duplicate tests (e.g., A.vs.B and B.vs.A). You could fix this by using combn instead of full_join, but I didn't take the time to work out the details. This function should work if you change the input variables, the grouping variables, ect. You can also apply multiple groups with this method.

Related

How to automatically fill in a blank column

I am trying to get the list of sums of two columns from my original data set, from left to right
I have made a loop:
for (i in 1:ncol(df)) {
m = i
n = i + 1
if (i %% 2 != 0) {
df_cum$V1 <- sum(df[,m] + df[,n])
}
}
But, the way to add value to the new list is wrong:
df_cum$V1 <- sum(df[,m] + df[,n])
would be really appreciated if anyone knows how to do that in R
You can try split.default(), i.e.
sapply(split.default(df, gsub('\\d+', '', names(df))), sum)
A B
17 12
A base R option using tapply -
tapply(unlist(df),
rep(1:ncol(df), each = nrow(df) * 2, length.out = nrow(df) * ncol(df)),
sum)
# 1 2 3
#17 12 13
The logic here is to create group of every 2 columns and sum them.
data
It is easier to help if you provide data in a reproducible format
df <- data.frame(A1 = c(0, 3, 2), A2 = c(2, 6, 4),
B1 = c(3, 0, 1), B2 = c(2, 3, 3),
C1 = c(7, 3, 2), C2 = c(1, 0, 0))
We can do this in tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(everything(), names_to = c(".value", "grp"),
names_sep ="(?<=[A-Z])(?=[0-9])") %>%
select(-grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 1 x 3
A B C
<dbl> <dbl> <dbl>
1 17 12 13
Or using base R
aggregate(values ~ ., transform(stack(df1),
ind = sub("\\d+", "", ind)), FUN = sum)
ind values
1 A 17
2 B 12
3 C 13
Or another option with rowsum from base R
with(stack(df1), rowsum(values, group = trimws(ind, whitespace = "\\d+")))
[,1]
A 17
B 12
C 13
Or another option is with colSums and rowsum
{tmp <- colSums(df1); rowsum(tmp, group = substr(names(tmp), 1, 1))}
[,1]
A 17
B 12
C 13
data
df1 <- structure(list(A1 = c(0, 3, 2), A2 = c(2, 6, 4), B1 = c(3, 0,
1), B2 = c(2, 3, 3), C1 = c(7, 3, 2), C2 = c(1, 0, 0)),
class = "data.frame", row.names = c(NA,
-3L))

generate a weighted matrix from r dataframe

I have a toy example of a dataframe:
df <- data.frame(matrix(, nrow = 5, ncol = 0))
df["A|A"] <- c(0.3, 0, 0, 100, 23)
df["A|B"]= c(0, 0, 0.3, 10, 0.23)
df["A|C"]= c(0.3, 0.1, 0, 100, 2)
df["B|B"]= c(0, 0, 0, 12, 2)
df["B|B"]= c(0, 0, 0.3, 0, 0.23)
df["B|C"]= c(0.3, 0, 0, 21, 3)
df["C|A"]= c(0.3, 0, 1, 100, 0)
df["C|B"]= c(0, 0, 0.3, 10, 0.2)
df["C|C"]= c(0.3, 0, 1, 1, 0.3)
I need to get a matrix with counts of non-zero values between A and A, A and B, ..., C and C.
I started splitting the colnames and assigning them to variables. But I don't know how to create a matrix with certain rows and columns in a loop
counts <- colSums(df != 0)
df <- rbind(df, counts)
for(i in colnames(df)) {
cluster1 <- (strsplit(i, "\\|")[[1]])[1]
cluster2 <- (strsplit(i, "\\|")[[1]])[2]
}
A base R option
> table(read.table(text = rep(names(df), colSums(df > 0)), sep = "|"))
V2
V1 A B C
A 3 3 4
B 0 2 3
C 3 3 4
or a longer version
table(
data.frame(
do.call(
rbind,
strsplit(
as.character(subset(stack(df), values > 0)$ind),
"\\|"
)
)
)
)
gives
X2
X1 A B C
A 3 3 4
B 0 2 3
C 3 3 4
Reshape the data into 'long' format with pivot_longer, then separate the 'name' column into two, and reshape back to 'wide' with pivot_wider, specifying the values_fn as a lambda function to get the count of non-zero values
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
separate(name, into = c('name1', 'name2')) %>%
pivot_wider(names_from = name2, values_from = value,
values_fn = list(value = ~ sum(. > 0)), values_fill = 0)
-output
# A tibble: 3 x 4
name1 A B C
<chr> <int> <int> <int>
1 A 3 3 4
2 B 0 2 3
3 C 3 3 4

how to build a string variable to capture muti cols info

I have a df that looks like this:
It can be build using codes:
structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
Where pass stand for a student pass any test or not. Now I want to build a new var Result to capture a student's test results like following, what should I do?
Try the base R code below
q <- with(data.frame(which(df[-(1:2)] == 1, arr.ind = TRUE)),
tapply(names(df[-(1:2)])[col], factor(row, levels = 1:nrow(df)), toString))
df$Result <- ifelse(is.na(q), "Not Pass", paste0("Pass: ", q))
which gives
> df
# A tibble: 5 x 6
ID Pass Math ELA PE Result
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 0 0 0 Not Pass
2 2 1 0 1 0 Pass: ELA
3 3 1 1 0 1 Pass: Math, PE
4 4 1 1 1 1 Pass: Math, ELA, PE
5 5 1 1 0 1 Pass: Math, PE
Using dplyr with rowwise
library(dplyr)
library(stringr)
df1 %>%
rowwise %>%
mutate(Result = if(as.logical(Pass))
str_c('Pass: ', toString(names(select(., Math:PE))[as.logical(c_across(Math:PE))])) else 'Not pass' ) %>%
ungroup
# A tibble: 5 x 6
# ID Pass Math ELA PE Result
# <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 0 0 0 Not pass
#2 2 1 0 1 0 Pass: ELA
#3 3 1 1 0 1 Pass: Math, PE
#4 4 1 1 1 1 Pass: Math, ELA, PE
#5 5 1 1 0 1 Pass: Math, PE
data
df1 <- structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
Here's one solution:
library(dplyr)
library(magrittr)
library(stringr)
df <- structure(list(ID = c(1, 2, 3, 4, 5), Pass = c(0, 1, 1, 1, 1),
Math = c(0, 0, 1, 1, 1), ELA = c(0, 1, 0, 1, 0), PE = c(0,
0, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
df %<>% pivot_longer(cols = -c(ID, Pass), names_to = "sub", values_to = "done")
df %<>% group_by(ID) %>% mutate(Result = paste0(ifelse(done == 1, sub, NA), collapse = ", ")) %>% ungroup()
df %<>% pivot_wider(names_from = sub, values_from = done)
df %<>% mutate(Result = paste0("Pass: ", str_replace_all(Result, "NA[, ]*", "")))
df %<>% mutate(Result = ifelse(str_detect(Result, "Pass: $"), "Not pass", str_replace_all(Result, ",[\\s]*$", "")))
df
# # A tibble: 5 x 6
# ID Pass Result Math ELA PE
# <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
# 1 1 0 Not pass 0 0 0
# 2 2 1 Pass: ELA 0 1 0
# 3 3 1 Pass: Math, PE 1 0 1
# 4 4 1 Pass: Math, ELA, PE 1 1 1
# 5 5 1 Pass: Math, PE 1 0 1
I can provide an explanation of what the code is doing if necessary.

How to select max numeric value out of numeric characters?

I have a dataset where I have grouped by a Gene column. Some values grouped into each row are just ., so I remove them, leaving only several numeric characters per row and column.
To do this am coding:
#Group by Gene:
data <- setDT(df2)[, lapply(.SD, paste, collapse = ", "), by = Genes]
#Remove ., from anywhere in the dataframe
dat <- data.frame(lapply(data, function(x) {
gsub("\\.,|\\.$|\\,$|(, .$)", "", x)
}))
My data before removing ., and after grouping by Gene looks like:
Gene col1 col2 col3 col4
ACE 0.3, 0.4, 0.5, 0.5 . ., ., . 1, 1, 1, 1, 1
NOS2 ., . . ., ., ., . 0, 0, 0, 0, 0
BRCA1 . ., . 1, 1, 1, 1, 1
HER2 . 0.1, ., ., 0.2, 0.1 . 1, 1, 1, 1, 1
After removing ., my data looks like:
Gene col1 col2 col3 col4
ACE 0.3, 0.4, 0.5, 0.5 1, 1, 1, 1, 1
NOS2 0, 0, 0, 0, 0
BRCA1 1, 1, 1, 1, 1
HER2 0.1, 0.2, 0.1 1, 1, 1, 1, 1
I am now trying to select the minimum or maximum value per row and column.
Expecting example output:
Gene col1 col2 col3 col4
ACE 0.5 1
NOS2 0
BRCA1 1
HER2 0.1 1
#For col1 I need the max value per row (so for ACE 0.5 is selected)
#For col2 I need the min value per row
For note, my actual data is 100 columns and 20,000 rows - different columns need either max or min values per gene selected.
However with the code I use I am only getting the expected output for col4 and my other columns repeat the selected value twice (I am getting 0.5, 0.5 and 0.1, 0.1 and I can't figure out why).
The code I am using to select min/max values is:
#Max value per feature and row
max2 = function(x) if(all(is.na(x))) NA else max(x,na.rm = T)
getmax = function(col) str_extract_all(col,"[0-9\\.-]+") %>%
lapply(.,function(x)max2(as.numeric(x)) ) %>%
unlist()
#Min value per feature and row
min2 = function(x) if(all(is.na(x))) NA else min(x,na.rm = T)
getmin = function(col) str_extract_all(col,"[0-9\\.-]+") %>%
lapply(.,function(x)min2(as.numeric(x)) ) %>%
unlist()
data <- dt %>%
mutate_at(names(dt)[2],getmax)
data <- dt %>%
mutate_at(names(dt)[3],getmin)
data <- dt %>%
mutate_at(names(dt)[4],getmax)
Why aren't these selection functions working for all my columns? All columns are character class. I'm also wondering if I even need to remove ., at all and can just jump straight to selecting the max/min value per row and column?
Example input data:
structure(list(Gene = c("ACE", "NOS2", "BRCA1", "HER2"), col1 = c("0.3, 0.4, 0.5, 0.5",
"", "", ""), col2 = c("", "", "", " 0.1, 0.2 0.,1"), col3 = c(NA,
NA, NA, NA), col4 = c(" 1, 1, 1, 1, 1",
" 0, 0, 0, 0, 0", " 1, 1, 1, 1, 1",
" 1, 1, 1, 1, 1")), row.names = c(NA, -4L), class = c("data.table",
"data.frame"))
You can use type.convert and set its argument na.strings to ".". You may also want to use the range function to get both min and max in one shot.
Assume that your data.table looks like this
> dt
Gene col1 col2 col3 col4
1: ACE 0.3, 0.4, 0.5, 0.5 . ., ., . 1, 1, 1, 1, 1
2: NOS2 ., . . ., ., ., . 0, 0, 0, 0, 0
3: BRCA1 . ., . 1, 1, 1, 1, 1
4: HER2 . 0.1, ., ., 0.2, 0.1 . 1, 1, 1, 1, 1
Consider a function like this
library(data.table)
library(stringr)
get_range <- function(x) {
x <- type.convert(str_split(x, ",\\s+", simplify = TRUE), na.strings = ".")
x <- t(apply(x, 1L, function(i) {
i <- i[!is.na(i)]
if (length(i) < 1L) c(NA_real_, NA_real_) else range(i)
}))
dimnames(x)[[2L]] <- c("min", "max")
x
}
Then you can just
dt[, c(Gene = .(Gene), lapply(.SD, get_range)), .SDcols = -"Gene"]
Output
Gene col1.min col1.max col2.min col2.max col3.min col3.max col4.min col4.max
1: ACE 0.3 0.5 NA NA NA NA 1 1
2: NOS2 NA NA NA NA NA NA 0 0
3: BRCA1 NA NA NA NA NA NA 1 1
4: HER2 NA NA 0.1 0.2 NA NA 1 1
Note that there is no need to do it by Gene as the function get_range is already vectorised.

Using data from one column to assign NAs to others

Goal: Add NAs to columns based on the value of another column. For example, if I have a data set with five columns (one ID column and four binary variables: caseID, var1, var2, var3, nocasedata), how can I evaluate the data from "nocasedata" to determine TRUE (no data) or FALSE (data) and then remove that column and assign NA (if TRUE) or do nothing (if FALSE) to the entire row of that case for the three other variables. (tidyverse tools preferred, but not necessary.)
Reproducible example:
df <- data.frame(caseID = c(1:5),
var1 = c(1, 0, 0, 1, 1),
var2 = c(0, 0, 1, 1, 0),
var3 = c(0, 0, 0, 1, 1),
nocasedata = c(0, 1, 0, 0, 0))
df
desired_df <- data.frame(caseID = c(1:5),
var1 = c(1, NA, 0, 1, 1),
var2 = c(0, NA, 1, 1, 0),
var3 = c(0, NA, 0, 1, 1))
desired_df
Here is a reprex of the solution using tidyverse tools, as requested.
library(tidyverse)
#> -- Attaching packages ---------------------------------------------------- tidyverse 1.2.1 --
#> v ggplot2 2.2.1 v purrr 0.2.4
#> v tibble 1.3.4 v dplyr 0.7.4
#> v tidyr 0.7.2 v stringr 1.2.0
#> v readr 1.1.1 v forcats 0.2.0
#> -- Conflicts ------------------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
df <- data.frame(caseID = c(1:5),
var1 = c(1, 0, 0, 1, 1),
var2 = c(0, 0, 1, 1, 0),
var3 = c(0, 0, 0, 1, 1),
nocasedata = c(0, 1, 0, 0, 0))
df
#> caseID var1 var2 var3 nocasedata
#> 1 1 1 0 0 0
#> 2 2 0 0 0 1
#> 3 3 0 1 0 0
#> 4 4 1 1 1 0
#> 5 5 1 0 1 0
desired_df = df %>%
mutate_at(.vars = vars(var1:var3),
.funs = funs(ifelse(nocasedata == 1, NA, .))) %>%
select(-nocasedata)
desired_df
#> caseID var1 var2 var3
#> 1 1 1 0 0
#> 2 2 NA NA NA
#> 3 3 0 1 0
#> 4 4 1 1 1
#> 5 5 1 0 1

Resources