I have a dataset where I have grouped by a Gene column. Some values grouped into each row are just ., so I remove them, leaving only several numeric characters per row and column.
To do this am coding:
#Group by Gene:
data <- setDT(df2)[, lapply(.SD, paste, collapse = ", "), by = Genes]
#Remove ., from anywhere in the dataframe
dat <- data.frame(lapply(data, function(x) {
gsub("\\.,|\\.$|\\,$|(, .$)", "", x)
}))
My data before removing ., and after grouping by Gene looks like:
Gene col1 col2 col3 col4
ACE 0.3, 0.4, 0.5, 0.5 . ., ., . 1, 1, 1, 1, 1
NOS2 ., . . ., ., ., . 0, 0, 0, 0, 0
BRCA1 . ., . 1, 1, 1, 1, 1
HER2 . 0.1, ., ., 0.2, 0.1 . 1, 1, 1, 1, 1
After removing ., my data looks like:
Gene col1 col2 col3 col4
ACE 0.3, 0.4, 0.5, 0.5 1, 1, 1, 1, 1
NOS2 0, 0, 0, 0, 0
BRCA1 1, 1, 1, 1, 1
HER2 0.1, 0.2, 0.1 1, 1, 1, 1, 1
I am now trying to select the minimum or maximum value per row and column.
Expecting example output:
Gene col1 col2 col3 col4
ACE 0.5 1
NOS2 0
BRCA1 1
HER2 0.1 1
#For col1 I need the max value per row (so for ACE 0.5 is selected)
#For col2 I need the min value per row
For note, my actual data is 100 columns and 20,000 rows - different columns need either max or min values per gene selected.
However with the code I use I am only getting the expected output for col4 and my other columns repeat the selected value twice (I am getting 0.5, 0.5 and 0.1, 0.1 and I can't figure out why).
The code I am using to select min/max values is:
#Max value per feature and row
max2 = function(x) if(all(is.na(x))) NA else max(x,na.rm = T)
getmax = function(col) str_extract_all(col,"[0-9\\.-]+") %>%
lapply(.,function(x)max2(as.numeric(x)) ) %>%
unlist()
#Min value per feature and row
min2 = function(x) if(all(is.na(x))) NA else min(x,na.rm = T)
getmin = function(col) str_extract_all(col,"[0-9\\.-]+") %>%
lapply(.,function(x)min2(as.numeric(x)) ) %>%
unlist()
data <- dt %>%
mutate_at(names(dt)[2],getmax)
data <- dt %>%
mutate_at(names(dt)[3],getmin)
data <- dt %>%
mutate_at(names(dt)[4],getmax)
Why aren't these selection functions working for all my columns? All columns are character class. I'm also wondering if I even need to remove ., at all and can just jump straight to selecting the max/min value per row and column?
Example input data:
structure(list(Gene = c("ACE", "NOS2", "BRCA1", "HER2"), col1 = c("0.3, 0.4, 0.5, 0.5",
"", "", ""), col2 = c("", "", "", " 0.1, 0.2 0.,1"), col3 = c(NA,
NA, NA, NA), col4 = c(" 1, 1, 1, 1, 1",
" 0, 0, 0, 0, 0", " 1, 1, 1, 1, 1",
" 1, 1, 1, 1, 1")), row.names = c(NA, -4L), class = c("data.table",
"data.frame"))
You can use type.convert and set its argument na.strings to ".". You may also want to use the range function to get both min and max in one shot.
Assume that your data.table looks like this
> dt
Gene col1 col2 col3 col4
1: ACE 0.3, 0.4, 0.5, 0.5 . ., ., . 1, 1, 1, 1, 1
2: NOS2 ., . . ., ., ., . 0, 0, 0, 0, 0
3: BRCA1 . ., . 1, 1, 1, 1, 1
4: HER2 . 0.1, ., ., 0.2, 0.1 . 1, 1, 1, 1, 1
Consider a function like this
library(data.table)
library(stringr)
get_range <- function(x) {
x <- type.convert(str_split(x, ",\\s+", simplify = TRUE), na.strings = ".")
x <- t(apply(x, 1L, function(i) {
i <- i[!is.na(i)]
if (length(i) < 1L) c(NA_real_, NA_real_) else range(i)
}))
dimnames(x)[[2L]] <- c("min", "max")
x
}
Then you can just
dt[, c(Gene = .(Gene), lapply(.SD, get_range)), .SDcols = -"Gene"]
Output
Gene col1.min col1.max col2.min col2.max col3.min col3.max col4.min col4.max
1: ACE 0.3 0.5 NA NA NA NA 1 1
2: NOS2 NA NA NA NA NA NA 0 0
3: BRCA1 NA NA NA NA NA NA 1 1
4: HER2 NA NA 0.1 0.2 NA NA 1 1
Note that there is no need to do it by Gene as the function get_range is already vectorised.
Related
I have this dataset:
A<- c(10,20,10,31,51,1,60,1,02,0,12,0,20,1,0,0,0,0,1,0,1,1,1)
B<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0)
C<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1)
SUB <- c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2)
dat <- as.data.frame(cbind(SUB,B,A,C))
I wrote a function calculating the cor among A/B, B/C, C/A.
Z <- function(a,b,c) {
cor1 = cor(a,b)
cor2 = cor(b,c)
cor3 = cor(c,a)
x = c(cor1,cor2,cor3)
return(x)
}
if I type
Z(dat$A, dat$B,dat$C)
I get the vector of results:
> [1] 0.11294312 0.91417410 0.06457059
I need to condition my function to the SUB variable and get a matrix whose rows are the cor among A/B, B/C, C/A for each SUB.
For instance:
A/B B/C C/A
SUB1 0.11294312 0.91417410 0.06457059
SUB2 0.10335312 0.96744677 0.16356059
Thank you,
Best regards
base R
You can split with by and then recombine.
do.call(rbind, by(dat, dat$SUB, function(x) Z(x$A, x$B, x$C)))
# [,1] [,2] [,3]
# 1 -0.1534126 1.0000000 -0.15341258
# 2 0.1081781 0.8215838 0.04608456
The row names 1 and 2 are the SUB values themselves; if SUB is more "interesting" than counting numbers, it will be more apparent. Column names can be applied trivially.
dplyr
library(dplyr)
dat %>%
group_by(SUB) %>%
summarize(as.data.frame(matrix(Z(A, B, C), nr = 1)))
# # A tibble: 2 x 4
# SUB V1 V2 V3
# <dbl> <dbl> <dbl> <dbl>
# 1 1 -0.153 1.00 -0.153
# 2 2 0.108 0.822 0.0461
Try split in combination with sapply
sapply( split(dat,dat$SUB), function(x) Z(x["A"],x["B"],x["C"]) )
1 2
[1,] -0.1534126 0.10817808
[2,] 1.0000000 0.82158384
[3,] -0.1534126 0.04608456
Actually there's no need for your function if you use the upper.tri of the correlation matrix. Recently you can do this very easily by piping:
sapply(unique(dat$SUB), \(i) cor(dat[dat$SUB == i, -1]) |> {\(x) x[upper.tri(x)]}())
# [,1] [,2]
# [1,] -0.1534126 0.10817808
# [2,] 1.0000000 0.82158384
# [3,] -0.1534126 0.04608456
R.version.string
# [1] "R version 4.1.2 (2021-11-01)"
Data
dat <- structure(list(SUB = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2), B = c(1, 0, 0, 1, 1, 1, 0, 1,
1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), A = c(10, 20, 10,
31, 51, 1, 60, 1, 2, 0, 12, 0, 20, 1, 0, 0, 0, 0, 1, 0, 1, 1,
1), C = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 1)), class = "data.frame", row.names = c(NA, -23L
))
This is a lengthy answer, but it should be pretty flexible.
library(tidyverse)
cor.by.group.combos <- function(.data, groups, vars){
by <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
piv <- gsub(x = rlang::quo_get_expr(enquo(vars)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
.data %>%
group_by(!!!groups) %>%
group_split() %>%
map(.,
~pivot_longer(., cols = all_of(piv), names_to = "name", values_to = "val") %>%
nest(data = val) %>%
full_join(.,.,by = by) %>%
filter(name.x != name.y) %>%
mutate(test = paste(name.x, "vs",name.y, sep = "."),
grp = paste0(by,!!!groups),
cor = map2_dbl(data.x,data.y, ~cor(unlist(.x), unlist(.y)))) %>%
select(test,grp, cor)
) %>%
bind_rows() %>%
pivot_wider(names_from = test, values_from = cor)
}
cor.by.group.combos(dat, vars(SUB), vars(A, B, C))
#> # A tibble: 2 x 7
#> grp A.vs.B A.vs.C B.vs.A B.vs.C C.vs.A C.vs.B
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 SUB1 -0.153 -0.153 -0.153 1 -0.153 1
#> 2 SUB2 0.108 0.0461 0.108 0.822 0.0461 0.822
In essence, what we are doing is splitting the data by group, and then applying a cor test to every combination of the selected variables. The way I set this up will give some duplicate tests (e.g., A.vs.B and B.vs.A). You could fix this by using combn instead of full_join, but I didn't take the time to work out the details. This function should work if you change the input variables, the grouping variables, ect. You can also apply multiple groups with this method.
I have a toy example of a dataframe:
df <- data.frame(matrix(, nrow = 5, ncol = 0))
df["A|A"] <- c(0.3, 0, 0, 100, 23)
df["A|B"]= c(0, 0, 0.3, 10, 0.23)
df["A|C"]= c(0.3, 0.1, 0, 100, 2)
df["B|B"]= c(0, 0, 0, 12, 2)
df["B|B"]= c(0, 0, 0.3, 0, 0.23)
df["B|C"]= c(0.3, 0, 0, 21, 3)
df["C|A"]= c(0.3, 0, 1, 100, 0)
df["C|B"]= c(0, 0, 0.3, 10, 0.2)
df["C|C"]= c(0.3, 0, 1, 1, 0.3)
I need to get a matrix with counts of non-zero values between A and A, A and B, ..., C and C.
I started splitting the colnames and assigning them to variables. But I don't know how to create a matrix with certain rows and columns in a loop
counts <- colSums(df != 0)
df <- rbind(df, counts)
for(i in colnames(df)) {
cluster1 <- (strsplit(i, "\\|")[[1]])[1]
cluster2 <- (strsplit(i, "\\|")[[1]])[2]
}
A base R option
> table(read.table(text = rep(names(df), colSums(df > 0)), sep = "|"))
V2
V1 A B C
A 3 3 4
B 0 2 3
C 3 3 4
or a longer version
table(
data.frame(
do.call(
rbind,
strsplit(
as.character(subset(stack(df), values > 0)$ind),
"\\|"
)
)
)
)
gives
X2
X1 A B C
A 3 3 4
B 0 2 3
C 3 3 4
Reshape the data into 'long' format with pivot_longer, then separate the 'name' column into two, and reshape back to 'wide' with pivot_wider, specifying the values_fn as a lambda function to get the count of non-zero values
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
separate(name, into = c('name1', 'name2')) %>%
pivot_wider(names_from = name2, values_from = value,
values_fn = list(value = ~ sum(. > 0)), values_fill = 0)
-output
# A tibble: 3 x 4
name1 A B C
<chr> <int> <int> <int>
1 A 3 3 4
2 B 0 2 3
3 C 3 3 4
I'm running a linear regression, but many of my observations can be used because some of the values have an NA in the row. I know that if one of a set of variables is entered, then and NA is actually 0. However, if all the values are NA, then the columns do not change. I will include and example because I know this might be confusing.
What I have is something that looks likes this:
df <- data.frame(outcome = c(1, 0, 1, 1, 0),
Var1 = c(1, 0, 1, NA, NA),
Var2 = c(NA, 1, 0, 0, NA),
Var3 = c(0, 1, NA, 1, NA))
For Vars 1-3, the first 4 rows have an NA, but have other entries in other vars. In the last row, however, all values are NA. I know that everything in the last row is NA, but I want the NAs in those first 4 rows to be filled with 0. The desired outcome would look like this:
desired - data.frame(outcome = c(1, 0, 1, 1, 0),
Var1 = c(1, 0, 1, 0, NA),
Var2 = c(0, 1, 0, 0, NA),
Var3 = c(0, 1, 0, 1, NA))
I know there are messy ways I could go about this, but I was wondering what would be the most streamlined process for this?
I hope this makes sense, I know the question is confusing. I can clarify anything if needed.
We can create a logical vector with rowSums, use that to subset the rows before changing the NA to 0
i1 <- rowSums(!is.na(df[-1])) > 0
df[i1, -1][is.na(df[i1, -1])] <- 0
-checking with desired
identical(df, desired)
#[1] TRUE
You can use apply to conditionally replace NA in certain rows:
data.frame(t(apply(df, 1, function(x) if (all(is.na(x[-1]))) x else replace(x, is.na(x), 0))))
Output
outcome Var1 Var2 Var3
1 1 1 0 0
2 0 0 1 1
3 1 1 0 0
4 1 0 0 1
5 0 NA NA NA
I am working with a very large dataset. Consider the following example for illustration:
df1<-{data.frame(MyID=c(1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5),v1=c(0.1, 0.2, NA, 0.4, 0.2, 0.1, 0.8, 0.3, 0.1, 0.4, 0.3), v2=c(NA, 0.4, 0.2, 0.1, 0.8, 0.3, 0.1, 0.4, 0.3, 0.1, 0.2))}
df2<-{data.frame(MyID=c(1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5),v1=c(10, 8, 0, 6, 10, 5, 3, 1, 10, 8, 3), v2=c(0, 10, 5, 1, 8, 5,10, 3, 3, 1, 5))}
I would like to extract information from df1 but based on maximum values per MyID in df2. The final result should be a dataframe with:
one row per a unique MyID
each column would have the value in df1 corresponding the maximum of MyID group of df2.
The result should be
ExpectedResult<-{data.frame(MyID=c(1, 2, 3, 4, 5),v1=c(0.1,0.2,0.1,0.4,0.3), v2=c(0.1,0.4,0.2,0.1,0.2))}
What I have tried already but solved only a part of the problem:
using groups and finding max per group, e.g. df2Max<- df2 %>% group_by(MyID) %>% slice_max(1,)
splitting the data using e.g. df2.split <- split(df2, list(df2$MyID))
But, I am still not sure how to link the two dataframes to extract what I need.
We can group_by MyID and get the index of maximum value in each column and store it in df3.
library(dplyr)
df2 %>%
group_by(MyID) %>%
summarise(across(.fns = which.max)) -> df3
We split df3 by row and split df1 by MyID and extract the relevant value using indexing.
df3[-1] <- t(mapply(function(x, y) x[cbind(y, 1:ncol(x))],
split(df1[-1], df1$MyID), asplit(df3[-1], 1)))
# MyID v1 v2
# <dbl> <dbl> <dbl>
#1 1 0.1 0.1
#2 2 0.2 0.4
#3 3 0.1 0.2
#4 4 0.4 0.1
#5 5 0.3 0.2
We get the row index for 'v1', 'v2', column where the values are highest in 'df2' grouped by 'MyID', then do a join with the first dataset by 'MyID' and summarise the 'v1', 'v2' columns based on the index after grouping by 'MyID'
library(dplyr)
df2 %>%
group_by(MyID) %>%
summarise(rnv1 = row_number()[which.max(v1)],
rnv2 = row_number()[which.max(v2)], .groups = 'drop' ) %>%
right_join(df1, by = 'MyID') %>%
group_by(MyID) %>%
summarise(v1 = v1[first(rnv1)], v2 = v2[first(rnv2)], .groups = 'drop')
-output
# A tibble: 5 x 3
# MyID v1 v2
# <dbl> <dbl> <dbl>
#1 1 0.1 0.1
#2 2 0.2 0.4
#3 3 0.1 0.2
#4 4 0.4 0.1
#5 5 0.3 0.2
Or another option is a join with data.table
library(data.table)
nm1 <- names(df2)[-1]
setDT(df1)[setDT(df2)[, lapply(.SD, which.max), MyID],
Map(function(x, y) x[first(y)], .SD, mget(paste0("i.", nm1))),
on = .(MyID), by = .EACHI]
# MyID v1 v2
#1: 1 0.1 0.1
#2: 2 0.2 0.4
#3: 3 0.1 0.2
#4: 4 0.4 0.1
#5: 5 0.3 0.2
I need to replace a value of one variable conditional on another variable as shown below. I'm getting tripped up on how R is handling factors.
# setup example
df1 <- data.frame(v1=c(1, 0, 1, 1),
v2=c(1, 1, 1, 1))
df1$v1 <- factor(df1$v1,
levels=c(0, 1),
labels=c("0", "1"))
df2 <- data.frame(v3=c(1, NA, NA, 0),
v4=c(1, 1, 1, 1))
df2$v3 <- factor(df2$v3,
levels=c(0, 1),
labels=c("0", "1"))
# df2$v3
#[1] 1 <NA> <NA> 1
#Levels: 0 1
# recode NA in df2$v3 to 0 if df1$v1==0
# df2$v3 should end up as 1, 0, NA, 1 and remain a factor
df2$v3 <- ifelse(df1$v1=="0" & is.na(df2$v3), "0", df2$v3)
# df2$v3
#[1] "2" "0" NA "2"
# setup example
df1 <- data.frame(v1=c(1, 0, 1, 0),
v2=c(1, 1, 1, 1))
df1$v1 <- factor(df1$v1,
levels=c(0, 1),
labels=c("0", "1"))
df2 <- data.frame(v3=c(1, NA, NA, 0),
v4=c(1, 1, 1, 1))
df2$v3 <- factor(df2$v3,
levels=c(0, 1),
labels=c("0", "1"))
df2$v3[which(df1$v1==0)] <- 0
sapply(df2,class)
# v3 v4
# "factor" "numeric"