Related
I have a dataset like the following simplified one:
x_1 <- c(1, NA, 2, 3, NA, 4, 5)
x_2 <- c(2, 1, NA, NA, NA, 4, 6)
y_1 <- c(2, 4, 6, 8, NA, 10, NA)
y_2 <- c(NA, 4, NA, 8, 10, 11, 13)
df <- data.frame(x_1, x_2, y_1, y_2)
x_1 x_2 y_1 y_2
1 1 2 2 NA
2 NA 1 4 4
3 2 NA 6 NA
4 3 NA 8 8
5 NA NA NA 10
6 4 4 10 11
7 5 6 NA 13
The goal is to coalesce each of the two corresponding variables (x and y) and to replace the values that are not the same (e.g. first row of x_1 and x_2) with NA. I did this with the following:
df <- df %>%
mutate(x = coalesce(x_1, x_2)) %>%
mutate(x = ifelse(!is.na(x) &
!is.na(x_2) &
x != x_2,
NA,
x)) %>%
select(!c(x_1, x_2))
Now, I have to do this with 21 variables so I thought I put the variables in a list and feed them through the pipeline with a for loop like this:
cols <- c("x", "y")
for(i in cols){
var_1 <- paste(i, "1", sep = "_")
var_2 <- paste(i, "2", sep = "_")
df <- df %>%
mutate(i = coalesce(var_1, var_2)) %>%
mutate(i = ifelse(!is.na(i) &
!is.na(var_2) &
i != var_2,
NA,
i)) %>%
select(!c(var_1, var_2))
}
What happens is that the code is executed, but instead of the new variables there is only the variable "i" with empty values. It seems as if R does not recognise the "i" in the pipeline as the iterator, however it does recognize "var_1" and "var_2" (because they are being removed from the dataset).
Does anyone know why that is and how I can fix it?
Thanks a lot in advance.
fun <- function(x, var) {
var_1 <- sym(paste(var, "1", sep = "_"))
var_2 <- sym(paste(var, "2", sep = "_"))
x %>%
mutate(!!var := ifelse((!!var_1 != !!var_2) %in% TRUE,
NA, coalesce(!!var_1, !!var_2))) %>%
select(!c(var_1, var_2))
}
cols <- c("x", "y")
Reduce(fun, cols, init = df)
# x y
# 1 NA 2
# 2 1 4
# 3 2 6
# 4 3 8
# 5 NA 10
# 6 4 NA
# 7 NA 13
If you want to avoid rlang:
library(tidyverse)
library(stringr)
x_1 <- c(1, NA, 2, 3, NA, 4, 5)
x_2 <- c(2, 1, NA, NA, NA, 4, 6)
y_1 <- c(2, 4, 6, 8, NA, 10, NA)
y_2 <- c(NA, 4, NA, 8, 10, 11, 13)
df <- data.frame(x_1, x_2, y_1, y_2)
my_coalesce <- function(d) {
vec_1 <- select(d, 1) %>% pull()
vec_2 <- select(d, 2) %>% pull()
res <- coalesce(vec_1, vec_2)
res[vec_1 != vec_2] <- NA
res
}
cols <- c("x", "y")
map(cols, ~df %>%
select(starts_with(.x)) %>% # or:
#select(str_c(.x, "_", 1:2)) %>%
my_coalesce()) %>%
set_names(cols) %>%
as_tibble()
I have this dataset:
A<- c(10,20,10,31,51,1,60,1,02,0,12,0,20,1,0,0,0,0,1,0,1,1,1)
B<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0)
C<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1)
SUB <- c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2)
dat <- as.data.frame(cbind(SUB,B,A,C))
I wrote a function calculating the cor among A/B, B/C, C/A.
Z <- function(a,b,c) {
cor1 = cor(a,b)
cor2 = cor(b,c)
cor3 = cor(c,a)
x = c(cor1,cor2,cor3)
return(x)
}
if I type
Z(dat$A, dat$B,dat$C)
I get the vector of results:
> [1] 0.11294312 0.91417410 0.06457059
I need to condition my function to the SUB variable and get a matrix whose rows are the cor among A/B, B/C, C/A for each SUB.
For instance:
A/B B/C C/A
SUB1 0.11294312 0.91417410 0.06457059
SUB2 0.10335312 0.96744677 0.16356059
Thank you,
Best regards
base R
You can split with by and then recombine.
do.call(rbind, by(dat, dat$SUB, function(x) Z(x$A, x$B, x$C)))
# [,1] [,2] [,3]
# 1 -0.1534126 1.0000000 -0.15341258
# 2 0.1081781 0.8215838 0.04608456
The row names 1 and 2 are the SUB values themselves; if SUB is more "interesting" than counting numbers, it will be more apparent. Column names can be applied trivially.
dplyr
library(dplyr)
dat %>%
group_by(SUB) %>%
summarize(as.data.frame(matrix(Z(A, B, C), nr = 1)))
# # A tibble: 2 x 4
# SUB V1 V2 V3
# <dbl> <dbl> <dbl> <dbl>
# 1 1 -0.153 1.00 -0.153
# 2 2 0.108 0.822 0.0461
Try split in combination with sapply
sapply( split(dat,dat$SUB), function(x) Z(x["A"],x["B"],x["C"]) )
1 2
[1,] -0.1534126 0.10817808
[2,] 1.0000000 0.82158384
[3,] -0.1534126 0.04608456
Actually there's no need for your function if you use the upper.tri of the correlation matrix. Recently you can do this very easily by piping:
sapply(unique(dat$SUB), \(i) cor(dat[dat$SUB == i, -1]) |> {\(x) x[upper.tri(x)]}())
# [,1] [,2]
# [1,] -0.1534126 0.10817808
# [2,] 1.0000000 0.82158384
# [3,] -0.1534126 0.04608456
R.version.string
# [1] "R version 4.1.2 (2021-11-01)"
Data
dat <- structure(list(SUB = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2), B = c(1, 0, 0, 1, 1, 1, 0, 1,
1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), A = c(10, 20, 10,
31, 51, 1, 60, 1, 2, 0, 12, 0, 20, 1, 0, 0, 0, 0, 1, 0, 1, 1,
1), C = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 1)), class = "data.frame", row.names = c(NA, -23L
))
This is a lengthy answer, but it should be pretty flexible.
library(tidyverse)
cor.by.group.combos <- function(.data, groups, vars){
by <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
piv <- gsub(x = rlang::quo_get_expr(enquo(vars)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
.data %>%
group_by(!!!groups) %>%
group_split() %>%
map(.,
~pivot_longer(., cols = all_of(piv), names_to = "name", values_to = "val") %>%
nest(data = val) %>%
full_join(.,.,by = by) %>%
filter(name.x != name.y) %>%
mutate(test = paste(name.x, "vs",name.y, sep = "."),
grp = paste0(by,!!!groups),
cor = map2_dbl(data.x,data.y, ~cor(unlist(.x), unlist(.y)))) %>%
select(test,grp, cor)
) %>%
bind_rows() %>%
pivot_wider(names_from = test, values_from = cor)
}
cor.by.group.combos(dat, vars(SUB), vars(A, B, C))
#> # A tibble: 2 x 7
#> grp A.vs.B A.vs.C B.vs.A B.vs.C C.vs.A C.vs.B
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 SUB1 -0.153 -0.153 -0.153 1 -0.153 1
#> 2 SUB2 0.108 0.0461 0.108 0.822 0.0461 0.822
In essence, what we are doing is splitting the data by group, and then applying a cor test to every combination of the selected variables. The way I set this up will give some duplicate tests (e.g., A.vs.B and B.vs.A). You could fix this by using combn instead of full_join, but I didn't take the time to work out the details. This function should work if you change the input variables, the grouping variables, ect. You can also apply multiple groups with this method.
I want to filter a data frame so that I only keep columns that have the values 1, 2, 3, 4, 5, or NA in them.
x = data.frame(col1 = c("a" , "b", "d", "e", "f", "g"),
col2 = c(12, 45, 235, 2134, NA, 1),
col3 = c(1, 2, 3, 1, 2, NA),
col4 = c(1, 2, 3, 4, 5, NA),
col5 = c(1, 2, 3, 4, 5, 6))
With this example data, I would like to return x with only col 3 and 4.
You can use the following solution:
library(dplyr)
x %>%
select(where(function(x) all(x %in% c(1:5, NA))))
col3 col4
1 1 1
2 2 2
3 3 3
4 1 4
5 2 5
6 NA NA
Or using a formula:
x %>%
select(where(~ all(.x %in% c(1:5, NA))))
Since the discussion just heated up on this, in case you would like to know how R interprets formulas created by ~ pronounced twiddle, just wrap it inside purrr::as_mapper. This is a function R calls behind the scene when you use this syntax for an anonymous function:
as_mapper(~ all(.x %in% c(1:5, NA)))
<lambda>
function (..., .x = ..1, .y = ..2, . = ..1)
all(.x %in% c(1:5, NA))
attr(,"class")
[1] "rlang_lambda_function" "function"
Here .x argument is equivalent to the first argument of our anonymous function.
I'm running a linear regression, but many of my observations can be used because some of the values have an NA in the row. I know that if one of a set of variables is entered, then and NA is actually 0. However, if all the values are NA, then the columns do not change. I will include and example because I know this might be confusing.
What I have is something that looks likes this:
df <- data.frame(outcome = c(1, 0, 1, 1, 0),
Var1 = c(1, 0, 1, NA, NA),
Var2 = c(NA, 1, 0, 0, NA),
Var3 = c(0, 1, NA, 1, NA))
For Vars 1-3, the first 4 rows have an NA, but have other entries in other vars. In the last row, however, all values are NA. I know that everything in the last row is NA, but I want the NAs in those first 4 rows to be filled with 0. The desired outcome would look like this:
desired - data.frame(outcome = c(1, 0, 1, 1, 0),
Var1 = c(1, 0, 1, 0, NA),
Var2 = c(0, 1, 0, 0, NA),
Var3 = c(0, 1, 0, 1, NA))
I know there are messy ways I could go about this, but I was wondering what would be the most streamlined process for this?
I hope this makes sense, I know the question is confusing. I can clarify anything if needed.
We can create a logical vector with rowSums, use that to subset the rows before changing the NA to 0
i1 <- rowSums(!is.na(df[-1])) > 0
df[i1, -1][is.na(df[i1, -1])] <- 0
-checking with desired
identical(df, desired)
#[1] TRUE
You can use apply to conditionally replace NA in certain rows:
data.frame(t(apply(df, 1, function(x) if (all(is.na(x[-1]))) x else replace(x, is.na(x), 0))))
Output
outcome Var1 Var2 Var3
1 1 1 0 0
2 0 0 1 1
3 1 1 0 0
4 1 0 0 1
5 0 NA NA NA
I have a dataset where I have grouped by a Gene column. Some values grouped into each row are just ., so I remove them, leaving only several numeric characters per row and column.
To do this am coding:
#Group by Gene:
data <- setDT(df2)[, lapply(.SD, paste, collapse = ", "), by = Genes]
#Remove ., from anywhere in the dataframe
dat <- data.frame(lapply(data, function(x) {
gsub("\\.,|\\.$|\\,$|(, .$)", "", x)
}))
My data before removing ., and after grouping by Gene looks like:
Gene col1 col2 col3 col4
ACE 0.3, 0.4, 0.5, 0.5 . ., ., . 1, 1, 1, 1, 1
NOS2 ., . . ., ., ., . 0, 0, 0, 0, 0
BRCA1 . ., . 1, 1, 1, 1, 1
HER2 . 0.1, ., ., 0.2, 0.1 . 1, 1, 1, 1, 1
After removing ., my data looks like:
Gene col1 col2 col3 col4
ACE 0.3, 0.4, 0.5, 0.5 1, 1, 1, 1, 1
NOS2 0, 0, 0, 0, 0
BRCA1 1, 1, 1, 1, 1
HER2 0.1, 0.2, 0.1 1, 1, 1, 1, 1
I am now trying to select the minimum or maximum value per row and column.
Expecting example output:
Gene col1 col2 col3 col4
ACE 0.5 1
NOS2 0
BRCA1 1
HER2 0.1 1
#For col1 I need the max value per row (so for ACE 0.5 is selected)
#For col2 I need the min value per row
For note, my actual data is 100 columns and 20,000 rows - different columns need either max or min values per gene selected.
However with the code I use I am only getting the expected output for col4 and my other columns repeat the selected value twice (I am getting 0.5, 0.5 and 0.1, 0.1 and I can't figure out why).
The code I am using to select min/max values is:
#Max value per feature and row
max2 = function(x) if(all(is.na(x))) NA else max(x,na.rm = T)
getmax = function(col) str_extract_all(col,"[0-9\\.-]+") %>%
lapply(.,function(x)max2(as.numeric(x)) ) %>%
unlist()
#Min value per feature and row
min2 = function(x) if(all(is.na(x))) NA else min(x,na.rm = T)
getmin = function(col) str_extract_all(col,"[0-9\\.-]+") %>%
lapply(.,function(x)min2(as.numeric(x)) ) %>%
unlist()
data <- dt %>%
mutate_at(names(dt)[2],getmax)
data <- dt %>%
mutate_at(names(dt)[3],getmin)
data <- dt %>%
mutate_at(names(dt)[4],getmax)
Why aren't these selection functions working for all my columns? All columns are character class. I'm also wondering if I even need to remove ., at all and can just jump straight to selecting the max/min value per row and column?
Example input data:
structure(list(Gene = c("ACE", "NOS2", "BRCA1", "HER2"), col1 = c("0.3, 0.4, 0.5, 0.5",
"", "", ""), col2 = c("", "", "", " 0.1, 0.2 0.,1"), col3 = c(NA,
NA, NA, NA), col4 = c(" 1, 1, 1, 1, 1",
" 0, 0, 0, 0, 0", " 1, 1, 1, 1, 1",
" 1, 1, 1, 1, 1")), row.names = c(NA, -4L), class = c("data.table",
"data.frame"))
You can use type.convert and set its argument na.strings to ".". You may also want to use the range function to get both min and max in one shot.
Assume that your data.table looks like this
> dt
Gene col1 col2 col3 col4
1: ACE 0.3, 0.4, 0.5, 0.5 . ., ., . 1, 1, 1, 1, 1
2: NOS2 ., . . ., ., ., . 0, 0, 0, 0, 0
3: BRCA1 . ., . 1, 1, 1, 1, 1
4: HER2 . 0.1, ., ., 0.2, 0.1 . 1, 1, 1, 1, 1
Consider a function like this
library(data.table)
library(stringr)
get_range <- function(x) {
x <- type.convert(str_split(x, ",\\s+", simplify = TRUE), na.strings = ".")
x <- t(apply(x, 1L, function(i) {
i <- i[!is.na(i)]
if (length(i) < 1L) c(NA_real_, NA_real_) else range(i)
}))
dimnames(x)[[2L]] <- c("min", "max")
x
}
Then you can just
dt[, c(Gene = .(Gene), lapply(.SD, get_range)), .SDcols = -"Gene"]
Output
Gene col1.min col1.max col2.min col2.max col3.min col3.max col4.min col4.max
1: ACE 0.3 0.5 NA NA NA NA 1 1
2: NOS2 NA NA NA NA NA NA 0 0
3: BRCA1 NA NA NA NA NA NA 1 1
4: HER2 NA NA 0.1 0.2 NA NA 1 1
Note that there is no need to do it by Gene as the function get_range is already vectorised.