R loop to generate multiple variables based on a condition - r

The data is given as below.
x1<- c(1,0,0,0,1,1,1,0)
x2<- c(1,0,0,0,1,1,1,0)
x3<- c(1,0,0,0,1,1,1,0)
x4<- c(1,0,0,0,1,1,1,0)
x5<- c(1,0,0,0,1,1,1,0)
x6<- c(1,0,0,0,1,1,1,0)
my_data <- as.data.frame(cbind(x1, x2, x3, x4, x5, x6))
I want to use a loop to automate the following process:
my_data$a1 = ifelse(my_data$x1> 0 & is.na(my_data$x1) != T, 1, 0)
my_data$a2 = ifelse(my_data$x2> 0 & is.na(my_data$x2) != T, 1, 0)
my_data$a3 = ifelse(my_data$x3> 0 & is.na(my_data$x3) != T, 1, 0)
my_data$a4 = ifelse(my_data$x4> 0 & is.na(my_data$x4) != T, 1, 0)
my_data$a5 = ifelse(my_data$x5> 0 & is.na(my_data$x5) != T, 1, 0)
my_data$a6 = ifelse(my_data$x6> 0 & is.na(my_data$x6) != T, 1, 0)
Any help would be appreciated, thanks!

You can use the following code -
my_data[paste0('a', seq_along(my_data))] <- +(my_data > 0 & !is.na(my_data))
my_data
# x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
#1 1 1 1 1 1 1 1 1 1 1 1 1
#2 0 0 0 0 0 0 0 0 0 0 0 0
#3 0 0 0 0 0 0 0 0 0 0 0 0
#4 0 0 0 0 0 0 0 0 0 0 0 0
#5 1 1 1 1 1 1 1 1 1 1 1 1
#6 1 1 1 1 1 1 1 1 1 1 1 1
#7 1 1 1 1 1 1 1 1 1 1 1 1
#8 0 0 0 0 0 0 0 0 0 0 0 0
This will assign 1 where the value is greater than 0 and is not NA. my_data > 0 & !is.na(my_data) returns a logical value (TRUE/FALSE) adding + ahead of it turns them to integers (1/0).

You can use following for loop
for (i in 1:ncol(my_data)) {
my_data[,paste0("a",i)] <- ifelse(my_data[,i] > 0 & !is.na(my_data[,i]),1,0)
}
Output
x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
1 1 1 1 1 1 1 1 1 1 1 1 1
2 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0
5 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1 1 1 1 1 1
8 0 0 0 0 0 0 0 0 0 0 0 0

We can use tidyverse
library(dplyr)
library(stringr)
df <- my_data %>%
mutate(across(everything(), ~ +(. > 0 & !is.na(.)),
.names = "a{.col}")) %>%
rename_with(~ str_remove(., 'x'), starts_with('a'))
df
x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
1 1 1 1 1 1 1 1 1 1 1 1 1
2 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0
5 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1 1 1 1 1 1
8 0 0 0 0 0 0 0 0 0 0 0 0

data.table
x1<- c(1,0,0,0,1,1,1,0)
x2<- c(1,0,0,0,1,1,1,0)
x3<- c(1,0,0,0,1,1,1,0)
x4<- c(1,0,0,0,1,1,1,0)
x5<- c(1,0,0,0,1,1,1,0)
x6<- c(1,0,0,0,1,1,1,0)
my_data <- as.data.frame(cbind(x1, x2, x3, x4, x5, x6))
library(data.table)
setDT(my_data)[, (paste0("a", seq_len(length(names(my_data))))) := lapply(.SD, function(x) ifelse(x > 0 & !is.na(x), 1, 0))][]
#> x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
#> 1: 1 1 1 1 1 1 1 1 1 1 1 1
#> 2: 0 0 0 0 0 0 0 0 0 0 0 0
#> 3: 0 0 0 0 0 0 0 0 0 0 0 0
#> 4: 0 0 0 0 0 0 0 0 0 0 0 0
#> 5: 1 1 1 1 1 1 1 1 1 1 1 1
#> 6: 1 1 1 1 1 1 1 1 1 1 1 1
#> 7: 1 1 1 1 1 1 1 1 1 1 1 1
#> 8: 0 0 0 0 0 0 0 0 0 0 0 0
Created on 2021-06-06 by the reprex package (v2.0.0)

Related

R check equality of one column to rowSums of other columns

I have a dataframe like this:
x
y
x1
y1
x2
y2
x3
y3
1
0
1
0
0
0
0
0
0
0
3
0
0
0
0
0
2
0
0
0
0
0
2
0
1
0
0
0
1
0
0
0
I want to find rows that x=x1+x2+x3 and rows that y=y1+y2+y3.
Here is my code to check x=x1+x2+x3:
col_x = c(3,5,7)
df[df$x == rowSums(df[col_x])]
Suppose return row 1,3,4, but it returned
x x1 y1 x2 x3 y3
1 1 1 0 0 0 0
2 0 3 0 0 0 0
3 2 0 0 0 2 0
4 1 0 0 1 0 0
I also tried
col_x = c(3,5,7)
df[df$x == apply(df[col_x],1,sum)]
Which also give me:
x x1 y1 x2 x3 y3
1 1 1 0 0 0 0
2 0 3 0 0 0 0
3 2 0 0 0 2 0
4 1 0 0 1 0 0
I can't figure out why it returned all rows and it had skip column y2.
You are just missing a comma.
col_x = c(3,5,7)
df[df$x == rowSums(df[col_x]),]
x y x1 y1 x2 y2 x3 y3
1 1 0 1 0 0 0 0 0
3 2 0 0 0 0 0 2 0
4 1 0 0 0 1 0 0 0
A possible solution:
library(dplyr)
df %>%
filter(x == rowSums(across(matches("x\\d$"))) &
y == rowSums(across(matches("y\\d$"))))
#> x y x1 y1 x2 y2 x3 y3
#> 1 1 0 1 0 0 0 0 0
#> 2 2 0 0 0 0 0 2 0
#> 3 1 0 0 0 1 0 0 0

data.table versus tidyr::expand_grid

I have
XIa <- diag(1, 3)
colnames(XIa) <- rownames(XIa) <- c("a0", "a1", "a2")
XIb <- diag(1, 2)
colnames(XIb) <- rownames(XIb) <- c("b0", "b1")
XIc <- diag(1, 2)
colnames(XIc) <- rownames(XIc) <- c("c0", "c1")
tidyr::expand_grid gives me:
tidyr::expand_grid(as.data.frame(XIa), as.data.frame(XIb), as.data.frame(XIc))
# A tibble: 12 x 7
a0 a1 a2 b0 b1 c0 c1
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 1 0 1 0
2 1 0 0 1 0 0 1
3 1 0 0 0 1 1 0
4 1 0 0 0 1 0 1
5 0 1 0 1 0 1 0
6 0 1 0 1 0 0 1
7 0 1 0 0 1 1 0
8 0 1 0 0 1 0 1
9 0 0 1 1 0 1 0
10 0 0 1 1 0 0 1
11 0 0 1 0 1 1 0
12 0 0 1 0 1 0 1
How do I achieve the same result using data.table?
Clearly, there is this way:
dXIa <- data.table(XIa)
dXIb <- data.table(XIb)
dXIc <- data.table(XIc)
cbind(
dXIa[c(rep(1:3, each = 4))],
dXIb[c(rep(1:2, each = 2))],
dXIc[c(rep(1:2, len = 12))]
)
a0 a1 a2 b0 b1 c0 c1
1: 1 0 0 1 0 1 0
2: 1 0 0 1 0 0 1
3: 1 0 0 0 1 1 0
4: 1 0 0 0 1 0 1
5: 0 1 0 1 0 1 0
6: 0 1 0 1 0 0 1
7: 0 1 0 0 1 1 0
8: 0 1 0 0 1 0 1
9: 0 0 1 1 0 1 0
10: 0 0 1 1 0 0 1
11: 0 0 1 0 1 1 0
12: 0 0 1 0 1 0 1
but that is probably not optimal/ideal.
You can use CJ but it does not work with data.table directly. Using the function cjdt from this answer you can do -
library(data.table)
dXIa <- data.table(XIa)
dXIb <- data.table(XIb)
dXIc <- data.table(XIc)
cjdt <- function(a,b){
cj = CJ(1:nrow(a),1:nrow(b))
cbind(a[cj[[1]],],b[cj[[2]],])
}
Reduce(cjdt, list(dXIa, dXIb, dXIc))
# a0 a1 a2 b0 b1 c0 c1
# 1: 1 0 0 1 0 1 0
# 2: 1 0 0 1 0 0 1
# 3: 1 0 0 0 1 1 0
# 4: 1 0 0 0 1 0 1
# 5: 0 1 0 1 0 1 0
# 6: 0 1 0 1 0 0 1
# 7: 0 1 0 0 1 1 0
# 8: 0 1 0 0 1 0 1
# 9: 0 0 1 1 0 1 0
#10: 0 0 1 1 0 0 1
#11: 0 0 1 0 1 1 0
#12: 0 0 1 0 1 0 1
As an alternative to RonakShah's use of cjdt, here's a modified version that has two more features:
Guards against 0-row frames, which should really be a no-op for the 0-row frame;
Uses a single call to cbind instead of Reduce; while reduce isn't evil here, there may be benefits with a much longer list of frames/tables; and
While not a stated constraint here, it works with data.frame and data.table alike.
cjdt2 <- function(...) {
dots <- Filter(nrow, list(...))
eg <- do.call(expand.grid, lapply(sapply(dots, nrow), seq_len))
do.call(cbind, Map(function(x, i) x[i,], dots, eg))
}
cjdt2(XIa, XIb, XIc)
# a0 a1 a2 b0 b1 c0 c1
# a0 1 0 0 1 0 1 0
# a1 0 1 0 1 0 1 0
# a2 0 0 1 1 0 1 0
# a0 1 0 0 0 1 1 0
# a1 0 1 0 0 1 1 0
# a2 0 0 1 0 1 1 0
# a0 1 0 0 1 0 0 1
# a1 0 1 0 1 0 0 1
# a2 0 0 1 1 0 0 1
# a0 1 0 0 0 1 0 1
# a1 0 1 0 0 1 0 1
# a2 0 0 1 0 1 0 1
Which you can easily wrap with setDT (either externally or mod the function).
Here's another approach that uses data.table merge
expgridDT<-function(...) {
DTs<-list(...)
for(jj in 1:(length(DTs)-1)) {
DTs[[jj+1]]<-merge(DTs[[1]][,c(kfjekflj=1,.SD)], DTs[[2]][,c(kfjekflj=1,.SD)],by=.EACHI, allow.cartesian=TRUE)[,!"kfjekflj",with=FALSE]
}
return(DTs[[length(DTs)]][])
}
Essentially what this does is create's a dummy column on each data.table with a non-sense name (kfjekflj) to make a collision with a real column name unlikely. It sets that dummy column as the join by column. Then it merges two tables at a time with allow.cartesian turned on. It does that for every data.table that is passed to the function.
Here's a benchmark:
XIa <- diag(1, 50)
colnames(XIa) <- rownames(XIa) <- paste0("a",1:ncol(XIa))
XIb <- diag(1, 72)
colnames(XIb) <- rownames(XIb) <- paste0("b",1:ncol(XIb))
XIc <- diag(1, 80)
colnames(XIc) <- rownames(XIc) <- paste0("c",1:ncol(XIc))
XIa <- as.data.table(XIa)
XIb <- as.data.table(XIb)
XIc <- as.data.table(XIc)
microbenchmark(expgridDT(XIa, XIb, XIc), Reduce(cjdt, list(XIa, XIb, XIc)), cjdt2(XIa, XIb, XIc))
Unit: milliseconds
expr min lq mean median uq max neval
expgridDT(XIa, XIb, XIc) 167.5827 191.6542 264.8172 203.8769 231.6937 852.2033 100
Reduce(cjdt, list(XIa, XIb, XIc)) 164.4640 217.2215 252.2262 230.7276 255.6974 689.1763 100
cjdt2(XIa, XIb, XIc) 65611.1425 67829.0407 77024.1458 77151.0220 84385.0727 95048.6625 100

R: Merge rows by names in one column adding 1 whenever is present [duplicate]

This question already has answers here:
How to get the maximum value by group
(5 answers)
Closed 2 years ago.
I have a large dataset with one column with genes names, four columns with the detection methods (X1-X4) and three columns with type of mutation (Y5-Y7). I would like to merge the rows by the name of the gene and that the gene contain 1 whenever there is a 1 in one of the columns. Example of the table:
GENE X1 X2 X3 X4 Y5 Y6 Y7
AKT1 1 0 0 0 0 1 0
AKT1 0 0 1 0 0 0 1
AKT1 0 0 1 0 0 1 0
CENPF 0 1 0 0 0 1 0
CENPF 0 0 1 0 0 1 0
FOXA1 1 0 0 0 0 1 0
FOXA1 0 1 0 0 0 1 0
KMT2C 0 1 0 0 1 0 0
KMT2C 0 0 1 0 1 0 0
Example of the table results using the information of the above table.
GENE X1 X2 X3 X4 Y5 Y6 Y7
AKT1 1 0 1 0 0 1 1
CENPF 0 1 1 0 0 1 0
FOXA1 1 1 0 0 0 1 0
KMT2C 0 1 1 0 1 0 0
Thanks for your help
You can use rowsum to merge by GENE. rowsum sums up the values and with > 0 you get FALSE / TRUE in case it is larger than 0 and with + you get back values 0 or 1.
+(rowsum(x[-1], x$GENE) > 0)
# X1 X2 X3 X4 Y5 Y6 Y7
#AKT1 1 0 1 0 0 1 1
#CENPF 0 1 1 0 0 1 0
#FOXA1 1 1 0 0 0 1 0
#KMT2C 0 1 1 0 1 0 0
Data:
x <- read.table(header=TRUE, text="
GENE X1 X2 X3 X4 Y5 Y6 Y7
AKT1 1 0 0 0 0 1 0
AKT1 0 0 1 0 0 0 1
AKT1 0 0 1 0 0 1 0
CENPF 0 1 0 0 0 1 0
CENPF 0 0 1 0 0 1 0
FOXA1 1 0 0 0 0 1 0
FOXA1 0 1 0 0 0 1 0
KMT2C 0 1 0 0 1 0 0
KMT2C 0 0 1 0 1 0 0")
One way would be to take max for all the columns for each GENE.
This can be done in base R :
result <- aggregate(.~GENE, df, max, na.rm = TRUE)
result
# GENE X1 X2 X3 X4 Y5 Y6 Y7
#1 AKT1 1 0 1 0 0 1 1
#2 CENPF 0 1 1 0 0 1 0
#3 FOXA1 1 1 0 0 0 1 0
#4 KMT2C 0 1 1 0 1 0 0
dplyr :
library(dplyr)
df %>% group_by(GENE) %>% summarise(across(X1:Y7, max, na.rm = TRUE))
and data.table :
library(data.table)
setDT(df)[, lapply(.SD, max), GENE, .SDcols = X1:Y7]
Does this work:
library(dplyr)
dat %>% group_by(GENE) %>% summarise(across(X1:Y7, ~ case_when(1 %in% . ~ 1, TRUE ~ 0)))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 4 x 8
GENE X1 X2 X3 X4 Y5 Y6 Y7
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AKT1 1 0 1 0 0 1 1
2 CENPF 0 1 1 0 0 1 0
3 FOXA1 1 1 0 0 0 1 0
4 KMT2C 0 1 1 0 1 0 0
Data used:
dat
# A tibble: 9 x 8
GENE X1 X2 X3 X4 Y5 Y6 Y7
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AKT1 1 0 0 0 0 1 0
2 AKT1 0 0 1 0 0 0 1
3 AKT1 0 0 1 0 0 1 0
4 CENPF 0 1 0 0 0 1 0
5 CENPF 0 0 1 0 0 1 0
6 FOXA1 1 0 0 0 0 1 0
7 FOXA1 0 1 0 0 0 1 0
8 KMT2C 0 1 0 0 1 0 0
9 KMT2C 0 0 1 0 1 0 0

get value of column when column name is based on the value of another column

I have a data table which contains one million records and I try to create a new column based on month.idx:
dt[, new_col := get(paset0("month_",month.idx)]
and it only works for the first line.
Can anyone help me with this problem? thanks!
Data
id month_1 month_2 month_3 month_4 month_5 month.idx
1: x1 1 1 1 0 1 3
2: x2 0 0 0 1 0 4
3: x3 1 0 0 0 0 1
4: x4 0 0 0 0 0 5
5: x5 1 1 0 0 1 2
6: x6 0 1 0 1 1 3
7: x7 0 0 1 1 1 4
8: x8 0 0 0 0 0 1
9: x9 0 0 0 0 1 5
results:
id month_1 month_2 month_3 month_4 month_5 month.idx new_col
1: x1 1 1 1 0 1 3 1
2: x2 0 0 0 1 0 4 0
3: x3 1 0 0 0 0 1 0
4: x4 0 0 0 0 0 5 0
5: x5 1 1 0 0 1 2 0
6: x6 0 1 0 1 1 3 0
7: x7 0 0 1 1 1 4 1
8: x8 0 0 0 0 0 1 0
9: x9 0 0 0 0 1 5 0
expected:
id month_1 month_2 month_3 month_4 month_5 month.idx new_col
1: x1 1 1 1 0 1 3 1
2: x2 0 0 0 1 0 4 1
3: x3 1 0 0 0 0 1 1
4: x4 0 0 0 0 0 5 0
5: x5 1 1 0 0 1 2 1
6: x6 0 1 0 1 1 3 0
7: x7 0 0 1 1 1 4 0
8: x8 0 0 0 0 0 1 0
9: x9 0 0 0 0 1 5 1
Here are 2 options:
1) Using get row by row taking in Frank’s comment:
DT[, new_col := get(paste0("month_", month.idx)), by= month.idx]
2) Melt and then join to do a lookup
DT[, variable := paste0("month_", month.idx)]
DT[melt(DT, id.vars="id", measure.vars=patterns("^month_")),
on=.(id, variable), new_col := value]
Speed is dependent on the number of rows & month columns that you have.
data:
DT <- fread("id month_1 month_2 month_3 month_4 month_5 month.idx
x1 1 1 1 0 1 3
x2 0 0 0 1 0 4
x3 1 0 0 0 0 1
x4 0 0 0 0 0 5
x5 1 1 0 0 1 2
x6 0 1 0 1 1 3
x7 0 0 1 1 1 4
x8 0 0 0 0 0 1")

undefined columns error when trying to subset

subset_car_data <- car_data[car_data, car_data$Car_Type == "N" & car_data$Term == 60 & car_data$FICO>=675 & car_data$FICO<=725 & car_data$Amount>=30000 & car_data$Amount<=40000]
this is my code. I am attempting to create a subset subset_car_data from car_data with specific conditions. However, I keep getting the error:
df <- data.frame(replicate(10,sample(0:1,10,rep=TRUE)))
df
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 0 0 0 1 1 1 0 1 0 1
2 0 1 1 1 0 0 1 1 0 0
3 0 1 1 0 0 0 1 0 0 0
4 0 0 1 0 1 1 1 1 1 0
5 0 0 1 0 0 1 0 1 0 0
6 1 0 0 1 1 0 1 1 1 0
7 1 0 1 0 1 0 1 1 1 0
8 0 0 0 1 0 0 1 0 0 1
9 0 0 0 0 1 0 1 0 1 1
10 0 0 1 0 0 0 1 1 1 1
You should do something like:
subset_df <- df[df$X1 == 1 & df$X2 == 1 & df$X3 == 1,]
subset_df
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
8 1 1 1 0 0 1 0 0 0 0
10 1 1 1 1 0 1 0 0 0 1
Instead of:
subset_df <- df[df,df$X1 == 1 & df$X2 == 1 & df$X3 == 1]

Resources