R check equality of one column to rowSums of other columns - r

I have a dataframe like this:
x
y
x1
y1
x2
y2
x3
y3
1
0
1
0
0
0
0
0
0
0
3
0
0
0
0
0
2
0
0
0
0
0
2
0
1
0
0
0
1
0
0
0
I want to find rows that x=x1+x2+x3 and rows that y=y1+y2+y3.
Here is my code to check x=x1+x2+x3:
col_x = c(3,5,7)
df[df$x == rowSums(df[col_x])]
Suppose return row 1,3,4, but it returned
x x1 y1 x2 x3 y3
1 1 1 0 0 0 0
2 0 3 0 0 0 0
3 2 0 0 0 2 0
4 1 0 0 1 0 0
I also tried
col_x = c(3,5,7)
df[df$x == apply(df[col_x],1,sum)]
Which also give me:
x x1 y1 x2 x3 y3
1 1 1 0 0 0 0
2 0 3 0 0 0 0
3 2 0 0 0 2 0
4 1 0 0 1 0 0
I can't figure out why it returned all rows and it had skip column y2.

You are just missing a comma.
col_x = c(3,5,7)
df[df$x == rowSums(df[col_x]),]
x y x1 y1 x2 y2 x3 y3
1 1 0 1 0 0 0 0 0
3 2 0 0 0 0 0 2 0
4 1 0 0 0 1 0 0 0

A possible solution:
library(dplyr)
df %>%
filter(x == rowSums(across(matches("x\\d$"))) &
y == rowSums(across(matches("y\\d$"))))
#> x y x1 y1 x2 y2 x3 y3
#> 1 1 0 1 0 0 0 0 0
#> 2 2 0 0 0 0 0 2 0
#> 3 1 0 0 0 1 0 0 0

Related

R loop to generate multiple variables based on a condition

The data is given as below.
x1<- c(1,0,0,0,1,1,1,0)
x2<- c(1,0,0,0,1,1,1,0)
x3<- c(1,0,0,0,1,1,1,0)
x4<- c(1,0,0,0,1,1,1,0)
x5<- c(1,0,0,0,1,1,1,0)
x6<- c(1,0,0,0,1,1,1,0)
my_data <- as.data.frame(cbind(x1, x2, x3, x4, x5, x6))
I want to use a loop to automate the following process:
my_data$a1 = ifelse(my_data$x1> 0 & is.na(my_data$x1) != T, 1, 0)
my_data$a2 = ifelse(my_data$x2> 0 & is.na(my_data$x2) != T, 1, 0)
my_data$a3 = ifelse(my_data$x3> 0 & is.na(my_data$x3) != T, 1, 0)
my_data$a4 = ifelse(my_data$x4> 0 & is.na(my_data$x4) != T, 1, 0)
my_data$a5 = ifelse(my_data$x5> 0 & is.na(my_data$x5) != T, 1, 0)
my_data$a6 = ifelse(my_data$x6> 0 & is.na(my_data$x6) != T, 1, 0)
Any help would be appreciated, thanks!
You can use the following code -
my_data[paste0('a', seq_along(my_data))] <- +(my_data > 0 & !is.na(my_data))
my_data
# x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
#1 1 1 1 1 1 1 1 1 1 1 1 1
#2 0 0 0 0 0 0 0 0 0 0 0 0
#3 0 0 0 0 0 0 0 0 0 0 0 0
#4 0 0 0 0 0 0 0 0 0 0 0 0
#5 1 1 1 1 1 1 1 1 1 1 1 1
#6 1 1 1 1 1 1 1 1 1 1 1 1
#7 1 1 1 1 1 1 1 1 1 1 1 1
#8 0 0 0 0 0 0 0 0 0 0 0 0
This will assign 1 where the value is greater than 0 and is not NA. my_data > 0 & !is.na(my_data) returns a logical value (TRUE/FALSE) adding + ahead of it turns them to integers (1/0).
You can use following for loop
for (i in 1:ncol(my_data)) {
my_data[,paste0("a",i)] <- ifelse(my_data[,i] > 0 & !is.na(my_data[,i]),1,0)
}
Output
x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
1 1 1 1 1 1 1 1 1 1 1 1 1
2 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0
5 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1 1 1 1 1 1
8 0 0 0 0 0 0 0 0 0 0 0 0
We can use tidyverse
library(dplyr)
library(stringr)
df <- my_data %>%
mutate(across(everything(), ~ +(. > 0 & !is.na(.)),
.names = "a{.col}")) %>%
rename_with(~ str_remove(., 'x'), starts_with('a'))
df
x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
1 1 1 1 1 1 1 1 1 1 1 1 1
2 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0
5 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1 1 1 1 1 1
8 0 0 0 0 0 0 0 0 0 0 0 0
data.table
x1<- c(1,0,0,0,1,1,1,0)
x2<- c(1,0,0,0,1,1,1,0)
x3<- c(1,0,0,0,1,1,1,0)
x4<- c(1,0,0,0,1,1,1,0)
x5<- c(1,0,0,0,1,1,1,0)
x6<- c(1,0,0,0,1,1,1,0)
my_data <- as.data.frame(cbind(x1, x2, x3, x4, x5, x6))
library(data.table)
setDT(my_data)[, (paste0("a", seq_len(length(names(my_data))))) := lapply(.SD, function(x) ifelse(x > 0 & !is.na(x), 1, 0))][]
#> x1 x2 x3 x4 x5 x6 a1 a2 a3 a4 a5 a6
#> 1: 1 1 1 1 1 1 1 1 1 1 1 1
#> 2: 0 0 0 0 0 0 0 0 0 0 0 0
#> 3: 0 0 0 0 0 0 0 0 0 0 0 0
#> 4: 0 0 0 0 0 0 0 0 0 0 0 0
#> 5: 1 1 1 1 1 1 1 1 1 1 1 1
#> 6: 1 1 1 1 1 1 1 1 1 1 1 1
#> 7: 1 1 1 1 1 1 1 1 1 1 1 1
#> 8: 0 0 0 0 0 0 0 0 0 0 0 0
Created on 2021-06-06 by the reprex package (v2.0.0)

R: Merge rows by names in one column adding 1 whenever is present [duplicate]

This question already has answers here:
How to get the maximum value by group
(5 answers)
Closed 2 years ago.
I have a large dataset with one column with genes names, four columns with the detection methods (X1-X4) and three columns with type of mutation (Y5-Y7). I would like to merge the rows by the name of the gene and that the gene contain 1 whenever there is a 1 in one of the columns. Example of the table:
GENE X1 X2 X3 X4 Y5 Y6 Y7
AKT1 1 0 0 0 0 1 0
AKT1 0 0 1 0 0 0 1
AKT1 0 0 1 0 0 1 0
CENPF 0 1 0 0 0 1 0
CENPF 0 0 1 0 0 1 0
FOXA1 1 0 0 0 0 1 0
FOXA1 0 1 0 0 0 1 0
KMT2C 0 1 0 0 1 0 0
KMT2C 0 0 1 0 1 0 0
Example of the table results using the information of the above table.
GENE X1 X2 X3 X4 Y5 Y6 Y7
AKT1 1 0 1 0 0 1 1
CENPF 0 1 1 0 0 1 0
FOXA1 1 1 0 0 0 1 0
KMT2C 0 1 1 0 1 0 0
Thanks for your help
You can use rowsum to merge by GENE. rowsum sums up the values and with > 0 you get FALSE / TRUE in case it is larger than 0 and with + you get back values 0 or 1.
+(rowsum(x[-1], x$GENE) > 0)
# X1 X2 X3 X4 Y5 Y6 Y7
#AKT1 1 0 1 0 0 1 1
#CENPF 0 1 1 0 0 1 0
#FOXA1 1 1 0 0 0 1 0
#KMT2C 0 1 1 0 1 0 0
Data:
x <- read.table(header=TRUE, text="
GENE X1 X2 X3 X4 Y5 Y6 Y7
AKT1 1 0 0 0 0 1 0
AKT1 0 0 1 0 0 0 1
AKT1 0 0 1 0 0 1 0
CENPF 0 1 0 0 0 1 0
CENPF 0 0 1 0 0 1 0
FOXA1 1 0 0 0 0 1 0
FOXA1 0 1 0 0 0 1 0
KMT2C 0 1 0 0 1 0 0
KMT2C 0 0 1 0 1 0 0")
One way would be to take max for all the columns for each GENE.
This can be done in base R :
result <- aggregate(.~GENE, df, max, na.rm = TRUE)
result
# GENE X1 X2 X3 X4 Y5 Y6 Y7
#1 AKT1 1 0 1 0 0 1 1
#2 CENPF 0 1 1 0 0 1 0
#3 FOXA1 1 1 0 0 0 1 0
#4 KMT2C 0 1 1 0 1 0 0
dplyr :
library(dplyr)
df %>% group_by(GENE) %>% summarise(across(X1:Y7, max, na.rm = TRUE))
and data.table :
library(data.table)
setDT(df)[, lapply(.SD, max), GENE, .SDcols = X1:Y7]
Does this work:
library(dplyr)
dat %>% group_by(GENE) %>% summarise(across(X1:Y7, ~ case_when(1 %in% . ~ 1, TRUE ~ 0)))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 4 x 8
GENE X1 X2 X3 X4 Y5 Y6 Y7
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AKT1 1 0 1 0 0 1 1
2 CENPF 0 1 1 0 0 1 0
3 FOXA1 1 1 0 0 0 1 0
4 KMT2C 0 1 1 0 1 0 0
Data used:
dat
# A tibble: 9 x 8
GENE X1 X2 X3 X4 Y5 Y6 Y7
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AKT1 1 0 0 0 0 1 0
2 AKT1 0 0 1 0 0 0 1
3 AKT1 0 0 1 0 0 1 0
4 CENPF 0 1 0 0 0 1 0
5 CENPF 0 0 1 0 0 1 0
6 FOXA1 1 0 0 0 0 1 0
7 FOXA1 0 1 0 0 0 1 0
8 KMT2C 0 1 0 0 1 0 0
9 KMT2C 0 0 1 0 1 0 0

How might I generalize this code to fit larger sets of data?

I want to extend the usability of the code i've written. even better, I would like to generalize it for future use.
I am using Rstudio. I have recoded a 100-dimensional vector. Values 1-10 have been converted to identity vectors. For instance, all values of 1 are now vectors that read 1 0 0 0 0 0 0 0 0 0, and all values of 2 now read 0 1 0 0 0 0 0 0 0 0, and so on. Here is the code:
tens <- seq(from=1, to=10, by=1)
y <- sample(tens, size=100, replace=TRUE)
y
num.its <- 100
Y <- rep(0,num.its*10)
dim(Y) <- c(num.its,10)
I <- diag(10)
for(i in 1:100){
if(y[i]==1){
Y[i,] <- I[1,]
} else if (y[i]==2){
Y[i,] <- I[2,]
} else if (y[i]==3){
Y[i,] <- I[3,]
} else if (y[i]==4){
Y[i,] <- I[4,]
} else if (y[i]==5){
Y[i,] <- I[5,]
} else if (y[i]==6){
Y[i,] <- I[6,]
} else if (y[i]==7){
Y[i,] <- I[7,]
} else if (y[i]==8){
Y[i,] <- I[8,]
} else if (y[i]==9){
Y[i,] <- I[9,]
} else {
Y[i,] <- I[10,]
}
}
The code works as planned. however, if I had to recode values of 1-2000, then I would rather not write 2000 else if statements. Any help would be appreciated. thank you!
A pretty decent one-liner is the following:
# sample data
set.seed(1234)
x <- c(1:5, sample(10L, 6))
Our vector is
x
[1] 1 2 3 4 5 10 6 5 4 1 8
Then, convert x to a factor variable, specifying the desired levels, and use model.matrix to get a matrix of your desired vectors.
model.matrix(~ . + 0, data.frame(x=factor(x, levels=1:10)))
This returns
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
1 1 0 0 0 0 0 0 0 0 0
2 0 1 0 0 0 0 0 0 0 0
3 0 0 1 0 0 0 0 0 0 0
4 0 0 0 1 0 0 0 0 0 0
5 0 0 0 0 1 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 1
7 0 0 0 0 0 1 0 0 0 0
8 0 0 0 0 1 0 0 0 0 0
9 0 0 0 1 0 0 0 0 0 0
10 1 0 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 1 0 0
attr(,"assign")
[1] 1 1 1 1 1 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$x
[1] "contr.treatment"
Here, the rows represent what you want. You can use t to convert this to columns if desired. Note also that even though 7 is missing in x, that column is present in the matrix.
You could use the function dummy from the package dummy
dummy::dummy(data.frame(x=factor(x)))
x_1 x_2 x_3 x_4 x_5 x_6 x_8 x_9
1 1 0 0 0 0 0 0 0
2 0 1 0 0 0 0 0 0
3 0 0 1 0 0 0 0 0
4 0 0 0 1 0 0 0 0
5 0 0 0 0 1 0 0 0
6 0 1 0 0 0 0 0 0
7 0 0 0 0 0 1 0 0
8 0 0 0 0 1 0 0 0
9 0 0 0 0 0 0 1 0
10 0 0 0 0 0 0 0 1
11 0 0 0 1 0 0 0 0

undefined columns error when trying to subset

subset_car_data <- car_data[car_data, car_data$Car_Type == "N" & car_data$Term == 60 & car_data$FICO>=675 & car_data$FICO<=725 & car_data$Amount>=30000 & car_data$Amount<=40000]
this is my code. I am attempting to create a subset subset_car_data from car_data with specific conditions. However, I keep getting the error:
df <- data.frame(replicate(10,sample(0:1,10,rep=TRUE)))
df
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 0 0 0 1 1 1 0 1 0 1
2 0 1 1 1 0 0 1 1 0 0
3 0 1 1 0 0 0 1 0 0 0
4 0 0 1 0 1 1 1 1 1 0
5 0 0 1 0 0 1 0 1 0 0
6 1 0 0 1 1 0 1 1 1 0
7 1 0 1 0 1 0 1 1 1 0
8 0 0 0 1 0 0 1 0 0 1
9 0 0 0 0 1 0 1 0 1 1
10 0 0 1 0 0 0 1 1 1 1
You should do something like:
subset_df <- df[df$X1 == 1 & df$X2 == 1 & df$X3 == 1,]
subset_df
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
8 1 1 1 0 0 1 0 0 0 0
10 1 1 1 1 0 1 0 0 0 1
Instead of:
subset_df <- df[df,df$X1 == 1 & df$X2 == 1 & df$X3 == 1]

Looking to multiply all rows of all columns by a constant

I am trying to find a simple (1 line of code or so) to multiply all rows of all columns of a dataframe by 100 for example.
df <- data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
head(df)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 0 1 1 0 0 0 0 1 0 1
2 0 0 1 0 0 1 0 1 0 0
3 0 0 1 1 0 0 1 1 0 0
4 0 1 0 1 1 1 0 0 0 1
5 1 0 1 1 0 0 0 1 0 0
6 0 0 0 0 1 0 0 1 1 1
The way I am currently doing it;
dfX1 <- as.data.frame(df$X1 * 100)
But this way I would have to do this 10 times... and then use the cbind function to bind them all back together again.
dfFULL <- cbind(dfX1, dfX2, dfX3...)
Anybody know of a cleaner way?

Resources