Creating a new column based on other columns in a dataframe R - r

I have a dataframe that looks like this:
df <- data.frame('col1'=c(1,2,2,4,5), 'col2'=c(4,9,3,5,13), 'col3'=c(3,5,8,7,10))
> df
col1 col2 col3
1 1 4 3
2 2 9 5
3 2 3 8
4 4 5 7
5 5 13 10
I want to create a new column that has a value of 1 if at least one of the values in the row is greater or equal to 8 and a value of 0 if all of the values in the row are less than 8. So the final result would look something like this:
> df
col1 col2 col3 new
1 1 4 3 0
2 2 9 5 1
3 2 3 8 1
4 4 5 7 0
5 5 13 10 1
Thanks!

This works:
df$new <- apply(df, 1, function(x) max(x >= 8))
df
# col1 col2 col3 new
# 1 1 4 3 0
# 2 2 9 5 1
# 3 2 3 8 1
# 4 4 5 7 0
# 5 5 13 10 1

Using rowSums.
df$new <- +(rowSums(df>=8, na.rm=TRUE) > 0); df
col1 col2 col3 new
1 1 4 3 0
2 2 9 5 1
3 2 3 8 1
4 4 5 7 0
5 5 13 10 1

Alternatively using matrix multiplication
df$new <- as.numeric(((df >= 8) %*% rep(1, ncol(df))) > 0)
df
col1 col2 col3 new
1 1 4 3 0
2 2 9 5 1
3 2 3 8 1
4 4 5 7 0
5 5 13 10 1
# Or logical column
df$new <- ((df >= 8) %*% rep(1, ncol(df))) > 0
df
col1 col2 col3 new
1 1 4 3 FALSE
2 2 9 5 TRUE
3 2 3 8 TRUE
4 4 5 7 FALSE
5 5 13 10 TRUE

Related

R: Create a dummy variable if a particular value exists in any of the previous columns

I have a data frame that has 238 rows and 10 columns. I want to create a new column at the end that contains a dummy variable that says "yes" if the number "1" exists in any or the 10 columns and "no" if none of the columns have "1" in them.
I tried
df$dummy = (ifelse(any(x == 1) %in% df[], 'yes', 'no'))
view(df)
but it didn't work.
Any input would be greatly appreciated!
If you want to use any you have to apply it to the rows (MARGIN=1).
The solution given by #dash2 is of course a lot shorter and most likely also faster (see rowSums(df==1) int he comments to the question).
# Create a dummy data set
df <- data.frame(c1 = sample.int(10, size=10, replace = TRUE))
for (i in 2:10)
df[[paste0("c", i)]] <- sample.int(10, size=10, replace = TRUE)
df$new <- apply(df, MARGIN=1, function(x) any(x == 1))
df
#> c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 new
#> 1 5 6 1 2 5 8 10 6 4 7 TRUE
#> 2 9 3 7 3 2 4 2 8 3 4 FALSE
#> 3 6 2 2 9 8 1 6 6 10 8 TRUE
#> 4 10 5 3 6 6 7 3 6 2 8 FALSE
#> 5 4 8 2 10 10 5 5 2 10 10 FALSE
#> 6 4 2 8 8 2 9 7 7 2 2 FALSE
#> 7 9 4 3 3 7 5 10 6 3 8 FALSE
#> 8 7 7 6 9 3 2 2 7 3 2 FALSE
#> 9 7 7 9 9 1 1 3 2 5 5 TRUE
#> 10 3 2 10 2 3 5 2 1 4 3 TRUE
Created on 2022-06-26 by the reprex package (v2.0.1)
> #Creating Random data frame with 2 variables ranges from 0 to 9
>
> set.seed(200)
>
> df <- data.frame(val1 = sample(0:9,100,replace = TRUE),
+ val2 = sample(0:9,100,replace = TRUE))
> df %>% filter(val1 ==1 | val2 ==1)
val1 val2
1 1 7
2 1 8
3 1 6
4 5 1
5 7 1
6 1 4
7 2 1
8 7 1
9 1 8
10 6 1
11 9 1
12 2 1
13 7 1
14 1 0
15 0 1
16 3 1
17 1 5
18 9 1
19 7 1
20 0 1
21 1 1
> # we notice 21 occuerence of "1" in both of vars.
>
> #mutate a new "dummy" column in a new dataframe
> df1 <- df %>%
+ mutate(dummy = ifelse(rowSums(df==1) > 0, "yes", "no"))
>
>
> df1 %>%filter(val1 == 1 | val2 == 1)
val1 val2 dummy
1 1 7 yes
2 1 8 yes
3 1 6 yes
4 5 1 yes
5 7 1 yes
6 1 4 yes
7 2 1 yes
8 7 1 yes
9 1 8 yes
10 6 1 yes
11 9 1 yes
12 2 1 yes
13 7 1 yes
14 1 0 yes
15 0 1 yes
16 3 1 yes
17 1 5 yes
18 9 1 yes
19 7 1 yes
20 0 1 yes
21 1 1 yes
> #we see the same 21 occuerences labeled with yes or no
>
> #a random sample of the dataframe
> sample_n(df1,10)
val1 val2 dummy
1 2 7 no
2 7 6 no
3 5 8 no
4 7 9 no
5 5 4 no
6 9 1 yes
7 5 6 no
8 6 9 no
9 6 6 no
10 3 7 no

How to recode multiple variables for a subset of a dataframe?

I'm lost, so any directions would be helpful. Let's say I have a dataframe:
df <- data.frame(
id = 1:12,
v1 = rep(c(1:4), 3),
v2 = rep(c(1:3), 4),
v3 = rep(c(1:6), 2),
v4 = rep(c(1:2), 6))
My goal would be to recode 2=4 and 4=2 for variables v3 and v4 but only for the first 4 cases (id < 5). I'm looking for a solution that works for up to twenty variables. I know how to do basic recoding but I don't see a simple way to implement the subset condition while manipulating multiple variables.
Here is a base R solution,
df[1:5, c('v3', 'v4')] <- lapply(df[1:5, c('v3', 'v4')], function(i)
ifelse(i == 2, 4, ifelse(i == 4, 2, i)))
which gives,
id v1 v2 v3 v4
1 1 1 1 1 1
2 2 2 2 4 4
3 3 3 3 3 1
4 4 4 1 2 4
5 5 1 2 5 1
6 6 2 3 6 2
7 7 3 1 1 1
8 8 4 2 2 2
9 9 1 3 3 1
10 10 2 1 4 2
11 11 3 2 5 1
12 12 4 3 6 2
You can try mutate_at with case_when in dplyr
library(dplyr)
df %>%
mutate_at(vars(v3:v4), ~case_when(id < 5 & . == 4 ~ 2L,
id < 5 & . == 2 ~ 4L,
TRUE ~.))
# id v1 v2 v3 v4
#1 1 1 1 1 1
#2 2 2 2 4 4
#3 3 3 3 3 1
#4 4 4 1 2 4
#5 5 1 2 5 1
#6 6 2 3 6 2
#7 7 3 1 1 1
#8 8 4 2 2 2
#9 9 1 3 3 1
#10 10 2 1 4 2
#11 11 3 2 5 1
#12 12 4 3 6 2
With mutate_at you can specify range of columns to apply the function.
Another, more direct, option is to get the indices of the numbers to replace, and to replace them by 6 minus the number (6-4=2, 6-2=4):
whToChange <- which(df[1:5, c("v3", "v4")] ==2 | df[1:5, c("v3", "v4")]==4, arr.ind=TRUE)
df[, c("v3", "v4")][whToChange] <- 6-df[, c("v3", "v4")][whToChange]
head(df, 5)
# id v1 v2 v3 v4
#1 1 1 1 1 1
#2 2 2 2 4 4
#3 3 3 3 3 1
#4 4 4 1 2 4
#5 5 1 2 5 1
You can use match and a lookup table - just in chase you have to recede more than two values.
rosetta <- matrix(c(2,4,4,2), 2)
df[1:4, c("v3", "v4")] <- lapply(df[1:4, c("v3", "v4")], function(x) {
i <- match(x, rosetta[1,]); j <- !is.na(i); "[<-"(x, j, rosetta[2, i[j]])})
df
# id v1 v2 v3 v4
#1 1 1 1 1 1
#2 2 2 2 4 4
#3 3 3 3 3 1
#4 4 4 1 2 4
#5 5 1 2 5 1
#6 6 2 3 6 2
#7 7 3 1 1 1
#8 8 4 2 2 2
#9 9 1 3 3 1
#10 10 2 1 4 2
#11 11 3 2 5 1
#12 12 4 3 6 2
Have also a look at R: How to recode multiple variables at once or Recoding multiple variables in R

remove i+1th term if reoccuring

Say we have the following data
A <- c(1,2,2,2,3,4,8,6,6,1,2,3,4)
B <- c(1,2,3,4,5,1,2,3,4,5,1,2,3)
data <- data.frame(A,B)
How would one write a function so that for A, if we have the same value in the i+1th position, then the reoccuring row is removed.
Therefore the output should like like
data.frame(c(1,2,3,4,8,6,1,2,3,4), c(1,2,5,1,2,3,5,1,2,3))
My best guess would be using a for statement, however I have no experience in these
You can try
data[c(TRUE, data[-1,1]!= data[-nrow(data), 1]),]
Another option, dplyr-esque:
library(dplyr)
dat1 <- data.frame(A=c(1,2,2,2,3,4,8,6,6,1,2,3,4),
B=c(1,2,3,4,5,1,2,3,4,5,1,2,3))
dat1 %>% filter(A != lag(A, default=FALSE))
## A B
## 1 1 1
## 2 2 2
## 3 3 5
## 4 4 1
## 5 8 2
## 6 6 3
## 7 1 5
## 8 2 1
## 9 3 2
## 10 4 3
using diff, which calculates the pairwise differences with a lag of 1:
data[c( TRUE, diff(data[,1]) != 0), ]
output:
A B
1 1 1
2 2 2
5 3 5
6 4 1
7 8 2
8 6 3
10 1 5
11 2 1
12 3 2
13 4 3
Using rle
A <- c(1,2,2,2,3,4,8,6,6,1,2,3,4)
B <- c(1,2,3,4,5,1,2,3,4,5,1,2,3)
data <- data.frame(A,B)
X <- rle(data$A)
Y <- cumsum(c(1, X$lengths[-length(X$lengths)]))
View(data[Y, ])
row.names A B
1 1 1 1
2 2 2 2
3 5 3 5
4 6 4 1
5 7 8 2
6 8 6 3
7 10 1 5
8 11 2 1
9 12 3 2
10 13 4 3

Perform function over groups in columns in R

I am completely new to R and have a question about performing a function over a column.
data <- read.table(text ="group; val
a; 4
a; 24
a; 12
b; 1
a; 2
c; 4
c; 5
b; 6 ", sep=";", header=T,stringsAsFactors = FALSE)
How could I add data in the following way?
I would like to create two new columns which I am doing like this:
data$col1 <- 0
data$col2 <- 1
What I now want to do is to add +2 for each group value into the new columns and reach the following pattern:
group val col1 col2
a 4 0 1
a 24 0 1
a 12 0 1
b 1 2 3
a 2 0 1
c 4 4 5
c 5 4 5
b 6 2 3
How could I do this? I hope I made my example more or less clear.
Try this:
Creating an index to cumulatively add +2 depending on the number of groups
indx <- c(0, 2 * seq_len(length(unique(data[, 1])) - 1))
Splitting the data set by groups, adding (cumulatively) +2 and unsplitting back so everything comes back in place
data[, 3:4] <- unsplit(Map(`+`, split(data[, 3:4], data[, 1]), indx), data[, 1])
data
# group val col1 col2
# 1 a 4 0 1
# 2 a 24 0 1
# 3 a 12 0 1
# 4 b 1 2 3
# 5 a 2 0 1
# 6 c 4 4 5
# 7 c 5 4 5
# 8 b 6 2 3
Or you could do
within(data, {col1 <- 2*(as.numeric(factor(group))-1)
col2 <- col1+1})[,c(1:2,4:3)]
# group val col1 col2
#1 a 4 0 1
#2 a 24 0 1
#3 a 12 0 1
#4 b 1 2 3
#5 a 2 0 1
#6 c 4 4 5
#7 c 5 4 5
#8 b 6 2 3
Using data.table
library(data.table)
setDT(data)[,c('col1', 'col2'):= {list(indx=2*(match(group,
unique(group))-1), indx+1)}]
data
# group val col1 col2
#1: a 4 0 1
#2: a 24 0 1
#3: a 12 0 1
#4: b 1 2 3
#5: a 2 0 1
#6: c 4 4 5
#7: c 5 4 5
#8: b 6 2 3

Delete specific row in a data frame?

I have a data frame that looks like this:
col1 col2 col3
1 0 1 5
2 0 3 0
3 5 4 5
4 5 5 0
5 5 3 7
I want to delete every row that contains the string '0' in column 'col1' and '5' in column 'col3'. How can I accomplish this in R?
col1 col2 col3
2 0 3 0
3 5 4 5
4 5 5 0
5 5 3 7
Thank U.
Assuming your data set called df
df[!(df$col1 == 0 & df$col3 == 5), ]
## col1 col2 col3
## 2 0 3 0
## 3 5 4 5
## 4 5 5 0
## 5 5 3 7

Resources