Use if else statement in R dataframe - r

d = data.frame(A=c(1,1),B=c(0,20),C=c(0,10))
d$B[d$B>0] = TRUE
d$C[d$C>0] = TRUE
d$B[d$B==0] = FALSE
d$C[d$C==0] = FALSE
Is there a way to do the 4 last operations in one line to simplify things like an if else statement for dataframe.

We can compare the entire dataframe (except the first column) and check if the value is greater than 0 or not and convert that value into numeric
d[-1] <- as.numeric(d[-1] > 0)
d
# A B C
#1 1 0 0
#2 1 1 1
Or if we want to keep the values as logical we can remove the as.numeric call
d[-1] <- d[-1] > 0
# A B C
#1 1 FALSE FALSE
#2 1 TRUE TRUE

We can use tidyverse
library(dplyr)
d %>%
mutate_at(2:3, funs(. > 0))
# A B C
#1 1 FALSE FALSE
#2 1 TRUE TRUE

You can use dplyr's if_else as follows:
library(dplyr)
d = data.frame(A=c(1,1),B=c(0,20),C=c(0,10))
d$B<-if_else(d$B>0,"TRUE","FALSE")
d$C<-if_else(d$C>0,"TRUE","FALSE")
A B C
1 1 FALSE FALSE
2 1 TRUE TRUE

Related

recoding based on two condtions in r

I have an example dataset looks like:
data <- as.data.frame(c("A","B","C","X1_theta","X2_theta","AB_theta","BC_theta","CD_theta"))
colnames(data) <- "category"
> data
category
1 A
2 B
3 C
4 X1_theta
5 X2_theta
6 AB_theta
7 BC_theta
8 CD_theta
I am trying to generate a logical variable when the category (variable) contains "theta" in it. However, I would like to assign the logical value as "FALSE" when cell values contain "X1" and "X2".
Here is what I did:
data$logic <- str_detect(data$category, "theta")
> data
category logic
1 A FALSE
2 B FALSE
3 C FALSE
4 X1_theta TRUE
5 X2_theta TRUE
6 AB_theta TRUE
7 BC_theta TRUE
8 CD_theta TRUE
here, all cells value that have "theta" have the logical value of "TRUE".
Then, I wrote this below to just assign "FALSE" when the cell value has "X" in it.
data$logic <- ifelse(grepl("X", data$category), "FALSE", "TRUE")
> data
category logic
1 A TRUE
2 B TRUE
3 C TRUE
4 X1_theta FALSE
5 X2_theta FALSE
6 AB_theta TRUE
7 BC_theta TRUE
8 CD_theta TRUE
But this, of course, overwrote the previous application
What I would like to get is to combine two conditions:
> data
category logic
1 A FALSE
2 B FALSE
3 C FALSE
4 X1_theta FALSE
5 X2_theta FALSE
6 AB_theta TRUE
7 BC_theta TRUE
8 CD_theta TRUE
Any thoughts?
Thanks
We can create the 'logic', by detecting substring 'theta' at the end and not having 'X' ([^X]) as the starting (^) character
libary(dplyr)
library(stringr)
library(tidyr)
data %>%
mutate(logic = str_detect(category, "^[^X].*theta$"))
If we need to split the column into separate columns based on the conditions
data %>%
mutate(logic = str_detect(category, "^[^X].*theta$"),
category = case_when(logic ~ str_replace(category, "_", ","),
TRUE ~ as.character(category))) %>%
separate(category, into = c("split1", "split2"), sep= ",", remove = FALSE)
# category split1 split2 logic
#1 A A <NA> FALSE
#2 B B <NA> FALSE
#3 C C <NA> FALSE
#4 X1_theta X1_theta <NA> FALSE
#5 X2_theta X2_theta <NA> FALSE
#6 AB,theta AB theta TRUE
#7 BC,theta BC theta TRUE
#8 CD,theta CD theta TRUE
Or in base R
data$logic <- with(data, grepl("^[^X].*theta$", category))
Another option is to have two grepl condition statements
data$logic <- with(data, grepl("theta$", category) & !grepl("^X\\d+", category))
data$logic
#[1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
Not the cleanest in the world (since it adds 2 unnecessary cols) but it gets the job done:
data <- as.data.frame(c("A","B","C","X1_theta","X2_theta","AB_theta","BC_theta","CD_theta"))
colnames(data) <- "category"
data$logic1 <- ifelse(grepl('X',data$category), FALSE, TRUE)
data$logic2 <- ifelse(grepl('theta',data$category),TRUE, FALSE)
data$logic <- ifelse((data$logic1 == TRUE & data$logic2 == TRUE), TRUE, FALSE)
print(data)
I think you can also remove the logic1 and logic2 cols if you want but I usually don't bother (I'm a messy coder haha).
Hope this helped!
EDIT: akrun's grepl solution does what I'm doing way more cleanly (as in, it doesn't require the extra cols). I definitely recommend that approach!

R: How to make extra rows from a column?

I have a data-set of human hands, where currently a single person is defined as a single observation. I want to reshape dataframe to have hands as individual observations. I tried something with "dplyr" package and "gather" function but had no success at all.
So from this, where each person is on one row :
id Gender Age Present_R Present_L Dominant
1 F 2 TRUE TRUE R
2 F 5 TRUE FALSE L
3 M 8 FALSE FALSE R
to this, where each hand is on one row:
id Gender Age Hand Present Dominant
1 F 2 R TRUE TRUE
2 F 2 L TRUE FALSE
3 F 5 R TRUE FALSE
4 F 5 L FALSE TRUE
5 M 8 R FALSE TRUE
6 M 8 L FALSE FALSE
Note that hand dominance becomes logical.
We can gather into 'long' format, arrange by 'id', then create the 'Dominant' by unlisting the 'Present' columns, 'Hand' by removing the substring of the 'Hand' column
library(tidyverse)
gather(df1, Hand, Present, Present_R:Present_L) %>%
arrange(id) %>%
mutate(Dominant = unlist(df1[c("Present_L", "Present_R")]),
id = row_number(),
Hand = str_remove(Hand, ".*_"))
# id Gender Age Dominant Hand Present
#1 1 F 2 TRUE R TRUE
#2 2 F 2 FALSE L TRUE
#3 3 F 5 FALSE R TRUE
#4 4 F 5 TRUE L FALSE
#5 5 M 8 TRUE R FALSE
#6 6 M 8 FALSE L FALSE
Based on the OP' comments, it seems like we need to compare the 'Dominant' with the 'Hand'
gather(df1, Hand, Present, Present_R:Present_L) %>%
arrange(id) %>%
mutate(id = row_number(),
Hand = str_remove(Hand, ".*_"),
Dominant = Dominant == Hand)
# id Gender Age Dominant Hand Present
#1 1 F 2 TRUE R TRUE
#2 2 F 2 FALSE L TRUE
#3 3 F 5 FALSE R TRUE
#4 4 F 5 TRUE L FALSE
#5 5 M 8 TRUE R FALSE
#6 6 M 8 FALSE L FALSE
With a small data frame (i.e., few variables, regardless of the number of cases), "hand-coding" may be the easiest approach:
with(df, data.frame(id = c(id,id), Gender=c(Gender,Gender), Age=c(Age, Age),
Hand = c(rep("R", nrow(df)), rep("L", nrow(df))),
Present = c(Present_R, Present_L),
Dominant = c(Dominant=="R", Dominant=="L")
))

R - use if statement to regroup variable

I want to regroup a variable into a new one.
If value is 0, new one should be 0 too.
If value ist 999, then make it missing, NA.
Everything else 1
This is my try:
id <- 1:10
variable <- c(0,0,0,1,2,3,4,5,999,999)
df <- data.frame(id,variable)
df$variable2 <-
if (df$variable == 0) {
df$variable2 = 0
} else if (df$variable == 999){
df$variable2 = NA
} else {
df$variable2 = 1
}
And this the error message:
In if (df$variable == 0) { : the condition has length > 1 and only
the first element will be used
A pretty basic question but I'm a basic user. Thanks in advance!
Try ifelse
df$variable2 <- ifelse(df$variable == 999, NA, ifelse(df$variable > 0, 1, 0))
df
# id variable variable2
#1 1 0 0
#2 2 0 0
#3 3 0 0
#4 4 1 1
#5 5 2 1
#6 6 3 1
#7 7 4 1
#8 8 5 1
#9 9 999 NA
#10 10 999 NA
When you do df$variable == 0 the output / condition is
#[1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
where it should be a length-one logical vector that is not NA in if(condition), see ?"if".
You can avoid ifelse, for example, like so
df$variable2 <- df$variable
df$variable2[df$variable2 == 999] <- NA
df$variable2[df$variable2 > 0] <- 1
It might be easier to avoid the if/else statement all together by using conditional statements within subset notation:
when df$variable is equal to zero, change it to zero
df$variable[df$variable==0] <- 0
when df$variable is equal to 999, change it to NA
df$variable[df$variable==999] <- NA
when df$variable is greater than 0 and is not equal to NA, change it to 1
df$variable[df$variable>0 & is.na(df$variable) == 'FALSE'] <- 1
Looks like you want to recode your variable. You can do this (and other data/variable transformations) with the sjmisc-package, in your case with the rec()-command:
id <- 1:10
variable <- c(0,0,0,1,2,3,4,5,999,999)
df <- data.frame(id,variable)
library(sjmisc)
rec(df, variable, rec = c("0=0;999=NA;else=1"))
#> id variable variable_r
#> 1 1 0 0
#> 2 2 0 0
#> 3 3 0 0
#> 4 4 1 1
#> 5 5 2 1
#> 6 6 3 1
#> 7 7 4 1
#> 8 8 5 1
#> 9 9 999 NA
#> 10 10 999 NA
# or a single vector as input
rec(df$variable, rec = c("0=0;999=NA;else=1"))
#> [1] 0 0 0 1 1 1 1 1 NA NA
There are many examples, also in the help-file, and you can find a sjmisc-cheatsheet at the RStudio-Cheatsheet collection (or direct PDF-download here).
df$variable2 <- sapply(df$variable,
function(el) if (el == 0) {0} else if (el == 999) {NA} else {1})
This one-liner reflects your:
If value is 0, new one should be 0 too. If value ist 999, then make it
missing, NA. Everything else 1
Well, it is slightly slower than #markus's second or #SPJ's solutions which are most r-ish solutions.
Why one should put away the hands from ifelse
tt <- c(TRUE, FALSE, TRUE, FALSE)
a <- c("a", "b", "c", "d")
b <- 1:4
ifelse(tt, a, b) ## [1] "a" "2" "c" "4"
# totally perfect and as expected!
df <- data.frame(a=a, b=b, c=tt)
df$d <- ifelse(df$c, df$a, df$b)
## > df
## a b c d
## 1 a 1 TRUE 1
## 2 b 2 FALSE 2
## 3 c 3 TRUE 3
## 4 d 4 FALSE 4
######### This is wrong!! ##########################
## df$d is not [1] "a" "2" "c" "4"
## the problem is that
## ifelse(df$c, df$a, df$b)
## returns for each TRUE or FALSE the entire
## df$a or df$b intead of treating it like a vector.
## Since the last df$c is FALSE, df$b is returned
## Thus we get df$b for df$d.
## Quite an unintuitive behaviour.
##
## If one uses purely vectors, ifelse is fine.
## But actually df$c, df$a, df$b should be treated each like a vector.
## However, `ifelse` does not.
## No warnings that using `ifelse` with them will lead to a
## totally different behaviour.
## In my view, this is a design mistake of `ifelse`.
## Thus I decided myself to abandon `ifelse` from my set of R commands.
## To avoid that such kind of mistakes can ever happen.
#####################################################
As #Parfait pointed out correctly, it was a misinterpretation.
The problem was that df$a was treated in the data frame as a factor.
df <- data.frame(a=a, b=b, c=tt, stringsAsFactor = F)
df$d <- ifelse(df$c, df$a, df$b)
df
Gives the correct result.
a b c d
1 a 1 TRUE a
2 b 2 FALSE 2
3 c 3 TRUE c
4 d 4 FALSE 4
Thank you #Parfait to pointing that out!
Strange that I didn't recognized that in my initial trials.
But yeah, you are absolutely right!

R: Nested for-loop to compare rows and build new data frame

I created two nested for loops to complete the following:
Iterating through each column that is not the first column:
Iterate through each row i that is NOT the last row (the last row is denoted j)
Compare the value in i to the value in j.
If i is NA, i = NA.
If i >= j, i = 0.
If i < j, i = 1.
Store the results of all iterations across all columns and rows in a df.
The code below creates some test data but produces a Value "out" that is NULL (empty). Any recommendations?
# Create df
a <- rnorm(5)
b <- c(rnorm(3),NA,rnorm(1))
c <- rnorm(5)
df <- data.frame(a,b,c)
rows <- nrow(df) # rows
cols <- ncol(df) # cols
out <- for (c in 2:cols){
for (r in 1:(rows - 1)){
ifelse(
is.na(df[r,c]),
NA,
df[r, c] <- df[r, c] < df[rows, c])
}
}
There's no need for looping at all. Use a vectorised function like sweep to compare via > your last row - df[nrow(df),] vs all the other rows df[-nrow(df),]:
df
# a b c
#1 -0.2739735 0.5095727 0.30664838
#2 0.7613023 -0.1509454 -0.08818313
#3 -0.4781940 1.5760307 0.46769601
#4 1.1754130 NA 0.33394212
#5 0.5448537 1.0493805 -0.10528847
sweep(df[-nrow(df),], 2, unlist(df[nrow(df),]), FUN=`>`)
# a b c
#1 FALSE FALSE TRUE
#2 TRUE FALSE TRUE
#3 FALSE TRUE TRUE
#4 TRUE NA TRUE
sweep(df[-nrow(df),], 2, unlist(df[nrow(df),]), FUN=`>`) + 0
# a b c
#1 0 0 1
#2 1 0 1
#3 0 1 1
#4 1 NA 1
Here is another option. We can replicate the last row to make the dimensions of both datasets equal and then do the > to get a logical index, which can be coerced to binary by wrapping with +.
+(df[-nrow(df),] > df[nrow(df),][col(df[-nrow(df),])])
# a b c
#1 0 0 1
#2 1 0 1
#3 0 1 1
#4 1 NA 1

Collapse consecutive rows in a data frame

I have this example data.frame:
df <- data.frame(a = c(1,2,3,5,7,8),b=c(2,3,4,6,8,9))
And I'd like to collapse all rows i whose b column value is equal to a column value at their subsequent row (i+1) such that in the collapsed row they their a column will be that of row i and their b column will be that of row i+1. This has to be done as long as there are no consecutive rows that meet this condition.
For the example df rows 1-3 are to be collapsed, row 4 left as is, and then rows 5-6 collapsed, giving:
res.df <- data.frame(a = c(1,5,7), b = c(4,6,9))
This isn't overly pretty, but it is vectorised comparing a cutdown version of df$a to df$b.
grps <- rev(cumsum(rev(c(tail(df$a,-1) != head(df$b,-1),TRUE))))
#[1] 3 3 3 2 1 1
cbind(df["a"], b=ave(df$b,grps,FUN=max) )[!duplicated(grps),]
# a b
#1 1 4
#4 5 6
#5 7 9
Breaking it down probably helps explain the first part:
tail(df$a,-1) != head(df$b,-1)
#[1] FALSE FALSE TRUE TRUE FALSE
c(tail(df$a,-1) != head(df$b,-1),TRUE)
#[1] FALSE FALSE TRUE TRUE FALSE TRUE
rev(c(tail(df$a,-1) != head(df$b,-1),TRUE))
#[1] TRUE FALSE TRUE TRUE FALSE FALSE
cumsum(rev(c(tail(df$a,-1) != head(df$b,-1),TRUE)))
#[1] 1 1 2 3 3 3

Resources