for loop combined with if statement in R - r

I am working in R and I want to iterate over every unique/distinct name in this table and if A=="yes" | B=="yes" it should create another column C==TRUE for all entries with the same Name, else C==FALSE. I dont know how to combine a for loop with this if statement, I am always getting error messages, although it should be a simple task to do...
Name
A
B
Jordan
yes
no
Pascal
yes
no
Nando
no
yes
Nando
no
no
Nico
no
no
Nico
no
no
This should be the result:
Name
A
B
C
Jordan
yes
no
TRUE
Pascal
yes
no
TRUE
Nando
no
yes
TRUE
Nando
no
no
TRUE
Nico
no
no
FALSE
Nico
no
no
FALSE

For-loops are often not needed in R.
library(dplyr)
dat |>
group_by(Name) |>
mutate(C = if_else("yes" %in% c(A, B), TRUE, FALSE))
#> # A tibble: 6 x 4
#> # Groups: Name [4]
#> Name A B C
#> <chr> <chr> <chr> <lgl>
#> 1 Jordan yes no TRUE
#> 2 Pascal yes no TRUE
#> 3 Nando no yes TRUE
#> 4 Nando no no TRUE
#> 5 Nico no no FALSE
#> 6 Nico no no FALSE
Created on 2022-07-05 by the reprex package (v2.0.1)

R is a language that prefers vector calculations over loops, so the more common way in R would be
df <- data.frame(
Name = c("Jordan","Pascal","Nando","Nando","Nico","Nico"),
A = c("yes","yes","no","no","no","no"),
B = c("no","no","yes","no","no","no")
)
df$C <- df$Name %in% df$Name[df$A == "yes" | df$B == "yes"]
This solution does not rely on any additional library.
If you feel strongly about looping, you could loop over unique(df$Name) or you could aggregate by df$Name, but all of those are much more involved and inefficient techniques.

Related

Check whether a string appears in another in R

I've got a tibble containing sentences like that :
df <- tibble(sentences = c("Bob is looking for something", "Adriana has an umbrella", "Michael is looking at..."))
And another containing a long list of names :
names <- tibble(names = c("Bob", "Mary", "Michael", "John", "Etc."))
I would like to see if the sentences contain a name from the list and add a column to indicate if this is the case and get the following tibble :
wanted_df <- tibble(sentences = c("Bob is looking for something", "Adriana has an umbrella", "Michael is looking at..."), check = c(TRUE, FALSE, TRUE))
So far I've tried that, with no success :
df <- df %>%
mutate(check = grepl(pattern = names$names, x = df$sentences, fixed = TRUE))
And also :
check <- str_detect(names$names %in% df$sentences)
Thanks a lot for any help ;)
You should form a single regex expression in grepl:
df %>%
mutate(check = grepl(paste(names$names, collapse = "|"), sentences))
# A tibble: 3 × 2
sentences check
<chr> <lgl>
1 Bob is looking for something TRUE
2 Adriana has an umbrella FALSE
3 Michael is looking at... TRUE
Here is a base R solution.
inx <- sapply(names$names, \(pat) grepl(pat, df$sentences))
inx
#> Bob Mary Michael John Etc.
#> [1,] TRUE FALSE FALSE FALSE FALSE
#> [2,] FALSE FALSE FALSE FALSE FALSE
#> [3,] FALSE FALSE TRUE FALSE FALSE
inx <- rowSums(inx) > 0L
df$check <- inx
df
#> # A tibble: 3 × 2
#> sentences check
#> <chr> <lgl>
#> 1 Bob is looking for something TRUE
#> 2 Adriana has an umbrella FALSE
#> 3 Michael is looking at... TRUE
Created on 2023-01-11 with reprex v2.0.2
grep and family expect pattern= to be length 1. Similarly, str_detect needs strings, not a logical vector, and of the same length, so that won't work as-is.
We have a couple of options:
sapply on the names (into a matrix) and see if each row has one or more matches:
df %>%
mutate(check = rowSums(sapply(names$names, grepl, sentences)) > 0)
# # A tibble: 3 × 2
# sentences check
# <chr> <lgl>
# 1 Bob is looking for something TRUE
# 2 Adriana has an umbrella FALSE
# 3 Michael is looking at... TRUE
(I now see this is in RuiBarradas's answer.)
Do a fuzzy-join on the data using fuzzyjoin:
df %>%
fuzzyjoin::regex_left_join(names, by = c(sentences = "names")) %>%
mutate(check = !is.na(names))
# # A tibble: 3 × 3
# sentences names check
# <chr> <chr> <lgl>
# 1 Bob is looking for something Bob TRUE
# 2 Adriana has an umbrella NA FALSE
# 3 Michael is looking at... Michael TRUE
This method as an advantage that it tells you which pattern (in names) made the match.
Maybe we can try adist + colSums like below
df %>%
mutate(check = colSums(adist(names$names, sentences, fixed = FALSE) == 0) > 0)
which gives
# A tibble: 3 × 2
sentences check
<chr> <lgl>
1 Bob is looking for something TRUE
2 Adriana has an umbrella FALSE
3 Michael is looking at... TRUE

Count word matches between two variables

Assume two datasets A and B:
X1<- c('a', 'b','c')
place<-c('andes','brooklyn', 'comorin')
A<-data.frame(X1,place)
X2<-c('a','a','a','b','c','c','d')
place2<-c('andes','alamo','andes','brooklyn','comorin','camden','dover')
B<-data.frame(X2,place2)
I want to count how many times each word in A$place occurs in B$place2.
A possible solution:
library(tidyverse)
A %>%
rowwise %>%
mutate(n = sum(place == B$place2)) %>%
ungroup
#> # A tibble: 3 × 3
#> X1 place n
#> <chr> <chr> <int>
#> 1 a andes 2
#> 2 b brooklyn 1
#> 3 c comorin 1
Use str_detect from the stringr package.
library(stringr)
sapply(A$place, function(x) sum(str_detect(x, B$place2)))
andes brooklyn comorin
2 1 1
table(B$place2[B$place2 %in% A$place])
# andes brooklyn comorin
# 2 1 1
Here's a base R version of user438383's answer.
sapply(A$place, function(y) sum(grepl(y, B$place2)))
andes brooklyn comorin
2 1 1
The key functionality is sapply() which repeats an operation over all elements of a vector, grepl() which checks the matches and generates TRUE or FALSE, and sum(). When you sum a logical vector, you get the count of TRUE.

ifelse statement based on vector of conditions in r

Let's say I have a simple data-frame with a list of IDs like this:
df<-tibble(id=1:5)
Then I have a vector of IDs like this:
ids<-3:7
I am trying to write an if or ifelse statement that will check to see if each id is contained in any ids in the ids vector. The result would be something like:
df1<-tibble(id=1:5,included=c("no","no","yes","yes","yes"))
# A tibble: 5 x 2
id included
<int> <chr>
1 no
2 no
3 yes
4 yes
5 yes
The ifelse functions works on vectors. You were probably looking for the %in% operator? Did you try google?
df$included <- ifelse(df$id %in% ids, "yes", "no")
R base:
df$included <- df$id %in% ids
df
# A tibble: 5 x 2
id included
<int> <lgl>
1 1 FALSE
2 2 FALSE
3 3 TRUE
4 4 TRUE
5 5 TRUE
We can use %in% to create a logical vector and replace the values based on either ifelse or case_when or just by indexing
library(dplyr)
df <- df %>%
mutate(included = case_when(id %in% ids ~ "yes", TRUE ~ "no"))
df
You can use %in% to subset a vector containing no and yes:
df$included <- c("no", "yes")[1 + df$id %in% ids]
## A tibble: 5 x 2
# id included
# <int> <chr>
#1 1 no
#2 2 no
#3 3 yes
#4 4 yes
#5 5 yes

How to identify and merge columns in R data frame with adjacent column?

I have reaction time data from an experiment in wide format where every row shows the data from a different participant. Every column gives information about a question (q01, q02 etc.) that was asked, e.g.:
q01_response q01_RT q01_complete q01_button q02_response q02_RT ...
1 yes 231 yes m yes 459
2 no 242 yes c yes 693
I would like to merge every question's response column (e.g., q01_response) with that same question's reaction time (e.g., q01_RT), while leaving the remaining columns as they are. The two columns I want to merge are always adjacent. Thus, I'd want the data frame to look like this:
q01_responsexRT q01_complete q01_button q02_responsexRT ...
1 yesx231 yes m yesx459
2 nox242 yes c yesx693
I know how to do that for two specific columns:
df_new <- unite(df, "q01_responsexRT", c("q01_response","q01_RT"), sep="x")
But I'd like to avoid doing that for every single pair individually. Thanks!
Here's a way using tidyverse :
library(tidyverse)
df1 <- read.table(h=T, strin=F, text="
q01_response q01_RT q01_complete q01_button q02_response q02_RT
1 yes 231 yes m yes 459
2 no 242 yes c yes 693")
mutate(df1, !!!map2_dfc(
select(df1, ends_with("_response")),
select(df1, ends_with("_RT")),
paste, sep ="x")) %>%
select(-ends_with("_RT")) %>%
rename_at(vars(ends_with("_response")),paste0,"xRT")
# q01_responsexRT q01_complete q01_button q02_responsexRT
# 1 yesx231 yes m yesx459
# 2 nox242 yes c yesx693
It assumes that you have the same number of response and RT columns and in the right order (i.e. it doesn't check the column prefixes).
Our first select calls build 2 subsets of the data frames with the relevant suffixes
map2_dfc iterates on these columns to paste them, keeping the names from the first data frame.
!!! splices the resulting data frame into a list of q01_response =... etc
Then we remove the extra columns and rename the newly updated ones
And here's a somewhat more "idiomatic" version, more robust as it looks at both prefixes and suffixes, containing a lot of gather / spread jiu-jitsu :
df1 %>%
rowid_to_column() %>%
gather(key,value, -rowid) %>%
separate(key,c("q","key2")) %>%
spread(key2,value) %>%
unite(responsexRT, response, RT, sep = "x") %>%
gather(key2,value, -rowid, -q) %>%
unite(key, q, key2, sep ="x") %>%
spread(key,value) %>%
select(-rowid)
# q01xbutton q01xcomplete q01xresponsexRT q02xbutton q02xcomplete q02xresponsexRT
# 1 m yes yesx231 <NA> <NA> yesx459
# 2 c yes nox242 <NA> <NA> yesx693
You can also loop over the questions explicitly, if you know how many questions there are to process.
library("tidyverse")
library("glue")
example <- "
q01_response q01_RT q01_complete q01_button q02_response q02_RT
yes 231 yes m yes 459
no 242 yes c yes 693
"
unite_response_time <- function(df, q) {
in_cols <- c(glue("{q}_response"), glue("{q}_RT"))
out_col <- glue("{q}_responsexRT")
df %>% unite(!!out_col, in_cols, sep = "x")
}
df <- read_table(example)
for (q in c("q01", "q02")) {
df <- unite_response_time(df, q)
}
df
#> # A tibble: 2 x 4
#> q01_responsexRT q01_complete q01_button q02_responsexRT
#> <chr> <chr> <chr> <chr>
#> 1 yesx231 yes m yesx459
#> 2 nox242 yes c yesx693
# or
df <- read_table(example)
for (i in seq(2)) {
q <- sprintf("q%02d", i)
df <- unite_response_time(df, q)
}
df
#> # A tibble: 2 x 4
#> q01_responsexRT q01_complete q01_button q02_responsexRT
#> <chr> <chr> <chr> <chr>
#> 1 yesx231 yes m yesx459
#> 2 nox242 yes c yesx693
Created on 2019-03-25 by the reprex package (v0.2.1)

Passing column names to user defined function inside mutate_at

I am struggling to pass column names inside my custom function while using dplyr - mutate_at.
I have a dataset "dt" with thousands of columns and I would like to perform mutate for some of these columns, but in a way which is dependent on the column name
I have this piece of code
Option 1:
relevantcols = c("A", "B", "C")
myfunc <- function(colname, x) {
#write different logic per column name
}
dt%>%
mutate_at(relevantcols, funs(myfunc(<what should i give?>,.)))
I tried approaching the problem in another way, i.e by iterating over relevantcols and applying mutate_at for each of the elements of the vector as follows
Option 2:
for (i in 1:length(relevantcols)){
dt%>%
mutate_at(relevantcols[i], funs(myfunc(relevantcols[i], .))
}
I get the colnames in Option 2, but it is 10 times slower than Option 1. Can I get somehow the column names in Option 1?
Adding an example for more clarity
df = data.frame(employee=seq(1:5), Mon_channelA=runif(5,1,10), Mon_channelB=runif(5,1,10), Tue_channelA=runif(5,1,10),Tue_channelB=runif(5,1,10))
df
employee Mon_channelA Mon_channelB Tue_channelA Tue_channelB
1 1 5.234383 6.857227 4.480943 7.233947
2 2 7.441399 3.777524 2.134075 6.310293
3 3 7.686558 8.598688 9.814882 9.192952
4 4 6.033345 5.658716 5.167388 3.018563
5 5 5.595006 7.582548 9.302917 6.071108
relevantcols = c("Mon_channelA", "Mon_channelB")
myfunc <- function(colname, x) {
#based on the channel and weekday, compare the data from corresponding column with the same channel but different weekday and return T if higher else F
}
# required output
employee Mon_channelA Mon_channelB Tue_channelA Tue_channelB
1 1 T F 4.480943 7.233947
2 2 T F 2.134075 6.310293
3 3 F F 9.814882 9.192952
4 4 T T 5.167388 3.018563
5 5 F T 9.302917 6.071108
I left a comment about data types, but assuming that that is what you're looking for, here's the approach I take to these sorts of problems. I do this in a seemingly convoluted process of reshaping a few times, but it lets you set up the variables that you're trying to compare without hard-coding much. I'll break it into pieces.
library(tidyverse)
set.seed(928)
df <- data.frame(employee=seq(1:5), Mon_channelA=runif(5,1,10), Mon_channelB=runif(5,1,10), Tue_channelA=runif(5,1,10),Tue_channelB=runif(5,1,10))
First, I'd reshape it into a long shape and break the "Mon_channelA", etc apart into a day and a channel. This lets you use the channel designation to match values for comparison.
df %>%
gather(key, value, -employee) %>%
separate(key, into = c("day", "channel"), sep = "_") %>%
head()
#> employee day channel value
#> 1 1 Mon channelA 2.039619
#> 2 2 Mon channelA 8.153684
#> 3 3 Mon channelA 9.027932
#> 4 4 Mon channelA 1.161967
#> 5 5 Mon channelA 3.583353
#> 6 1 Mon channelB 7.102797
Then, bring it back into a wide format based on the days. Now you have a column for each day for each combination of employee and channel.
df %>%
gather(key, value, -employee) %>%
separate(key, into = c("day", "channel"), sep = "_") %>%
spread(key = day, value = value) %>%
head()
#> employee channel Mon Tue
#> 1 1 channelA 2.039619 9.826677
#> 2 1 channelB 7.102797 7.388568
#> 3 2 channelA 8.153684 5.848375
#> 4 2 channelB 6.299178 9.452274
#> 5 3 channelA 9.027932 5.458906
#> 6 3 channelB 7.029408 7.087011
Then do your comparison, and take the data long again. Note that because the value column has numeric values, everything becomes numeric and the logical values are converted to 1 or 0.
df %>%
gather(key, value, -employee) %>%
separate(key, into = c("day", "channel"), sep = "_") %>%
spread(key = day, value = value) %>%
mutate(Mon = Mon > Tue) %>%
gather(key = day, value = value, Mon, Tue) %>%
head()
#> employee channel day value
#> 1 1 channelA Mon 0
#> 2 1 channelB Mon 0
#> 3 2 channelA Mon 1
#> 4 2 channelB Mon 0
#> 5 3 channelA Mon 1
#> 6 3 channelB Mon 0
Last few steps are to stick the day and channel back together to make the labels as you had them, spread back to a wide format, and turn all the columns starting with "Mon" back into logicals.
df %>%
gather(key, value, -employee) %>%
separate(key, into = c("day", "channel"), sep = "_") %>%
spread(key = day, value = value) %>%
mutate(Mon = Mon > Tue) %>%
gather(key = day, value = value, Mon, Tue) %>%
unite("variable", day, channel) %>%
spread(key = variable, value = value) %>%
mutate_at(vars(starts_with("Mon")), as.logical)
#> employee Mon_channelA Mon_channelB Tue_channelA Tue_channelB
#> 1 1 FALSE FALSE 9.826677 7.388568
#> 2 2 TRUE FALSE 5.848375 9.452274
#> 3 3 TRUE FALSE 5.458906 7.087011
#> 4 4 FALSE FALSE 8.854263 8.946458
#> 5 5 FALSE FALSE 6.933054 8.450741
Created on 2018-09-28 by the reprex package (v0.2.1)
You can do things like :
L <- c("A","B")
df <- data.frame(A=rep(1:3,2),B=1:6,C=7:12)
df
# A B C
#1 1 1 7
#2 2 2 8
#3 3 3 9
#4 1 4 10
#5 2 5 11
#6 3 6 12
f <- function(x,y) x^y
df %>% mutate_at(L,funs(f(.,2)))
# A B C
#1 1 1 7
#2 4 4 8
#3 9 9 9
#4 1 16 10
#5 4 25 11
#6 9 36 12
This is an old question, but I just stumbled over one possible way to solve it using a custom mutate/case_when function in combination with purrr::reduce.
It's important to use non-standard evaluation (NSE) inside the mutate/case_when statement to match the variable names you need for your custom function.
I do not know a way to do something similar with mutate_at.
Below I provide two examples, the most basic form (using your original data), and a more advanced version (which contains three weekdays and two channels and) which creates more than two variables. The latter requires an initial set-up using, for example, switch.
Basic example
library(tidyverse)
# your data
df <- data.frame(employee=seq(1:5),
Mon_channelA=runif(5,1,10),
Mon_channelB=runif(5,1,10),
Tue_channelA=runif(5,1,10),
Tue_channelB=runif(5,1,10)
)
# custom function which takes two arguments, df and a string variable name
myfunc <- function(df, x) {
mutate(df,
# overwrites all "Mon_channel" variables ...
!! paste0("Mon_", x) := case_when(
# ... with TRUE, when Mon_channel is smaller than Tue_channel, and FALSE else
!! sym(paste0("Mon_", x)) < !! sym(paste0("Tue_", x)) ~ T,
T ~ F
)
)
}
# define the variables you want to loop over
var_ls <- c("channelA", "channelB")
# use var_ls and myfunc with reduce on your data
df %>%
reduce(var_ls, myfunc, .init = .)
#> employee Mon_channelA Mon_channelB Tue_channelA Tue_channelB
#> 1 1 FALSE FALSE 3.437975 2.458389
#> 2 2 FALSE TRUE 3.686903 4.772390
#> 3 3 TRUE TRUE 5.158234 5.378021
#> 4 4 TRUE TRUE 5.338950 3.109760
#> 5 5 TRUE FALSE 6.365173 3.450495
Created on 2020-02-03 by the reprex package (v0.3.0)
More advanced example
library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 3.5.2
#> Warning: package 'purrr' was built under R version 3.5.2
#> Warning: package 'forcats' was built under R version 3.5.2
# your data plus one weekday with two channels
df <- data.frame(employee=seq(1:5),
Mon_channelA=runif(5,1,10),
Mon_channelB=runif(5,1,10),
Tue_channelA=runif(5,1,10),
Tue_channelB=runif(5,1,10),
Wed_channelA=runif(5,1,10),
Wed_channelB=runif(5,1,10)
)
# custom function which takes two argument, df and a string variable name
myfunc <- function(df, x) {
# an initial set-up is needed
# id gets the original day
id <- str_extract(x, "^\\w{3}")
# based on id the day of comparison is mapped with switch
y <- switch(id,
"Mon" = "Tue",
"Tue" = "Wed")
# j extracts the channel name including the underscore
j <- str_extract(x, "_channel[A-Z]{1}")
# this makes the function definition rather easy:
mutate(df,
!! x := case_when(
!! sym(x) < !! sym(paste0(y, j)) ~ T,
T ~ F
)
)
}
# define the variables you want to loop over
var_ls <- c("Mon_channelA",
"Mon_channelB",
"Tue_channelA",
"Tue_channelB")
# use var_ls and myfunc with reduce on your data
df %>%
reduce(var_ls, myfunc, .init = .)
#> employee Mon_channelA Mon_channelB Tue_channelA Tue_channelB
#> 1 1 TRUE TRUE TRUE FALSE
#> 2 2 FALSE TRUE TRUE FALSE
#> 3 3 FALSE TRUE FALSE TRUE
#> 4 4 FALSE TRUE TRUE FALSE
#> 5 5 TRUE FALSE FALSE FALSE
#> Wed_channelA Wed_channelB
#> 1 9.952454 5.634686
#> 2 9.356577 4.514683
#> 3 2.721330 7.107316
#> 4 4.410240 2.740289
#> 5 5.394057 4.772162
Created on 2020-02-03 by the reprex package (v0.3.0)

Resources