Mutate with str_detect only if condition is true - r

Im trying to use str_detect to mutate only if the column "RedColor" is "1".
I have a dataset test which looks like this:
# id RedColor Color_Number
#1 1 1 1
#2 2 0 1
#3 3 1 3
#4 4 1 2
#5 6 0 2
#6 8 1 6
I tried the filter function but it returns me only a filtered dataset with all other cases with RedColor = "0" removed.
test <- test %>%
filter(RedColor==TRUE) %>%
mutate(DarkRed = str_detect(Color_Number, "1|2"))
Im expecting an output with the new column DarkRed = "1" in all cases with RedColor = 1 and 1 or 2 in column Color_Number.
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
Thank you!

Using base R
transform(df, Dark_Red = +(RedColor == 1& Color_Number %in% 1:2))
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 0
6 8 1 6 0
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)), row.names = c(NA,
-6L), class = "data.frame")

Update on OP's request (see comments):
With this dataframe:
id RedColor Color_Number
1 1 1 one
2 2 0 one
3 3 1 three
4 4 1 two
5 6 0 two
6 8 1 six
you could use this code:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == "one" | Color_Number == "two", 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 one 1
2 2 0 one 0
3 3 1 three 0
4 4 1 two 1
5 6 0 two 1
6 8 1 six 0
First answer:
We could use ifelse
str_detect is not appropriate as Ronak already explained:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == 1 | Color_Number == 2, 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 1
6 8 1 6 0

For exact matches don't perform regex match. str_detect is used for pattern matching. Use %in% to match multiple values.
library(dplyr)
df <- df %>% mutate(Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
df
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
If you want to write this in base R use transform -
df <- transform(df, Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)),
row.names = c(NA, -6L), class = "data.frame")

you can use ifelse inside the mutate call instead of filtering:
test <- test %>%
mutate(Darkred=ifelse((RedColor==TRUE & Color_Number %in% 1:2), 1,0))
> test
# A tibble: 10 × 4
id RedColor Color_Number Darkred
<int> <int> <int> <dbl>
1 1 1 2 1
2 2 1 2 1
3 3 1 3 0
4 4 1 3 0
5 5 0 4 0
6 6 0 2 0
7 7 1 3 0
8 8 1 4 0
9 9 0 5 0
10 10 0 3 0
Data:
test<-data_frame(id=1:10,
RedColor=rbinom(10,1,0.5),
Color_Number=sample(1:5,10,TRUE,rep(.2,5)))

Related

How to custom arrange such that no group dimension contains the same index twice?

I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...
Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2

applying sorting function in r for every four rows returns dataframe sorted but without extended selection, other columns are not sorted accordingly

I need every four rows to be sorted by the 4th column, separately from the next four rows, made a function :
for (i in seq(1,nrow(data_frame), by=4)) {
data_frame[i:(i+3),4] <- sort(data_frame[i:(i+3),4], decreasing=TRUE) }
problem is only the 4th column gets sorted but the corresponding rows are maintained.
from
x y z userID
-1 1 2 5 1
-2 1 1 2 2
-3 0 0 5 5
-6 1 2 5 3
-4 1 1 2 6
-5 0 0 5 4
-4 1 1 2 1
-5 0 0 5 5
to -
x y z userID
-1 1 2 5 5
-2 1 1 2 3
-3 0 0 5 2
-6 1 2 5 1
-4 1 1 2 6
-5 0 0 5 5
-4 1 1 2 4
-5 0 0 5 1
With tidyverse, we can use %/% to create a grouping column with %/% and use that to sort the 'userID'
library(tidyverse)
df1 %>%
group_by(grp = (row_number()-1) %/% 4 + 1) %>%
#or use
#group_by(grp = cumsum(rep(c(TRUE, FALSE, FALSE, FALSE), length.out = n()))) %>%
mutate(userID = sort(userID, decreasing = TRUE))
# A tibble: 8 x 5
# Groups: grp [2]
# x y z userID grp
# <int> <int> <int> <int> <dbl>
#1 1 2 5 5 1
#2 1 1 2 3 1
#3 0 0 5 2 1
#4 1 2 5 1 1
#5 1 1 2 6 2
#6 0 0 5 5 2
#7 1 1 2 4 2
#8 0 0 5 1 2
Or using base R with ave
with(df1, ave(userID, (seq_along(userID)-1) %/% 4 + 1,
FUN = function(x) sort(x, decreasing = TRUE)))
#[1] 5 3 2 1 6 5 4 1
data
df1 <- structure(list(x = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L), y = c(2L,
1L, 0L, 2L, 1L, 0L, 1L, 0L), z = c(5L, 2L, 5L, 5L, 2L, 5L, 2L,
5L), userID = c(1L, 2L, 5L, 3L, 6L, 4L, 1L, 5L)), row.names = c(NA,
-8L), class = "data.frame")
In base R, we can split every 4 rows, order the fourth column and return the updated dataframe back.
df[] <- do.call(rbind, lapply(split(df, gl(nrow(df)/4, 4)),
function(p) p[order(p[[4]], decreasing = TRUE), ]))
df
# x y z userID
#1 0 0 5 5
#2 1 2 5 3
#3 1 1 2 2
#4 1 2 5 1
#5 1 1 2 6
#6 0 0 5 5
#7 0 0 5 4
#8 1 1 2 1
tidyverse approach using the same logic would be
library(tidyverse)
df %>%
group_split(gl(n()/4, 4), keep = FALSE) %>%
map_dfr(. %>% arrange(desc(userID)))

Longest consecutive count of the same value per group

I have a data.frame as below and I want to add a variable describing the longest consecutive count of 1 in the VALUE variable observed in the group (i.e. longest consecutive rows with 1 in VALUE per group).
GROUP_ID VALUE
1 0
1 1
1 1
1 1
1 1
1 0
2 1
2 1
2 0
2 1
2 1
2 1
3 1
3 0
3 1
3 0
So the output would look like this:
GROUP_ID VALUE CONSECUTIVE
1 0 4
1 1 4
1 1 4
1 1 4
1 1 4
1 0 4
2 1 3
2 1 3
2 0 3
2 1 3
2 1 3
2 1 3
3 1 1
3 0 1
3 1 1
3 0 1
Any help would be greatly appreciated!
Using dplyr:
library(dplyr)
dat %>%
group_by(GROUP_ID) %>%
mutate(CONSECUTIVE = {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])})
which gives:
# A tibble: 16 x 3
# Groups: GROUP_ID [3]
GROUP_ID VALUE CONSECUTIVE
<int> <int> <int>
1 1 0 4
2 1 1 4
3 1 1 4
4 1 1 4
5 1 1 4
6 1 0 4
7 2 1 3
8 2 1 3
9 2 0 3
10 2 1 3
11 2 1 3
12 2 1 3
13 3 1 1
14 3 0 1
15 3 1 1
16 3 0 1
Or with data.table:
library(data.table)
setDT(dat) # convert to a 'data.table'
dat[, CONSECUTIVE := {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])}
, by = GROUP_ID][]
We can use ave with rle and get maximum occurrence of consecutive 1's for each group. (GROUP_ID)
df$Consecutive <- ave(df$VALUE, df$GROUP_ID, FUN = function(x) {
y <- rle(x == 1)
max(y$lengths[y$values])
})
df
# GROUP_ID VALUE Consecutive
#1 1 0 4
#2 1 1 4
#3 1 1 4
#4 1 1 4
#5 1 1 4
#6 1 0 4
#7 2 1 3
#8 2 1 3
#9 2 0 3
#10 2 1 3
#11 2 1 3
#12 2 1 3
#13 3 1 1
#14 3 0 1
#15 3 1 1
#16 3 0 1
Here is another option with data.table
library(data.table)
library(dplyr)
setDT(df1)[, CONSECUTIVE := max(table(na_if(rleid(VALUE)*VALUE, 0))), .(GROUP_ID)]
df1
# GROUP_ID VALUE CONSECUTIVE
# 1: 1 0 4
# 2: 1 1 4
# 3: 1 1 4
# 4: 1 1 4
# 5: 1 1 4
# 6: 1 0 4
# 7: 2 1 3
# 8: 2 1 3
# 9: 2 0 3
#10: 2 1 3
#11: 2 1 3
#12: 2 1 3
#13: 3 1 1
#14: 3 0 1
#15: 3 1 1
#16: 3 0 1
data
df1 <- structure(list(GROUP_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), VALUE = c(0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-16L))

Panel data sequence adding for a particular value

I am really new in r and stackoverflow. Apologies in advance for this novice question.
I have a panel data set like the following table.
ID Choice
1 1
1 1
1 2
1 5
1 1
2 1
2 1
2 5
2 1
2 1
3 3
3 1
3 1
3 2
3 4
I want to add another column like the following table when choice is 1. This is basically, sequencing the choice 1 within ID.
ID Choice BUS
1 1 0 (The first 1 will be considered as 0)
1 1 1
1 2 1
1 5 1
1 1 2
2 1 0
2 1 1
2 5 1
2 1 2
2 1 3
3 3 0
3 1 0
3 1 1
3 2 1
3 4 1
with(df, ave(Choice == 1, ID, FUN = cumsum))
Almost gives you what you want but as you want to consider first 1 as 0 it needs some modification.
df$BUS <- with(df, ave(Choice == 1, ID, FUN = function(x) {
inds = cumsum(x)
ifelse(inds > 0, inds - 1, inds)
}))
df
# ID Choice BUS
#1 1 1 0
#2 1 1 1
#3 1 2 1
#4 1 5 1
#5 1 1 2
#6 2 1 0
#7 2 1 1
#8 2 5 1
#9 2 1 2
#10 2 1 3
#11 3 3 0
#12 3 1 0
#13 3 1 1
#14 3 2 1
#15 3 4 1
Here we subtract 1 from cumulative sum from the first 1.
Using the same logic in dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(inds = cumsum(Choice == 1),
BUS = ifelse(inds > 0, inds - 1, inds)) %>%
select(-inds)
We can also use data.table
library(data.table)
setDT(df1)[, BUS := pmax(0, cumsum(Choice == 1)-1), ID]
df1
# ID Choice BUS
# 1: 1 1 0
# 2: 1 1 1
# 3: 1 2 1
# 4: 1 5 1
# 5: 1 1 2
# 6: 2 1 0
# 7: 2 1 1
# 8: 2 5 1
# 9: 2 1 2
#10: 2 1 3
#11: 3 3 0
#12: 3 1 0
#13: 3 1 1
#14: 3 2 1
#15: 3 4 1
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), Choice = c(1L, 1L, 2L, 5L, 1L, 1L, 1L, 5L,
1L, 1L, 3L, 1L, 1L, 2L, 4L)), class = "data.frame", row.names = c(NA,
-15L))

R — Assign value to vector based on first episode

So I have a sequence dataset that looks like this
id epnum clockst
1 1 1 0
2 1 2 1
3 1 3 2
4 2 1 4
5 2 2 5
6 2 3 6
7 3 1 4
8 3 2 5
9 3 3 6
What I want is to create a vector of clockst based on epnum == 1.
So, I want basically this
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
However, I struggle to do so.
I came up with this, but it doesn't fully work.
dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0,
ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))
Any idea?
Data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0",
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id",
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")
Here is a solution using tidyverse:
First check the condition epnum == 1 and if TRUE, use clockst value if not NA. Then just fill NA with previous values.
Since clockst is a factor one needs to convert it to numeric while keeping the same values so as.numeric(as.character( needs to be used.
library(tidyverse)
dt %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down")
#output:
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
Here is a quick comparison of the available answers. I chose to use a 90 k row data set:
df <- df[rep(1:nrow(df), times = 10000),] #where df = dt
dt <- data.table(df)
library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
missuse = df %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down"),
d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
www = df %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup())
plot(bench)
with a 900 k row data set:
oh man I really need to learn DT.
Another tidyverse solution. arrange is not required if you are certain that the rows are in the right order.
library(dplyr)
dt2 <- dt %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup()
dt2
# # A tibble: 9 x 4
# id epnum clockst ep_start
# <fctr> <fctr> <fctr> <fctr>
# 1 1 1 0 0
# 2 1 2 1 0
# 3 1 3 2 0
# 4 2 1 4 4
# 5 2 2 5 4
# 6 2 3 6 4
# 7 3 1 4 4
# 8 3 2 5 4
# 9 3 3 6 4
You can do this with library(data.table) as follows
T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]
This gives:
id epnum clockst ep_start
1: 1 1 0 0
2: 1 2 1 0
3: 1 3 2 0
4: 2 1 4 4
5: 2 2 5 4
6: 2 3 6 4
7: 3 1 4 4
8: 3 2 5 4
9: 3 3 6 4
dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
# id epnum clockst ep_start
#1 1 1 0 0
#2 1 2 1 0
#3 1 3 2 0
#4 2 1 4 4
#5 2 2 5 4
#6 2 3 6 4
#7 3 1 4 4
#8 3 2 5 4
#9 3 3 6 4
Using match
clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]

Resources