Mutate with str_detect only if condition is true

Mutate with str_detect only if condition is true - r

Im trying to use str_detect to mutate only if the column "RedColor" is "1".
I have a dataset test which looks like this:
# id RedColor Color_Number
#1 1 1 1
#2 2 0 1
#3 3 1 3
#4 4 1 2
#5 6 0 2
#6 8 1 6
I tried the filter function but it returns me only a filtered dataset with all other cases with RedColor = "0" removed.
test <- test %>%
filter(RedColor==TRUE) %>%
mutate(DarkRed = str_detect(Color_Number, "1|2"))
Im expecting an output with the new column DarkRed = "1" in all cases with RedColor = 1 and 1 or 2 in column Color_Number.
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
Thank you!

Using base R
transform(df, Dark_Red = +(RedColor == 1& Color_Number %in% 1:2))
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 0
6 8 1 6 0
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)), row.names = c(NA,
-6L), class = "data.frame")

Update on OP's request (see comments):
With this dataframe:
id RedColor Color_Number
1 1 1 one
2 2 0 one
3 3 1 three
4 4 1 two
5 6 0 two
6 8 1 six
you could use this code:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == "one" | Color_Number == "two", 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 one 1
2 2 0 one 0
3 3 1 three 0
4 4 1 two 1
5 6 0 two 1
6 8 1 six 0
First answer:
We could use ifelse
str_detect is not appropriate as Ronak already explained:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == 1 | Color_Number == 2, 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 1
6 8 1 6 0

For exact matches don't perform regex match. str_detect is used for pattern matching. Use %in% to match multiple values.
library(dplyr)
df <- df %>% mutate(Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
df
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
If you want to write this in base R use transform -
df <- transform(df, Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)),
row.names = c(NA, -6L), class = "data.frame")

you can use ifelse inside the mutate call instead of filtering:
test <- test %>%
mutate(Darkred=ifelse((RedColor==TRUE & Color_Number %in% 1:2), 1,0))
> test
# A tibble: 10 × 4
id RedColor Color_Number Darkred
<int> <int> <int> <dbl>
1 1 1 2 1
2 2 1 2 1
3 3 1 3 0
4 4 1 3 0
5 5 0 4 0
6 6 0 2 0
7 7 1 3 0
8 8 1 4 0
9 9 0 5 0
10 10 0 3 0
Data:
test<-data_frame(id=1:10,
RedColor=rbinom(10,1,0.5),
Color_Number=sample(1:5,10,TRUE,rep(.2,5)))

Related

How to custom arrange such that no group dimension contains the same index twice?

I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...

Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2

applying sorting function in r for every four rows returns dataframe sorted but without extended selection, other columns are not sorted accordingly

I need every four rows to be sorted by the 4th column, separately from the next four rows, made a function :
for (i in seq(1,nrow(data_frame), by=4)) {
data_frame[i:(i+3),4] <- sort(data_frame[i:(i+3),4], decreasing=TRUE) }
problem is only the 4th column gets sorted but the corresponding rows are maintained.
from
x y z userID
-1 1 2 5 1
-2 1 1 2 2
-3 0 0 5 5
-6 1 2 5 3
-4 1 1 2 6
-5 0 0 5 4
-4 1 1 2 1
-5 0 0 5 5
to -
x y z userID
-1 1 2 5 5
-2 1 1 2 3
-3 0 0 5 2
-6 1 2 5 1
-4 1 1 2 6
-5 0 0 5 5
-4 1 1 2 4
-5 0 0 5 1

With tidyverse, we can use %/% to create a grouping column with %/% and use that to sort the 'userID'
library(tidyverse)
df1 %>%
group_by(grp = (row_number()-1) %/% 4 + 1) %>%
#or use
#group_by(grp = cumsum(rep(c(TRUE, FALSE, FALSE, FALSE), length.out = n()))) %>%
mutate(userID = sort(userID, decreasing = TRUE))
# A tibble: 8 x 5
# Groups: grp [2]
# x y z userID grp
# <int> <int> <int> <int> <dbl>
#1 1 2 5 5 1
#2 1 1 2 3 1
#3 0 0 5 2 1
#4 1 2 5 1 1
#5 1 1 2 6 2
#6 0 0 5 5 2
#7 1 1 2 4 2
#8 0 0 5 1 2
Or using base R with ave
with(df1, ave(userID, (seq_along(userID)-1) %/% 4 + 1,
FUN = function(x) sort(x, decreasing = TRUE)))
#[1] 5 3 2 1 6 5 4 1
data
df1 <- structure(list(x = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L), y = c(2L,
1L, 0L, 2L, 1L, 0L, 1L, 0L), z = c(5L, 2L, 5L, 5L, 2L, 5L, 2L,
5L), userID = c(1L, 2L, 5L, 3L, 6L, 4L, 1L, 5L)), row.names = c(NA,
-8L), class = "data.frame")

In base R, we can split every 4 rows, order the fourth column and return the updated dataframe back.
df[] <- do.call(rbind, lapply(split(df, gl(nrow(df)/4, 4)),
function(p) p[order(p[[4]], decreasing = TRUE), ]))
df
# x y z userID
#1 0 0 5 5
#2 1 2 5 3
#3 1 1 2 2
#4 1 2 5 1
#5 1 1 2 6
#6 0 0 5 5
#7 0 0 5 4
#8 1 1 2 1
tidyverse approach using the same logic would be
library(tidyverse)
df %>%
group_split(gl(n()/4, 4), keep = FALSE) %>%
map_dfr(. %>% arrange(desc(userID)))

Longest consecutive count of the same value per group

I have a data.frame as below and I want to add a variable describing the longest consecutive count of 1 in the VALUE variable observed in the group (i.e. longest consecutive rows with 1 in VALUE per group).
GROUP_ID VALUE
1 0
1 1
1 1
1 1
1 1
1 0
2 1
2 1
2 0
2 1
2 1
2 1
3 1
3 0
3 1
3 0
So the output would look like this:
GROUP_ID VALUE CONSECUTIVE
1 0 4
1 1 4
1 1 4
1 1 4
1 1 4
1 0 4
2 1 3
2 1 3
2 0 3
2 1 3
2 1 3
2 1 3
3 1 1
3 0 1
3 1 1
3 0 1
Any help would be greatly appreciated!

Using dplyr:
library(dplyr)
dat %>%
group_by(GROUP_ID) %>%
mutate(CONSECUTIVE = {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])})
which gives:
# A tibble: 16 x 3
# Groups: GROUP_ID [3]
GROUP_ID VALUE CONSECUTIVE
<int> <int> <int>
1 1 0 4
2 1 1 4
3 1 1 4
4 1 1 4
5 1 1 4
6 1 0 4
7 2 1 3
8 2 1 3
9 2 0 3
10 2 1 3
11 2 1 3
12 2 1 3
13 3 1 1
14 3 0 1
15 3 1 1
16 3 0 1
Or with data.table:
library(data.table)
setDT(dat) # convert to a 'data.table'
dat[, CONSECUTIVE := {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])}
, by = GROUP_ID][]

We can use ave with rle and get maximum occurrence of consecutive 1's for each group. (GROUP_ID)
df$Consecutive <- ave(df$VALUE, df$GROUP_ID, FUN = function(x) {
y <- rle(x == 1)
max(y$lengths[y$values])
})
df
# GROUP_ID VALUE Consecutive
#1 1 0 4
#2 1 1 4
#3 1 1 4
#4 1 1 4
#5 1 1 4
#6 1 0 4
#7 2 1 3
#8 2 1 3
#9 2 0 3
#10 2 1 3
#11 2 1 3
#12 2 1 3
#13 3 1 1
#14 3 0 1
#15 3 1 1
#16 3 0 1

Here is another option with data.table
library(data.table)
library(dplyr)
setDT(df1)[, CONSECUTIVE := max(table(na_if(rleid(VALUE)*VALUE, 0))), .(GROUP_ID)]
df1
# GROUP_ID VALUE CONSECUTIVE
# 1: 1 0 4
# 2: 1 1 4
# 3: 1 1 4
# 4: 1 1 4
# 5: 1 1 4
# 6: 1 0 4
# 7: 2 1 3
# 8: 2 1 3
# 9: 2 0 3
#10: 2 1 3
#11: 2 1 3
#12: 2 1 3
#13: 3 1 1
#14: 3 0 1
#15: 3 1 1
#16: 3 0 1
data
df1 <- structure(list(GROUP_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), VALUE = c(0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-16L))

Panel data sequence adding for a particular value

I am really new in r and stackoverflow. Apologies in advance for this novice question.
I have a panel data set like the following table.
ID Choice
1 1
1 1
1 2
1 5
1 1
2 1
2 1
2 5
2 1
2 1
3 3
3 1
3 1
3 2
3 4
I want to add another column like the following table when choice is 1. This is basically, sequencing the choice 1 within ID.
ID Choice BUS
1 1 0 (The first 1 will be considered as 0)
1 1 1
1 2 1
1 5 1
1 1 2
2 1 0
2 1 1
2 5 1
2 1 2
2 1 3
3 3 0
3 1 0
3 1 1
3 2 1
3 4 1

with(df, ave(Choice == 1, ID, FUN = cumsum))
Almost gives you what you want but as you want to consider first 1 as 0 it needs some modification.
df$BUS <- with(df, ave(Choice == 1, ID, FUN = function(x) {
inds = cumsum(x)
ifelse(inds > 0, inds - 1, inds)
}))
df
# ID Choice BUS
#1 1 1 0
#2 1 1 1
#3 1 2 1
#4 1 5 1
#5 1 1 2
#6 2 1 0
#7 2 1 1
#8 2 5 1
#9 2 1 2
#10 2 1 3
#11 3 3 0
#12 3 1 0
#13 3 1 1
#14 3 2 1
#15 3 4 1
Here we subtract 1 from cumulative sum from the first 1.
Using the same logic in dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(inds = cumsum(Choice == 1),
BUS = ifelse(inds > 0, inds - 1, inds)) %>%
select(-inds)

We can also use data.table
library(data.table)
setDT(df1)[, BUS := pmax(0, cumsum(Choice == 1)-1), ID]
df1
# ID Choice BUS
# 1: 1 1 0
# 2: 1 1 1
# 3: 1 2 1
# 4: 1 5 1
# 5: 1 1 2
# 6: 2 1 0
# 7: 2 1 1
# 8: 2 5 1
# 9: 2 1 2
#10: 2 1 3
#11: 3 3 0
#12: 3 1 0
#13: 3 1 1
#14: 3 2 1
#15: 3 4 1
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), Choice = c(1L, 1L, 2L, 5L, 1L, 1L, 1L, 5L,
1L, 1L, 3L, 1L, 1L, 2L, 4L)), class = "data.frame", row.names = c(NA,
-15L))

R — Assign value to vector based on first episode

So I have a sequence dataset that looks like this
id epnum clockst
1 1 1 0
2 1 2 1
3 1 3 2
4 2 1 4
5 2 2 5
6 2 3 6
7 3 1 4
8 3 2 5
9 3 3 6
What I want is to create a vector of clockst based on epnum == 1.
So, I want basically this
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
However, I struggle to do so.
I came up with this, but it doesn't fully work.
dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0,
ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))
Any idea?
Data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0",
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id",
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")

Here is a solution using tidyverse:
First check the condition epnum == 1 and if TRUE, use clockst value if not NA. Then just fill NA with previous values.
Since clockst is a factor one needs to convert it to numeric while keeping the same values so as.numeric(as.character( needs to be used.
library(tidyverse)
dt %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down")
#output:
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
Here is a quick comparison of the available answers. I chose to use a 90 k row data set:
df <- df[rep(1:nrow(df), times = 10000),] #where df = dt
dt <- data.table(df)
library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
missuse = df %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down"),
d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
www = df %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup())
plot(bench)
with a 900 k row data set:
oh man I really need to learn DT.

Another tidyverse solution. arrange is not required if you are certain that the rows are in the right order.
library(dplyr)
dt2 <- dt %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup()
dt2
# # A tibble: 9 x 4
# id epnum clockst ep_start
# <fctr> <fctr> <fctr> <fctr>
# 1 1 1 0 0
# 2 1 2 1 0
# 3 1 3 2 0
# 4 2 1 4 4
# 5 2 2 5 4
# 6 2 3 6 4
# 7 3 1 4 4
# 8 3 2 5 4
# 9 3 3 6 4

You can do this with library(data.table) as follows
T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]
This gives:
id epnum clockst ep_start
1: 1 1 0 0
2: 1 2 1 0
3: 1 3 2 0
4: 2 1 4 4
5: 2 2 5 4
6: 2 3 6 4
7: 3 1 4 4
8: 3 2 5 4
9: 3 3 6 4

dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
# id epnum clockst ep_start
#1 1 1 0 0
#2 1 2 1 0
#3 1 3 2 0
#4 2 1 4 4
#5 2 2 5 4
#6 2 3 6 4
#7 3 1 4 4
#8 3 2 5 4
#9 3 3 6 4

Using match
clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Mutate with str_detect only if condition is true - r

Related

How to custom arrange such that no group dimension contains the same index twice?

applying sorting function in r for every four rows returns dataframe sorted but without extended selection, other columns are not sorted accordingly

Longest consecutive count of the same value per group

Panel data sequence adding for a particular value

R — Assign value to vector based on first episode

Categories

Resources