Longest consecutive count of the same value per group - r

I have a data.frame as below and I want to add a variable describing the longest consecutive count of 1 in the VALUE variable observed in the group (i.e. longest consecutive rows with 1 in VALUE per group).
GROUP_ID VALUE
1 0
1 1
1 1
1 1
1 1
1 0
2 1
2 1
2 0
2 1
2 1
2 1
3 1
3 0
3 1
3 0
So the output would look like this:
GROUP_ID VALUE CONSECUTIVE
1 0 4
1 1 4
1 1 4
1 1 4
1 1 4
1 0 4
2 1 3
2 1 3
2 0 3
2 1 3
2 1 3
2 1 3
3 1 1
3 0 1
3 1 1
3 0 1
Any help would be greatly appreciated!

Using dplyr:
library(dplyr)
dat %>%
group_by(GROUP_ID) %>%
mutate(CONSECUTIVE = {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])})
which gives:
# A tibble: 16 x 3
# Groups: GROUP_ID [3]
GROUP_ID VALUE CONSECUTIVE
<int> <int> <int>
1 1 0 4
2 1 1 4
3 1 1 4
4 1 1 4
5 1 1 4
6 1 0 4
7 2 1 3
8 2 1 3
9 2 0 3
10 2 1 3
11 2 1 3
12 2 1 3
13 3 1 1
14 3 0 1
15 3 1 1
16 3 0 1
Or with data.table:
library(data.table)
setDT(dat) # convert to a 'data.table'
dat[, CONSECUTIVE := {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])}
, by = GROUP_ID][]

We can use ave with rle and get maximum occurrence of consecutive 1's for each group. (GROUP_ID)
df$Consecutive <- ave(df$VALUE, df$GROUP_ID, FUN = function(x) {
y <- rle(x == 1)
max(y$lengths[y$values])
})
df
# GROUP_ID VALUE Consecutive
#1 1 0 4
#2 1 1 4
#3 1 1 4
#4 1 1 4
#5 1 1 4
#6 1 0 4
#7 2 1 3
#8 2 1 3
#9 2 0 3
#10 2 1 3
#11 2 1 3
#12 2 1 3
#13 3 1 1
#14 3 0 1
#15 3 1 1
#16 3 0 1

Here is another option with data.table
library(data.table)
library(dplyr)
setDT(df1)[, CONSECUTIVE := max(table(na_if(rleid(VALUE)*VALUE, 0))), .(GROUP_ID)]
df1
# GROUP_ID VALUE CONSECUTIVE
# 1: 1 0 4
# 2: 1 1 4
# 3: 1 1 4
# 4: 1 1 4
# 5: 1 1 4
# 6: 1 0 4
# 7: 2 1 3
# 8: 2 1 3
# 9: 2 0 3
#10: 2 1 3
#11: 2 1 3
#12: 2 1 3
#13: 3 1 1
#14: 3 0 1
#15: 3 1 1
#16: 3 0 1
data
df1 <- structure(list(GROUP_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), VALUE = c(0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-16L))

Related

Mutate with str_detect only if condition is true

Im trying to use str_detect to mutate only if the column "RedColor" is "1".
I have a dataset test which looks like this:
# id RedColor Color_Number
#1 1 1 1
#2 2 0 1
#3 3 1 3
#4 4 1 2
#5 6 0 2
#6 8 1 6
I tried the filter function but it returns me only a filtered dataset with all other cases with RedColor = "0" removed.
test <- test %>%
filter(RedColor==TRUE) %>%
mutate(DarkRed = str_detect(Color_Number, "1|2"))
Im expecting an output with the new column DarkRed = "1" in all cases with RedColor = 1 and 1 or 2 in column Color_Number.
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
Thank you!
Using base R
transform(df, Dark_Red = +(RedColor == 1& Color_Number %in% 1:2))
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 0
6 8 1 6 0
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)), row.names = c(NA,
-6L), class = "data.frame")
Update on OP's request (see comments):
With this dataframe:
id RedColor Color_Number
1 1 1 one
2 2 0 one
3 3 1 three
4 4 1 two
5 6 0 two
6 8 1 six
you could use this code:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == "one" | Color_Number == "two", 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 one 1
2 2 0 one 0
3 3 1 three 0
4 4 1 two 1
5 6 0 two 1
6 8 1 six 0
First answer:
We could use ifelse
str_detect is not appropriate as Ronak already explained:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == 1 | Color_Number == 2, 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 1
6 8 1 6 0
For exact matches don't perform regex match. str_detect is used for pattern matching. Use %in% to match multiple values.
library(dplyr)
df <- df %>% mutate(Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
df
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
If you want to write this in base R use transform -
df <- transform(df, Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)),
row.names = c(NA, -6L), class = "data.frame")
you can use ifelse inside the mutate call instead of filtering:
test <- test %>%
mutate(Darkred=ifelse((RedColor==TRUE & Color_Number %in% 1:2), 1,0))
> test
# A tibble: 10 × 4
id RedColor Color_Number Darkred
<int> <int> <int> <dbl>
1 1 1 2 1
2 2 1 2 1
3 3 1 3 0
4 4 1 3 0
5 5 0 4 0
6 6 0 2 0
7 7 1 3 0
8 8 1 4 0
9 9 0 5 0
10 10 0 3 0
Data:
test<-data_frame(id=1:10,
RedColor=rbinom(10,1,0.5),
Color_Number=sample(1:5,10,TRUE,rep(.2,5)))

By group relative order

I have a data set that looks like this
ID
Week
1
3
1
5
1
5
1
8
1
11
1
16
2
2
2
2
2
3
2
3
2
9
Now, what I would like to do is to add another column to the DataFrame so that, for every ID I will mark the week's relative position. More elaborately, I would like to the mark ID's earliest week (smallest number) as 1, then the next week for the ID as 2 and so forth, where if there are two observations of the same week they get the same number.
So, in the above example I should get:
ID
Week
Order
1
3
1
1
5
2
1
5
2
1
8
3
1
11
4
1
16
5
2
2
1
2
2
1
2
3
2
2
3
2
2
9
3
How could I achieve this?
Thank you very much!
A base R option using ave + match
transform(
df,
Order = ave(Week,
ID,
FUN = function(x) match(x, sort(unique(x)))
)
)
or ave + order (thank #IRTFM for comments)
transform(
df,
Order = ave(Week,
ID,
FUN = order
)
)
gives
ID Week Order
1 1 3 1
2 1 5 2
3 1 5 2
4 1 8 3
5 1 11 4
6 1 16 5
7 2 2 1
8 2 2 1
9 2 3 2
10 2 3 2
11 2 9 3
A data.table option with frank
> setDT(df)[, Order := frank(Week, ties.method = "dense"), ID][]
ID Week Order
1: 1 3 1
2: 1 5 2
3: 1 5 2
4: 1 8 3
5: 1 11 4
6: 1 16 5
7: 2 2 1
8: 2 2 1
9: 2 3 2
10: 2 3 2
11: 2 9 3
Data
> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), Week = c(3L, 5L, 5L, 8L, 11L, 16L, 2L, 2L, 3L, 3L, 9L)), class = "data.frame", row.names =
c(NA,
-11L))
You can use dense_rank in dplyr :
library(dplyr)
df %>% group_by(ID) %>% mutate(Order = dense_rank(Week)) %>% ungroup
# ID Week Order
# <int> <int> <int>
# 1 1 3 1
# 2 1 5 2
# 3 1 5 2
# 4 1 8 3
# 5 1 11 4
# 6 1 16 5
# 7 2 2 1
# 8 2 2 1
# 9 2 3 2
#10 2 3 2
#11 2 9 3

How to custom arrange such that no group dimension contains the same index twice?

I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...
Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2

Panel data sequence adding for a particular value

I am really new in r and stackoverflow. Apologies in advance for this novice question.
I have a panel data set like the following table.
ID Choice
1 1
1 1
1 2
1 5
1 1
2 1
2 1
2 5
2 1
2 1
3 3
3 1
3 1
3 2
3 4
I want to add another column like the following table when choice is 1. This is basically, sequencing the choice 1 within ID.
ID Choice BUS
1 1 0 (The first 1 will be considered as 0)
1 1 1
1 2 1
1 5 1
1 1 2
2 1 0
2 1 1
2 5 1
2 1 2
2 1 3
3 3 0
3 1 0
3 1 1
3 2 1
3 4 1
with(df, ave(Choice == 1, ID, FUN = cumsum))
Almost gives you what you want but as you want to consider first 1 as 0 it needs some modification.
df$BUS <- with(df, ave(Choice == 1, ID, FUN = function(x) {
inds = cumsum(x)
ifelse(inds > 0, inds - 1, inds)
}))
df
# ID Choice BUS
#1 1 1 0
#2 1 1 1
#3 1 2 1
#4 1 5 1
#5 1 1 2
#6 2 1 0
#7 2 1 1
#8 2 5 1
#9 2 1 2
#10 2 1 3
#11 3 3 0
#12 3 1 0
#13 3 1 1
#14 3 2 1
#15 3 4 1
Here we subtract 1 from cumulative sum from the first 1.
Using the same logic in dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate(inds = cumsum(Choice == 1),
BUS = ifelse(inds > 0, inds - 1, inds)) %>%
select(-inds)
We can also use data.table
library(data.table)
setDT(df1)[, BUS := pmax(0, cumsum(Choice == 1)-1), ID]
df1
# ID Choice BUS
# 1: 1 1 0
# 2: 1 1 1
# 3: 1 2 1
# 4: 1 5 1
# 5: 1 1 2
# 6: 2 1 0
# 7: 2 1 1
# 8: 2 5 1
# 9: 2 1 2
#10: 2 1 3
#11: 3 3 0
#12: 3 1 0
#13: 3 1 1
#14: 3 2 1
#15: 3 4 1
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), Choice = c(1L, 1L, 2L, 5L, 1L, 1L, 1L, 5L,
1L, 1L, 3L, 1L, 1L, 2L, 4L)), class = "data.frame", row.names = c(NA,
-15L))

R: adding in rows of zero based on the values in multiple columns

I am trying to append rows to an R data.frame. Here is an example of a data.frame "foo":
A B C D
1 1 1 200
1 1 2 50
1 1 3 15
1 2 1 150
1 2 4 50
1 3 1 300
2 1 2 40
2 1 4 90
2 3 2 80
For every A, there are 3 possible values of B, and for every B, there are 4 possible values of C. However, the initial df only contains non-zero values of D. I'd like to manipulate the df so that zeros are included for both B and C. Thus, the df would show 0's in D for any value of B/C that was 0. I have seen questions that address this with one column, but couldn't find a question addressing it with multiple columns. The final df would look like this:
A B C D
1 1 1 200
1 1 2 50
1 1 3 15
1 1 4 0
1 2 1 150
1 2 2 0
1 2 3 0
1 2 4 50
1 3 1 300
1 3 2 0
1 3 3 0
1 3 4 0
2 1 1 0
2 1 2 40
2 1 3 0
2 1 4 90
2 2 1 0
2 2 2 0
2 2 3 0
2 2 4 0
2 3 1 0
2 3 2 80
2 3 3 0
2 3 4 0
I first tried creating a dummy data frame that then merged with the initial df, but something isn't working right. Here's the current code, which I know is wrong because this code only generates rows based on A. I think I want to make the dummy frame based on A and B but I don't know how - could an if/else function work here?:
# create dummy df
dummy <- as.data.frame(
cbind(
sort(rep(unique(foo$A), 12)),
rep(1:3,length(unique(foo$A)))))
colnames(dummy) <- c("A","B")
foo$A <- as.numeric(foo$A)
foo$B <- as.numeric(foo$C)
# merge with foo
mergedummy <- merge(dummy,foo,all.x=T)
Any insight is greatly appreciated - thanks!
A one liner:
merge(dat, data.frame(table(dat[1:3]))[-4],all.y=TRUE)
# A B C D
#1 1 1 1 200
#2 1 1 2 50
#3 1 1 3 15
#4 1 1 4 NA
#...
Or maybe less complicated:
out <- data.frame(xtabs(D ~ ., data=dat))
out[do.call(order,out[1:3]),]
# A B C Freq
#1 1 1 1 200
#7 1 1 2 50
#13 1 1 3 15
#19 1 1 4 0
#...
Where dat is:
dat <- structure(list(A = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), B = c(1L,
1L, 1L, 2L, 2L, 3L, 1L, 1L, 3L), C = c(1L, 2L, 3L, 1L, 4L, 1L,
2L, 4L, 2L), D = c(200L, 50L, 15L, 150L, 50L, 300L, 40L, 90L,
80L)), .Names = c("A", "B", "C", "D"), class = "data.frame", row.names = c(NA,
-9L))
I created a master data frame which includes all combinations of A, B, and C as you describe in the expected outcome. Then, I merge the master data frame and your data frame. Finally, I replaced NA with 0.
master <- data.frame(A = rep(1:2, each = 12),
B = rep(1:3, each = 4),
C = rep(1:4, times = 6))
library(dplyr)
master %>%
left_join(., mydf) %>%
mutate(D = ifelse(D %in% NA, 0, D))
# A B C D
#1 1 1 1 200
#2 1 1 2 50
#3 1 1 3 15
#4 1 1 4 0
#5 1 2 1 150
#6 1 2 2 0
#7 1 2 3 0
#8 1 2 4 50
#9 1 3 1 300
#10 1 3 2 0
#11 1 3 3 0
#12 1 3 4 0
#13 2 1 1 0
#14 2 1 2 40
#15 2 1 3 0
#16 2 1 4 90
#17 2 2 1 0
#18 2 2 2 0
#19 2 2 3 0
#20 2 2 4 0
#21 2 3 1 0
#22 2 3 2 80
#23 2 3 3 0
#24 2 3 4 0
Here is one solution:
foo <- merge(expand.grid(lapply(foo[,1:3], unique)), foo, all=TRUE, sort=TRUE)
foo[is.na(foo)] <- 0

Resources