Rows sequence by group using two columns - r

Suppose I have the following df
data <- data.frame(ID = c(1,1,1,1,1,1,1,2,2,2,2,3,3,3),
Value = c(1,1,0,1,0,1,1,1,0,0,1,0,0,0),
Result = c(1,1,2,3,4,5,5,1,2,2,3,1,1,1))
How can I obtain column Result from the first two columns?
I have tried different approaches using rle, seq, cumsum and cur_group_id but can't get the Result column easily

library(data.table)
library(dplyr)
data %>%
group_by(ID) %>%
mutate(Result2 = rleid(Value))
This gives us:
ID Value Result Result2
<dbl> <dbl> <dbl> <int>
1 1 1 1 1
2 1 1 1 1
3 1 0 2 2
4 1 1 3 3
5 1 0 4 4
6 1 1 5 5
7 1 1 5 5
8 2 1 1 1
9 2 0 2 2
10 2 0 2 2
11 2 1 3 3
12 3 0 1 1
13 3 0 1 1
14 3 0 1 1

Does this work:
library(dplyr)
data %>% group_by(ID) %>% mutate(r = rep(seq_along(rle(ID*Value)$values), rle(ID*Value)$lengths))
# A tibble: 14 x 4
# Groups: ID [3]
ID Value Result r
<dbl> <dbl> <dbl> <int>
1 1 1 1 1
2 1 1 1 1
3 1 0 2 2
4 1 1 3 3
5 1 0 4 4
6 1 1 5 5
7 1 1 5 5
8 2 1 1 1
9 2 0 2 2
10 2 0 2 2
11 2 1 3 3
12 3 0 1 1
13 3 0 1 1
14 3 0 1 1

We could use rle with ave in base R
data$Result2 <- with(data, ave(Value, ID, FUN =
function(x) inverse.rle(within.list(rle(x), values <- seq_along(values)))))
data$Result2
#[1] 1 1 2 3 4 5 5 1 2 2 3 1 1 1

Related

R Count Unique By Group in DPLYR

HAVE = data.frame("TRIMESTER" = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,4,4,4),
"STUDENT" = c(1,2,3,3,4,2,5,6,7,1,2,2,2,2,2,1,2,3,4,5))
HAVE$WANT1 = c(4,4,4,4,4,5,5,5,5,5,1,1,1,1,5,5,5,5,5,5)
HAVE$WANT2 = c(0,0,0,0,0,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1)
I have HAVE and wish to APPEND a column to count the UNIQUE value of STUDENT for every TRIMESTER shown WANT1 and I wish to create WANT2 which is the SUM of times for every TRIMESTER that STUDENT==5 appears so STUDENT==5 appear ZERO times in TRIMESTER == 1, so the value for all TRIMESTER == 1 is ZERO but student 5 appear ONCE in TRIMESTER==4 so the value is 1
After grouping by 'TRIMESTER', get the count of distinct elements of 'STUDENT' with n_distinct and the count of STUDENT 5 with sum on a logical expression
library(dplyr)
HAVE %>%
group_by(TRIMESTER) %>%
mutate(WANT1new = n_distinct(STUDENT),
WANT2NEW = sum(STUDENT == 5)) %>%
ungroup
-output
# A tibble: 20 × 6
TRIMESTER STUDENT WANT1 WANT2 WANT1new WANT2NEW
<dbl> <dbl> <dbl> <dbl> <int> <int>
1 1 1 4 0 4 0
2 1 2 4 0 4 0
3 1 3 4 0 4 0
4 1 3 4 0 4 0
5 1 4 4 0 4 0
6 2 2 5 1 5 1
7 2 5 5 1 5 1
8 2 6 5 1 5 1
9 2 7 5 1 5 1
10 2 1 5 1 5 1
11 3 2 1 0 1 0
12 3 2 1 0 1 0
13 3 2 1 0 1 0
14 3 2 1 0 1 0
15 4 2 5 1 5 1
16 4 1 5 1 5 1
17 4 2 5 1 5 1
18 4 3 5 1 5 1
19 4 4 5 1 5 1
20 4 5 5 1 5 1
The code below should produce the desired result.
library(dplyr)
HAVE %>%
group_by(TRIMESTER) %>%
mutate(WANT1 = length(unique(STUDENT)),
WANT2 = as.numeric(any(5 == STUDENT)))

R update values within a grouped df with information from updated previous value

I would like conditionally mutate variables (var1, var2) within groups (id) at different timepoints (timepoint) using previously updated/muated values according to this function:
change_function <- function(value,pastvalue,timepoint){
if(timepoint==1){valuenew=value} else
if(value==0){valuenew=pastvalue-1}
if(value==1){valuenew=pastvalue}
if(value==2){valuenew=pastvalue+1}
return(valuenew)
}
pastvalue is the MUTATED/UPDATED value at timepoint -1 for timepoint 2:4
Here is an example and output file:
``` r
#example data
df <- data.frame(id=c(1,1,1,1,2,2,2,2),timepoint=c(1,2,3,4,1,2,3,4),var1=c(1,0,1,2,2,2,1,0),var2=c(2,0,1,2,3,2,1,0))
df
#> id timepoint var1 var2
#> 1 1 1 1 2
#> 2 1 2 0 0
#> 3 1 3 1 1
#> 4 1 4 2 2
#> 5 2 1 2 3
#> 6 2 2 2 2
#> 7 2 3 1 1
#> 8 2 4 0 0
#desired output
output <- data.frame(id=c(1,1,1,1,2,2,2,2),timepoint=c(1,2,3,4,1,2,3,4),var1=c(1,0,0,1,2,3,3,2),var2=c(2,1,1,2,3,4,4,3))
output
#> id timepoint var1 var2
#> 1 1 1 1 2
#> 2 1 2 0 1
#> 3 1 3 0 1
#> 4 1 4 1 2
#> 5 2 1 2 3
#> 6 2 2 3 4
#> 7 2 3 3 4
#> 8 2 4 2 3
```
<sup>Created on 2020-11-23 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
My Approach: use my function using dplyr::mutate_at
library(dplyr)
df %>%
group_by(id) %>%
mutate_at(.vars=vars(var1,var2),
.funs=funs(.=change_function(.,dplyr::lag(.),timepoint)))
However, this does not work because if/else is not vectorized
Update 1:
Using a nested ifelse function does not give the desired output, because it does not use updated pastvalue's:
change_function <- function(value,pastvalue,timepoint){
ifelse((timepoint==1),value,
ifelse((value==0),pastvalue-1,
ifelse((value==1),pastvalue,
ifelse((value==2),pastvalue+1,NA))))
}
library(dplyr)
df %>%
group_by(id) %>%
mutate_at(.vars=vars(var1,var2),
.funs=funs(.=change_function(.,dplyr::lag(.),timepoint)))
id TimePoint var1 var2 var1_. var2_.
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 2 1 2
2 1 2 0 0 0 1
3 1 3 1 1 0 0
4 1 4 2 2 2 2
5 2 1 2 3 2 3
6 2 2 2 2 3 4
7 2 3 1 1 2 2
8 2 4 0 0 0 0
Update 2:
According to the comments, purrr:accumulate could be used
Thanks to akrun I could get the correct function:
# write a vectorized function
change_function <- function(prev, new) {
change=if_else(new==0,-1,
if_else(new==1,0,1))
if_else(is.na(new), new, prev + change)
}
# use purrr:accumulate
df %>%
group_by(id) %>%
mutate_at(.vars=vars(var1,var2),
.funs=funs(accumulate(.,change_function)))
# A tibble: 8 x 4
# Groups: id [2]
id timepoint var1 var2
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 2
2 1 2 0 1
3 1 3 0 1
4 1 4 1 2
5 2 1 2 3
6 2 2 3 4
7 2 3 3 4
8 2 4 2 3

is there a way in R to fill missing groups absent of observations?

Say I have something like:
df<-data.frame(group=c(1, 1,1, 2,2,2,3,3,3,4,4, 1, 1,1),
group2=c(1,2,3,1,2,3,1,2,3,1,3, 1,2,3))
group group2
1 1 1
2 1 2
3 1 3
4 2 1
5 2 2
6 2 3
7 3 1
8 3 2
9 3 3
10 4 1
11 4 3
12 1 1
13 1 2
14 1 3
My goal is to count the number of unique instances for group= something and group2= something. Like so:
df1<-df%>%group_by(group, group2)%>% mutate(want=n())%>%distinct(group, group2, .keep_all=TRUE)
group group2 want
<dbl> <dbl> <int>
1 1 1 2
2 1 2 2
3 1 3 2
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 1
8 3 2 1
9 3 3 1
10 4 1 1
11 4 3 1
however, notice that group=4, group2=2 was not in my dataset to begin with. Is there some sort of autofill function where I can fill these non-observations with a zero to get below easily?:
group group2 want
<dbl> <dbl> <int>
1 1 1 2
2 1 2 2
3 1 3 2
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 1
8 3 2 1
9 3 3 1
10 4 1 1
11 4 2 0
12 4 3 1
After getting the count, we can expand with complete to fill the missing combinations with 0
library(dplyr)
library(tidyr)
df %>%
count(group, group2) %>%
complete(group, group2, fill = list(n = 0))
# A tibble: 12 x 3
# group group2 n
# <dbl> <dbl> <dbl>
# 1 1 1 2
# 2 1 2 2
# 3 1 3 2
# 4 2 1 1
# 5 2 2 1
# 6 2 3 1
# 7 3 1 1
# 8 3 2 1
# 9 3 3 1
#10 4 1 1
#11 4 2 0
#12 4 3 1
Or if we do the group_by, instead of mutate and then do the distinct, directly use the summarise
df %>%
group_by(group, group2) %>%
summarise(n = n()) %>%
ungroup %>%
complete(group, group2, fill = list(n = 0))
Here is a data.table approach solution to this problem:
library(data.table)
setDT(df)[CJ(group, group2, unique = TRUE),
c(.SD, .(want = .N)), .EACHI,
on = c("group", "group2")]
# group group2 want
# 1 1 2
# 1 2 2
# 1 3 2
# 2 1 1
# 2 2 1
# 2 3 1
# 3 1 1
# 3 2 1
# 3 3 1
# 4 1 1
# 4 2 0
# 4 3 1

Building sum of dynamic number of rows in dplyr

My df looks something like the first three columns of the following:
ID VAL LENGTH SUM
1 1 1 1
1 1 1 1
1 1 2 2
1 1 2 2
2 0 1 0
2 3 1 0
2 4 2 3
I want to add a fourth column, which is defined as the sum of the group's first to LENGTH-st values in VAL.
How do I do that?
You could do:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(SUM = sapply(LENGTH, function(x) sum(VAL[1:x])))
Output:
# A tibble: 7 x 4
# Groups: ID [2]
ID VAL LENGTH SUM
<int> <int> <int> <dbl>
1 1 1 1 1
2 1 1 1 1
3 1 1 2 2
4 1 1 2 2
5 2 0 1 0
6 2 3 1 0
7 2 4 2 3

If a value appears in the row, all subsequent rows should take this value (with dplyr)

I'm just starting to learn R and I'm already facing the first bigger problem.
Let's take the following panel dataset as an example:
N=5
T=3
time<-rep(1:T, times=N)
id<- rep(1:N,each=T)
dummy<- c(0,0,1,1,0,0,0,1,0,0,0,1,0,1,0)
df<-as.data.frame(cbind(id, time,dummy))
id time dummy
1 1 1 0
2 1 2 0
3 1 3 1
4 2 1 1
5 2 2 0
6 2 3 0
7 3 1 0
8 3 2 1
9 3 3 0
10 4 1 0
11 4 2 0
12 4 3 1
13 5 1 0
14 5 2 1
15 5 3 0
I now want the dummy variable for all rows of a cross section to take the value 1 after the 1 for this cross section appears for the first time. So, what I want is:
id time dummy
1 1 1 0
2 1 2 0
3 1 3 1
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 0
8 3 2 1
9 3 3 1
10 4 1 0
11 4 2 0
12 4 3 1
13 5 1 0
14 5 2 1
15 5 3 1
So I guess I need something like:
df_new<-df %>%
group_by(id) %>%
???
I already tried to set all zeros to NA and use the na.locf function, but it didn't really work.
Anybody got an idea?
Thanks!
Use cummax
df %>%
group_by(id) %>%
mutate(dummy = cummax(dummy))
# A tibble: 15 x 3
# Groups: id [5]
# id time dummy
# <dbl> <dbl> <dbl>
# 1 1 1 0
# 2 1 2 0
# 3 1 3 1
# 4 2 1 1
# 5 2 2 1
# 6 2 3 1
# 7 3 1 0
# 8 3 2 1
# 9 3 3 1
#10 4 1 0
#11 4 2 0
#12 4 3 1
#13 5 1 0
#14 5 2 1
#15 5 3 1
Without additional packages you could do
transform(df, dummy = ave(dummy, id, FUN = cummax))

Resources