df_test <- data.frame(MONTH_NUM = c(7,7,8,8,8,10,11,12,1,2,3,4,4,5,5,5,5,NA)
, YEAR = c(2018,2018,2018,2018,2019,2019,2019,2019,2019,2018,2018,2019,2018,2018,2018,2018,2018,NA)
, Sys_Indicator = c(1,0,0,1,0,0,0,0,1,1,0,1,0,1,1,1,1,1)
, lbl_Indicator = c(1,1,1,1,0,1,0,0,1,1,0,1,1,1,1,1,1,0)
, Pk_Indicator=c(1,0,1,1,0,1,0,0,1,1,0,1,0,0,0,0,1,1))
I want to find the cumulative sum of each indicator for each month+year combination. I'm currently using dplyr to achieve this but I was wondering if there was an easier way to do this and to do it for all variables that have and Indicator in their names? I want all my variable with Indicator in them to have cumulative sum.
df_test %>%
group_by(YEAR,MONTH_NUM) %>%
summarize(Sys_sum=sum(Sys_Indicator),lbl_Sum=sum(lbl_Indicator),Pk_Sum=sum(Pk_Indicator)) %>%
arrange(MONTH_NUM,YEAR) %>%
ungroup() %>%
mutate(Sys_cum=cumsum(Sys_sum),Cum_lbl=cumsum(lbl_Sum),Pk_sum=cumsum(Pk_Sum))
You could use the _at variants in dplyr to apply this for multiple columns :
library(dplyr)
df_test %>%
arrange(MONTH_NUM,YEAR) %>%
group_by(YEAR,MONTH_NUM) %>%
summarize_at(vars(ends_with('Indicator')), sum) %>%
ungroup() %>%
mutate_at(vars(ends_with('Indicator')), list(cs = ~cumsum(.)))
# YEAR MONTH_NUM Sys_Indicator lbl_Indicator Pk_Indicator Sys_Indicator_cs lbl_Indicator_cs Pk_Indicator_cs
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2018 2 1 1 1 1 1 1
# 2 2018 3 0 0 0 1 1 1
# 3 2018 4 0 1 0 1 2 1
# 4 2018 5 4 4 1 5 6 2
# 5 2018 7 1 2 1 6 8 3
# 6 2018 8 1 2 2 7 10 5
# 7 2019 1 1 1 1 8 11 6
# 8 2019 4 1 1 1 9 12 7
# 9 2019 8 0 0 0 9 12 7
#10 2019 10 0 1 1 9 13 8
#11 2019 11 0 0 0 9 13 8
#12 2019 12 0 0 0 9 13 8
#13 NA NA 1 0 1 10 13 9
I think I understand what you want. Here is a data.table approach.
library(data.table)
setDT(df_test)[ ,sapply(names(df_test)[grep("Indicator",names(df_test))],paste0,"_cumsum") := lapply(.SD[,grep("Indicator",names(df_test))],cumsum)]
df_test
MONTH_NUM YEAR Sys_Indicator lbl_Indicator Pk_Indicator Sys_Indicator_cumsum lbl_Indicator_cumsum Pk_Indicator_cumsum
1: 7 2018 1 1 1 1 1 1
2: 7 2018 0 1 0 1 2 1
3: 8 2018 0 1 1 1 3 2
4: 8 2018 1 1 1 2 4 3
5: 8 2019 0 0 0 2 4 3
6: 10 2019 0 1 1 2 5 4
7: 11 2019 0 0 0 2 5 4
8: 12 2019 0 0 0 2 5 4
9: 1 2019 1 1 1 3 6 5
10: 2 2018 1 1 1 4 7 6
11: 3 2018 0 0 0 4 7 6
12: 4 2019 1 1 1 5 8 7
13: 4 2018 0 1 0 5 9 7
14: 5 2018 1 1 0 6 10 7
15: 5 2018 1 1 0 7 11 7
16: 5 2018 1 1 0 8 12 7
17: 5 2018 1 1 1 9 13 8
18: NA NA 1 0 1 10 13 9
Related
Here is a part of the sample data :
dat<-read.table (text=" ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1 NA NA NA
2 10 12 0 6 7 8
2 14 0 1 NA NA NA
1 16 16A 0 1 2 4
1 14 0 1 NA NA NA
2 14 16A 0 5 6 7
2 7 0 1 NA NA NA
1 7 20 0 5 8 0
1 7 0 1 NA NA NA
2 9 20 0 7 8 1
2 9 0 1 NA NA NA
", header=TRUE)
I want to update value 1 In column T1 for repeated IDs. For the first repeated IDs, should be a value of 1, and for the second repeated IDs, a value of 2; and for the third repeated IDs, a value of 3 and so on. I also want to replace NA with blank cells. here is the expected outcome:
ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1
2 10 12 0 6 7 8
2 14 0 1
1 16 16A 0 1 2 4
1 14 0 2
2 14 16A 0 5 6 7
2 7 0 2
1 7 20 0 5 8 0
1 7 0 3
2 9 20 0 7 8 1
2 9 0 3
You could use an ifelse across with cumsum per group like this:
library(dplyr)
dat %>%
group_by(ID, B1) %>%
mutate(across(T1, ~ ifelse(.x == 1, cumsum(.x), T1)))
#> # A tibble: 12 × 7
#> # Groups: ID, B1 [8]
#> ID Time B1 T1 Q1 W1 M1
#> <int> <int> <chr> <int> <int> <int> <int>
#> 1 1 12 12 0 12 11 9
#> 2 1 13 0 1 NA NA NA
#> 3 2 10 12 0 6 7 8
#> 4 2 14 0 1 NA NA NA
#> 5 1 16 16A 0 1 2 4
#> 6 1 14 0 2 NA NA NA
#> 7 2 14 16A 0 5 6 7
#> 8 2 7 0 2 NA NA NA
#> 9 1 7 20 0 5 8 0
#> 10 1 7 0 3 NA NA NA
#> 11 2 9 20 0 7 8 1
#> 12 2 9 0 3 NA NA NA
Created on 2023-01-14 with reprex v2.0.2
With data.table
library(data.table)
setDT(dat)[T1 ==1, T1 := cumsum(T1), .(ID, B1)]
-output
> dat
ID Time B1 T1 Q1 W1 M1
1: 1 12 12 0 12 11 9
2: 1 13 0 1 NA NA NA
3: 2 10 12 0 6 7 8
4: 2 14 0 1 NA NA NA
5: 1 16 16A 0 1 2 4
6: 1 14 0 2 NA NA NA
7: 2 14 16A 0 5 6 7
8: 2 7 0 2 NA NA NA
9: 1 7 20 0 5 8 0
10: 1 7 0 3 NA NA NA
11: 2 9 20 0 7 8 1
12: 2 9 0 3 NA NA NA
I have a dataframe like this:
df <- data.frame(
id = 1:19,
Area_l = c(1,2,0,0,0,2,3,1,2,0,0,0,0,3,4,0,0,0,0),
Area_r = c(3,2,2,0,0,2,3,1,0,0,0,1,3,3,4,3,0,0,0)
)
I need to filter the dataframe in such a way that all rows are omitted that fulfill two conditions:
(i): Area_l and Area_r are 0
(ii): the paired 0values in Area_l and Area_r are the last values in the columns.
I really have no clue how to implement these two conditions using dplyr. The desired result is this:
df
id Area_l Area_r
1 1 1 3
2 2 2 2
3 3 0 2
4 4 0 0
5 5 0 0
6 6 2 2
7 7 3 3
8 8 1 1
9 9 2 0
10 10 0 0
11 11 0 0
12 12 0 1
13 13 0 3
14 14 3 3
15 15 4 4
16 16 0 3
Any help?
Reverse the order of the dataframe, filter with a cumany condition, then reverse it back.
library(dplyr)
df %>%
map_df(rev) %>%
filter(cumany(Area_l + Area_r != 0)) %>%
map_df(rev)
output
# A tibble: 16 x 3
id Area_l Area_r
<int> <dbl> <dbl>
1 1 1 3
2 2 2 2
3 3 0 2
4 4 0 0
5 5 0 0
6 6 2 2
7 7 3 3
8 8 1 1
9 9 2 0
10 10 0 0
11 11 0 0
12 12 0 1
13 13 0 3
14 14 3 3
15 15 4 4
16 16 0 3
We may use rle
library(dplyr)
df %>%
filter(!if_all(starts_with("Area"), ~ .x == 0 &
inverse.rle(within.list(rle(.x == 0), values[-length(values)] <- FALSE))))
-output
id Area_l Area_r
1 1 1 3
2 2 2 2
3 3 0 2
4 4 0 0
5 5 0 0
6 6 2 2
7 7 3 3
8 8 1 1
9 9 2 0
10 10 0 0
11 11 0 0
12 12 0 1
13 13 0 3
14 14 3 3
15 15 4 4
16 16 0 3
Or another option is
df %>%
filter(if_any(starts_with("Area"),
~ row_number() <= max(row_number() * (.x != 0))))
Or another option is revcumsum from spatstat.utils
library(spatstat.utils)
df %>%
filter(!if_all(starts_with("Area"), ~ revcumsum(.x != 0) <1))
This question already has answers here:
Numbering rows within groups in a data frame
(10 answers)
Closed 1 year ago.
I am trying to create an additional variable (new variable-> flag) that will number the repetition of observation in my variable starting from 0.
dataset <- data.frame(id = c(1,1,1,2,2,4,6,6,6,7,7,7,7,8))
intended results will look like:
id flag
1 0
1 1
1 2
2 0
2 1
4 0
6 0
6 1
6 2
7 0
7 1
7 2
7 3
8 0
Thank You!
You may try
dataset$flag <- unlist(sapply(rle(dataset$id)$length, function(x) seq(1,x)-1))
id flag
1 1 0
2 1 1
3 1 2
4 2 0
5 2 1
6 4 0
7 6 0
8 6 1
9 6 2
10 7 0
11 7 1
12 7 2
13 7 3
14 8 0
data.table:
library(data.table)
setDT(dataset)[, flag := rowid(id) - 1]
dataset
id flag
1: 1 0
2: 1 1
3: 1 2
4: 2 0
5: 2 1
6: 4 0
7: 6 0
8: 6 1
9: 6 2
10: 7 0
11: 7 1
12: 7 2
13: 7 3
14: 8 0
Base R:
dataset$flag = sequence(rle(dataset$id)$lengths) - 1
dataset
id flag
1 1 0
2 1 1
3 1 2
4 2 0
5 2 1
6 4 0
7 6 0
8 6 1
9 6 2
10 7 0
11 7 1
12 7 2
13 7 3
14 8 0
Another base option:
transform(dataset,
flag = Reduce(function(x, y) y * x + y, duplicated(id), accumulate = TRUE))
id flag
1 1 0
2 1 1
3 1 2
4 2 0
5 2 1
6 4 0
7 6 0
8 6 1
9 6 2
10 7 0
11 7 1
12 7 2
13 7 3
14 8 0
dplyr -
library(dplyr)
dataset %>% group_by(id) %>% mutate(flag = row_number() - 1)
# id flag
# <dbl> <dbl>
# 1 1 0
# 2 1 1
# 3 1 2
# 4 2 0
# 5 2 1
# 6 4 0
# 7 6 0
# 8 6 1
# 9 6 2
#10 7 0
#11 7 1
#12 7 2
#13 7 3
#14 8 0
Base R with similar logic
transform(dataset, flag = ave(id, id, FUN = seq_along) - 1)
another way to reach what you expect but writing a little more
x <- dataset %>%
group_by(id) %>%
summarise(nreg=n())
df <- data.frame()
for(i in 1:nrow(x)){
flag <- data.frame(id = rep( x$id[i], x$nreg[i] ),
flag = seq(0, x$nreg [i] -1 )
)
df <- rbind(df, flag)
}
I have the following data set:
time <- c(0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5)
value <- c(10,8,6,5,3,2,12,10,6,5,4,2,20,15,16,9,2,2)
group <- c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3)
data <- data.frame(time, value, group)
I want to create a new column called data$diff that is equal to data$value minus the value of data$value when data$time == 0 within each group.
I am beginning with the following code
for(i in 1:nrow(data)){
for(n in 1:max(data$group)){
if(data$group[i] == n) {
data$diff[i] <- ???????
}
}
}
But cannot figure out what to put in place of the question marks. The desired output would be this table: https://i.stack.imgur.com/1bAKj.png
Any thoughts are appreciated.
Since in your example data$time == 0 is always the first element of the group, you can use this data.table approach.
library(data.table)
setDT(data)
data[, diff := value[1] - value, by = group]
In case that data$time == 0 is not the first element in each group you can use this:
data[, diff := value[time==0] - value, by = group]
Output:
> data
time value group diff
1: 0 10 1 0
2: 1 8 1 2
3: 2 6 1 4
4: 3 5 1 5
5: 4 3 1 7
6: 5 2 1 8
7: 0 12 2 0
8: 1 10 2 2
9: 2 6 2 6
10: 3 5 2 7
11: 4 4 2 8
12: 5 2 2 10
13: 0 20 3 0
14: 1 15 3 5
15: 2 16 3 4
16: 3 9 3 11
17: 4 2 3 18
18: 5 2 3 18
Here is a base R approach.
within(data, diff <- ave(
seq_along(value), group,
FUN = \(i) value[i][time[i] == 0] - value[i]
))
Output
time value group diff
1 0 10 1 0
2 1 8 1 2
3 2 6 1 4
4 3 5 1 5
5 4 3 1 7
6 5 2 1 8
7 0 12 2 0
8 1 10 2 2
9 2 6 2 6
10 3 5 2 7
11 4 4 2 8
12 5 2 2 10
13 0 20 3 0
14 1 15 3 5
15 2 16 3 4
16 3 9 3 11
17 4 2 3 18
18 5 2 3 18
Here is a short way to do it with dplyr.
library(dplyr)
data %>%
group_by(group) %>%
mutate(diff = value[which(time == 0)] - value)
Which gives
# Groups: group [3]
time value group diff
<dbl> <dbl> <dbl> <dbl>
1 0 10 1 0
2 1 8 1 2
3 2 6 1 4
4 3 5 1 5
5 4 3 1 7
6 5 2 1 8
7 0 12 2 0
8 1 10 2 2
9 2 6 2 6
10 3 5 2 7
11 4 4 2 8
12 5 2 2 10
13 0 20 3 0
14 1 15 3 5
15 2 16 3 4
16 3 9 3 11
17 4 2 3 18
18 5 2 3 18
library(dplyr)
vals2use <- data %>%
group_by(group) %>%
filter(time==0) %>%
select(c(2,3)) %>%
rename(value4diff=value)
dataNew <- merge(data, vals2use, all=T)
dataNew$diff <- dataNew$value4diff-dataNew$value
dataNew <- dataNew[,c(1,2,3,5)]
dataNew
group time value diff
1 1 0 10 0
2 1 1 8 2
3 1 2 6 4
4 1 3 5 5
5 1 4 3 7
6 1 5 2 8
7 2 0 12 0
8 2 1 10 2
9 2 2 6 6
10 2 3 5 7
11 2 4 4 8
12 2 5 2 10
13 3 0 20 0
14 3 1 15 5
15 3 2 16 4
16 3 3 9 11
17 3 4 2 18
18 3 5 2 18
I would like to write code to compute within each group, sum of lagged differences as shown in the table below:
ID x rank U R Required Output Value
1 1 1 U1 R1 -
1 1 2 U2 R2 R2-U1
1 1 3 U3 R3 (R3-U2) + (R3-U1)
1 1 4 U4 R4 (R4-U3) + (R4-U2) + (R4-U1)
1 0 5 U5 R5 R5
1 0 6 U6 R6 R6
1 0 7 U7 R7 R7
2 1 1 U8 R8 -
2 1 2 U9 R9 R9-U8
2 1 3 U10 R10 (R10-U9) + (R10 - U8)
2 1 4 U11 R11 (R11-U10) + (R11 - U9) + (R11 - U8)
3 1 1 U12 R12 -
3 0 2 U13 R13 R13
3 0 3 U14 R14 R14
ID is the unique group identifier. x is a bool and depending on its value the required output is either sum of difference with previous values or same period value. "rank" is a rank ordering column and the maximum rank can vary within each group. "U" and "R" are the main columns of interest.
To give a numerical example, I need the following:
ID x rank U R Required Output Value
1 1 1 10 7 -
1 1 2 9 11 1
1 1 3 10 10 1 + 0 = 1
1 1 4 11 13 3+4+3 = 10
1 0 5 7 8 8
1 0 6 8 8 8
1 0 7 5 7 7
2 1 1 3 2 -
2 1 2 9 15 12
2 1 3 13 14 16
2 1 4 1 14 17
3 1 1 12 1 -
3 0 2 14 9 9
3 0 3 1 11 11
R code to generate this table:
ID = c(rep(1,7),rep(2,4),rep(3,3))
x = c(rep(1,4),rep(0,3),rep(1,5),rep(0,2))
rank = c(1:7,1:4,1:3)
U = c(10,9,10,11,7,8,5,3,9,13,1,12,14,1)
R = c(7,11,10,13,8,8,7,2,15,14,14,1,9,11)
dat = cbind(ID,x,rank,U,R)
colnames(dat)=c("ID","x","rank","U","R")
Here's a tidyverse solution:
library(dplyr)
library(tidyr)
dat %>%
as_tibble() %>%
group_by(ID) %>%
mutate(output = ifelse(x, lag(rank) * R - lag(cumsum(U)), R))
Result:
# A tibble: 14 x 6
# Groups: ID [3]
ID x rank U R output
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 10 7 NA
2 1 1 2 9 11 1
3 1 1 3 10 10 1
4 1 1 4 11 13 10
5 1 0 5 7 8 8
6 1 0 6 8 8 8
7 1 0 7 5 7 7
8 2 1 1 3 2 NA
9 2 1 2 9 15 12
10 2 1 3 13 14 16
11 2 1 4 1 14 17
12 3 1 1 12 1 NA
13 3 0 2 14 9 9
14 3 0 3 1 11 11
Here is a base R solution using ave
dat <- within(dat,output <- ave(R,ID,x, FUN = function(v) v*(seq(v)-1))-ave(U,ID,x, FUN = function(v) c(NA,cumsum(v)[-length(v)])))
dat <- within(dat, output <- ifelse(x==0,R,output))
such that
> dat
ID x rank U R output
1 1 1 1 10 7 NA
2 1 1 2 9 11 1
3 1 1 3 10 10 1
4 1 1 4 11 13 10
5 1 0 5 7 8 8
6 1 0 6 8 8 8
7 1 0 7 5 7 7
8 2 1 1 3 2 NA
9 2 1 2 9 15 12
10 2 1 3 13 14 16
11 2 1 4 1 14 17
12 3 1 1 12 1 NA
13 3 0 2 14 9 9
14 3 0 3 1 11 11