freq table for multiple variables in r - r

I would like to crosstab the items variable vs cat as a frequency table.
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
> table(df1$cat, df1$item1)
0 1
1 3 1
2 3 2
3 3 2
4 2 2
Is there a way to print all the items variables freq table by cat together?
Thanks

Here is a quick solution in base-R
aggregate(.~ cat, df1, table)
cat item1.0 item1.1 item2.0 item2.1 item3.0 item3.1
1 1 3 1 1 3 3 1
2 2 3 2 3 2 3 2
3 3 3 2 2 3 2 3
4 4 2 2 3 1 2 2

You can use tally() to get the frequency for every combination of groups.
library(tidyverse)
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
df1 %>% mutate_if(is.numeric, as.factor) %>%
group_by(cat, item1, item2, item3, .drop=F) %>%
tally()
First convert your variables to factors then you can then use group_by(, .drop=F) %>% tally() to tally all of your variables, including all groupings with zero frequencies. Remove .drop=F to remove all zero frequencies.
cat item1 item2 item3 n
1 1 0 0 0 0
2 1 0 0 1 0
3 1 0 1 0 3
4 1 0 1 1 0
5 1 1 0 0 0
6 1 1 0 1 1
7 1 1 1 0 0
8 1 1 1 1 0
9 2 0 0 0 1
10 2 0 0 1 1
11 2 0 1 0 1
12 2 0 1 1 0
13 2 1 0 0 0
14 2 1 0 1 1
15 2 1 1 0 1
16 2 1 1 1 0
17 3 0 0 0 0
18 3 0 0 1 0
19 3 0 1 0 1
20 3 0 1 1 2
21 3 1 0 0 1
22 3 1 0 1 1
23 3 1 1 0 0
24 3 1 1 1 0
25 4 0 0 0 0
26 4 0 0 1 1
27 4 0 1 0 1
28 4 0 1 1 0
29 4 1 0 0 1
30 4 1 0 1 1
31 4 1 1 0 0
32 4 1 1 1 0
Alternatively, if that is too unwieldy, you can also try table1() from library(table1).
library(tidyverse)
library(table1)
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
df1 <- df1 %>% mutate_if(is.numeric, as.factor)
table1(~ item1 + item2 + item3 | cat, data=df1)
To get a table of the frequencies and percentages. The top row is your cat variable.
table1() is really great for generating HTML frequency tables. Highly recommend. You can do lots of formatting and labels to make tables presentable. Here is a tutorial

Here's another approach using ftable and stack from base R:
x <- ftable(cbind(cat = df1[, 1], stack(df1[-1])), row.vars = 1, col.vars = c(3, 2))
x
# ind item1 item2 item3
# values 0 1 0 1 0 1
# cat
# 1 3 1 1 3 3 1
# 2 3 2 3 2 3 2
# 3 3 2 2 3 2 3
# 4 2 2 3 1 2 2
One (debatable) downside of this approach is that the default data.table or data.frame methods for converting ftables to more usable objects will convert the output to a long format. But, you can grab SOfun and use ftable2dt if you want to keep the wide format.
library(SOfun)
ftable2dt(x)
# cat item1_0 item1_1 item2_0 item2_1 item3_0 item3_1
# 1: 1 3 1 1 3 3 1
# 2: 2 3 2 3 2 3 2
# 3: 3 3 2 2 3 2 3
# 4: 4 2 2 3 1 2 2

You can try this:
List <- list()
for(i in 2:dim(df1)[2])
{
List[[i-1]] <- table(df1$cat, df1[,i])
}
[[1]]
0 1
1 3 1
2 3 2
3 3 2
4 2 2
[[2]]
0 1
1 1 3
2 3 2
3 2 3
4 3 1
[[3]]
0 1
1 3 1
2 3 2
3 2 3
4 2 2

Related

Recoding by an order in r

I have a data recoding puzzle. Here is how my sample data looks like:
df <- data.frame(
id = c(1,1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3,3),
scores = c(0,1,1,0,0,-1,-1, 0,0,1,-1,-1,-1, 0,1,0,1,1,0,1),
position = c(1,2,3,4,5,6,7, 1,2,3,4,5,6, 1,2,3,4,5,6,7),
cat = c(1,1,1,1,1,0,0, 1,1,1,0,0,0, 1,1,1,1,1,1,1))
id scores position cat
1 1 0 1 1
2 1 1 2 1
3 1 1 3 1
4 1 0 4 1
5 1 0 5 1
6 1 -1 6 0
7 1 -1 7 0
8 2 0 1 1
9 2 0 2 1
10 2 1 3 1
11 2 -1 4 0
12 2 -1 5 0
13 2 -1 6 0
14 3 0 1 1
15 3 1 2 1
16 3 0 3 1
17 3 1 4 1
18 3 1 5 1
19 3 0 6 1
20 3 1 7 1
There are three ids in the dataset and rows were ordered by a positon variable. For each id, the first row after the scores start by -1 needs to be 0, and the cat variable needs to be 1. For example, for id=1, the first row would be 6th position and in that row, score should be 0 and the cat variable needs to 1. For those ids do not have scores=-1, I keep them as they are.
The desired output should look like below:
id scores position cat
1 1 0 1 1
2 1 1 2 1
3 1 1 3 1
4 1 0 4 1
5 1 0 5 1
6 1 0 6 1
7 1 -1 7 0
8 2 0 1 1
9 2 0 2 1
10 2 1 3 1
11 2 0 4 1
12 2 -1 5 0
13 2 -1 6 0
14 3 0 1 1
15 3 1 2 1
16 3 0 3 1
17 3 1 4 1
18 3 1 5 1
19 3 0 6 1
20 3 1 7 1
Any recommendations??
Thanks
This may be what you are after
df %>%
group_by(id) %>%
mutate(i = which(scores == -1)[1]) %>% # find the first row == -1
mutate(scores = case_when(position == i & scores !=0 ~ 0, T ~ scores), # update the score using position & i
cat = ifelse(scores == -1,0,1)) %>% # then update cat
select (-i) # remove I
After trying a few things and getting ideas from #Ricky and #e.matt, I came up with a solution.
df %>%
filter(scores == -1) %>% # keep cases where var = 1
distinct(id, .keep_all = T) %>% # keep distinct cases based on group
mutate(first = 1) %>% # create first column
right_join(df, by=c("id","scores","position","cat")) %>% # join back original dataset
mutate(first = coalesce(first, 0)) %>% # replace NAs with 0
mutate(scores = case_when(
first == 1 ~ 0,
TRUE~scores)) %>%
mutate(cat = case_when(
first == 1 ~ 1,
TRUE~cat))
This provides my desired output.
id scores position cat first
1 1 0 1 1 0
2 1 1 2 1 0
3 1 1 3 1 0
4 1 0 4 1 0
5 1 0 5 1 0
6 1 0 6 1 1
7 1 -1 7 0 0
8 2 0 1 1 0
9 2 0 2 1 0
10 2 1 3 1 0
11 2 0 4 1 1
12 2 -1 5 0 0
13 2 -1 6 0 0
14 3 0 1 1 0
15 3 1 2 1 0
16 3 0 3 1 0
17 3 1 4 1 0
18 3 1 5 1 0
19 3 0 6 1 0
20 3 1 7 1 0
here is a data.table oneliner
library( data.table )
setDT(df)
df[ df[, .(cumsum( scores == -1 ) == 1), by = .(id)]$V1, `:=`( scores = 0, cat = 1) ]
# id scores position cat
# 1: 1 0 1 1
# 2: 1 1 2 1
# 3: 1 1 3 1
# 4: 1 0 4 1
# 5: 1 0 5 1
# 6: 1 0 6 1
# 7: 1 -1 7 0
# 8: 2 0 1 1
# 9: 2 0 2 1
# 10: 2 1 3 1
# 11: 2 0 4 1
# 12: 2 -1 5 0
# 13: 2 -1 6 0
# 14: 3 0 1 1
# 15: 3 1 2 1
# 16: 3 0 3 1
# 17: 3 1 4 1
# 18: 3 1 5 1
# 19: 3 0 6 1
# 20: 3 1 7 1
You could do something along these lines using the dplyr package:
library(dplyr)
df = mutate(df, cat = ifelse(scores == -1, 1, cat),
scores = ifelse(scores == -1, 0, scores))
Using the mutate() function, I am re-assigning the values for the scores and cat fields according to ifelse() conditional statements. For scores, if the score is -1, the value is replaced by 0, otherwise it keeps the score as is. For cat, it also checks if scores is equal to -1, but would assign a value of 1 when the condition is met, or the already existing value of cat when the condition is not met.
EDIT
After our discussion in the comments, I think something along these lines should be helpful (you may have to modify the logic since I don't exactly follow what the desired output is here):
for(i in 1:nrow(df)){
# Check if score is -1
if(df[i, 'scores'] == -1){
# Update values for the next row
df[i+1, 'scores'] <- 0
df[i+1, 'cat'] <- 1
}
}
Sorry that I don't really follow the desired output, hopefully this is helpful in getting you to your answer!

File entire vector by a certain appearance of another vector

I have the following data:
players<-rep(1:3,each=3)
trial<-rep(1:3)
choice<-c(1,0,0,0,0,0,0,1,0)
gamematrix<-data.frame(cbind(players,trial,choice))
players trial choice
1 1 1 1
2 1 2 0
3 1 3 0
4 2 1 0
5 2 2 0
6 2 3 0
7 3 1 0
8 3 2 1
9 3 3 0
Now I want to create a new vector:
for each participant who have at least one choice of "1", to get the value "3" and "0" otherwise:
players trial choice win
1 1 1 1 3
2 1 2 0 3
3 1 3 0 3
4 2 1 0 0
5 2 2 0 0
6 2 3 0 0
7 3 1 0 3
8 3 2 1 3
9 3 3 0 3
In the simple example above, player "1", had "1" in the first trial, while player 3 in the second trial, thus for all their choices the value is "3" in the new vector.
Any ideas how to do it? thanks!
A base R option using ave + ifelse
within(
gamematrix,
win <- ave(choice,players,FUN = function(x) ifelse(any(x==1),3,0))
)
giving
players trial choice win
1 1 1 1 3
2 1 2 0 3
3 1 3 0 3
4 2 1 0 0
5 2 2 0 0
6 2 3 0 0
7 3 1 0 3
8 3 2 1 3
9 3 3 0 3
Update
If you criteria is depending on the first two values of choice, you can try
within(
gamematrix,
win <- ave(choice,players,FUN = function(x) ifelse(all(head(x,2)==1),3,0))
)
which gives
players trial choice win
1 1 1 1 0
2 1 2 0 0
3 1 3 0 0
4 2 1 0 0
5 2 2 0 0
6 2 3 0 0
7 3 1 0 0
8 3 2 1 0
9 3 3 0 0
Try this dplyr approach:
library(dplyr)
#Code
gamematrix <- gamematrix %>% group_by(players) %>%
mutate(win=ifelse(length(choice[choice==1])>=1,3,0))
Output:
# A tibble: 9 x 4
# Groups: players [3]
players trial choice win
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 3
2 1 2 0 3
3 1 3 0 3
4 2 1 0 0
5 2 2 0 0
6 2 3 0 0
7 3 1 0 3
8 3 2 1 3
9 3 3 0 3
There is no reason for this data to be a data.frame. Keep it as a numeric matrix. If you do so you can do in one line using only vectorized functions.
cbind(gamematrix, win = (rowSums(gamematrix == 1) > 0) * 3)
for your second case:
I would like it to be only for those players who had "choice=1" in the first N (e.g., first 2 trials)
cbind(gamematrix, win = (rowSums(gamematrix[,c(1,2)] == 1) > 0) * 3)
Vectorized solutions are usually more performant than solutions incorporating a buried loop (e.g. ave).
An option with rowsum from base R
gamematrix$win <- with(gamematrix, 3 * players %in%
names(which(rowsum(choice, players)[,1] > 0)))
gamematrix$win
#[1] 3 3 3 0 0 0 3 3 3

Create new duplicate table from another, adding new values from the originals, using R

I need to create a table, from the original table (both below).
In the original table, we have the families A and B, and the members of each family is indicated by the column PESS. The members of each family who are beneficiaries are marked by the number 1 in the BEN column.
From this table, I need to generate a new table, in which you should have 2 more columns. Taking the family A as an example, members 1 and 4 are beneficiaries. Then, the family A should be doubled into two groups, with only one beneficiary at a time (column I_BPC_FAM2). The FAM2 column indicates the groups.
With the code below, I am generating the new table, however, the I_BPC_FAM2 column is missing. The problem must be solved in the R.
Is it possible to complete this code to get to the final table?
library(tidyverse)
tabela<-data.frame(FAM=c("A","A","A","A","B","B","B"), PESS=c(1,2,3,4,1,2,3),BEN=c(1,0,0,1,0,0,1))
tabela1<- summarise(group_by(tabela,FAM),contador=sum(BEN),cont=n()) #faz a tabela com contadores
tab2<-NULL
for(i in 1:length(tabela1$FAM)){
x<-as.numeric(tabela1[i,"contador"])
j<-as.numeric(tabela1[i,"cont"])
for(l in 1:x){
for(k in 1:j){
tab<-data.frame(tabela1[i,"FAM"],PESS=as.numeric(k),FAM2=as.numeric(l))
tab2<-rbind(tab2,tab)
final<-merge(tab2,tabela,by=c("FAM","PESS"))
final <- final[order(final$FAM, final$FAM2), ]
}
}
}
Original table:
> tabela
FAM PESS BEN
1 A 1 1
2 A 2 0
3 A 3 0
4 A 4 1
5 B 1 0
6 B 2 0
7 B 3 1
Table generated by my code
> final
FAM PESS FAM2 BEN
1 A 1 1 1
3 A 2 1 0
5 A 3 1 0
7 A 4 1 1
2 A 1 2 1
4 A 2 2 0
6 A 3 2 0
8 A 4 2 1
9 B 1 1 0
10 B 2 1 0
11 B 3 1 1
Table I need to generate
FAM PESS FAM2 BEN I_BPC_FAM2
1 A 1 1 1 1
3 A 2 1 0 0
5 A 3 1 0 0
7 A 4 1 1 0
2 A 1 2 1 0
4 A 2 2 0 0
6 A 3 2 0 0
8 A 4 2 1 1
9 B 1 1 0 0
10 B 2 1 0 0
11 B 3 1 1 1
Here is an alternative way:
table <-data.frame(FAM=c("A","A","A","A","B","B","B"),
PESS=c(1,2,3,4,1,2,3),
BEN=c(1,0,0,1,0,0,1))
Create an unique id for each observation:
table %<>% mutate( unique_id = row_number())
Subset the group of unique families you want to get:
ben <-
table %>%
filter(BEN == 1) %>%
mutate(FAM2 = unique_id) %>%
select(FAM2, FAM)
> ben
FAM2 FAM
1 1 A
2 4 A
3 7 B
Merge and compare the ids:
new_table<- merge(ben, table, by = "FAM") %>%
mutate(I_BPC_FAM2 = as.integer(unique_id == FAM2)) %>%
select(-unique_id)
The result is:
new_table
> new_table
FAM FAM2 PESS BEN I_BPC_FAM2
1 A 1 1 1 1
2 A 1 2 0 0
3 A 1 3 0 0
4 A 1 4 1 0
5 A 4 1 1 0
6 A 4 2 0 0
7 A 4 3 0 0
8 A 4 4 1 1
9 B 7 1 0 0
10 B 7 2 0 0
11 B 7 3 1 1
You can transform the new family IDs then, if needed, with:
> new_table %>% mutate(FAM2 = as.integer(as.factor(FAM2)))
FAM FAM2 PESS BEN I_BPC_FAM2
1 A 1 1 1 1
2 A 1 2 0 0
3 A 1 3 0 0
4 A 1 4 1 0
5 A 2 1 1 0
6 A 2 2 0 0
7 A 2 3 0 0
8 A 2 4 1 1
9 B 3 1 0 0
10 B 3 2 0 0
11 B 3 3 1 1
On code of Nicolas,I would change this part:
ben <- table %>%
filter(BEN == 1) %>%
mutate(ID = unique_id)
ben %<>%
group_by(FAM) %>%
mutate(FAM2=cumsum(BEN)) %>%
select(ID,FAM2,FAM)
new_table<- merge(ben, table, by = "FAM") %>%
mutate(I_BPC_FAM2 = as.integer(unique_id == ID)) %>%
select(-unique_id,-ID)
what results in this:
> new_table
FAM FAM2 PESS BEN I_BPC_FAM2
1 A 1 1 1 1
2 A 1 2 0 0
3 A 1 3 0 0
4 A 1 4 1 0
5 A 2 1 1 0
6 A 2 2 0 0
7 A 2 3 0 0
8 A 2 4 1 1
9 B 1 1 0 0
10 B 1 2 0 0
11 B 1 3 1 1
Now we have the FAM2 with the correct numbers of families
which can be practical with the function max ()
That should help. Might be possible to combine some commands, but I tried to be as analytical as possible and also avoid using for loops. Run the chained commands step by step to see how it works.
library(dplyr)
# original dataset
dt <- data.frame(FAM=c("A","A","A","A","B","B","B"), PESS=c(1,2,3,4,1,2,3),BEN=c(1,0,0,1,0,0,1))
# create multiple rows of FAM based on how many 1s they have in column BEN
dt %>%
group_by(FAM) %>%
mutate(sum_BEN = sum(BEN)) %>%
group_by(FAM, PESS) %>%
do(data.frame(., FAM2=seq(1,.$sum_BEN))) %>%
select(-sum_BEN) %>%
ungroup() %>%
arrange(FAM, FAM2) %>%
print() -> tbl1
# # A tibble: 11 × 4
# FAM PESS BEN FAM2
# <fctr> <dbl> <dbl> <int>
# 1 A 1 1 1
# 2 A 2 0 1
# 3 A 3 0 1
# 4 A 4 1 1
# 5 A 1 1 2
# 6 A 2 0 2
# 7 A 3 0 2
# 8 A 4 1 2
# 9 B 1 0 1
# 10 B 2 0 1
# 11 B 3 1 1
# keep the relevant rows of FAM to put 1 for I_BPC_FAM2
dt %>%
arrange(FAM, PESS) %>%
group_by(FAM) %>%
mutate(cumsum_BEN = cumsum(BEN)) %>%
ungroup() %>%
distinct(FAM, BEN, cumsum_BEN, .keep_all = T) %>%
filter(BEN != 0) %>%
mutate(I_BPC_FAM2 = 1) %>%
rename(FAM2 = cumsum_BEN) %>%
print() -> tbl2
# # A tibble: 3 × 5
# FAM PESS BEN FAM2 I_BPC_FAM2
# <fctr> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 1 1 1
# 2 A 4 1 2 1
# 3 B 3 1 1 1
# join tables
tbl1 %>%
left_join(tbl2, by=c("FAM","PESS","BEN","FAM2")) %>%
mutate(I_BPC_FAM2 = coalesce(I_BPC_FAM2, 0)) %>%
arrange(FAM, FAM2)
# # A tibble: 11 × 5
# FAM PESS BEN FAM2 I_BPC_FAM2
# <fctr> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 1 1 1
# 2 A 2 0 1 0
# 3 A 3 0 1 0
# 4 A 4 1 1 0
# 5 A 1 1 2 0
# 6 A 2 0 2 0
# 7 A 3 0 2 0
# 8 A 4 1 2 1
# 9 B 1 0 1 0
# 10 B 2 0 1 0
# 11 B 3 1 1 1
Here is a base R solution using the split-apply combine methodology with split, lapply, and do.call/rbind.
# construct of data.frames, one for each family
myList <- lapply(split(df, df$FAM), function(i) {
bens <- which(i$BEN == 1) # get the benefit indices
rows <- nrow(i) # store the number of rows
i <- i[rep(seq_len(rows), length(bens)),] # grow data.frame for each benefit
i$I_BPC_FAM2 <- 0 # initialize variable
i$I_BPC_FAM2[bens + (rows * (seq_along(bens)-1))] <- 1 fill in indicator
i # return new data.frame
})
Now, you can put the list together with
do.call(rbind, myList)
FAM PESS BEN I_BPC_FAM2
A.1 A 1 1 1
A.2 A 2 0 0
A.3 A 3 0 0
A.4 A 4 1 0
A.1.1 A 1 1 0
A.2.1 A 2 0 0
A.3.1 A 3 0 0
A.4.1 A 4 1 1
B.5 B 1 0 0
B.6 B 2 0 0
B.7 B 3 1 1

Sum rows in a group, starting when a specific value occurs

I want to accumulate the values of a column till the end of the group, though starting the addition when a specific value occurs in another column. I am only interested in the first instance of the specific value within a group. So if that value occurs again within the group, the addition column should continue to add the values. I know this sounds like a rather strange problem, so hopefully the example table makes sense.
The following data frame is what I have now:
> df = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0))
> df
group numToAdd occurs
1 1 1 0
2 1 1 0
3 1 3 1
4 1 2 0
5 2 4 0
6 2 2 1
7 2 1 0
8 2 3 0
9 2 2 0
10 3 1 0
11 3 2 1
12 3 1 1
13 4 2 0
14 4 3 0
15 4 2 0
Thus, whenever a 1 occurs within a group, I want a cumulative sum of the values from the column numToAdd, until a new group starts. This would look like the following:
> finalDF = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0),added = c(0,0,3,5,0,2,3,6,8,0,2,3,0,0,0))
> finalDF
group numToAdd occurs added
1 1 1 0 0
2 1 1 0 0
3 1 3 1 3
4 1 2 0 5
5 2 4 0 0
6 2 2 1 2
7 2 1 0 3
8 2 3 0 6
9 2 2 0 8
10 3 1 0 0
11 3 2 1 2
12 3 1 1 3
13 4 2 0 0
14 4 3 0 0
15 4 2 0 0
Thus, the added column is 0 until a 1 occurs within the group, then accumulates the values from numToAdd until it moves to a new group, turning the added column back to 0. In group three, a value of 1 is found a second time, yet the cumulated sum continues. Additionally, in group 4, a value of 1 is never found, thus the value within the added column remains 0.
I've played around with dplyr, but can't get it to work. The following solution only outputs the total sum, and not the increasing cumulated number at each row.
library(dplyr)
df =
df %>%
mutate(added=ifelse(occurs == 1,cumsum(numToAdd),0)) %>%
group_by(group)
Try
df %>%
group_by(group) %>%
mutate(added= cumsum(numToAdd*cummax(occurs)))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Or using data.table
library(data.table)#v1.9.5+
i1 <-setDT(df)[, .I[(rleid(occurs) + (occurs>0))>1], group]$V1
df[, added:=0][i1, added:=cumsum(numToAdd), by = group]
Or a similar option as in dplyr
setDT(df)[,added := cumsum(numToAdd * cummax(occurs)) , by = group]
You can use split-apply-combine in base R with something like:
df$added <- unlist(lapply(split(df, df$group), function(x) {
y <- rep(0, nrow(x))
pos <- cumsum(x$occurs) > 0
y[pos] <- cumsum(x$numToAdd[pos])
y
}))
df
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
To add another base R approach:
df$added <- unlist(lapply(split(df, df$group), function(x) {
c(x[,'occurs'][cumsum(x[,'occurs']) == 0L],
cumsum(x[,'numToAdd'][cumsum(x[,'occurs']) != 0L]))
}))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Another base R:
df$added <- unlist(lapply(split(df,df$group),function(x){
cumsum((cumsum(x$occurs) > 0) * x$numToAdd)
}))

Changing the ID value based on another column

I have a large data set that looks something like this:
Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 1 3 2
1 100 1 4 2
0 0 2 1 1
0 0 2 2 1
1 150 2 3 1
1 100 2 4 2
I want to make a new ID column based on when there is a new path_no, then the ID will change. So I am hoping it will look something like this:
Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 2 3 2
1 100 2 4 2
0 0 3 1 1
0 0 3 2 1
1 150 3 3 1
1 100 4 4 2
I think rleid from data.table should do the trick. Here's one solution that uses data.table and dplyr:
dplyr::mutate(df, ID = data.table::rleid(path_no))
Conv. Rev. ID Order path_no
1 0 0 1 1 1
2 1 50 1 2 1
3 0 0 2 3 2
4 1 100 2 4 2
5 0 0 3 1 1
6 0 0 3 2 1
7 1 150 3 3 1
8 1 100 4 4 2
Or with data.table only:
dt <- setDT(df)
dt[, ID := rleid(path_no)][]
Conv. Rev. ID Order path_no
1: 0 0 1 1 1
2: 1 50 1 2 1
3: 0 0 2 3 2
4: 1 100 2 4 2
5: 0 0 3 1 1
6: 0 0 3 2 1
7: 1 150 3 3 1
8: 1 100 4 4 2
Data:
text <- "Conv. Rev. ID Order path_no
0 0 1 1 1
1 50 1 2 1
0 0 1 3 2
1 100 1 4 2
0 0 2 1 1
0 0 2 2 1
1 150 2 3 1
1 100 2 4 2"
df <- read.table(text = text, stringsAsFactors = FALSE, header = TRUE)
Can go for a simple for loop:
vals <- c(1, 1, 1, 2, 2, 2, 1, 1, 2)
nobs <- length(vals)
idx <- rep(1, nobs)
for (i in 2:nobs) {
if (vals[i] != vals[i-1]) {
idx[i] <- idx[i-1] + 1
} else {
idx[i] <- idx[i-1]
}
}

Resources