Rowwise sum is not working in dplyr for R - r

Dataset
I have simulated this dataset for my question:
#### Set Seed ####
set.seed(123)
#### Create Data Frame ####
df <- data.frame(x1 = rbinom(n=100,
size=1,
prob = .5),
x2 = rbinom(n=100,
size=1,
prob = .5),
x3 = rbinom(n=100,
size=1,
prob = .5))
#### Convert to Tibble ####
tibble <- df %>%
as_tibble()
Problem
When I run this rowwise summary of the X values:
#### Summarize Rowwise Values ####
tibble %>%
rowwise() %>%
mutate(Sum.X = sum(.,
na.rm = T))
I get this summary, which is not what I'm looking for. This appears to be a summary of something else:
# A tibble: 100 × 4
# Rowwise:
x1 x2 x3 Sum.X
<int> <int> <int> <int>
1 0 1 0 145
2 1 0 1 145
3 0 0 1 145
4 1 1 1 145
5 1 0 0 145
6 0 1 1 145
7 1 1 0 145
8 1 1 0 145
9 1 0 0 145
10 0 0 0 145
# … with 90 more rows
However, I'm looking for a summary by row that looks something like this:
# A tibble: 100 × 4
# Rowwise:
x1 x2 x3 Sum.X
<int> <int> <int> <int>
1 0 1 0 1
2 1 0 1 2
3 0 0 1 1
4 1 1 1 3
5 1 0 0 1
6 0 1 1 2
7 1 1 0 2
8 1 1 0 2
9 1 0 0 2
10 0 0 0 0
# … with 90 more rows

You don't need to use rowwise, just use rowSums.
tibble %>%
mutate(Sum.X = rowSums(., na.rm = T))
# # A tibble: 100 × 4
# x1 x2 x3 Sum.X
# <int> <int> <int> <dbl>
# 1 0 1 0 1
# 2 1 0 1 2
# 3 0 0 1 1
# 4 1 1 1 3
# 5 1 0 0 1
# 6 0 1 1 2
# 7 1 1 0 2
# 8 1 1 0 2
# 9 1 0 0 1
# 10 0 0 0 0
# # … with 90 more rows
# # ℹ Use `print(n = ...)` to see more rows

Using c_across:
df %>%
rowwise() %>%
mutate(Sum.X = sum(c_across(everything()),na.rm = T))
# A tibble: 100 × 4
# Rowwise:
x1 x2 x3 Sum.X
<int> <int> <int> <int>
1 1 0 1 2
2 0 1 0 1
3 0 0 0 0
4 1 1 0 2
5 1 0 1 2
6 0 1 1 2
7 0 0 0 0
8 1 0 1 2
9 1 1 0 2
10 0 0 0 0
# … with 90 more rows
# ℹ Use `print(n = ...)` to see more rows
Can use base R as well:
apply(df, 1, sum, na.rm = T)
[1] 2 1 0 2 2 2 0 2 2 0 2 2 1 2 1 1 2 2 2 1 2 1 0 3 2 3 0 2 1 2 2 3 2 2 2 1 2 3 2 0 2 2 2 1 2 1 2 1 2
[50] 1 3 1 1 2 1 2 2 2 2 1 3 1 1 1 1 3 1 2 2 2 1 3 1 1 1 0 3 3 1 1 2 1 2 0 3 2 2 3 2 2 2 2 3 2 2 3 2 2
[99] 1 1

Related

How to cope with multi-index data in R?

I have a multi-index data set with 100 cases, and each case has 5 questions. Each question was scored by 2 raters.
case question rater1 rater2
1 1 1 1
1 2 1 0
1 3 1 1
1 4 1 1
1 5 0 0
2 1 0 1
2 2 1 1
2 3 1 1
2 4 1 0
2 5 0 0
3 1 0 0
3 2 1 0
3 3 1 1
3 4 1 1
3 5 0 1
...
I want to sum question 1, 2, 3 in each case as A, and question 4, 5 in each case as B. Then insert the value at the end of each case, such as
case question rater1 rater2
1 1 1 1
1 2 1 0
1 3 1 1
1 4 1 1
1 5 0 0
1 A 3 2
1 B 1 1
2 1 0 1
2 2 1 1
2 3 1 1
2 4 1 0
2 5 0 0
2 A 2 3
2 B 1 0
3 1 0 0
3 2 1 0
3 3 1 1
3 4 1 1
3 5 0 1
3 A 2 1
3 B 1 2
...
I am unsure how to achieve it.
You could summarize the data, and then bind it back to the original data and resort it. For example
library(dplyr)
dd %>%
group_by(case, grp = case_when(question %in% 1:3~"A", question %in% 4:5 ~ "B")) %>%
summarize(across(-question, sum)) %>%
ungroup() %>%
rename(question = grp) %>%
bind_rows(mutate(dd, question = as.character(question))) %>%
arrange(case, question)
With data.table
library(data.table)
dt[
,.(
question = c(question, "A", "B"),
rater1 = c(rater1, sum(rater1[1:3]), sum(rater1[4:5])),
rater2 = c(rater2, sum(rater2[1:3]), sum(rater2[4:5]))
), case
][1:15]
#> case question rater1 rater2
#> 1: 1 1 1 0
#> 2: 1 2 1 1
#> 3: 1 3 0 0
#> 4: 1 4 0 0
#> 5: 1 5 0 1
#> 6: 1 A 2 1
#> 7: 1 B 0 1
#> 8: 2 1 0 0
#> 9: 2 2 0 1
#> 10: 2 3 0 1
#> 11: 2 4 1 1
#> 12: 2 5 0 0
#> 13: 2 A 0 2
#> 14: 2 B 1 1
#> 15: 3 1 0 0
Data
dt <- data.table(
case = rep(1:100, each = 5),
question = rep(1:5, 100),
rater1 = sample(0:1, 500, 1),
rater2 = sample(0:1, 500, 1)
)

How to get the number of consecutive zeroes from a column in a dataframe

I'm trying to work out how to get the number of consecutive zeroes for a given column for a dataframe.
Here is a dataframe:
data <- data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2), value = c(1,0,0,1,0,0,0,0,0,0,4,3))
This would be the desired output:
id value consec
1 1 0
1 0 2
1 0 2
1 1 0
1 0 2
1 0 2
2 0 4
2 0 4
2 0 4
2 0 4
2 4 0
2 3 0
Any ideas on how to achieve this output?
Many thanks
You can do:
data$consec <- with(data, ave(value, value, cumsum(value != 0), id, FUN = length) - (value != 0))
data
id value consec
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0
Here's a base R solution using interaction and rle (run-length encoding):
rlid <- rle(as.numeric(interaction(data$id, data$value)))$lengths
data$consec <- replace(rep(rlid, rlid), data$value != 0, 0)
data
#> id value consec
#> 1 1 1 0
#> 2 1 0 2
#> 3 1 0 2
#> 4 1 1 0
#> 5 1 0 2
#> 6 1 0 2
#> 7 2 0 4
#> 8 2 0 4
#> 9 2 0 4
#> 10 2 0 4
#> 11 2 4 0
#> 12 2 3 0
This dplyr solution will work. Using cumulative sum we keep track of every time a new non-zero entry occurs, and for each of these groups we count the number of zeros:
data %>%
group_by(id) %>%
mutate(flag_0 = cumsum(value == 1)) %>%
group_by(id, flag_0) %>%
mutate(conseq = ifelse(value == 0, sum(value == 0), 0)) %>%
ungroup()
# A tibble: 12 x 4
id value flag_0 conseq
<dbl> <dbl> <int> <dbl>
1 1 1 1 0
2 1 0 1 2
3 1 0 1 2
4 1 1 2 0
5 1 0 2 2
6 1 0 2 2
7 2 0 0 4
8 2 0 0 4
9 2 0 0 4
10 2 0 0 4
11 2 4 0 0
12 2 3 0 0
This tidyverse approach can also do the job
library(tidyverse)
data %>% group_by(id) %>%
mutate(value2 =cumsum(value)) %>% group_by(id, value, value2) %>%
mutate(consec = ifelse(value == 0, n(), 0)) %>%
ungroup() %>% select(-value2)
# A tibble: 12 x 3
id value consec
<dbl> <dbl> <dbl>
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0

fill values between interval grouped by ID

I have a data set where subjects have a value of 1 or 0 at different times. I need a function or a piece of code to that feels with 1, the values of 0 between the first and last 1.
I have tried complete() and fill() but not doing what I want
I have the following data:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0))
# A tibble: 30 x 3
ID TIME DV
<dbl> <dbl> <dbl>
1 1 1 0
2 1 2 0
3 1 3 1
4 1 4 1
5 1 5 0
6 1 6 0
7 1 7 1
8 1 8 0
9 1 9 0
10 1 10 0
# ... with 20 more rows
I need the following output as shown in DV2:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0),
DV2 = c(0,0,1,1,1,1,1,0,0,0,
0,1,1,1,1,1,1,1,1,1,
0,1,1,1,1,1,1,1,0,0))
# A tibble: 30 x 4
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
# ... with 20 more rows
With dplyr, you can do:
dat %>%
rowid_to_column() %>%
group_by(ID) %>%
mutate(DV2 = if_else(rowid %in% min(rowid[DV == 1]):max(rowid[DV == 1]),
1, 0)) %>%
ungroup() %>%
select(-rowid)
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
We can create a helper function, and apply it on every group, i.e.
f1 <- function(x) {
v1 <- which(x == 1)
x[v1[1]:v1[length(v1)]] <- 1
return(x)
}
with(dat, ave(DV, ID, FUN = f1))
#[1] 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0

Creating a new variable while using subsequent values in r

I have the following data frame:
df1 <- data.frame(id = rep(1:3, each = 5),
time = rep(1:5),
y = c(rep(1, 4), 0, 1, 0, 1, 1, 0, 0, 1, rep(0,3)))
df1
## id time y
## 1 1 1 1
## 2 1 2 1
## 3 1 3 1
## 4 1 4 1
## 5 1 5 0
## 6 2 1 1
## 7 2 2 0
## 8 2 3 1
## 9 2 4 1
## 10 2 5 0
## 11 3 1 0
## 12 3 2 1
## 13 3 3 0
## 14 3 4 0
## 15 3 5 0
I'd like to create a new indicator variable that tells me, for each of the three ids, at what point y = 0 for all subsequent responses. In the example above, for ids 1 and 2 this occurs at the 5th time point, and for id 3 this occurs at the 3rd time point.
I'm getting tripped up on id 2, where y = 1 at time point 2, but then goes back to one -- I'd like to the indicator variable to take subsequent time points into account.
Essentially, I'm looking for the following output:
df1
## id time y new_col
## 1 1 1 1 0
## 2 1 2 1 0
## 3 1 3 1 0
## 4 1 4 1 0
## 5 1 5 0 1
## 6 2 1 1 0
## 7 2 2 0 0
## 8 2 3 1 0
## 9 2 4 1 0
## 10 2 5 0 1
## 11 3 1 0 0
## 12 3 2 1 0
## 13 3 3 0 1
## 14 3 4 0 1
## 15 3 5 0 1
The new_col variable is indicating whether or not y = 0 at that time point and for all subsequent time points.
I would use a little helper function for that.
foo <- function(x, val) {
pos <- max(which(x != val)) +1
as.integer(seq_along(x) >= pos)
}
df1 %>%
group_by(id) %>%
mutate(indicator = foo(y, 0))
# # A tibble: 15 x 4
# # Groups: id [3]
# id time y indicator
# <int> <int> <dbl> <int>
# 1 1 1 1 0
# 2 1 2 1 0
# 3 1 3 1 0
# 4 1 4 1 0
# 5 1 5 0 1
# 6 2 1 1 0
# 7 2 2 0 0
# 8 2 3 1 0
# 9 2 4 1 0
# 10 2 5 0 1
# 11 3 1 0 0
# 12 3 2 1 0
# 13 3 3 0 1
# 14 3 4 0 1
# 15 3 5 0 1
In case you want to consider NA-values in y, you can adjust foo to:
foo <- function(x, val) {
pos <- max(which(x != val | is.na(x))) +1
as.integer(seq_along(x) >= pos)
}
That way, if there's a NA after the last y=0, the indicator will remain 0.
Here is an option using data.table
library(data.table)
setDT(df1)[, indicator := cumsum(.I %in% .I[which.max(rleid(y)*!y)]), id]
df1
# id time y indicator
# 1: 1 1 1 0
# 2: 1 2 1 0
# 3: 1 3 1 0
# 4: 1 4 1 0
# 5: 1 5 0 1
# 6: 2 1 1 0
# 7: 2 2 0 0
# 8: 2 3 1 0
# 9: 2 4 1 0
#10: 2 5 0 1
#11: 3 1 0 0
#12: 3 2 1 0
#13: 3 3 0 1
#14: 3 4 0 1
#15: 3 5 0 1
Based on the comments from #docendodiscimus, if the values are not 0 for 'y' at the end of each 'id', then we can do
setDT(df1)[, indicator := {
i1 <- rleid(y) * !y
if(i1[.N]!= max(i1) & !is.na(i1[.N])) 0L else cumsum(.I %in% .I[which.max(i1)]) }, id]

Create new duplicate table from another, adding new values from the originals, using R

I need to create a table, from the original table (both below).
In the original table, we have the families A and B, and the members of each family is indicated by the column PESS. The members of each family who are beneficiaries are marked by the number 1 in the BEN column.
From this table, I need to generate a new table, in which you should have 2 more columns. Taking the family A as an example, members 1 and 4 are beneficiaries. Then, the family A should be doubled into two groups, with only one beneficiary at a time (column I_BPC_FAM2). The FAM2 column indicates the groups.
With the code below, I am generating the new table, however, the I_BPC_FAM2 column is missing. The problem must be solved in the R.
Is it possible to complete this code to get to the final table?
library(tidyverse)
tabela<-data.frame(FAM=c("A","A","A","A","B","B","B"), PESS=c(1,2,3,4,1,2,3),BEN=c(1,0,0,1,0,0,1))
tabela1<- summarise(group_by(tabela,FAM),contador=sum(BEN),cont=n()) #faz a tabela com contadores
tab2<-NULL
for(i in 1:length(tabela1$FAM)){
x<-as.numeric(tabela1[i,"contador"])
j<-as.numeric(tabela1[i,"cont"])
for(l in 1:x){
for(k in 1:j){
tab<-data.frame(tabela1[i,"FAM"],PESS=as.numeric(k),FAM2=as.numeric(l))
tab2<-rbind(tab2,tab)
final<-merge(tab2,tabela,by=c("FAM","PESS"))
final <- final[order(final$FAM, final$FAM2), ]
}
}
}
Original table:
> tabela
FAM PESS BEN
1 A 1 1
2 A 2 0
3 A 3 0
4 A 4 1
5 B 1 0
6 B 2 0
7 B 3 1
Table generated by my code
> final
FAM PESS FAM2 BEN
1 A 1 1 1
3 A 2 1 0
5 A 3 1 0
7 A 4 1 1
2 A 1 2 1
4 A 2 2 0
6 A 3 2 0
8 A 4 2 1
9 B 1 1 0
10 B 2 1 0
11 B 3 1 1
Table I need to generate
FAM PESS FAM2 BEN I_BPC_FAM2
1 A 1 1 1 1
3 A 2 1 0 0
5 A 3 1 0 0
7 A 4 1 1 0
2 A 1 2 1 0
4 A 2 2 0 0
6 A 3 2 0 0
8 A 4 2 1 1
9 B 1 1 0 0
10 B 2 1 0 0
11 B 3 1 1 1
Here is an alternative way:
table <-data.frame(FAM=c("A","A","A","A","B","B","B"),
PESS=c(1,2,3,4,1,2,3),
BEN=c(1,0,0,1,0,0,1))
Create an unique id for each observation:
table %<>% mutate( unique_id = row_number())
Subset the group of unique families you want to get:
ben <-
table %>%
filter(BEN == 1) %>%
mutate(FAM2 = unique_id) %>%
select(FAM2, FAM)
> ben
FAM2 FAM
1 1 A
2 4 A
3 7 B
Merge and compare the ids:
new_table<- merge(ben, table, by = "FAM") %>%
mutate(I_BPC_FAM2 = as.integer(unique_id == FAM2)) %>%
select(-unique_id)
The result is:
new_table
> new_table
FAM FAM2 PESS BEN I_BPC_FAM2
1 A 1 1 1 1
2 A 1 2 0 0
3 A 1 3 0 0
4 A 1 4 1 0
5 A 4 1 1 0
6 A 4 2 0 0
7 A 4 3 0 0
8 A 4 4 1 1
9 B 7 1 0 0
10 B 7 2 0 0
11 B 7 3 1 1
You can transform the new family IDs then, if needed, with:
> new_table %>% mutate(FAM2 = as.integer(as.factor(FAM2)))
FAM FAM2 PESS BEN I_BPC_FAM2
1 A 1 1 1 1
2 A 1 2 0 0
3 A 1 3 0 0
4 A 1 4 1 0
5 A 2 1 1 0
6 A 2 2 0 0
7 A 2 3 0 0
8 A 2 4 1 1
9 B 3 1 0 0
10 B 3 2 0 0
11 B 3 3 1 1
On code of Nicolas,I would change this part:
ben <- table %>%
filter(BEN == 1) %>%
mutate(ID = unique_id)
ben %<>%
group_by(FAM) %>%
mutate(FAM2=cumsum(BEN)) %>%
select(ID,FAM2,FAM)
new_table<- merge(ben, table, by = "FAM") %>%
mutate(I_BPC_FAM2 = as.integer(unique_id == ID)) %>%
select(-unique_id,-ID)
what results in this:
> new_table
FAM FAM2 PESS BEN I_BPC_FAM2
1 A 1 1 1 1
2 A 1 2 0 0
3 A 1 3 0 0
4 A 1 4 1 0
5 A 2 1 1 0
6 A 2 2 0 0
7 A 2 3 0 0
8 A 2 4 1 1
9 B 1 1 0 0
10 B 1 2 0 0
11 B 1 3 1 1
Now we have the FAM2 with the correct numbers of families
which can be practical with the function max ()
That should help. Might be possible to combine some commands, but I tried to be as analytical as possible and also avoid using for loops. Run the chained commands step by step to see how it works.
library(dplyr)
# original dataset
dt <- data.frame(FAM=c("A","A","A","A","B","B","B"), PESS=c(1,2,3,4,1,2,3),BEN=c(1,0,0,1,0,0,1))
# create multiple rows of FAM based on how many 1s they have in column BEN
dt %>%
group_by(FAM) %>%
mutate(sum_BEN = sum(BEN)) %>%
group_by(FAM, PESS) %>%
do(data.frame(., FAM2=seq(1,.$sum_BEN))) %>%
select(-sum_BEN) %>%
ungroup() %>%
arrange(FAM, FAM2) %>%
print() -> tbl1
# # A tibble: 11 × 4
# FAM PESS BEN FAM2
# <fctr> <dbl> <dbl> <int>
# 1 A 1 1 1
# 2 A 2 0 1
# 3 A 3 0 1
# 4 A 4 1 1
# 5 A 1 1 2
# 6 A 2 0 2
# 7 A 3 0 2
# 8 A 4 1 2
# 9 B 1 0 1
# 10 B 2 0 1
# 11 B 3 1 1
# keep the relevant rows of FAM to put 1 for I_BPC_FAM2
dt %>%
arrange(FAM, PESS) %>%
group_by(FAM) %>%
mutate(cumsum_BEN = cumsum(BEN)) %>%
ungroup() %>%
distinct(FAM, BEN, cumsum_BEN, .keep_all = T) %>%
filter(BEN != 0) %>%
mutate(I_BPC_FAM2 = 1) %>%
rename(FAM2 = cumsum_BEN) %>%
print() -> tbl2
# # A tibble: 3 × 5
# FAM PESS BEN FAM2 I_BPC_FAM2
# <fctr> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 1 1 1
# 2 A 4 1 2 1
# 3 B 3 1 1 1
# join tables
tbl1 %>%
left_join(tbl2, by=c("FAM","PESS","BEN","FAM2")) %>%
mutate(I_BPC_FAM2 = coalesce(I_BPC_FAM2, 0)) %>%
arrange(FAM, FAM2)
# # A tibble: 11 × 5
# FAM PESS BEN FAM2 I_BPC_FAM2
# <fctr> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 1 1 1
# 2 A 2 0 1 0
# 3 A 3 0 1 0
# 4 A 4 1 1 0
# 5 A 1 1 2 0
# 6 A 2 0 2 0
# 7 A 3 0 2 0
# 8 A 4 1 2 1
# 9 B 1 0 1 0
# 10 B 2 0 1 0
# 11 B 3 1 1 1
Here is a base R solution using the split-apply combine methodology with split, lapply, and do.call/rbind.
# construct of data.frames, one for each family
myList <- lapply(split(df, df$FAM), function(i) {
bens <- which(i$BEN == 1) # get the benefit indices
rows <- nrow(i) # store the number of rows
i <- i[rep(seq_len(rows), length(bens)),] # grow data.frame for each benefit
i$I_BPC_FAM2 <- 0 # initialize variable
i$I_BPC_FAM2[bens + (rows * (seq_along(bens)-1))] <- 1 fill in indicator
i # return new data.frame
})
Now, you can put the list together with
do.call(rbind, myList)
FAM PESS BEN I_BPC_FAM2
A.1 A 1 1 1
A.2 A 2 0 0
A.3 A 3 0 0
A.4 A 4 1 0
A.1.1 A 1 1 0
A.2.1 A 2 0 0
A.3.1 A 3 0 0
A.4.1 A 4 1 1
B.5 B 1 0 0
B.6 B 2 0 0
B.7 B 3 1 1

Resources