Condense dataset by unique id - r

I have rows with recurring IDs that I would like to merge. The columns are binaries so I would like to sum them together
Example before:
id nam1 nam2
1 1 1
1 0 0
2 1 0
2 0 1
3 1 1
3 1 0
Example after:
id nam1 nam2
1 1 1
2 1 1
3 2 1
Any ideas on how to do this?

#d.b's answer in comment:
aggregate(.~id, df, sum)
or using dplyr:
library(dplyr)
df %>%
group_by(id) %>%
summarize_all("sum")
Result:
# A tibble: 3 x 3
id nam1 nam2
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 2 1
Data
df = structure(list(id = c(1L, 1L, 2L, 2L, 3L, 3L), nam1 = c(1L, 0L,
1L, 0L, 1L, 1L), nam2 = c(1L, 0L, 0L, 1L, 1L, 0L)), .Names = c("id",
"nam1", "nam2"), row.names = c(NA, -6L), class = "data.frame")

#Sample data:
df <- data.frame(id=c(1,1,2,2,3,3),
nam1=c(1,0,1,0,1,1),
nam2=c(1,0,0,1,1,0))
library(data.table)
setDT(df)[, lapply(.SD, sum), by=.(id)]
id nam1 nam2
1 1 1
2 1 1
3 2 1

Related

Create new variable based on numeric pattern in R

I am trying to create a new variable (v2) based on a pattern of numerical responses to another variable (v1). The dataset I am working with is in long format and ordered by visit. I have tried grouping by the 'id' variable and using various combinations of 'summarise' in dplyr, but cannot seem to figure this out. Below is an example of what I would like to achieve.
id visit v1 v2
<dbl> <int> <dbl> <int>
1 10001 1 0 1
2 10001 2 0 1
3 10002 1 0 2
4 10002 2 1 2
5 10003 1 1 3
6 10003 2 0 3
The value of 1 for v2 should reflect a response pattern of 0 across two visits for id 10001, 2 reflects a response pattern of 0/1, and so on.
Thank you in advance for the help!
Another way is:
dat %>%
group_by(id) %>%
mutate(v2 = c("00" = 1, "01" = 2, "10" = 3, "11" = 4)[paste(v1, collapse = "")])
# A tibble: 6 x 4
# Groups: id [3]
id visit v1 v2
<int> <int> <int> <dbl>
1 10001 1 0 1
2 10001 2 0 1
3 10002 1 0 2
4 10002 2 1 2
5 10003 1 1 3
6 10003 2 0 3
Assumption:
within an id, we always have exactly 2 rows
base R
ave(dat$v1, dat$id, FUN = function(z) {
if (length(z) != 2) return(NA_integer_)
switch(paste(z, collapse = ""),
"00" = 1L,
"01" = 2L,
"10" = 3L,
"11" = 4L,
NA_integer_)
})
# [1] 1 1 2 2 3 3
dplyr
library(dplyr)
dat %>%
group_by(id) %>%
mutate(v2 = if (n() != 2) NA_integer_ else case_when(
all(v1 == c(0L, 0L)) ~ 1L,
all(v1 == c(0L, 1L)) ~ 2L,
all(v1 == c(1L, 0L)) ~ 3L,
all(v1 == c(1L, 1L)) ~ 4L,
TRUE ~ NA_integer_)
) %>%
ungroup()
# # A tibble: 6 x 4
# id visit v1 v2
# <int> <int> <int> <int>
# 1 10001 1 0 1
# 2 10001 2 0 1
# 3 10002 1 0 2
# 4 10002 2 1 2
# 5 10003 1 1 3
# 6 10003 2 0 3
Data
dat <- structure(list(id = c(10001L, 10001L, 10002L, 10002L, 10003L, 10003L), visit = c(1L, 2L, 1L, 2L, 1L, 2L), v1 = c(0L, 0L, 0L, 1L, 1L, 0L), v2 = c(1L, 1L, 2L, 2L, 3L, 3L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))

R: How to identify first date of positive observation by ID for multiple columns?

I would like to identify first date of positive observation by ID for multiple columns.
Example dataframe:
ID date Observ1 Observ2 Observ3
1 1 1 0 0
1 2 0 1 0
1 3 1 0 1
2 1 1 1 0
Desired result:
ID FirstObserv1 FirstObserv2 FirstObserv3
1 1 2 3
2 1 1 NA
For single column of observation, I can solve it with dplyr:
df %>% group_by(ID) %>% filter( Observ1 > 0) %>% summarize( FirstObserv1 = min(date) ) %>% as.data.frame()
Having no idea how to do it for multiple column at once, though.
Try reshaping your data like this using tidyverse functions. The key of the code id filtering those values with value of 1 and then set a filter to extract the min date value using filter(). After that you reshape to wide and you get the expected output. Here the code:
library(tidyverse)
#Code
dfnew <- df %>% pivot_longer(-c(ID,date)) %>%
group_by(ID) %>%
filter(value==1) %>% select(-value) %>% ungroup() %>%
group_by(ID,name) %>%
filter(date==min(date)) %>%
pivot_wider(names_from = name,values_from=date)
Output:
# A tibble: 2 x 4
# Groups: ID [2]
ID Observ1 Observ2 Observ3
<int> <int> <int> <int>
1 1 1 2 3
2 2 1 1 NA
Some data used:
#Data
df <- structure(list(ID = c(1L, 1L, 1L, 2L), date = c(1L, 2L, 3L, 1L
), Observ1 = c(1L, 0L, 1L, 1L), Observ2 = c(0L, 1L, 0L, 1L),
Observ3 = c(0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-4L))
Here's a method which just replaces the observation with date if the observation is positive and NA otherwise. Getting the min of each observation yields the desired results.
df %>%
mutate_at(vars(starts_with("Observ")), ~ifelse(. > 0, date, NA)) %>%
group_by(ID) %>%
summarise_at(vars(starts_with("Observ")), min, na.rm = TRUE)
#> # A tibble: 2 x 4
#> ID Observ1 Observ2 Observ3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 3
#> 2 2 1 1 Inf
Another alternative:
df %>%
group_by(ID) %>%
summarise(across(
-date,
list(First = ~{x <- which(. > 0); if (length(x) > 0L) date[[x[[1L]]]] else NA_real_}),
.names = "{.fn}{.col}"
))
Output
ID FirstObserv1 FirstObserv2 FirstObserv3
<dbl> <dbl> <dbl> <dbl>
1 1 1 2 3
2 2 1 1 NA
We can use data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) which(x > 0)[1]),
ID, .SDcols = patterns('^Observ')]
# ID Observ1 Observ2 Observ3
#1: 1 1 2 3
#2: 2 1 1 NA
Or using tidyverse
library(dplyr)
df %>%
group_by(ID) %>%
summarise(across(starts_with('Obser'), ~ which(. > 0)[1],
.names = 'First{col}'), .groups = 'drop')
# A tibble: 2 x 4
# ID FirstObserv1 FirstObserv2 FirstObserv3
# <int> <int> <int> <int>
#1 1 1 2 3
#2 2 1 1 NA
data
df <- structure(list(ID = c(1L, 1L, 1L, 2L), date = c(1L, 2L, 3L, 1L
), Observ1 = c(1L, 0L, 1L, 1L), Observ2 = c(0L, 1L, 0L, 1L),
Observ3 = c(0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-4L))

R - delete rows according to the value of another row

I am quite a beginner in R but thanks to the community of Stackoverflow I am improving!
However, I am stuck with a problem:
I have a dataset with 5 variables:
id_house represents the id for each household
id_ind is an id which values 1 for the first individual in the household, 2 for the next, 3 for the third...
Indicator_tb_men which indicates if the first person has answered to the survey (1 = yes, 0 = no). All the other members of the household take the value 0.
id_house id_ind indicator_tb_men
1 1 1
1 2 0
2 1 1
3 1 0
3 2 0
3 3 0
4 1 1
5 1 0
I would like to delete all members of households where the first individual has not answered the survey.
So it would give:
id_house id_ind indicator_tb_men
1 1 1
1 2 0
2 1 1
4 1 1
Using dplyr here is one way :
library(dplyr)
df %>%
arrange(id_house, id_ind) %>%
group_by(id_house) %>%
filter(first(indicator_tb_men) != 0)
# id_house id_ind indicator_tb_men
# <int> <int> <int>
#1 1 1 1
#2 1 2 NA
#3 2 1 1
#4 4 1 1
data
df <- structure(list(id_house = c(1L, 1L, 2L, 3L, 3L, 3L, 4L, 5L),
id_ind = c(1L, 2L, 1L, 1L, 2L, 3L, 1L, 1L), indicator_tb_men = c(1L,
NA, 1L, 0L, NA, NA, 1L, 0L)), class = "data.frame", row.names = c(NA, -8L))
in base we can use nested logic
df[df$id_house %in% df$id_house[df$id_ind == 1 & df$indicator_tb_men == 1],]
id_house id_ind indicator_tb_men
1 1 1 1
2 1 2 NA
3 2 1 1
7 4 1 1
Data: Using Ronak Shah's data

Grouping by a column and counting number of positive and negative values corresponding to each value in R

I want to have a list of positive and negative values corresponding to each value that comes after grouping a column. My data looks like this:
dataset <- read.table(text =
"id value
1 4
1 -2
1 0
2 6
2 -4
2 -5
2 -1
3 0
3 0
3 -4
3 -5",
header = TRUE, stringsAsFactors = FALSE)
I want my result to look like this:
id num_pos_value num_neg_value num_zero_value
1 1 1 1
2 1 3 0
3 0 2 2
I want to extend the columns of the above result by adding sum of the positive and negative values.
id num_pos num_neg num_zero sum_pos sum_neg
1 1 1 1 4 -2
2 1 3 0 6 -10
3 0 2 2 0 -9
We create a group by 'id' and calculate the sum of logical vector
library(dplyr)
df1 %>%
group_by(id) %>%
summarise(num_pos = sum(value > 0),
num_neg = sum(value < 0),
num_zero = sum(value == 0))
# A tibble: 3 x 4
# id num_pos num_neg num_zero
# <int> <int> <int> <int>
#1 1 1 1 1
#2 2 1 3 0
#3 3 0 2 2
Or get the table of sign of 'value' and spread it to 'wide'
library(tidyr)
df1 %>%
group_by(id) %>%
summarise(num = list(table(factor(sign(value), levels = -1:1)))) %>%
unnest %>%
mutate(grp = rep(paste0("num", c("pos", "zero", "neg")), 3)) %>%
spread(grp, num)
Or using count
df1 %>%
count(id, val = sign(value)) %>%
spread(val, n, fill = 0)
data
df1 <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L), value = c(4L, -2L, 0L, 6L, -4L, -5L, -1L, 0L, 0L, -4L, -5L
)), class = "data.frame", row.names = c(NA, -11L))

How do I create occasion variable (time) for each ID?

I would like to create variable "Time" which basically indicates the number of times variable ID showed up within each day minus 1. In other words, the count is lagged by 1 and the first time ID showed up in a day should be left blank. Second time the same ID shows up on a given day should be 1.
Basically, I want to create the "Time" variable in the example below.
ID Day Time Value
1 1 0
1 1 1 0
1 1 2 0
1 2 0
1 2 1 0
1 2 2 0
1 2 3 1
2 1 0
2 1 1 0
2 1 2 0
Below is the code I am working on. Have not been successful with it.
data$time<-data.frame(data$ID,count=ave(data$ID==data$ID, data$Day, FUN=cumsum))
We can do this with data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', 'Day', we get the lag of sequence of rows (shift(seq_len(.N))) and assign (:=) it as "Time" column.
library(data.table)
setDT(df1)[, Time := shift(seq_len(.N)), .(ID, Day)]
df1
# ID Day Value Time
# 1: 1 1 0 NA
# 2: 1 1 0 1
# 3: 1 1 0 2
# 4: 1 2 0 NA
# 5: 1 2 0 1
# 6: 1 2 0 2
# 7: 1 2 1 3
# 8: 2 1 0 NA
# 9: 2 1 0 1
#10: 2 1 0 2
Or with base R
with(df1, ave(Day, Day, ID, FUN= function(x)
ifelse(seq_along(x)!=1, seq_along(x)-1, NA)))
#[1] NA 1 2 NA 1 2 3 NA 1 2
Or without the ifelse
with(df1, ave(Day, Day, ID, FUN= function(x)
NA^(seq_along(x)==1)*(seq_along(x)-1)))
#[1] NA 1 2 NA 1 2 3 NA 1 2
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L),
Day = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L), Value = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L)), .Names = c("ID", "Day",
"Value"), row.names = c(NA, -10L), class = "data.frame")

Resources