Filtering a panel dataset in r - r

I have a dataframe of this form
familyid memberid occupation panelid year
1 1 1 1 2000
1 2 1 1 2000
2 1 1 1 2000
2 2 2 1 2000
3 1 1 1 2000
3 2 1 1 2000
3 3 1 1 2000
1 1 2 2 2001
1 2 1 2 2001
2 1 2 2 2001
2 2 2 2 2001
3 1 1 2 2001
3 2 2 2 2001
3 3 2 2 2001
I want to filter this dataframe in order to get the following.
familyid memberid occupation panelid year
1 1 1 1 2000
2 1 1 1 2000
3 2 1 1 2000
3 3 1 1 2000
1 1 2 2 2001
2 1 2 2 2001
3 2 2 2 2001
3 3 2 2 2001
In words, I want to keep only the panel obs that present occupation==1 in year 2000 (panelid==1) and occupation==2 in year 2001 (panelid==2). Does anybody know how to do this? Many thank to everyone,
Marco

Here, we can group by 'familyid', 'memberid', filter based on any 'occupation' 1 and 'year' 2000 as well as any 'occupation' 2 and 'year' 2001
library(tidyverse)
df1 %>%
group_by(familyid, memberid) %>%
filter(any(occupation == 1 & year == 2000) & any(occupation == 2 & year == 2001))
# A tibble: 8 x 5
# Groups: familyid, memberid [4]
# familyid memberid occupation panelid year
# <int> <int> <int> <int> <int>
#1 1 1 1 1 2000
#2 2 1 1 1 2000
#3 3 2 1 1 2000
#4 3 3 1 1 2000
#5 1 1 2 2 2001
#6 2 1 2 2 2001
#7 3 2 2 2 2001
#8 3 3 2 2 2001
Or if the levels of 'occupation' and 'year' are only two, then we can also count with n_distinct to create a logical vector for filtering
df1 %>%
group_by(familyid, memberid) %>%
filter(n_distinct(occupation) >1 & n_distinct(year)> 1)
data
df1 <- structure(list(familyid = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 1L, 1L,
2L, 2L, 3L, 3L, 3L), memberid = c(1L, 2L, 1L, 2L, 1L, 2L, 3L,
1L, 2L, 1L, 2L, 1L, 2L, 3L), occupation = c(1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), panelid = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), year = c(2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2001L, 2001L, 2001L,
2001L, 2001L, 2001L, 2001L)), class = "data.frame", row.names = c(NA,
-14L))

Related

Assigning 1 or 0 in new column based on similar ID BUT sum not to exceed value in another column in R

See table below: I want to assign 1 or 0 to a new_col but the sum of 1s per unique hhid column should not exceed the value of any element in the column "nets" as seen in the table below, assuming new_col doesn't exist
hhid nets new_col
1 1 3 1
1 1 3 1
1 1 3 1
1 1 3 0
1 2 2 1
1 2 2 1
1 2 2 0
1 3 2 1
1 3 2 1
1 3 2 0
1 3 2 0
I tried code below
df %>% group_by(hhid) %>% mutate(new_col = ifelse(summarise(across(new_col), sum)<= df$nets),1,0)
Try this:
Data:
df <- structure(list(hhid = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
3L), nets = c(3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-11L))
hhid nets
1 1 3
2 1 3
3 1 3
4 1 3
5 2 2
6 2 2
7 2 2
8 3 2
9 3 2
10 3 2
11 3 2
Code:
df %>%
group_by(hhid) %>%
mutate(new_col = ifelse(row_number() <= nets,1,0))
Output:
# A tibble: 11 x 3
# Groups: hhid [3]
hhid nets new_col
<int> <int> <dbl>
1 1 3 1
2 1 3 1
3 1 3 1
4 1 3 0
5 2 2 1
6 2 2 1
7 2 2 0
8 3 2 1
9 3 2 1
10 3 2 0
11 3 2 0
Same solution but using data.table instead of dplyr
dt <- structure(list(hhid = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
3L), nets = c(3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), row.names = c(NA,
-11L), class = c("data.frame"))
library(data.table)
setDT(dt)
dt[, new_col := +(seq_len(.N) <= nets), by = hhid]
dt
hhid nets new_col
1: 1 3 1
2: 1 3 1
3: 1 3 1
4: 1 3 0
5: 2 2 1
6: 2 2 1
7: 2 2 0
8: 3 2 1
9: 3 2 1
10: 3 2 0
11: 3 2 0

R dplyr: Add column in group_by to count number of males/females

I have this dataframe:
treatment hh_id hh_size sex yob g2000 g2002 g2004 p2000
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Civic Duty 1 2 1 1941 1 1 1 0
2 Civic Duty 1 2 1 1947 1 1 1 0
3 Hawthorne 2 3 1 1951 1 1 1 0
4 Hawthorne 2 3 1 1950 1 1 1 0
5 Hawthorne 2 3 1 1982 1 1 1 0
6 Control 3 3 1 1981 0 0 1 0
7 Control 3 3 1 1959 1 1 1 0
8 Control 3 3 1 1956 1 1 1 0
9 Control 4 2 1 1968 0 0 1 0
10 Control 4 2 1 1967 1 1 1 0
I want to group it by hh_id & treatment and summarize the rest of the columns by their mean.
Except, I also want two other columns to count the number of males and females in each household, where in the "sex" column female == 1 and male == 0.
Here's what I have so far:
households <- df %>%
mutate_if(is.character, factor) %>%
group_by(hh_id, treatment) %>%
summarise_if(is.numeric, mean)
View(households)
which gives me this dataframe:
hh_id treatment hh_size sex yob g2000 g2002 g2004 p2000
<dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 Civic Duty 2 1 1944 1 1 1 0
2 2 Hawthorne 3 1 1961 1 1 1 0
3 3 Control 3 1 1965. 0.667 0.667 1 0
4 4 Control 2 1 1968. 0.5 0.5 1 0
5 5 Control 1 1 1941 1 1 1 0
6 6 Hawthorne 2 1 1947 1 1 1 0
7 7 Control 1 1 1969 1 0 1 0
8 8 Control 2 1 1964 1 1 1 0.5
9 9 Self 2 1 1956 0.5 0.5 1 0
10 10 Control 1 1 1943 1 1 1 0
Instead of summarise_if, use summarise with across (which is much more flexible). Also, the _if/_at/_all are deprecated
library(dplyr)
df1 %>%
group_by(hh_id, treatment) %>%
summarise(across(where(is.numeric), mean),
n_female = sum(sex == 1), n_male = sum(sex == 0))
The flexibility is that, we can pass multiple set of columns with difference functions in across as well as computation on a single column without across
data
df1 <- structure(list(treatment = c("Civic Duty", "Civic Duty", "Hawthorne",
"Hawthorne", "Hawthorne", "Control", "Control", "Control", "Control",
"Control"), hh_id = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L),
hh_size = c(2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L), sex = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), yob = c(1941L, 1947L,
1951L, 1950L, 1982L, 1981L, 1959L, 1956L, 1968L, 1967L),
g2000 = c(1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L), g2002 = c(1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L), g2004 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), p2000 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

I need to subtract from different baseline value in R [duplicate]

This question already has answers here:
Subtraction within Groups using R
(3 answers)
Closed 2 years ago.
I want to create a new column similar to newvar. I need to subtract the values of group 1 from group 1 at the respective times and then the values of group 2 from group 1 at the respective times. The base values are of group 1 at the respective time.
id group time var newvar
1 1 1 0 0 0
2 1 1 1 1 0
3 1 1 2 5 0
4 1 2 0 1 1
5 1 2 1 2 1
6 1 2 2 3 -2
7 2 1 0 0 0
8 2 1 1 2 0
9 2 1 2 4 0
10 2 2 0 1 1
11 2 2 1 2 0
12 2 2 2 5 1
A dplyr solution:
library(dplyr)
df %>%
group_by(id, time) %>%
mutate(result = var - var[1])
# # A tibble: 12 x 6
# # Groups: id, time [6]
# id group time var newvar result
# <int> <int> <int> <int> <int> <int>
# 1 1 1 0 0 0 0
# 2 1 1 1 1 0 0
# 3 1 1 2 5 0 0
# 4 1 2 0 1 1 1
# 5 1 2 1 2 1 1
# 6 1 2 2 3 -2 -2
# 7 2 1 0 0 0 0
# 8 2 1 1 2 0 0
# 9 2 1 2 4 0 0
# 10 2 2 0 1 1 1
# 11 2 2 1 2 0 0
# 12 2 2 2 5 1 1
The corresponding solution with ave() in stats:
within(df, result <- ave(var, id, time, FUN = function(x) x - x[1]))
Data
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L),
group = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L),
time = c(0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L),
var = c(0L, 1L, 5L, 1L, 2L, 3L, 0L, 2L, 4L, 1L, 2L, 5L),
newvar = c(0L, 0L, 0L, 1L, 1L, -2L, 0L, 0L, 0L, 1L, 0L, 1L)),
class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
Base R one-liner using higher-order functions:
do.call("c", Map(function(x){x - x[1]}, with(df, split(var, paste0(id, time)))))

selecting the last row of a group [duplicate]

This question already has answers here:
Numbering rows within groups in a data frame
(10 answers)
Closed 3 years ago.
I have a data frame.
household person trip loop
1 1 1 1
1 1 2 1
1 1 3 1
1 1 4 2
1 1 5 2
1 2 1 1
1 2 2 1
1 2 3 2
2 1 1 1
2 1 2 1
2 1 3 2
2 1 4 2
for each person in each household I want to change some of index in column trip as below:
when loop is changed I want the trip index Strats from 1 agin.
output
household person trip loop
1 1 1 1
1 1 2 1
1 1 3 1
1 1 1 2
1 1 2 2
1 2 1 1
1 2 2 1
1 2 1 2
2 1 1 1
2 1 2 1
2 1 1 2
2 1 2 2
We can use
library(dplyr)
df1 %>%
group_by(household, person, loop) %>%
mutate(trip = row_number())
# A tibble: 12 x 4
# Groups: household, person, loop [6]
# household person trip loop
# <int> <int> <int> <int>
# 1 1 1 1 1
# 2 1 1 2 1
# 3 1 1 3 1
# 4 1 1 1 2
# 5 1 1 2 2
# 6 1 2 1 1
# 7 1 2 2 1
# 8 1 2 1 2
# 9 2 1 1 1
#10 2 1 2 1
#11 2 1 1 2
#12 2 1 2 2
data
df1 <- structure(list(household = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L), person = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 1L), trip = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 1L, 2L,
3L, 4L), loop = c(1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
2L)), class = "data.frame", row.names = c(NA, -12L))
Using data.table :
library(data.table)
df <- setDT(df) # Making sure your data is a data table
df[, trip := seq_len(.N), by = .(household, person, loop)]

Conditional and grouped summaries by week dplyr

Complicating a previous question, lets say I have the following sock data.
>socks
year drawer week sock_total
1990 1 1 3
1990 1 2 4
1990 1 3 3
1990 1 4 2
1990 1 5 4
1990 2 1 1
1990 2 2 1
1990 2 3 1
1990 2 4 1
1990 2 5 2
1990 3 1 3
1990 3 2 4
1990 3 3 4
1990 3 4 4
1990 3 5 4
1991 1 1 4
1991 1 2 3
1991 1 3 2
1991 1 4 2
1991 1 5 3
1991 2 1 1
1991 2 2 3
1991 2 3 4
1991 2 4 4
1991 2 5 3
1991 3 1 2
1991 3 2 3
1991 3 3 3
1991 3 4 2
1991 3 5 3
How can I use summarise in dplyr to create a new variable
growth which equals 1 if their was an increase in each week between the first year and the second year-- else 0. The data should look like this
>socks
drawer week growth
1 1 1
1 2 0
1 3 0
1 4 0
1 5 0
2 1 0
2 2 1
2 3 1
2 4 1
2 5 1
3 1 0
3 2 0
3 3 0
3 4 0
3 5 0
Also, how would you handle data where a drawer did not have a corresponding week in one of the years. aka add NA if a week was missing.
The answer would be very similar to the previous, but group by drawer and week, comment by #eipi10 is also a great option; You can handle missing year for a specific drawer and week by using index after the subset, which turns a length zero object into NA:
For instance:
df %>%
group_by(drawer, week) %>%
summarise(growth = +(sock_total[year==1991][1] - sock_total[year==1990][1] > 0))
# ^^^ ^^^
# A tibble: 15 x 3
# Groups: drawer [?]
# drawer week growth
# <int> <int> <int>
# 1 1 1 1
# 2 1 2 0
# 3 1 3 0
# 4 1 4 0
# 5 1 5 0
# 6 2 1 0
# 7 2 2 1
# 8 2 3 1
# 9 2 4 1
#10 2 5 1
#11 3 1 0
#12 3 2 0
#13 3 3 0
#14 3 4 0
#15 3 5 NA
The data has left out the year 1991 for drawer 3 and week 5:
structure(list(year = c(1990L, 1990L, 1990L, 1990L, 1990L, 1990L,
1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L, 1990L,
1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L, 1991L,
1991L, 1991L, 1991L, 1991L, 1991L), drawer = c(1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), week = c(1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L), sock_total = c(3L, 4L, 3L,
2L, 4L, 1L, 1L, 1L, 1L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 3L, 2L, 2L,
3L, 1L, 3L, 4L, 4L, 3L, 2L, 3L, 3L, 2L)), .Names = c("year",
"drawer", "week", "sock_total"), class = "data.frame", row.names = c(NA,
-29L))
Or you can try this without complete .
df%>%group_by(drawer,week)%>%
summarise(growth =ifelse(n()<=1,0,ifelse((sock_total[1]-sock_total[2])>=0,0,1)))
# A tibble: 15 x 3
# Groups: drawer [?]
drawer week growth
<int> <int> <dbl>
1 1 1 1
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 2 1 0
7 2 2 1
8 2 3 1
9 2 4 1
10 2 5 1
11 3 1 0
12 3 2 0
13 3 3 0
14 3 4 0
15 3 5 0

Resources