Related
I am trying to generate a new variable to identify 'single parents' in a household, based on a group identifier. If there is a 'Child' in a group without both a 'Head' and "Spouse', I would like the variable to take the value of 1. I have tried using dplyr but am unable to arrive at the solution.
relation<-c("Head","Spouse","Child","Head","Spouse","Head","Child")
group<-c(1,1,1,2,2,3,3)
my_data<-as.data.frame(cbind(group,relation))
my_data %>%
group_by(group) %>%
mutate(single_parent = case_when(relation %in% "Child" & !(relation %in% "Head" & relation %in% "Spouse")~1))
# desired output
my_data$single_parent<-c(0,0,0,0,0,1,1)
Thank you for your help.
We could do
library(dplyr)
my_data <- my_data %>%
group_by(group) %>%
mutate(single_parent = +((!all(c("Head", "Spouse") %in% relation &
'Child' %in% relation)) & 'Child' %in% relation)) %>%
ungroup
-output
my_data
# A tibble: 7 × 3
group relation single_parent
<dbl> <chr> <int>
1 1 Head 0
2 1 Spouse 0
3 1 Child 0
4 2 Head 0
5 2 Spouse 0
6 3 Head 1
7 3 Child 1
data
my_data <- data.frame(group, relation)
Here is another tidyverse option:
library(tidyverse)
my_data %>%
group_by(group) %>%
mutate(single_parent = ifelse(relation == "Child" & sum(n()) == 2, 1, NA)) %>%
fill(single_parent, .direction = "downup", 0) %>%
mutate(single_parent = replace_na(single_parent, 0))
Or another option using a combination of base R and tidyverse using table:
data.frame(group = unique(my_data$group), single_parent = +(table(my_data)[,1] == 1 & rowSums(table(my_data)[,-1]) == 1)) %>%
left_join(my_data, ., by = "group")
Output
group relation single_parent
<chr> <chr> <dbl>
1 1 Head 0
2 1 Spouse 0
3 1 Child 0
4 2 Head 0
5 2 Spouse 0
6 3 Head 1
7 3 Child 1
Given a larger data frame with around 300k+ rows and 14 columns in the following form:
df <- data.frame(team_id = c(rep(1,10),rep(2,10),rep(3,10),rep(4,10),rep(5,10)),
year = rep(c(1954:1963), 5), members= c(0,0,0,1,1,1,2,0,0,0,0,0,2,1,1,1,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0),
size = c(rep(60,8),50,50,rep(40,7),50,50,70,rep(30,10),rep(99,6),110,101,101,101,rep(80,9),66) )
The aim is to create a new vector containing the difference in size, for each team, once all members left (members change from 2 or 1 to 0) subtracting the size of the year of the last departure of players from the next different size.
The direction of change should be shown so absolute values are not necessary.
What I achieved so far is:
df2 <- df %>% arrange(team_id,year) %>%
group_by(team_id) %>%
mutate(sizediff = if_else(members == 1 & lead(members) == 0 | members == 2 & lead(members) == 0,1,0, missing = 0) )
However, instead of the values 1 in the sizediff vector I want to have the difference to future size. Maybe changes from long to wide format or a conditional re-arrangement the year vector could help but I am stuck. What I want to achieve looks like:
aim <- data.frame(team_id = c(rep(1,10),rep(2,10),rep(3,10),rep(4,10),rep(5,10)),
year = rep(c(1954:1963), 5), members= c(0,0,0,1,1,1,2,0,0,0, 0,0,2,1,1,1,0,0,0,0, 1,1,1,1,1,1,1,1,1,1, 0,1,1,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,0,0 ) ,
size = c(57,rep(60,7),50,50,rep(40,7),50,50,70,rep(30,10),rep(99,6),110,101,101,101,88,rep(80,8),66),
sizediff = c(rep(0,6),-10,rep(0,3),rep(0,5),10,rep(0,4),rep(0,10),rep(0,3),11,rep(0,6),rep(0,7),-14,rep(0,2)) )
is this something your are looking for?
df %>%
arrange(team_id, year) %>%
mutate(diff = if_else((members> 0 & dplyr::lead(members, n=1)==0), size, 0)) %>%
group_by(team_id) %>%
mutate(diff = ifelse(diff>0, dplyr::last(size)-size, NA))
Try this custom approach :
library(dplyr)
df %>%
group_by(team_id) %>%
mutate(sizediff = {
sizediff = rep(0, n())
inds <- which(members %in% c(1, 2) & lead(members) == 0)[1]
sizediff[inds] <- size[which(row_number() > inds & size != size[inds])[1]] - size[inds]
sizediff
}) -> result
result
# team_id year members size sizediff
# <dbl> <int> <dbl> <dbl> <dbl>
# 1 1 1954 0 60 0
# 2 1 1955 0 60 0
# 3 1 1956 0 60 0
# 4 1 1957 1 60 0
# 5 1 1958 1 60 0
# 6 1 1959 1 60 0
# 7 1 1960 2 60 -10
# 8 1 1961 0 60 0
# 9 1 1962 0 50 0
#10 1 1963 0 50 0
# … with 40 more rows
We first initialise sizediff to 0, inds is used to find where members left. We calculate the difference in size from the next value which changes and update inds position.
shift = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3)
count =c(1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7)
test <- cbind(shift,count)
So I am trying to mark every last row for every shift (so rows with count = c(8,10,7)with a binary 1 and every other row with 0. Right now I am thinking maybe that is possible with a left join but I am not quite sure. I would prefer not working with loops but rather use some techniques from dplyr. Thanks guys!
Assuming that you want to add a new 0/1 column last that contains a 1 in the last row of each shift and that the shifts are contiguous, here are two base R approaches:
transform(test, last = ave(count, shift, FUN = function(x) x == max(x)))
transform(test, last = +!duplicated(shift, fromLast = TRUE))
or with dplyr use mutate:
test %>%
as.data.frame %>%
group_by(shift) %>%
mutate(last = +(1:n() == n())) %>%
ungroup
test %>%
as.data.frame %>%
mutate(last = +!duplicated(shift, fromLast = TRUE))
Try this one
library(dplyr)
test %>%
as_tibble() %>%
group_by(shift) %>%
mutate(is_last = ifelse( row_number() == max(row_number()), 1, 0)) %>%
ungroup()
# A tibble: 25 x 3
shift count is_last
<dbl> <dbl> <dbl>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 0
7 1 7 0
8 1 8 1
9 2 1 0
10 2 2 0
# … with 15 more rows
I have data that looks like this:
library(dplyr)
d<-data.frame(ID=c(1,1,2,3,3,4), Quality=c("Good", "Bad", "Ugly", "Good", "Good", "Ugly"), Area=c("East", "North", "North", "South", "East", "North"))
What I'd like to do is create one new column for each unique value in Quality and populate it with whether the ID matches that value and then aggregate the ID's. I want to do the same for Area.
This is what I have for when Quality == Good:
d$Quality.Good <- 0
d$Quality.Good[d$Quality=="Good"] <- 1
e <- d %>%
group_by(ID) %>%
summarise(n=n(), MAX.Quality.Good = max(Quality.Good))
e
Output
A tibble: 4 x 3
ID MAX.Quality.Good
<dbl> <dbl>
1 1 1
2 2 0
3 3 1
4 4 0
Is it possible to build a function that will loop through each character column and build an indicator column for Good, Bad, Ugly, North, East, South instead of copy pasting the above many more times?
Here's where I'm stuck:
library(stringr)
#vector of each Quality
e <-d %>%
group_by(Quality) %>%
summarise(n=n()) %>%
select(Quality)
e<-as.data.frame(e)
#create new column names
f <- str_c(names(e),".",e[,1])
#initialize list of new columns
d[f] <- 0
#I'm stuck after this...
Thank you!
We can do this in base R using table by replicating the 'ID' column by the number of columns of dataset minus 1, and pasteing the column names with the unlisted values (excluding the 'ID' column)
table(rep(d$ID, 2), paste0(names(d)[-1][col(d[-1])], unlist(d[-1])))
# AreaEast AreaNorth AreaSouth QualityBad QualityGood QualityUgly
# 1 1 1 0 1 1 0
# 2 0 1 0 0 0 1
# 3 1 0 1 0 2 0
# 4 0 1 0 0 0 1
or with tidyverse, gather into 'long' format, unite the 'key', 'val' columns to a single column, get the distinct rows, and spread into 'wide' format after creating a column of 1s.
library(tidyverse)
gather(d, key, val, -ID) %>%
unite(kv, key, val) %>%
distinct %>%
mutate(n = 1) %>%
spread(kv, n, fill = 0)
#ID Area_East Area_North Area_South Quality_Bad Quality_Good Quality_Ugly
#1 1 1 1 0 1 1 0
#2 2 0 1 0 0 0 1
#3 3 1 0 1 0 1 0
#4 4 0 1 0 0 0 1
1) Base R Create the model matrix for each column (using function make_mm) and bind them together as a data frame m. Finally aggregate on ID. No packages are used.
make_mm <- function(nm, data) model.matrix(~ . - 1, data[nm])
m <- do.call("data.frame", lapply(names(d)[-1], make_mm, d))
with(d, aggregate(. ~ ID, m, max))
giving:
ID QualityBad QualityGood QualityUgly AreaEast AreaNorth AreaSouth
1 1 1 1 0 1 1 0
2 2 0 0 1 0 1 0
3 3 0 1 0 1 0 1
4 4 0 0 1 0 1 0
2) dplyr/purrr This could alternately be written as the following which is close to the code in the question but generalizes to all required columns. Note that here we make model data frames using make_md rather than making model matrices with make_mm. Also note that the dot in group_by(m, ID = .$ID) refers to d and not to m.
library(dplyr)
library(purrr)
make_md <- function(nm, data) {
data %>%
select(nm) %>%
model.matrix(~ . - 1, .) %>%
as.data.frame
}
d %>% {
m <- map_dfc(names(.)[-1], make_md, .)
group_by(m, ID = .$ID) %>%
summarize_all(max) %>%
ungroup
}
I have the following data frame
df <- data.frame(Gender = c(rep(c("M","F"),each=4)),
DiffA=c(1,1,-1,-1,1,1,1,-1),
DiffB=c(1,-1,1,-1,1,1,1,-1))
I would like to create 2 new variables which summarize for each gender i)the number of rows for which DiffA and DiffB are positive and ii) the number of rows for which DiffA and DiffB are negative in order to obtain:
df2 <- data.frame(Gender = c("M","F"),
Diff_Pos=c(1,3),
Diff_Neg=c(1,1))
I have failed to combine the summary function from dplyr n() which returns the count of rows and the required logical statement. Thanks in advance
I would consider doing
library(tidyr)
df %>% filter(DiffA == DiffB) %>% count(Gender, DiffA) %>% spread(DiffA, n)
Gender -1 1
# (fctr) (int) (int)
# 1 F 1 3
# 2 M 1 1
The analogous data.table code is
dcast(df[DiffA == DiffB, .N, by=.(Gender, DiffA)], Gender ~ DiffA)
# Gender -1 1
# 1: F 1 3
# 2: M 1 1
If your real data goes beyond -1 and 1, wrap the relevant columns in sign().
Here is a base R option
with(subset(df, DiffA==DiffB), table(Gender, DiffA))
# DiffA
#Gender -1 1
# F 1 3
# M 1 1
This should work:
df %>%
dplyr::mutate(
Diff_Pos = DiffA > 0 & DiffB > 0,
Diff_Neg = DiffA < 0 & DiffB < 0) %>%
dplyr::group_by(Gender) %>%
dplyr::summarise(
Diff_Pos = sum(Diff_Pos),
Diff_Neg = sum(Diff_Neg))