Dplyr solution using slice and group - r

Ciao, Here is my replicating example.
a=c(1,2,3,4,5,6)
a1=c(15,17,17,16,14,15)
a2=c(0,0,1,1,1,0)
b=c(1,0,NA,NA,0,NA)
c=c(2010,2010,2010,2010,2010,2010)
d=c(1,1,0,1,0,NA)
e=c(2012,2012,2012,2012,2012,2012)
f=c(1,0,0,0,0,NA)
g=c(2014,2014,2014,2014,2014,2014)
h=c(1,1,0,1,0,NA)
i=c(2010,2012,2014,2012,2014,2014)
mydata = data.frame(a,a1,a2,b,c,d,e,f,g,h,i)
names(mydata) = c("id","age","gender","drop1","year1","drop2","year2","drop3","year3","drop4","year4")
mydata2 <- reshape(mydata, direction = "long", varying = list(c("year1","year2","year3","year4"), c("drop1","drop2","drop3","drop4")),v.names = c("year", "drop"), idvar = "X", timevar = "Year", times = c(1:4))
x1 = mydata2 %>%
group_by(id) %>%
slice(which(drop==1)[1])
x2 = mydata2 %>%
group_by(id) %>%
slice(which(drop==0)[1])
I have data "mydata2" which is tall such that every ID has many rows.
I want to make new data set "x" such that every ID has one row that is based on if they drop or not.
The first of drop1 drop2 drop3 drop4 that equals to 1, I want to take the year of that and put that in a variable dropYEAR. If none of drop1 drop2 drop3 drop4 equals to 1 I want to put the last data point in year1 year2 year3 year4 in the variable dropYEAR.
Ultimately every ID should have 1 row and I want to create 2 new columns: didDROP equals to 1 if the ID ever dropped or 0 if the ID did not ever drop. dropYEAR equals to the year of drop if didDROP equals to 1 or equals to the last reported year1 year2 year3 year4 if the ID did not ever drop. I try to do this in dplyr but this gives part of what I want only because it gets rid of ID values that equals to 0.
This is desired output, thank you to #Wimpel

First mydata2 %>% arrange(id) to understand the dataset, then using dplyr first and lastwe can pull the first year where drop==1 and the last year in case of drop never get 1 where drop is not null. Usingcase_when to check didDROP as it has a nice magic in dealing with NAs.
library(dplyr)
mydata2 %>% group_by(id) %>%
mutate(dropY=first(year[!is.na(drop) & drop==1]),
dropYEAR=if_else(is.na(dropY), last(year[!is.na(drop)]),dropY)) %>%
slice(1)
#Update
mydata2 %>% group_by(id) %>%
mutate(dropY=first(year[!is.na(drop) & drop==1]),
dropYEAR=if_else(is.na(dropY), last(year),dropY),
didDROP=case_when(any(drop==1) ~ 1, #Return 1 if there is any drop=1 o.w it will return 0
TRUE ~ 0)) %>%
select(-dropY) %>% slice(1)
# A tibble: 6 x 9
# Groups: id [6]
id age gender Year year drop X dropYEAR didDROP
<dbl> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 1 15 0 1 2010 1 1 2010 1
2 2 17 0 1 2010 0 2 2012 1
3 3 17 1 1 2010 NA 3 2014 0
4 4 16 1 1 2010 NA 4 2012 1
5 5 14 1 1 2010 0 5 2014 0
6 6 15 0 1 2010 NA 6 2014 0
I hope this what you're looking for.

You can sort by id, drop and year, conditionally on dropping or not:
library(dplyr)
mydata2 %>%
mutate(drop=ifelse(is.na(drop),0,drop)) %>%
arrange(id,-drop,year*(2*drop-1)) %>%
group_by(id) %>%
slice(1) %>%
select(id,age,gender,didDROP=drop,dropYEAR=year)
# A tibble: 6 x 5
# Groups: id [6]
id age gender didDROP dropYEAR
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 15 0 1 2010
2 2 17 0 1 2012
3 3 17 1 0 2014
4 4 16 1 1 2012
5 5 14 1 0 2014
6 6 15 0 0 2014

Related

How to create a column with a value of 1 if another column goes past a threshold from one year to another in panel data , 0 otherwise in R

I want to create a column for if the value went over the threshold of 10 for example as 1 if the previous year did not go over the threshold otherwise it's 0 as shown as in the last column of the sample image. This would be done for only the same unique items across all the years
my intiution is use dplyr and casewhen but don't know how to write the code for the case when part
df <- df %>%
group_by(unique, year) %>%
mutate(value turned = case_when(value10 == 1 in year but not in previous year) ~ 1, otherwise 0)
library(dplyr)
df <- data.frame(unique=c(rep("10ab",3),rep("12cc",3)), year=rep(2017:2019,2),
value10 = c(0,1,1,1,1,1))
df %>% group_by(unique) %>% arrange(year) %>%
mutate(value_turned = as.integer(value10 == 1 & lag(value10, default = 1) == 0)) %>%
ungroup() %>% arrange(unique)
#> # A tibble: 6 × 4
#> unique year value10 value_turned
#> <chr> <int> <dbl> <int>
#> 1 10ab 2017 0 0
#> 2 10ab 2018 1 1
#> 3 10ab 2019 1 0
#> 4 12cc 2017 1 0
#> 5 12cc 2018 1 0
#> 6 12cc 2019 1 0

How to count repetitions in a dataframe column in R using dplyr?

I have srink a df and I have kept only two columns, which are airport names (origin and destiny):
Origin Destination
<chr> <chr>
1 LPPD LEMD
2 DAAE LFML
3 EDDH UUEE
4 LFLL DAAS
5 LFPO LFSL
6 UMKK ULLI
7 LFPO LFBA
8 LFPG EDDN
9 LFLL LFRN
10 LFPG EDDW
# … with more rows
Airport names are repeated on either columns. I would like to summarise the repeated airport names, and output the following:
Airports totalMovements takeoffs landings
Airports are the airport names (one appearance) that appear on both columns. Total_Movements are the sum of the number of times an airport name appears in the Origin column plus the times that appears in the Destiny colum. Takeoffs are the number of times that an airport name appears in the Origin column and finally, landings are the total number of times that an airport name appears on the Destiny column.
We can use data.table
library(data.table)
melt(setDT(df1), measure = 1:2)[, .(.N, sum(variable == 'Origin'),
sum(variable == 'Destination')), value]
You could try:
library(dplyr)
library(tidyr)
pivot_longer(df, everything()) %>%
group_by(Airports = value) %>%
summarise(
totalMovements = n(),
takeoffs = sum(name == 'Origin'),
landings = sum(name == 'Destination')
)
Output (based on the rows shown in your question):
# A tibble: 17 x 4
Airports totalMovements takeoffs landings
<fct> <int> <int> <int>
1 DAAE 1 1 0
2 EDDH 1 1 0
3 LFLL 2 2 0
4 LFPG 2 2 0
5 LFPO 2 2 0
6 LPPD 1 1 0
7 UMKK 1 1 0
8 DAAS 1 0 1
9 EDDN 1 0 1
10 EDDW 1 0 1
11 LEMD 1 0 1
12 LFBA 1 0 1
13 LFML 1 0 1
14 LFRN 1 0 1
15 LFSL 1 0 1
16 ULLI 1 0 1
17 UUEE 1 0 1
If you'd like to stick to only using dplyr, you can also emulate the behaviour of pivot_longer by:
library(dplyr)
bind_rows(
df %>% transmute(Airports = Origin, name = 'Origin'),
df %>% transmute(Airports = Destination, name = 'Destination')
) %>%
group_by(Airports) %>%
summarise(
totalMovements = n(),
takeoffs = sum(name == 'Origin'),
landings = sum(name == 'Destination')
)

Removing rows with dates conditional to specific IDs

Basically, I have a data frame that contains IDs, Dates, VolumeX, and VolumeY.
I want to split the VolumeX data frame into before and after the max date of VolumeY specific to an ID.
Ex.
df looks like (with many different IDs) :
ID Date VolX VolY
1 2018 - 02- 01 5 -
1 2018 - 03- 01 6 -
1 2018 - 08- 01 3 -
1 2018 - 10- 01 1 -
1 2017 - 02- 01 - 1
1 2014 - 10- 01 - 0
1 2014 - 11- 01 - 5
1 2018 - 02- 01 - 0
So for the max date of VolY for every ID, I'd like to split the data frame into two: before and after that date for each ID soas to sum VolX before and after VolY max date.
Seems like this needs to be some kind of nested for loop. I am able to extract max dates and total volume... just having a hard time selecting out ID-specific
Is this what you're after?
library(dplyr)
df %>%
replace(., . == "-", NA) %>%
mutate(Date = as.Date(gsub("\\s", "", Date))) %>%
mutate_at(vars(VolX, VolY), as.numeric) %>%
group_by(ID, Before_After = cumsum(c(0, lag(+(Date == max(Date)))[-1]))) %>%
mutate(
sum_Volx = sum(VolX[Date != max(Date)], na.rm = T),
sum_VolY = sum(VolY[Date != max(Date)], na.rm = T)
) %>% ungroup() %>% select(-Before_After)
Output:
# A tibble: 8 x 6
ID Date VolX VolY sum_Volx sum_VolY
<int> <date> <dbl> <dbl> <dbl> <dbl>
1 1 2018-02-01 5 NA 14 0
2 1 2018-03-01 6 NA 14 0
3 1 2018-08-01 3 NA 14 0
4 1 2018-10-01 1 NA 14 0
5 1 2017-02-01 NA 1 0 6
6 1 2014-10-01 NA 0 0 6
7 1 2014-11-01 NA 5 0 6
8 1 2018-02-01 NA 0 0 6
You could also make separate columns for before/after, like this:
df %>%
replace(., . == "-", NA) %>%
mutate_at(vars(VolX, VolY), as.numeric) %>%
group_by(ID) %>%
mutate(
Date = as.Date(gsub("\\s", "", Date)),
Before_After = cumsum(c(0, lag(+(Date == max(Date)))[-1])),
sum_Volx_Before = sum(VolX[Date != max(Date) & Before_After == 0], na.rm = T),
sum_VolY_Before = sum(VolY[Date != max(Date) & Before_After == 0], na.rm = T),
sum_Volx_After = sum(VolX[Date != max(Date) & Before_After == 1], na.rm = T),
sum_VolY_After = sum(VolY[Date != max(Date) & Before_After == 1], na.rm = T)
) %>% ungroup() %>% select(-Before_After)
Output:
# A tibble: 8 x 8
ID Date VolX VolY sum_Volx_Before sum_VolY_Before sum_Volx_After sum_VolY_After
<int> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2018-02-01 5 NA 14 0 0 6
2 1 2018-03-01 6 NA 14 0 0 6
3 1 2018-08-01 3 NA 14 0 0 6
4 1 2018-10-01 1 NA 14 0 0 6
5 1 2017-02-01 NA 1 14 0 0 6
6 1 2014-10-01 NA 0 14 0 0 6
7 1 2014-11-01 NA 5 14 0 0 6
8 1 2018-02-01 NA 0 14 0 0 6
On the other hand, you could just create 2 separate new data frames in your environment, named Before and After, that literally exclude the maximum date and summarise the information, like below:
df_list <- df %>%
replace(., . == "-", NA) %>%
mutate_at(vars(VolX, VolY), as.numeric) %>%
group_by(ID) %>%
mutate(
Date = as.Date(gsub("\\s", "", Date)),
Before_After = cumsum(c(0, lag(+(Date == max(Date)))[-1]))
) %>%
filter(!Date == max(Date)) %>%
group_by(ID, Before_After) %>%
summarise(
sum_VolX = sum(VolX, na.rm = T),
sum_VolY = sum(VolY, na.rm = T)
) %>%
split(., .$Before_After)
names(df_list) <- c("Before", "After")
list2env(df_list, envir = .GlobalEnv)
Let's go through one-by-one:
first we replace the - signs by NA (not strictly needed, just to avoid errors later on);
afterwards we transform VolX and VolY into numeric;
then we group by ID so that everything is applied to each group separately;
afterwards we transform the Date into a proper Date format;
then it is the crucial part: we calculate the flag Before_After column where first we flag with 1 if in the previous row the maximum date was observed; afterwards we calculate a cumulative sum of such column, so that everything before this event is 0 and everything after 1;
then we filter out the maximum Date;
we group again by ID and Before_After indicator;
we shrink the data frame with summarise so that it only contains the sum of the respective columns;
we turn the data frame into 2 different ones by splitting on Before_After column;
as the obtained result is a list of 2 data frames, we need to get them into global environment, so first we assign the names to each one and then we turn them into 'proper' data frames.
Output:
Before
# A tibble: 1 x 4
# Groups: ID [1]
ID Before_After sum_VolX sum_VolY
<int> <dbl> <dbl> <dbl>
1 1 0 14 0
After
# A tibble: 1 x 4
# Groups: ID [1]
ID Before_After sum_VolX sum_VolY
<int> <dbl> <dbl> <dbl>
1 1 1 0 6
Note that 0 corresponds to Before and 1 to After.

Identify repeats in r

Hi I have a dataframe and it looks like this:
test = data.frame("Year" = c("2015","2015","2016","2017","2018"),
"UserID" = c(1,2,1,1,3), "PurchaseValue" = c(1,5,3,3,5))
where "Year" is the time of purchase and "UserID" is the buyer.
I want to create a variable "RepeatedPurchase" that gives "1" if it is a repeated purchase and else 0 (if it is the only purchase/ if it is the first time purchase).
Thus, the desired output would look like this:
I tried to achieve this by first creating a variable "Se" that tells if that purchase is the 1st/ 2nd/ 3rd... purchase of that buyer but my code didn't work. Wondering what's wrong with my code or is there a better way I can identify repeated purchase? Thanks!
library(dplyr)
df %>% arrange(UserID, Year) %>% group_by(UserID) %>% mutate(Se = seq(n())) %>% ungroup()
You do not need dplyr. You can use duplicated() as following:
test=data.frame("Year" = c("2015","2015","2016","2017","2018"), "UserID" = c(1,2,1,1,3), "PurchaseValue" = c(1,5,3,3,5))
repeated<-duplicated(test$UserID)
# [1] FALSE FALSE TRUE TRUE FALSE
test$RepeatedPurchase<-ifelse(repeated==T,1,0)
test
# Year UserID PurchaseValue RepeatedPurchase
# 1 2015 1 1 0
# 2 2015 2 5 0
# 3 2016 1 3 1
# 4 2017 1 3 1
# 5 2018 3 5 0
Cheers!,
We can start by counting the number of purchases for each UserID and assign 1 when it exceeds 1
test %>% group_by(UserID) %>% mutate(RepeatedPurchase = ifelse(1:n()>1, 1, 0))
# A tibble: 5 x 4
# Groups: UserID [3]
Year UserID PurchaseValue Repeatedpurchase
<fct> <dbl> <dbl> <dbl>
1 2015 1.00 1.00 0
2 2015 2.00 5.00 0
3 2016 1.00 3.00 1.00
4 2017 1.00 3.00 1.00
5 2018 3.00 5.00 0
Here is another dplyr solution. We can group_by the UserID and PurchaseValue, and then use as.integer(n() > 1) to evaluate if the count is larger than 1.
library(dplyr)
test2 <- test %>%
group_by(UserID, PurchaseValue) %>%
mutate(RepeatedPurchase = as.integer(n() > 1)) %>%
ungroup()
test2
# # A tibble: 5 x 4
# Year UserID PurchaseValue RepeatedPurchase
# <fct> <dbl> <dbl> <int>
# 1 2015 1 1 0
# 2 2015 2 5 0
# 3 2016 1 3 1
# 4 2017 1 3 1
# 5 2018 3 5 0

keep only consecutive observations

As said in the title, I have a data.frame like below,
df<-data.frame('id'=c('1','1','1','1','1','1','1'),'time'=c('1998','2000','2001','2002','2003','2004','2007'))
df
id time
1 1 1998
2 1 2000
3 1 2001
4 1 2002
5 1 2003
6 1 2004
7 1 2007
there are some others cases with shorter or longer time window than this,just for illustration's sake.
I want to do two things about this data set, first, find all those id that have at least five consecutive observations here, this can be done by following solutions here. Second, I want to keep only those observations in the at least five consecutive row of id selected by first step. The ideal result would be :
df
id time
1 1 2000
2 1 2001
3 1 2002
4 1 2003
5 1 2004
I could write a complex function using for loop and diff function, but this may be very time consuming both in writing the function and getting the result if I have a bigger data set with lots if id. But this is not seems like R and I do believe there should be a one or two line solution.
Anyone know how to achieve this? your time and knowledge would be deeply appreciated. Thanks in advance.
You can use dplyr to group by id and consecutive time, and filter groups with less than 5 entries, i.e.
#read data with stringsAsFactors = FALSE
df<-data.frame('id'=c('1','1','1','1','1','1','1'),
'time'=c('1998','2000','2001','2002','2003','2004','2007'),
stringsAsFactors = FALSE)
library(dplyr)
df %>%
mutate(time = as.integer(time)) %>%
group_by(id, grp = cumsum(c(1, diff(time) != 1))) %>%
filter(n() >= 5)
which gives
# A tibble: 5 x 3
# Groups: id, grp [1]
id time grp
<chr> <int> <dbl>
1 1 2000 2
2 1 2001 2
3 1 2002 2
4 1 2003 2
5 1 2004 2
Similar to #Sotos answer, this solution instead uses seqle (from cgwtools) as the grouping variable:
library(dplyr)
library(cgwtools)
df %>%
mutate(time = as.numeric(time)) %>%
group_by(id, consec = rep(seqle(time)$length, seqle(time)$length)) %>%
filter(consec >= 5)
Result:
# A tibble: 5 x 3
# Groups: id, consec [1]
id time consec
<chr> <dbl> <int>
1 1 2000 5
2 1 2001 5
3 1 2002 5
4 1 2003 5
5 1 2004 5
To remove grouping variable:
df %>%
mutate(time = as.numeric(time)) %>%
group_by(id, consec = rep(seqle(time)$length, seqle(time)$length)) %>%
filter(consec >= 5) %>%
ungroup() %>%
select(-consec)
Result:
# A tibble: 5 x 2
id time
<chr> <dbl>
1 1 2000
2 1 2001
3 1 2002
4 1 2003
5 1 2004
Data:
df<-data.frame('id'=c('1','1','1','1','1','1','1'),
'time'=c('1998','2000','2001','2002','2003','2004','2007'),
stringsAsFactors = FALSE)
Try that on your data:
df[,] <- lapply(df, function(x) type.convert(as.character(x), as.is = TRUE))
IND1 <- (df$time - c(df$time[-1],df$time[length(df$time)-1])) %>% abs(.)
IND2 <- (df$time - c(df$time[2],df$time[-(length(df$time))])) %>% abs(.)
df <- df[IND1 %in% 1 | IND2 %in% 1,]
df[ave(df$time, df$id, FUN = length) >= 5, ]
A solution from dplyr, tidyr, and data.table.
library(dplyr)
library(tidyr)
library(data.table)
df2 <- df %>%
mutate(time = as.numeric(as.character(time))) %>%
arrange(id, time) %>%
right_join(data_frame(time = full_seq(.$time, 1)), by = "time") %>%
mutate(RunID = rleid(id)) %>%
group_by(RunID) %>%
filter(n() >= 5, !is.na(id)) %>%
ungroup() %>%
select(-RunID)
df2
# A tibble: 5 x 2
id time
<fctr> <dbl>
1 1 2000
2 1 2001
3 1 2002
4 1 2003
5 1 2004

Resources