I have a data set of this format
Order_Name Frequency Order_Dt
A 2 2016-01-20
A 2 2016-05-01
B 1 2016-02-12
C 3 2016-03-04
C 3 2016-07-01
C 3 2016-08-09
I need to find the average difference between the dates of those order which have been placed for more than 1 times, i.e., frequency > 1.
require(dplyr)
# loading the data
df0 <- read.table(text =
'Order_Name Frequency Order_Dt
A 2 2016-01-20
A 2 2016-05-01
B 1 2016-02-12
C 3 2016-03-04
C 3 2016-07-01
C 3 2016-08-09',
stringsAsFactors = F,
header = T)
# putting the date in the right format
df0$Order_Dt <- as.Date(df0$Order_Dt)
# obtaining the averages
df0 %>% filter(Frequency > 1) %>%
arrange(., Order_Name, Order_Dt) %>%
mutate(diff_date = Order_Dt - lag(Order_Dt)) %>%
group_by(Order_Name) %>%
summarise(avg_days = mean(diff_date, na.rm = T))
# A tibble: 2 × 2
Order_Name avg_days
<chr> <time>
1 A 102.00000 days
2 C 33.33333 days
Related
I am trying to count the appearances of a value (across 2 columns) consecutively over the previous days. In the example this would be counting the consecutive days a team made an appearance (either in Hteam or Ateam) prior to that date. The aim would be to produce additional columns for both the home and away teams that showed these new values.
Test data:
data<- data.frame(
Date= c("2018-01-01", "2018-01-01", "2018-01-02", "2018-01-03", "2018-01-04", "2018-01-05"),
Hteam= c("A","D","B","A","C","A"),
Ateam= c("B","C","A","C","B","C"))
Date Hteam Ateam
1 2018-01-01 A B
2 2018-01-01 D C
3 2018-01-02 B A
4 2018-01-03 A C
5 2018-01-04 C B
6 2018-01-05 A C
The aim would end up looking like:
Date Hteam Ateam Hdays Adays
1 2018-01-01 A B 0 0
2 2018-01-01 D C 0 0
3 2018-01-02 B A 1 1
4 2018-01-03 A C 2 0
5 2018-01-04 C B 1 0
6 2018-01-05 A C 0 2
In my searching I haven't found an example close enough that I am able to adapt to this situation. I feel like I should be using a rollapply or dplyr grouping, but I can't get close to a solution.
Thanks.
Maybe the following gives what you wanted assuming that data is sorted by Date and missing days are not considered.
t1 <- unique(unlist(data[-1]))
t2 <- do.call(rbind, lapply(split(data[-1], data$Date), function(x) t1 %in% unlist(x)))
t3 <- apply(t2, 2, function(x) ave(x, cumsum(!x), FUN=cumsum))-1
data.frame(data
, Hdays=t3[cbind(match(data$Date, rownames(t3)), match(data$Hteam, t1))]
, Adays=t3[cbind(match(data$Date, rownames(t3)), match(data$Ateam, t1))])
# Date Hteam Ateam Hdays Adays
#1 2018-01-01 A B 0 0
#2 2018-01-01 D C 0 0
#3 2018-01-02 B A 1 1
#4 2018-01-03 A C 2 0
#5 2018-01-04 C B 1 0
#6 2018-01-05 A C 0 2
I think your expected output is incorrect. Namely, row 5's "C" occurs twice above it, but has a 1.
Here's a tidyverse version:
library(dplyr)
library(tidyr)
data %>%
mutate(rn = row_number()) %>%
pivot_longer(-c(Date, rn), names_to = "x", values_to = "team") %>%
mutate(x = gsub("team$", "", x)) %>%
group_by(team) %>%
mutate(days = row_number() - 1) %>%
ungroup() %>%
pivot_wider(c(Date, rn), names_from = x, values_from = c(team, days)) %>%
select(-rn)
# # A tibble: 6 x 5
# Date team_H team_A days_H days_A
# <chr> <chr> <chr> <dbl> <dbl>
# 1 2018-01-01 A B 0 0
# 2 2018-01-01 D C 0 0
# 3 2018-01-02 B A 1 1
# 4 2018-01-03 A C 2 1
# 5 2018-01-04 C B 2 2
# 6 2018-01-05 A C 3 3
I have data that looks like this:
sample <- data.frame(
group = c("A","A","A","B","B","B"),
date = c(as.Date("2014-12-31"),
as.Date("2015-01-31"),
as.Date("2015-02-28"),
as.Date("2015-01-31"),
as.Date("2015-03-31"),
as.Date("2015-04-30")),
obs = c(100, 200, 300, 50, 100, 150)
)
Note that the date variable always takes the last date of the month. In table format, the data looks like this:
group date obs
1 A 2014-12-31 100
2 A 2015-01-31 200
3 A 2015-02-28 300
4 B 2015-01-31 50
5 B 2015-03-31 100
6 B 2015-04-30 150
I want to create a forth column that counts the number of observations in the group. HOWEVER, I want the count to start over if a month doesn't immediately follow the month before. This is what I want it to look like:
group date obs num
1 A 2014-12-31 100 1
2 A 2015-01-31 200 2
3 A 2015-02-28 300 3
4 B 2015-01-31 50 1
5 B 2015-03-31 100 1
6 B 2015-04-30 150 2
So far all I can get is the following:
library(tidyverse)
sample <- sample %>%
arrange(date) %>%
group_by(group) %>%
mutate(num = row_number())
group date obs num
1 A 2014-12-31 100 1
2 A 2015-01-31 200 2
3 A 2015-02-28 300 3
4 B 2015-01-31 50 1
5 B 2015-03-31 100 2
6 B 2015-04-30 150 3
Any help would be much appreciated. I also want to be able to do the same thing but with quarterly data (instead of monthly).
We can use lubridate::days_in_month to get number of days in a month compare it with difference of current and past date to create a new group. We can then assign row_number() in each group.
library(dplyr)
sample %>%
group_by(group) %>%
mutate(diff_days = cumsum(as.numeric(date - lag(date, default = first(date))) !=
lubridate::days_in_month(date))) %>%
group_by(diff_days, add = TRUE) %>%
mutate(num = row_number()) %>%
ungroup() %>%
select(-diff_days)
# group date obs num
# <fct> <date> <dbl> <int>
#1 A 2014-12-31 100 1
#2 A 2015-01-31 200 2
#3 A 2015-02-28 300 3
#4 B 2015-01-31 50 1
#5 B 2015-03-31 100 1
#6 B 2015-04-30 150 2
We can create a group based on the differnece of month of 'date' and if it is not equal to 1 i.e. one month difference
library(dplyr)
library(lubridate)
sample %>%
arrange(group, date) %>%
group_by(group, mth = cumsum(c(TRUE, diff(month(date)) != 1))) %>%
mutate(num = row_number()) %>%
ungroup %>%
select(-mth)
# A tibble: 6 x 4
# group date obs num
# <fct> <date> <dbl> <int>
#1 A 2015-01-31 100 1
#2 A 2015-02-28 200 2
#3 A 2015-03-31 300 3
#4 B 2015-01-31 50 1
#5 B 2015-03-31 100 1
#6 B 2015-04-30 150 2
If the year also needs to be considered
library(zoo)
sample %>%
arrange(group, date) %>%
mutate(yearmon = as.yearmon(date)) %>%
group_by(group) %>%
group_by(grp = cumsum(c(TRUE, as.integer(diff(yearmon) * 12)> 1)),
add = TRUE ) %>%
mutate(num = row_number()) %>%
ungroup %>%
select(-grp, -yearmon)
# A tibble: 6 x 4
# group date obs num
# <fct> <date> <dbl> <int>
#1 A 2015-01-31 100 1
#2 A 2015-02-28 200 2
#3 A 2015-03-31 300 3
#4 B 2015-01-31 50 1
#5 B 2015-03-31 100 1
#6 B 2015-04-30 150 2
My data set looks like this:
ID start.date end.date program
1 2016.05.05 2017.05.05 A
1 2017.05.06 2019.06.16 A
2 2012.06.05 2013.06.18 B
3 2014.09.09 2017.07.01 B
3 2017.09.09 2018.09.09 B
I want to identify the people who were present in a program (character variable) consecutively, and then calculate the time between each end.date and start.date (if the occurrence was consecutive).
So the resulting data should look like this:
ID start.date end.date program days
1 2016.05.05 2017.05.05 A NA
1 2017.05.06 2019.06.16 A . 1
2 2012.06.05 2013.06.18 B . NA
3 2014.09.09 2017.07.01 B . NA
3 2017.09.09 2018.09.09 B . 63
Don't know how to start on this!
library(dplyr)
dat %>%
group_by(ID, program) %>%
arrange(start.date) %>% # Added in case the data isn't sorted
mutate(days = start.date - lag(end.date))
I get slightly different results, though:
# A tibble: 5 x 5
# Groups: ID, program [3]
ID start.date end.date program days
<int> <date> <date> <chr> <time>
1 1 2016-05-05 2017-05-05 A NA
2 1 2017-05-06 2019-06-16 A 1
3 2 2012-06-05 2013-06-18 B NA
4 3 2014-09-09 2017-07-01 B NA
5 3 2017-09-09 2018-09-09 B 70
To bring the data in, I converted to dates:
dat <- read.table(header = T, stringsAsFactors = F,
text = "ID start.date end.date program
1 2016.05.05 2017.05.05 A
1 2017.05.06 2019.06.16 A
2 2012.06.05 2013.06.18 B
3 2014.09.09 2017.07.01 B
3 2017.09.09 2018.09.09 B") %>%
mutate_at(vars(matches("date")), lubridate::ymd)
Let's suppose I have this dataframe:
Date A B
2010-01-01 NA 1
2010-01-02 2 NA
2010-01-05 3 NA
2010-01-07 NA 4
2010-01-20 5 NA
2010-01-25 6 7
I want to collapse rows, removing the NA values to the closest Date. So the result would be:
Date A B
2010-01-02 2 1
2010-01-07 3 4
2010-01-20 5 NA
2010-01-25 6 7
I saw this stack overflow that solves collapsing using a key value, but I could not find a similar case using close date values to collapse.
Obs1: It would be good if there was a way to not collapse the rows if the dates are too far apart (example: more than 15 days apart).
Obs2: It would be good if the collapsing lines kept the latter date rather than the earlier, as shown in the example above.
Using dplyr package an option could be to group_by on combination of A and B in such a way that they form complete values.
Considering Obs#2 the max of Date should be taken for combined row.
library(dplyr)
library(lubridate)
df %>% mutate(Date = ymd(Date)) %>%
mutate(GrpA = cumsum(!is.na(A)), GrpB = cumsum(!is.na(B))) %>%
rowwise() %>%
mutate(Grp = max(GrpA, GrpB)) %>%
ungroup() %>%
select(-GrpA, -GrpB) %>%
group_by(Grp) %>%
summarise(Date = max(Date), A = A[!is.na(A)][1], B = B[!is.na(B)][1])
# # A tibble: 4 x 4
# Grp Date A B
# <int> <date> <int> <int>
# 1 1 2010-01-02 2 1
# 2 2 2010-01-07 3 4
# 3 3 2010-01-20 5 NA
# 4 4 2010-01-25 6 7
Data:
df <- read.table(text =
"Date A B
2010-01-01 NA 1
2010-01-02 2 NA
2010-01-05 3 NA
2010-01-07 NA 4
2010-01-20 5 NA
2010-01-25 6 7",
stringsAsFactors = FALSE, header = TRUE)
I want to insert rows between two dates by group. My way of doing it is so complicated that I insert missing values by last observation carry forwards and then merge. I was wondering is there any easier way to achieve it.
# sample data
user<-c("A","A","B","B","B")
dummy<-c(1,1,1,1,1)
date<-as.Date(c("2017/1/3","2017/1/6","2016/5/1","2016/5/3","2016/5/5"))
dt<-data.frame(user,dummy,date)
user dummy date
1 A 1 2017-01-03
2 A 1 2017-01-06
3 B 1 2016-05-01
4 B 1 2016-05-03
5 B 1 2016-05-05
Desired output
By using dplyr and tidyr :)(one line solution )
library(dplyr)
library(tidyr)
dt %>% group_by(user) %>% complete(date=full_seq(date,1),fill=list(dummy=0))
# A tibble: 9 x 3
# Groups: user [2]
user date dummy
<fctr> <date> <dbl>
1 A 2017-01-03 1
2 A 2017-01-04 0
3 A 2017-01-05 0
4 A 2017-01-06 1
5 B 2016-05-01 1
6 B 2016-05-02 0
7 B 2016-05-03 1
8 B 2016-05-04 0
9 B 2016-05-05 1
you can try this
library(data.table)
setDT(dt)
tmp <- dt[, .(date = seq.Date(min(date), max(date), by = '1 day')), by =
'user']
dt <- merge(tmp, dt, by = c('user', 'date'), all.x = TRUE)
dt[, dummy := ifelse(is.na(dummy), 0, dummy)]
We can use the tidyverse to achieve this task.
library(tidyverse)
dt2 <- dt %>%
group_by(user) %>%
do(date = seq(from = min(.$date), to = max(.$date), by = 1)) %>%
unnest() %>%
left_join(dt, by = c("user", "date")) %>%
replace_na(list(dummy = 0)) %>%
select(colnames(dt))
dt2
# A tibble: 9 x 3
user dummy date
<fctr> <dbl> <date>
1 A 1 2017-01-03
2 A 0 2017-01-04
3 A 0 2017-01-05
4 A 1 2017-01-06
5 B 1 2016-05-01
6 B 0 2016-05-02
7 B 1 2016-05-03
8 B 0 2016-05-04
9 B 1 2016-05-05
The simplest way that I have found to do this is with the padr library.
library(padr)
dt_padded <- pad(dt, group = "user", by = "date") %>%
replace_na(list(dummy=0))
A Base R (not quite as elegant) solution:
# Data
user<-c("A","A","B","B","B")
dummy<-c(1,1,1,1,1)
date<-as.Date(c("2017/1/3","2017/1/6","2016/5/1","2016/5/3","2016/5/5"))
df1 <-data.frame(user,dummy,date)
# Solution
do.call(rbind, lapply(split(df1, df1$user), function(df) {
dff <- data.frame(user=df$user[1], dummy=0, date=seq.Date(min(df$date), max(df$date), 'day'))
dff[dff$date %in% df$date, "dummy"] <- df$dummy[1]
dff
}))
# user dummy date
# A 1 2017-01-03
# A 0 2017-01-04
# A 0 2017-01-05
# A 1 2017-01-06
# B 1 2016-05-01
# B 0 2016-05-02
# B 1 2016-05-03
# B 0 2016-05-04
# B 1 2016-05-05
Assuming your data is called df1, and you want to add dates between two days try this:
library(dplyr)
df2 <- seq.Date(as.Date("2015-01-03"), as.Date("2015-01-06"), by ="day")
left_join(df2, df1)
If you're simply trying to add a new record, I suggest using rbind.
rbind()