I have a table of gas refuelings, a, like this:
a = setDT(structure(list(date = structure(c(NA, 16837, 16843, 16847, 16852,
16854, 16858, 16862, 16867, 16871, 16874), class = "Date"), km = c(NA,
NA, 421, 351, 286, 350, 414, 332, 401, 321, 350)), .Names = c("date",
"km"), class = c("data.table", "data.frame"), row.names = c(NA,
-11L)), key = "date")
It has dates of refuel and km drove with that refuel. I'm also given a different table with dates of tire pressure adjusting and oil changes, actions, like this:
actions = setDT(structure(list(date = structure(c(16841, 16843, 16858, 16869), class = "Date"),
action = structure(c(1L, 2L, 2L, 2L), .Label = c("oil", "tires"
), class = "factor")), .Names = c("date", "action"), row.names = c(NA,
-4L), class = c("data.table", "data.frame")), key = "action")
I need to relate the fuel consumption (in the real version of a I also have gallons) to the days elapsed since last tire pressure checking and since last oil change. There must be a simple way to achieve this, but after some hours trying I'm stuck.
This is what I've tried:
library(data.table)
library(lubridate)
library(reshape2)
b <- dcast(actions, date ~ action, value.var = "date")
d <- seq(min(a$date, b$date, na.rm = TRUE), max(a$date, b$date, na.rm = TRUE), by = "day")
d <- data.table(date=d)
d <- b[d,]
d$daysOil <- as.double(difftime(d$date, d$date[! is.na(d$oil)], units = "days"))
d$daysOil[which(d$daysOil < 0)] <- NA
The thing gets a lot more complicated if I try to calculate the number of days elapsed since the last "tires" event (the one that's closer before the refuel date), and that's where I'm stuck.
My expected output is:
expected
date km daysoil daysTires
1 <NA> NA NA NA
2 2016-02-06 NA NA NA
3 2016-02-12 421 2 0
4 2016-02-16 351 6 4
5 2016-02-21 286 11 9
6 2016-02-23 350 13 11
7 2016-02-27 414 17 0
8 2016-03-02 332 21 4
9 2016-03-07 401 26 9
10 2016-03-11 321 30 2
11 2016-03-14 350 33 5
I'd appreciate any solution, but preferably by using data.table or dplyr packages.
########## EDIT ##########
If you can think of a better information (tables) structure to facilitate this task, it will be highly appreciated too!
Here's one option:
actions[, date.copy := date]
cbind(a,
dcast(actions[, .SD[a, .(days = date - date.copy, N = .I), roll = T, on = 'date']
, by = action],
N ~ action, value.var = 'days'))
# date km N oil tires
# 1: <NA> NA 1 NA days NA days
# 2: 2016-02-06 NA 2 NA days NA days
# 3: 2016-02-12 421 3 2 days 0 days
# 4: 2016-02-16 351 4 6 days 4 days
# 5: 2016-02-21 286 5 11 days 9 days
# 6: 2016-02-23 350 6 13 days 11 days
# 7: 2016-02-27 414 7 17 days 0 days
# 8: 2016-03-02 332 8 21 days 4 days
# 9: 2016-03-07 401 9 26 days 9 days
#10: 2016-03-11 321 10 30 days 2 days
#11: 2016-03-14 350 11 33 days 5 days
Several simple things are going on in the above - run in pieces to understand.
Related
Let's suppose I have two dataframes that look like this:
df1 = structure(list(X1 = c(0.659588465514883, 0.47368422669833, -0.0422047052887636,
-1.75642936005977, 0.339813114272074, 1.09341750942405, 0.327672990051479,
-0.893507823167616, -0.661285321563594, -0.569673784617002, -0.983369868281376,
-2.53659592825309, 0.396220995581641, -1.1994504350227, -0.553343957714012,
1.30884516680972, -0.120561033997931, 0.971506981390537, 0.815610612704566,
1.53103368033727, -0.808956975392184, -1.27332589061096, -1.89082047917723,
0.249755375966669, -0.704051599213331), X2 = c(0.659588465514883,
0.47368422669833, -0.0422047052887636, -1.75642936005977, 0.339813114272074,
1.09341750942405, 0.327672990051479, -0.893507823167616, -0.661285321563594,
-0.569673784617002, -0.983369868281376, -2.53659592825309, 0.396220995581641,
-1.1994504350227, -0.553343957714012, 1.30884516680972, -0.120561033997931,
0.971506981390537, 0.815610612704566, 1.53103368033727, -0.808956975392184,
-1.27332589061096, -1.89082047917723, 0.249755375966669, -0.704051599213331
), Date = structure(c(10957,
10988, 11017, 11048, 11078, 11109, 11139, 11170, 11201, 11231,
11262, 11292, 11323, 11354, 11382, 11413, 11443, 11474, 11504,
11535, 11566, 11596, 11627, 11657, 11688), class = "Date")), class = "data.frame", row.names = c(NA,
-25L))
X1 X2
1 -1.633636896 -1.633636896
2 1.793766808 1.793766808
3 0.440697771 0.440697771
4 0.330091148 0.330091148
5 -1.234246285 -1.234246285
6 0.044951993 0.044951993
7 -2.831295687 -2.831295687
8 -0.735371579 -0.735371579
9 -0.412580789 -0.412580789
10 0.001848622 0.001848622
11 1.480684731 1.480684731
12 -1.088999830 -1.088999830
13 -0.465903929 -0.465903929
14 -0.010743010 -0.010743010
15 1.420995930 1.420995930
16 -0.789190729 -0.789190729
17 -0.750476176 -0.750476176
18 -0.314079067 -0.314079067
19 -0.324779959 -0.324779959
20 -1.192471909 -1.192471909
21 -0.170325813 -0.170325813
22 0.890941125 0.890941125
23 0.863875448 0.863875448
24 -0.088048086 -0.088048086
25 0.021239226 0.021239226
Date
1 2000-01-01
2 2000-02-01
3 2000-03-01
4 2000-04-01
5 2000-05-01
6 2000-06-01
7 2000-07-01
8 2000-08-01
9 2000-09-01
10 2000-10-01
11 2000-11-01
12 2000-12-01
13 2001-01-01
14 2001-02-01
15 2001-03-01
16 2001-04-01
17 2001-05-01
18 2001-06-01
19 2001-07-01
20 2001-08-01
21 2001-09-01
22 2001-10-01
23 2001-11-01
24 2001-12-01
25 2002-01-01
df2 = structure(list(X1 = c(-0.0712460200169048, 1.0131741924359, 0.28590272354409,
-0.835911047943257, -0.146890264431744), X2 = c(-0.0712460200169048,
1.0131741924359, 0.28590272354409, -0.835911047943257, -0.146890264431744
), Date = structure(c(10984, 11120, 11441, 11488, 11712), class = "Date")), class = "data.frame", row.names = c(NA,
-5L))
X1 X2 Date
1 0.03815189 0.03815189 2000-01-28
2 -0.22665838 -0.22665838 2000-06-12
3 0.36459588 0.36459588 2001-04-29
4 0.32772746 0.32772746 2001-06-15
5 -1.22891784 -1.22891784 2002-01-25
What I would like to do is to reduce the number of rows in df1 (number of rows in df1 = number of rows in df2) on the basis of the the number of rows in df2. In particular, I would like to remove those rows that are in the Date column for df1 is not present in the Date column of df2. Easier to see the output I would like to get:
# DF1 shall become like this (n stays for the numbers corresponding to each date row):
X1 X2 Date
1 n n 2000-01-01
2 n n 2000-06-01
3 n n 2001-04-01
4 n n 2001-06-01
5 n n 2002-01-01
# not really important which day is diplayed in the finale output. What matters is just year and month
I tried to use semin_join but the problem is that different days make the function unable to grasp what I need. Ideally, I would need to ignore days and sample by year and months.
This is what I tried:
library(dplyr)
semin_join(df1, df2, by = "Date")
[1] X1 X2 Date
<0 rows> (or 0-length row.names)
Can anyone help me?
Thanks!
Using the great suggestion from #arg0naut91 here a possible solution in base R. First format the variables Date and then you can use %in% to check which dates are present or not. Next the code using your df1 and df2:
#Format dates
df1$I1 <- format(df1$Date,'%Y-%m')
df2$I2 <- format(df2$Date,'%Y-%m')
Now this makes the contrast:
df1[df1$I1 %in% df2$I2,]
Output:
X1 X2 Date I1
1 0.6595885 0.6595885 2000-01-01 2000-01
6 1.0934175 1.0934175 2000-06-01 2000-06
16 1.3088452 1.3088452 2001-04-01 2001-04
18 0.9715070 0.9715070 2001-06-01 2001-06
25 -0.7040516 -0.7040516 2002-01-01 2002-01
In the end you could assign that result to a new dataframe and remove I1.
I would appreciate your help in calculating the number of days sine the last purchase per user Id. I attached the dateset with the expected target.
Thank you,
We can group by 'USERID' and get the difftime of the current and past 'Datetime' converted 'date' column
library(lubridate)
library(dplyr)
df1 %>%
mutate(date = mdy_hm(date)) %>% # convert to Datetime class
group_by(USERID) %>% #group by USERID
mutate(numberofdays = as.integer(difftime(date, # take the difference
lag(date, default = first(date)), unit = 'day')))
# A tibble: 8 x 5
# Groups: USERID [3]
# ID date USERID SALES numberofdays
# <int> <dttm> <dbl> <dbl> <int>
#1 1 2018-11-19 10:36:00 500 1000 0
#2 2 2018-11-19 10:41:00 520 1450 0
#3 3 2018-11-23 10:59:00 500 1390 4
#4 4 2018-11-23 11:12:00 530 1778 0
#5 5 2018-11-29 11:52:00 530 1966 6
#6 6 2018-12-05 12:23:00 520 1100 16
#7 7 2018-12-19 12:24:00 520 700 14
#8 8 2018-12-25 21:24:00 520 900 6
data
df1 <- structure(list(ID = 1:8, date = c("11/19/2018 10:36", "11/19/2018 10:41",
"11/23/2018 10:59", "11/23/2018 11:12", "11/29/2018 11:52", "12/5/2018 12:23",
"12/19/2018 12:24", "12/25/2018 21:24"), USERID = c(500, 520,
500, 530, 530, 520, 520, 520), SALES = c(1000, 1450, 1390, 1778,
1966, 1100, 700, 900)), class = "data.frame", row.names = c(NA,
-8L))
I'm trying to use R to measure how many days supply of a prescription an individual already has on-hand when they make a refill, taking into account all previous prescriptions. For example, if I had this table...
member rx_id fill_date to_date days_supply
1 A 1 2018-10-01 2018-10-02 2
2 B 1 2016-11-07 2016-11-10 4
3 B 2 2016-11-07 2016-12-04 28
4 B 3 2016-11-08 2016-11-09 2
5 B 4 2016-11-10 2016-12-03 24
I'd expect the following output
member rx_id fill_date to_date days_supply_on_hand
1 A 1 2018-10-01 2018-10-02 0
2 B 1 2016-11-07 2016-11-10 0
3 B 2 2016-11-07 2016-12-04 4
4 B 3 2016-11-08 2016-11-09 30
5 B 4 2016-11-10 2016-12-03 26
For member B, when the second script is filled on the same day as the first script, the individual already has 4 days worth of RX on hand. When the third script is filled, the individual has 3 days left from the first script and 27 left from the second (30 total). When the fourth script is filled, the third script is depleted, but there is 1 day left from the first script and 25 from the third script (26 total).
I know how to do rolling totals in both dplyr and data.table, but I can't figure out how to take into account variable levels of depletion based on previous records on an individual by individual basis. Below is code to remake the original table, thanks in advance for any suggestions!
structure(list(member = structure(c(1L, 2L, 2L, 2L, 2L), .Label =
c("A",
"B"), class = "factor"), rx_id = c(1, 1, 2, 3, 4), fill_date =
structure(c(17805,
17112, 17112, 17113, 17115), class = "Date"), to_date =
structure(c(17806,
17115, 17139, 17114, 17138), class = "Date"), days_supply = c(2,
4, 28, 2, 24)), .Names = c("member", "rx_id", "fill_date",
"to_date",
"days_supply"), row.names = c(NA, -5L), class = "data.frame")
library(data.table)
dt = as.data.table(your_df) # or setDT to convert in place
# merge on relevant days, then compute sum of supply - days elapsed
dt[dt, on = .(member, fill_date <= fill_date, to_date >= fill_date, rx_id < rx_id), by = .EACHI,
sum(days_supply, na.rm = T) - sum(i.fill_date - x.fill_date, na.rm = T)]
# member fill_date to_date rx_id V1
#1: A 2018-10-01 2018-10-01 1 0 days
#2: B 2016-11-07 2016-11-07 1 0 days
#3: B 2016-11-07 2016-11-07 2 4 days
#4: B 2016-11-08 2016-11-08 3 30 days
#5: B 2016-11-10 2016-11-10 4 26 days
Using a simple loop
dt$days_supply_on_hand <- 0
for (a in unique(dt$member)) {
I <- which(.subset2(dt,1) == a)
flDate <- as.integer(.subset2(dt,3)[I])
toDate <- as.integer(.subset2(dt,4)[I])
V <- vapply(seq_along(I), function (k) sum(toDate[1:(k-1)] - flDate[k] + 1), numeric(1))
dt$days_supply_on_hand[I] <- c(0,V[-1])
}
dt
member rx_id fill_date to_date days_supply days_supply_on_hand
1 A 1 2018-10-01 2018-10-02 2 0
2 B 1 2016-11-07 2016-11-10 4 0
3 B 2 2016-11-07 2016-12-04 28 4
4 B 3 2016-11-08 2016-11-09 2 30
5 B 4 2016-11-10 2016-12-03 24 26
where dt is data frame provided above. (Note that the use of .subset2 or as.integer is for efficiency purposes - they can be changed for more readability).
So here is an example of my data
> d
customer date revenue
1: A 2016-01-01 32
2: A 2016-01-03 88
3: A 2016-01-04 80
4: A 2016-02-01 38
5: B 2016-01-13 44
6: B 2016-01-24 11
7: B 2016-01-25 50
8: B 2016-02-26 46
> dput(d)
structure(list(customer = c("A", "A", "A", "A", "B", "B", "B",
"B"), date = structure(c(16801, 16803, 16804, 16832, 16813, 16824,
16825, 16857), class = "Date"), revenue = c(32, 88, 80, 38, 44,
11, 50, 46)), .Names = c("customer", "date", "revenue"), row.names = c(NA,
-8L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002a60788>)
What i want to do is, I want to create a column, let call it roll_sum_3days.
This column is the rolling sum of revenue that happen afterward. The window size is conditioned on date column. In this case, roll_sum_3days is the sum of revenue that happen afterward and must not be later than 3 days.
The expected out come would look like this
customer date revenue roll_sum_3days
1: A 2016-01-01 32 168
2: A 2016-01-03 88 80
3: A 2016-01-04 80 0
4: A 2016-02-01 38 0
5: B 2016-01-13 44 0
6: B 2016-01-24 11 96
7: B 2016-01-25 50 46
8: B 2016-01-26 46 0
A possible solution:
library(lubridate) # for the '%m+%'-function
d[, roll_sum_3d := .SD[.SD[, .(date, date2 = date %m+% days(3), revenue)]
, on = .(date > date, date <= date2)
][, sum(revenue, na.rm = TRUE), by = date]$V1
, by = customer][]
which gives:
customer date revenue roll_sum_3d
1: A 2016-01-01 32 168
2: A 2016-01-03 88 80
3: A 2016-01-04 80 0
4: A 2016-02-01 38 0
5: B 2016-01-13 44 0
6: B 2016-01-24 11 96
7: B 2016-01-25 50 46
8: B 2016-01-26 46 0
What this does:
Group d by customer withby = customer`.
Add roll_sum_3d by reference with :=.
Calculate roll_sum_3d by joining .SD (Subset of Data) for each group with a date-window of that group (.SD[, .(date, date2 = date %m+% days(3), revenue)] with a non-equi join on = .(date > date, date <= date2), summarise the revenue for each date and give that back.
An alternative based on #Arun's comment:
d[, roll_sum_3d := d[d[, .(customer, date, date2 = date %m+% days(3), revenue)]
, on = .(customer, date > date, date <= date2)
, sum(revenue, na.rm = TRUE), by=.EACHI]$V1][]
Hi I guess there is another mistake in your example: observation number 8 won't add to the count of the two previous observations as it is from february. Nevermind I've got a solution if you want using apply() and the POSIXct() function
df <- data.frame(customer = c("A", "A", "A", "A", "B", "B", "B", "B"),
date = structure(c(16801, 16803, 16804, 16832, 16813, 16824,
16825, 16857), class = "Date"),
revenue = c(32, 88, 80, 38, 44, 11, 50, 46))
df$date <- as.POSIXct(df$date)
calc <- function(x){
date <- as.POSIXct(unlist(x["date"]),origin = "1970-01-01")
customer <- unlist(x["customer"])
# There you choose what you want to sum (here conditions are between the day and 3 days later and same customer)
# 86400 is the number of second in a day!
output <- sum(df[df$date > date & df$date <= (date+86400*3) & df$customer==customer,"revenue"])
return(output)
}
df$sum <- apply(df,1,calc)
# if you want to come back with your date format.
df$date <- as.Date(df$date)
df
customer date revenue sum
1 A 2016-01-01 32 168
2 A 2016-01-03 88 80
3 A 2016-01-04 80 0
4 A 2016-02-01 38 0
5 B 2016-01-13 44 0
6 B 2016-01-24 11 50
7 B 2016-01-25 50 0
8 B 2016-02-26 46 0
I couldn't keep your date format as the operator > won't work with it.
I have the following kind of data in my datafile
DriveNo Date and Time
12 2017-01-31 23:00:00 //Start time of a trip for Driver12
134 2017-01-31 23:00:01
12 2017-01-31 23:10:00 //End time ( 10 min trip)
345 (some date/time)
12 2017-01-31 23:20:00 //Start Time
12 2017-01-31 23:35:00 //End Time (15 min trip)
.
.
.
millions of similar data follow
The total number of data is around 3 million. Now, I need to get the time driven my each of the drivers(there are around 500 drivers).My ideal output would be like
DriveNo TotalTimeDriven
12 35mins
134 ........(in days/hours/mins)
.
.
(for all other Drivers as well)
Above, DriveNo 12 has four entries, suggesting start and end of two rides.Is there an efficient R way to do this?
Data table solution:-
# Sample data
df <- data.table(DriveNo = c(12, 134, 12, 134), Time = c("2017-01-31 23:00:00", "2017-01-31 23:00:01", "2017-01-31 23:10:00", "2017-01-31 23:20:01"))
df[, duration := max(as.POSIXct(Time)) - min(as.POSIXct(Time)), by = DriveNo]
df
DriveNo Time duration
1: 12 2017-01-31 23:00:00 10 mins
2: 134 2017-01-31 23:00:01 20 mins
3: 12 2017-01-31 23:10:00 10 mins
4: 134 2017-01-31 23:20:01 20 mins
range returns the maximum and minimum, and diff subtracts sequential numbers in a vector, so you could just do
aggregate(DateTime ~ DriveNo, df, function(x){diff(range(x))})
## DriveNo DateTime
## 1 12 10
## 2 134 0
or in dplyr,
library(dplyr)
df %>% group_by(DriveNo) %>% summarise(TimeDriven = diff(range(DateTime)))
## # A tibble: 2 × 2
## DriveNo TimeDriven
## <int> <time>
## 1 12 10 mins
## 2 134 0 mins
or in data.table,
library(data.table)
setDT(df)[, .(TimeDriven = diff(range(DateTime))), by = DriveNo]
## DriveNo TimeDriven
## 1: 12 10 mins
## 2: 134 0 mins
To change the units, it may be simpler to call difftime directly.
Data
df <- structure(list(DriveNo = c(12L, 134L, 12L), DateTime = structure(c(1485921600,
1485921601, 1485922200), class = c("POSIXct", "POSIXt"), tzone = "")), class = "data.frame", row.names = c(NA,
-3L), .Names = c("DriveNo", "DateTime"))
For the edit, you can make a variable identifying starts and stops, reshape, and summarise with difftime and sum.
library(tidyverse)
set.seed(47)
drives <- data_frame(DriveNo = sample(rep(1:5, 4)),
DateTime = seq(as.POSIXct("2017-04-13 12:00:00"),
by = '10 min', length.out = 20))
drives %>% str()
#> Classes 'tbl_df', 'tbl' and 'data.frame': 20 obs. of 2 variables:
#> $ DriveNo : int 5 3 4 3 5 1 1 2 3 5 ...
#> $ DateTime: POSIXct, format: "2017-04-13 12:00:00" "2017-04-13 12:10:00" ...
elapsed <- drives %>%
group_by(DriveNo) %>%
mutate(event = rep(c('start', 'stop'), n() / 2),
i = cumsum(event == 'start')) %>%
spread(event, DateTime) %>%
summarise(TimeDriven = sum(difftime(stop, start, units = 'mins')))
elapsed
#> # A tibble: 5 × 2
#> DriveNo TimeDriven
#> <int> <time>
#> 1 1 60 mins
#> 2 2 110 mins
#> 3 3 120 mins
#> 4 4 130 mins
#> 5 5 80 mins
It would be faster to index by recycled Boolean vectors, but in dplyr they get unclassed at some point. In data.table,
library(data.table)
set.seed(47)
drives <- data.table(DriveNo = sample(rep(1:5, 4)),
DateTime = seq(as.POSIXct("2017-04-13 12:00:00"),
by = '10 min', length.out = 20))
elapsed <- drives[, .(TimeDriven = sum(difftime(DateTime[c(FALSE, TRUE)],
DateTime[c(TRUE, FALSE)],
units = 'mins'))),
keyby = DriveNo]
elapsed
#> DriveNo TimeDriven
#> 1: 1 60 mins
#> 2: 2 110 mins
#> 3: 3 120 mins
#> 4: 4 130 mins
#> 5: 5 80 mins
or in base,
set.seed(47)
drives <- data.frame(DriveNo = sample(rep(1:5, 4)),
DateTime = seq(as.POSIXct("2017-04-13 12:00:00"),
by = '10 min', length.out = 20))
elapsed <- aggregate(DateTime ~ DriveNo, drives,
function(x){sum(difftime(x[c(FALSE, TRUE)], x[c(TRUE, FALSE)], units = 'mins'))})
elapsed
#> DriveNo DateTime
#> 1 1 60
#> 2 2 110
#> 3 3 120
#> 4 4 130
#> 5 5 80
All forms will likely have issues if there are an odd number of times for a driver, which is not possible under the assumptions given. If it is, more cleaning is necessary.