Time elapsed between first and second record - r

My objective is to find the amount of time that has elapsed in between the first time a user used a service and the second time a user used the service. Users who have only used the service once should be excluded.
For example, I have a sample dataset test:
> test
start_time user_id
1 2018-01-17 22:10:21 1
2 2018-01-17 22:10:08 2
3 2018-01-18 07:02:36 3
4 2018-01-18 07:24:18 4
5 2018-01-18 15:08:45 2
6 2018-01-18 15:26:57 1
7 2018-01-18 15:37:47 1
8 2018-01-18 20:12:43 3
9 2018-01-18 20:01:08 2
10 2018-01-18 22:42:02 2
I am able to go one by one with difftime:
output$time_lapse[1] <- abs(difftime(test$start_time[1], test$start_time[6]))
But this will take a long time on a large dataset. How could one iterate with data.table or dplyr ?
The output would be in hours as below, from the test dataset above:
> output
user_id time_lapse
1 1 17.27667
2 2 16.97694
3 3 13.16861
4 4 NA
Any suggestions would be appreciated! Here is the sample data:
> dput(test)
structure(list(start_time = structure(c(1516255821, 1516255808,
1516287756, 1516289058, 1516316925, 1516318017, 1516318667, 1516335163,
1516334468, 1516344122), class = c("POSIXct", "POSIXt"), tzone = ""),
user_id = c(1, 2, 3, 4, 2, 1, 1, 3, 2, 2)), .Names = c("start_time",
"user_id"), row.names = c(NA, 10L), class = "data.frame")

Here is data.table method
library(data.table)
test <- structure(list(start_time = structure(c(1516255821, 1516255808,
1516287756, 1516289058, 1516316925, 1516318017, 1516318667, 1516335163,
1516334468, 1516344122), class = c("POSIXct", "POSIXt"), tzone = ""),
user_id = c(1, 2, 3, 4, 2, 1, 1, 3, 2, 2)), .Names = c("start_time",
"user_id"), row.names = c(NA, 10L), class = "data.frame")
setDT(test)
test[, .(time_lapse = difftime(start_time[2], start_time[1])), by = user_id]
# user_id time_lapse
# 1: 1 17.27667 hours
# 2: 2 16.97694 hours
# 3: 3 13.16861 hours
# 4: 4 NA hours

Another possible solution with dplyr
df = structure(list(start_time = structure(c(1516255821, 1516255808,
1516287756, 1516289058, 1516316925, 1516318017, 1516318667, 1516335163,
1516334468, 1516344122), class = c("POSIXct", "POSIXt"), tzone = ""),
user_id = c(1, 2, 3, 4, 2, 1, 1, 3, 2, 2)), .Names = c("start_time",
"user_id"), row.names = c(NA, 10L), class = "data.frame")
library(dplyr)
df %>%
group_by(user_id) %>%
mutate(dif = difftime(start_time,lag(start_time),units = "hours")) %>%
filter(row_number()==2)
Result:
# A tibble: 3 x 3
# Groups: user_id [3]
start_time user_id dif
<dttm> <dbl> <time>
1 2018-01-18 21:08:45 2 16.97694 hours
2 2018-01-18 21:26:57 1 17.27667 hours
3 2018-01-19 02:12:43 3 13.16861 hours

Related

Merge Columns with different number of rows into one data frame in R [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 3 months ago.
I am trying to dynamically find the number of columns which is imported from an excel sheet and add that to another data frame. But this has to be done dynamically and the numbers of rows in both data frame will be different so cbind() is not working. I am not able to do that. Can someone help me understand, what is it that I am doing wrong.
The data frame I am working with is given below:
dput(Book1)
structure(list(`Row Labels` = structure(c(1667260800, 1669852800,
1672531200, 1675209600, 1677628800, 1680307200, 1682899200, 1685577600,
1688169600), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
X1 = c(1, 2, 2, 3, 3, 4, 4, 5, 5), X2 = c(2, 3, 2, 3, 2,
3, 2, 3, 2), X3 = c(3, 4, 3, 4, 3, 4, 3, 4, 3)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L))
dput(Book2)
structure(list(`Row Labels` = structure(c(1667260800, 1669852800,
1672531200, 1675209600, 1677628800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), X6 = c(1, 1, 1, 1, 1), X7 = c(1, 1, 1, 1,
1), X8 = c(1, 1, 1, 1, 1), X9 = c(1, 1, 1, 1, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
The code which I have written is:
library(readxl)
Book1 <- read_excel("C:/X/X/X/X/Book1.xlsx",sheet = "Sheet1")
View(Book1)
Book2 <- read_excel("C:/X/X/X/X/Book1.xlsx",sheet = "Sheet2")
dput(Book1)
dput(Book2)
ncol(Book2)
colnames(Book2)
cbind(Book1,Book2)
The Expected Output is:
The Error I am getting while using cbind is :
You can use full_join
> Book1 %>%
+ full_join(Book2, by = "Row Labels")
# A tibble: 9 × 8
`Row Labels` X1 X2 X3 X6 X7 X8 X9
<dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2022-11-01 00:00:00 1 2 3 1 1 1 1
2 2022-12-01 00:00:00 2 3 4 1 1 1 1
3 2023-01-01 00:00:00 2 2 3 1 1 1 1
4 2023-02-01 00:00:00 3 3 4 1 1 1 1
5 2023-03-01 00:00:00 3 2 3 1 1 1 1
6 2023-04-01 00:00:00 4 3 4 NA NA NA NA
7 2023-05-01 00:00:00 4 2 3 NA NA NA NA
8 2023-06-01 00:00:00 5 3 4 NA NA NA NA
9 2023-07-01 00:00:00 5 2 3 NA NA NA NA

Creating a list of column values by group

I have data as follows:
dat_in <- structure(list(Name = c("Name_a", "Name_a", "Name_a", "Name_a",
"Name_a", "Name_b", "Name_b", "Name_b", "Name_b"), freq = c(5,
4, 3, 2, 14, 1, 6, 0, 7), colspan = c(bb.25 = 1, bb.100 = 2,
bb.500 = 2, bb.Infinity = 4, bb.SUM = 1, aa.25 = 1, aa.3000 = 7,
aa.Infinity = 1, aa.SUM = 1), width = c(bb.25 = 50, bb.100 = 100,
bb.500 = 100, bb.Infinity = 200, bb.SUM = 50, aa.25 = 50, aa.3000 = 350,
aa.Infinity = 50, aa.SUM = 50)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L))
# A tibble: 9 x 4
Name freq colspan width
<chr> <dbl> <dbl> <dbl>
1 Name_a 5 1 50
2 Name_a 4 2 100
3 Name_a 3 2 100
4 Name_a 2 4 200
5 Name_a 14 1 50
6 Name_b 1 1 50
7 Name_b 6 7 350
8 Name_b 0 1 50
9 Name_b 7 1 50
I would like to create a column with list of colspans:
dat_out <- structure(list(rn = c("Name_a", "Name_b"), colspan= list(
c(1, 2, 2, 4, 1), c(1, 1, 7, 1))), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"))
How do I create a list of colspan by name?
By dplyr, you can group data by Name and then summarise each group with list.
library(dplyr)
dat_in %>%
group_by(Name) %>%
summarise(colspan = list(colspan))
# # A tibble: 2 × 2
# Name colspan
# <chr> <list>
# 1 Name_a <dbl [5]>
# 2 Name_b <dbl [4]>

How to do a conditional NA fill in R dataframe

It may be simple but could not figure out.
How to fill NA in the feature column with conditions as below in the data frame dt.
The conditions to fill NA are:
if the difference in Date is 1, fill the NA with the previous row's value (easily done by fill function of tidyverse)
dt_fl<-dt%>%
fill(feature, .direction = "down")
dt_fl
if the difference in the Date is >1, then fill the NA with the previous feature value +1 and replace the following rows (feature values) with 1 increment to make continuous feature values.
The dt_output shows what I am expecting from dt after filling NA values and replacing the feature numbers accordingly.
dt<-structure(list(Date = structure(c(15126, 15127, 15128, 15129,
15130, 15131, 15132, 15133, 15134, 15138, 15139, 15140, 15141,
15142, 15143, 15144, 15145, 15146, 15147, 15148, 15149), class = "Date"),
feature = c(1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, NA, NA, NA,
2, 2, 2, 2, 2, 2, NA)), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame"))
dt
dt_output<-structure(list(Date = structure(c(15126, 15127, 15128, 15129,
15130, 15131, 15132, 15133, 15134, 15138, 15139, 15140, 15141,
15142, 15143, 15144, 15145, 15146, 15147, 15148, 15149), class = "Date"),
feature = c(1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, NA, NA, NA,
2, 2, 2, 2, 2, 2, NA), finaloutput = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3)), row.names = c(NA,
-21L), spec = structure(list(cols = list(Date = structure(list(), class = c("collector_character",
"collector")), feature = structure(list(), class = c("collector_double",
"collector")), finaloutput = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
dt_output
Also, following Ben's suggestion, if the data frame starts with NA feature like in dt2 how to fix it? Expected output for dt2 is in dt2_output
dt2<-structure(list(Date = structure(c(13675, 13676, 13677, 13678,
13679, 13689, 13690, 13691, 13692, 13693, 13694, 13695), class = "Date"),
feature = c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1, NA, 2)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
dt2_output<-structure(list(Date = structure(c(13675, 13676, 13677, 13678,
13679, 13689, 13690, 13691, 13692, 13693, 13694, 13695), class = "Date"),
feature = c(NA, NA, NA, NA, NA, 1, 1, 1, 1, 1, NA, 2), output_feature = c(1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3)), row.names = c(NA, -12L
), spec = structure(list(cols = list(Date = structure(list(), class = c("collector_character",
"collector")), feature = structure(list(), class = c("collector_double",
"collector")), output_feature = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
The solution Ben provides works fine for all the conditions except in 1 condition like in dt3 (below), just wondering why it is so. My assumption is the second solution should give dt3_expected for dt3.
dt3<-structure(list(Date = structure(c(10063, 10064, 10065, 10066,
10067, 10068, 10069, 10070, 10079, 10080, 10081, 10082, 10083,
10084, 10085, 10086, 10087, 10088, 10089), class = "Date"), feature = c(1,
1, 1, 1, 1, 1, 1, NA, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA)), row.names = c(NA,
-19L), class = c("tbl_df", "tbl", "data.frame"))
dt3
dt3_expected<-structure(list(Date = structure(c(10063, 10064, 10065, 10066,
10067, 10068, 10069, 10070, 10079, 10080, 10081, 10082, 10083,
10084, 10085, 10086, 10087, 10088, 10089), class = "Date"), feature = c(1,
1, 1, 1, 1, 1, 1, NA, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA), output_feature = c(1,
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)), row.names = c(NA,
-19L), spec = structure(list(cols = list(Date = structure(list(), class = c("collector_character",
"collector")), feature = structure(list(), class = c("collector_double",
"collector")), output_feature = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
The help is greatly appreciated, thank you.
You could try creating an "offset" that is added whenever you have missing values and a difference in dates greater than 1 day. This cumulative offset can be added to your feature value to determine the finaloutput.
dt %>%
mutate(offset = cumsum(is.na(feature) & Date - lag(Date) > 1)) %>%
fill(feature, .direction = "down") %>%
mutate(finaloutput = feature + offset)
Output
# A tibble: 21 x 4
Date feature offset finaloutput
<date> <dbl> <int> <dbl>
1 2011-06-01 1 0 1
2 2011-06-02 1 0 1
3 2011-06-03 1 0 1
4 2011-06-04 1 0 1
5 2011-06-05 1 0 1
6 2011-06-06 1 0 1
7 2011-06-07 1 0 1
8 2011-06-08 1 0 1
9 2011-06-09 1 0 1
10 2011-06-13 1 1 2
11 2011-06-14 1 1 2
12 2011-06-15 1 1 2
13 2011-06-16 1 1 2
14 2011-06-17 1 1 2
15 2011-06-18 2 1 3
16 2011-06-19 2 1 3
17 2011-06-20 2 1 3
18 2011-06-21 2 1 3
19 2011-06-22 2 1 3
20 2011-06-23 2 1 3
21 2011-06-24 2 1 3
Edit: With the second example dt2 that begins with NA, you can try the following.
First, you can add a default for lag. In the case where the first row is NA, it will evaluate for a difference in Date. Since there is no prior Date to compare with, you can use a default of more than 1 day, so that an offset will be added and these initial NA will be considered the "first" feature.
The second issue is filling in the NA when you can't fill in the down direction (no prior feature value when it starts with NA). You can just replace these with 0. Given the offset, this will become finaloutput of 0 + 1 = 1.
dt2 %>%
mutate(offset = cumsum(is.na(feature) & Date - lag(Date, default = first(Date) - 2) > 1)) %>%
fill(feature, .direction = "down") %>%
replace_na(list(feature = 0)) %>%
mutate(finaloutput = feature + offset)
Output
Date feature offset finaloutput
<date> <dbl> <int> <dbl>
1 2007-06-11 0 1 1
2 2007-06-12 0 1 1
3 2007-06-13 0 1 1
4 2007-06-14 0 1 1
5 2007-06-15 0 1 1
6 2007-06-25 1 1 2
7 2007-06-26 1 1 2
8 2007-06-27 1 1 2
9 2007-06-28 1 1 2
10 2007-06-29 1 1 2
11 2007-06-30 1 1 2
12 2007-07-01 2 1 3
Edit: With additional comment, there is an additional criterion to consider.
If the difference in Date is > 1 and there are only 2 NA, the first NA should be filled by the previous feature, and the second by the following feature. In particular, the second of 2 NA where there is a gap should be dealt with differently.
One approach to this is to count the number of consecutive NA in a row. Then, feature can be filled in for this particular circumstance, where the second of two NA is identified with a Date gap.
dt3 %>%
mutate(grp = cumsum(c(1, abs(diff(is.na(feature))) == 1))) %>%
add_count(grp) %>%
ungroup %>%
mutate(feature = ifelse(is.na(feature) & n == 2 & is.na(lag(feature)), lead(feature), feature)) %>%
mutate(offset = cumsum(is.na(feature) & Date - lag(Date, default = first(Date) - 2) > 1)) %>%
fill(feature, .direction = "down") %>%
replace_na(list(feature = 0)) %>%
mutate(finaloutput = feature + offset)
Output
Date feature grp n offset finaloutput
<date> <dbl> <dbl> <int> <int> <dbl>
1 1997-07-21 1 1 7 0 1
2 1997-07-22 1 1 7 0 1
3 1997-07-23 1 1 7 0 1
4 1997-07-24 1 1 7 0 1
5 1997-07-25 1 1 7 0 1
6 1997-07-26 1 1 7 0 1
7 1997-07-27 1 1 7 0 1
8 1997-07-28 1 2 2 0 1
9 1997-08-06 2 2 2 0 2
10 1997-08-07 2 3 9 0 2
11 1997-08-08 2 3 9 0 2
12 1997-08-09 2 3 9 0 2
13 1997-08-10 2 3 9 0 2
14 1997-08-11 2 3 9 0 2
15 1997-08-12 2 3 9 0 2
16 1997-08-13 2 3 9 0 2
17 1997-08-14 2 3 9 0 2
18 1997-08-15 2 3 9 0 2
19 1997-08-16 2 4 1 0 2
Note that this could be simplified; but before doing so, will need to be sure this meets your needs.

Dataframe: Divide each group by a vector corresponding to each group in R?

I have a data frame like this:
df1 <- structure(list(user_id = c(1, 1, 1, 2, 2, 2, 3, 3, 3), param_a = c(123,
2.3, -9, 1, -0.03333, 4, -41, -12, 0.89)), .Names = c("user_id",
"param_a"), row.names = c(NA, -9L), class = c("tbl_df", "tbl",
"data.frame"))
and another dataframe of vectors:
df2 <- structure(list(user_id = c(1, 2, 3), param_b = c(34, 12, -0.89
)), .Names = c("user_id", "param_b"), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
Now I want to divide each group in df1 by corresponding value in df2:
For example for a group of user 1 divide each row by param_b first vector:
user_id param_a
1 123/34
1 2.3/34
1 -9/34
2 1/12
2 -0.03333/12
2 4/12
....
for user 2 divide each row by param_b second vector.
Please advise how can I divide a grouped by user dataframe by a vector per each group?
P.S
If I have df1 extended to param_a, param_k, param_p
and df2 extended accordingly with param_b, param_l, param_r
How can I perform this kind of operation? #nicola suggested a very nice solution but I want to extend it.
Something like this?
df1%>%
left_join(df2)%>%
mutate(result=param_a/param_b)
Joining, by = "user_id"
# A tibble: 9 x 4
user_id param_a param_b result
<dbl> <dbl> <dbl> <dbl>
1 1 123 34 3.62
2 1 2.3 34 0.0676
3 1 -9 34 -0.265
4 2 1 12 0.0833
5 2 -0.0333 12 -0.00278
6 2 4 12 0.333
7 3 -41 -0.89 46.1
8 3 -12 -0.89 13.5
9 3 0.89 -0.89 -1

R: Efficient extraction of events (continuous increase in a variable)

The task is to efficiently extract events from this data:
data <- structure(
list(i = c(1, 1, 1, 2, 2, 2), t = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("i", "t", "x"), row.names = c(NA, -6L), class = "data.frame"
)
> data
i t x
1 1 1 1
2 1 2 1
3 1 3 2
4 2 1 1
5 2 3 2
6 2 4 3
Let's call i facts, t is time, and x is the number of selections of i at t.
An event is an uninterrupted sequence of selections of one fact. Fact 1 is selected all throughout t=1 to t=3 with a sum of 4 selections. But fact 2 is split into two events, the first from t=1 to t=1 (sum=1) and the second from t=3 to t=4 (sum=5). Therefore, the event data frame is supposed to look like this:
> event
i from to sum
1 1 1 3 4
2 2 1 1 1
3 2 3 4 5
This code does what is needed:
event <- structure(
list(i = logical(0), from = logical(0), to = logical(0), sum = logical(0)),
.Names = c("i", "from", "to", "sum"), row.names = integer(0),
class = "data.frame"
)
l <- nrow(data) # get rows of data frame
c <- 1 # set counter
d <- 1 # set initial row of data to start with
e <- 1 # set initial row of event to fill
repeat{
event[e,1] <- data[d,1] # store "i" in event data frame
event[e,2] <- data[d,2] # store "from" in event data frame
while((data[d+1,1] == data[d,1]) & (data[d+1,2] == data[d,2]+1)){
c <- c+1
d <- d+1
if(d >= l) break
}
event[e,3] <- data[d,2] # store "to" in event data frame
event[e,4] <- sum(data[(d-c+1):d,3]) # store "sum" in event data frame
c <- 1
d <- d+1
e <- e+1
}
The problem is that this code takes 3 days to extract the events from a data frame with 1 million rows and my data frame has 5 million rows.
How can I make this more efficient?
P.S.: There's also a minor bug in my code related to termination.
P.P.S.: The data is sorted first by i, then by t.
can you try if this dplyr implementation is faster?
library(dplyr)
data <- structure(
list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)
group_by(data, fact) %>%
mutate(fromto=cumsum(c(0, diff(timing) > 1))) %>%
group_by(fact, fromto) %>%
summarize(from=min(timing), to=max(timing), sumx=sum(x)) %>%
select(-fromto) %>%
ungroup()
how about this data.table implementation?
library(data.table)
data <- structure(
list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)
setDT(data)[, fromto:=cumsum(c(0, diff(timing) > 1)), by=fact]
event <- data[, .(from=min(timing), to=max(timing), sumx=sum(x)), by=c("fact", "fromto")][,fromto:=NULL]
##results when i enter event in the R console and my data.table package version is data.table_1.9.6
> event
fact from to sumx
1: 1 1 3 4
2: 2 1 1 1
3: 2 3 4 5
> str(event)
Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables:
$ fact: num 1 2 2
$ from: num 1 1 3
$ to : num 3 1 4
$ sumx: num 4 1 5
- attr(*, ".internal.selfref")=<externalptr>
> dput(event)
structure(list(fact = c(1, 2, 2), from = c(1, 1, 3), to = c(3,
1, 4), sumx = c(4, 1, 5)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .Names = c("fact", "from", "to", "sumx"), .internal.selfref = <pointer: 0x0000000000120788>)
Reference
detect intervals of the consequent integer sequences
Assuming the data frame is sorted according to data$t, you can try something like this
event <- NULL
for (i in unique(data$i)) {
x <- data[data$i == i, ]
ev <- cumsum(c(1, diff(x$t)) > 1)
smry <- lapply(split(x, ev), function(z) c(i, range(z$t), sum(z$x)))
event <- c(event, smry)
}
event <- do.call(rbind, event)
rownames(event) <- NULL
colnames(event) <- c('i', 'from', 'to', 'sum')
The result is a matrix, not a data frame.

Resources