Related
my data is structured as follows:
price machine timestamp date hour weekday month year trans_id
1: 3.1 179 2017-01-11 15:53:58 2017-01-11 15 Wednesday 1 2017 2017-01-11 15:53:58,179
2: 3.1 179 2017-01-11 15:53:45 2017-01-11 15 Wednesday 1 2017 2017-01-11 15:53:45,179
3: 3.1 179 2017-01-28 00:31:20 2017-01-28 0 Saturday 1 2017 2017-01-28 00:31:20,179
4: 3.1 179 2017-02-04 02:08:42 2017-02-04 2 Saturday 2 2017 2017-02-04 02:08:42,179
5: 3.1 179 2017-03-03 06:34:04 2017-03-03 6 Friday 3 2017 2017-03-03 06:34:04,179
---
1840473: 2.3 2707 2017-04-01 17:06:42 2017-04-01 17 Saturday 4 2017 2017-04-01 17:06:42,2707
1840474: 2.3 2707 2017-04-01 07:55:11 2017-04-01 7 Saturday 4 2017 2017-04-01 07:55:11,2707
1840475: 2.3 2709 2017-02-19 00:28:08 2017-02-19 0 Sunday 2 2017 2017-02-19 00:28:08,2709
1840476: 2.3 2709 2017-03-19 07:34:21 2017-03-19 7 Sunday 3 2017 2017-03-19 07:34:21,2709
1840477: 2.3 2709 2017-03-29 05:56:19 2017-03-29 5 Wednesday 3 2017 2017-03-29 05:56:19,2709
What I am trying to do is calculate the average number of transactions per day for each machine. Then I look at every hour the machine has made a sale. I want to add a column with the difference of transactions in the hour compared to the daily average.
I have managed to get this when I subset my total data per day and per machine setting ex:
ex=dt_2017[(machine=='179')&(date=='2017-01-11')]
total_hours=ex[,unique(hour)]
total_day_transaction=nrow(ex)
average_hour_transaction=total_day_transaction/length(total_hours)
change_hour=vector(mode='list')
counterk=1
for (k in total_hours){
hour_transac=nrow(ex[hour==k])
change=(hour_transac-average_hour_transaction)/average_hour_transaction
change_hour[[counterk]]=change
counterk=counterk+1
}
avg_matrix=cbind(as.data.frame(total_hours),transpose(as.data.frame(change_hour)))
ex2=setDT(merge(x=ex,
y=avg_matrix,
by.x='hour',
by.y='total_hours'))
colnames(ex2)[ncol(ex2)]<-'hour_change'
trans_id=ex2[,trans_id]
dyna_price=vector(mode='list')
counterl=1
for (l in trans_id){
if (ex2[trans_id==l,hour_change]>0){
dyna_price[counterl]=ex2[trans_id==l,price]*(1+ex2[trans_id==l,hour_change])
}else{
dyna_price[counterl]=ex2[trans_id==l,price]
}
counterl=counterl+1
}
dyna_price_matrix=cbind(as.data.frame(trans_id),transpose(as.data.frame(dyna_price)))
ex3=merge(x=dt_2017,
y=dyna_price_matrix,
by='trans_id',
all.x=TRUE)
colnames(ex3)[ncol(ex3)]<-'dynamic_price'
However I would like to iterate it over every machine and every day. I believe what I would need to find a way to name my data table with a variable but I cannot find anything online.
Any help is appreciated.
Thank you very much
We can use different group by= and assign to variables with :=. .N is a special symbol that contains the number of rows in the group.
library(data.table)
setDT(Data)[,hour.trans := .N, by = c("machine","date","hour")][
,daily.avg := .N / 24,by = c("machine","date")][
,difference := hour.trans - daily.avg, by = c("machine","date")][
,.(machine,date,hour,daily.avg,difference)]
# machine date hour daily.avg difference
# 1: 179 2017-01-11 15 0.08333333 1.9166667
# 2: 179 2017-01-11 15 0.08333333 1.9166667
# 3: 179 2017-01-28 0 0.04166667 0.9583333
# 4: 179 2017-02-04 2 0.04166667 0.9583333
# 5: 179 2017-03-03 6 0.04166667 0.9583333
# 6: 2707 2017-04-01 17 0.08333333 0.9166667
# 7: 2707 2017-04-01 7 0.08333333 0.9166667
# 8: 2709 2017-02-19 0 0.04166667 0.9583333
# 9: 2709 2017-03-19 7 0.04166667 0.9583333
#10: 2709 2017-03-29 5 0.04166667 0.9583333
Data
structure(list(price = c(3.1, 3.1, 3.1, 3.1, 3.1, 2.3, 2.3, 2.3,
2.3, 2.3), machine = c(179L, 179L, 179L, 179L, 179L, 2707L, 2707L,
2709L, 2709L, 2709L), timestamp = structure(c(2L, 1L, 3L, 4L,
6L, 10L, 9L, 5L, 7L, 8L), .Label = c("2017-01-11 15:53:45", "2017-01-11 15:53:58",
"2017-01-28 00:31:20", "2017-02-04 02:08:42", "2017-02-19 00:28:08",
"2017-03-03 06:34:04", "2017-03-19 07:34:21", "2017-03-29 05:56:19",
"2017-04-01 07:55:11", "2017-04-01 17:06:42"), class = "factor"),
date = structure(c(1L, 1L, 2L, 3L, 5L, 8L, 8L, 4L, 6L, 7L
), .Label = c("2017-01-11", "2017-01-28", "2017-02-04", "2017-02-19",
"2017-03-03", "2017-03-19", "2017-03-29", "2017-04-01"), class = "factor"),
hour = c(15L, 15L, 0L, 2L, 6L, 17L, 7L, 0L, 7L, 5L), weekday = structure(c(4L,
4L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 4L), .Label = c("Friday",
"Saturday", "Sunday", "Wednesday"), class = "factor"), month = c(1L,
1L, 1L, 2L, 3L, 4L, 4L, 2L, 3L, 3L), year = c(2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L),
trans_id = structure(c(2L, 1L, 3L, 4L, 6L, 10L, 9L, 5L, 7L,
8L), .Label = c("2017-01-11 15:53:45,179", "2017-01-11 15:53:58,179",
"2017-01-28 00:31:20,179", "2017-02-04 02:08:42,179", "2017-02-19 00:28:08,2709",
"2017-03-03 06:34:04,179", "2017-03-19 07:34:21,2709", "2017-03-29 05:56:19,2709",
"2017-04-01 07:55:11,2707", "2017-04-01 17:06:42,2707"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
I have been given a table of data to analyse.
I have a column with start times and a column with finish times given in 3 or 4 digits, example: 3:40pm is 1540, 7:25am is 725.
How can I obtain a new column in my table with the difference of these times, given in minutes?
There are over 2000 entries.
Thanks for any help.
structure(list(schedtime = c(1455L, 1640L, 1245L, 1715L, 1039L,
840L), deptime = c(1455L, 1640L, 1245L, 1709L, 1035L, 839L),
distance = c(184L, 213L, 229L, 229L, 229L, 228L), flightnumber =
c(5935L,
6155L, 7208L, 7215L, 7792L, 7800L), weather = c(0L, 0L, 0L,
0L, 0L, 0L), dayweek = c(4L, 4L, 4L, 4L, 4L, 4L), daymonth = c(1L,
1L, 1L, 1L, 1L, 1L)), row.names = c(NA, 6L), class = "data.frame")
This is an example of the dataset (Im unsure are to why there are L's after each number, these are not shown in the table). I want, in minutes, deptime (finish) - schedtime (start).
Given the new column, there are 2 values with a schedtime before midnight and a deptime after midnight, for example schedtime 2120 and deptime 0010. The answer to this is given as -1270, considering it to be an extremely early departure. How could I change this to be calculated as +170, a late departure?
Efficient way for larger dataset-
data.table::setDT(dt)[,time_diff:=minutes(deptime-schedtime)]
> dt
schedtime deptime distance flightnumber weather dayweek daymonth time_diff
1: 1455 1455 184 5935 0 4 1 0S
2: 1640 1640 213 6155 0 4 1 0S
3: 1245 1245 229 7208 0 4 1 0S
4: 1715 1709 229 7215 0 4 1 -6M 0S
5: 1039 1035 229 7792 0 4 1 -4M 0S
6: 840 839 228 7800 0 4 1 -1M 0S
EDIT- (To handle cases like 1730 - 1600 = 130 mins ( Actually, it is 90 mins).
library(data.table)
library(stringr)
setDT(dt)
dt[,schedtime:=str_pad(schedtime, 4, pad = "0")]
dt[,deptime:=str_pad(deptime, 4, pad = "0")]
dt[,time_diff:=difftime(as.ITime(strptime(x = schedtime, format = "%H%M")),as.ITime(strptime(x = deptime, format = "%H%M")),units = "mins")]
> dt
schedtime deptime distance flightnumber weather dayweek daymonth time_diff
1: 1455 1455 184 5935 0 4 1 0 mins
2: 1640 1640 213 6155 0 4 1 0 mins
3: 1245 1245 229 7208 0 4 1 0 mins
4: 1715 1709 229 7215 0 4 1 6 mins
5: 1039 1035 229 7792 0 4 1 4 mins
6: 1730 1600 228 7800 0 4 1 90 mins
dat <- data.frame(c(1540,1820,1330,545,100),c(1850,2150,2325,1330,101))
60*(floor(dat[,2]/100) - floor(dat[,1]/100)) - dat[,1] %% 100 + dat[,2] %% 100
Taking the floor of the the hundreds gives the hours. Taking the difference and multiplying by 60 gives the minutes from the difference of the hours. Then you can subtract the original minutes and add the final minutes to get total minutes passed.
You can use library lubridate to find the difference in minutes . Hope this helps. lubridate provides very good functionality for time related data.
library(lubridate)
df$deptime_new <- minutes(df$deptime-df$schedtime)
Data
df <- structure(list(schedtime = c(1455L, 1640L, 1245L, 1715L, 1039L,
840L), deptime = c(1455L, 1640L, 1245L, 1709L, 1035L, 839L),
distance = c(184L, 213L, 229L, 229L, 229L, 228L), flightnumber =
c(5935L,
6155L, 7208L, 7215L, 7792L, 7800L), weather = c(0L, 0L, 0L,
0L, 0L, 0L), dayweek = c(4L, 4L, 4L, 4L, 4L, 4L), daymonth = c(1L,
1L, 1L, 1L, 1L, 1L)), row.names = c(NA, 6L), class = "data.frame")
I have the same query, Is there a way to calculate the time difference of times in a column and display the answers in a new column in minutes
I want split an irregular time series into separate events and assign each event a unique numerical ID for each site.
Here is an example data frame:
structure(list(site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("AllenBrook", "Eastberk"), class =
"factor"),
timestamp = structure(c(10L, 13L, 8L, 4L, 5L, 6L, 7L, 9L,
11L, 12L, 1L, 2L, 3L), .Label = c("10/1/12 11:29", "10/1/12 14:29",
"10/1/12 17:29", "10/20/12 16:30", "10/20/12 19:30", "10/21/12 1:30",
"10/21/12 4:30", "9/5/12 12:30", "9/5/12 4:14", "9/5/12 6:30",
"9/5/12 7:14", "9/5/12 7:44", "9/5/12 9:30"), class = "factor")), class
= "data.frame", row.names = c(NA,
-13L))
Each event is not the same length or number of timestamps, so I want to split them into separate events if more than 12 hours elapsed between a timestamp and the next timestamp at that site. Each event at the site should receive a unique numerical ID. Here's the outcome I would like:
site timestamp eventid
1 AllenBrook 9/5/12 6:30 1
2 AllenBrook 9/5/12 9:30 1
3 AllenBrook 9/5/12 12:30 1
4 AllenBrook 10/20/12 16:30 2
5 AllenBrook 10/20/12 19:30 2
6 AllenBrook 10/21/12 1:30 2
7 AllenBrook 10/21/12 4:30 2
8 Eastberk 9/5/12 4:14 1
9 Eastberk 9/5/12 7:14 1
10 Eastberk 9/5/12 7:44 1
11 Eastberk 10/1/12 11:29 2
12 Eastberk 10/1/12 14:29 2
13 Eastberk 10/1/12 17:29 2
Any coding solution will do, but bonus points for a tidyverse or data.table solution. Thanks for any help you can provide!
Using data.table, you can perhaps do the following:
library(data.table)
setDT(tmp)[, timestamp := as.POSIXct(timestamp, format="%m/%d/%y %H:%M")][,
eventid := 1L+cumsum(c(0L, diff(timestamp)>720)), by=.(site)]
diff(timestamp) calculates the time difference between adjacent rows. Then we check if the diff is greater than 12h (or 720mins). A common trick in R is to use cumsum to identify when an event happens in a series and group subsequent elements together with this event until the next event happens again. Since cumsum returns 1 less element, we use 0L to pad the beginning. 1+ merely starts the indexing from 1 instead of 0.
output:
site timestamp eventid
1: AllenBrook 2012-09-05 06:30:00 1
2: AllenBrook 2012-09-05 09:30:00 1
3: AllenBrook 2012-09-05 12:30:00 1
4: AllenBrook 2012-10-20 16:30:00 2
5: AllenBrook 2012-10-20 19:30:00 2
6: AllenBrook 2012-10-21 01:30:00 2
7: AllenBrook 2012-10-21 04:30:00 2
8: Eastberk 2012-09-05 04:14:00 1
9: Eastberk 2012-09-05 07:14:00 1
10: Eastberk 2012-09-05 07:44:00 1
11: Eastberk 2012-10-01 11:29:00 2
12: Eastberk 2012-10-01 14:29:00 2
13: Eastberk 2012-10-01 17:29:00 2
data:
tmp <- structure(list(site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("AllenBrook", "Eastberk"), class =
"factor"),
timestamp = structure(c(10L, 13L, 8L, 4L, 5L, 6L, 7L, 9L,
11L, 12L, 1L, 2L, 3L), .Label = c("10/1/12 11:29", "10/1/12 14:29",
"10/1/12 17:29", "10/20/12 16:30", "10/20/12 19:30", "10/21/12 1:30",
"10/21/12 4:30", "9/5/12 12:30", "9/5/12 4:14", "9/5/12 6:30",
"9/5/12 7:14", "9/5/12 7:44", "9/5/12 9:30"), class = "factor")), class
= "data.frame", row.names = c(NA,
-13L))
I have dataset with data of gamesessions(id,count of session,averege seconds of session and date of session for each id)
here sample of mydat:
mydat=read.csv("C:/Users/Admin/desktop/rty.csv", sep=";",dec=",")
structure(list(udid = c(74385162L, 79599601L, 79599601L, 91475825L,
91475825L, 91492531L, 92137561L, 96308016L, 96308016L, 96308016L,
96308016L, 96308016L, 96495076L, 97135620L, 97135620L, 97135620L,
97135620L, 97135620L, 97135620L, 97135620L, 97135620L, 97135620L,
97135620L, 97165942L), count = c(1L, 1L, 1L, 1L, 3L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), avg_duration = c(39L, 1216L, 568L, 5L, 6L, 79L, 9L, 426L,
78L, 884L, 785L, 785L, 22L, 302L, 738L, 280L, 2782L, 5L, 2284L,
144L, 234L, 231L, 539L, 450L), date = structure(c(13L, 3L, 3L,
1L, 1L, 14L, 2L, 11L, 11L, 11L, 12L, 12L, 9L, 7L, 4L, 4L, 5L,
6L, 8L, 8L, 8L, 8L, 8L, 10L), .Label = c("11.10.16", "12.12.16",
"15.11.16", "15.12.16", "16.12.16", "17.12.16", "18.10.16", "18.12.16",
"21.10.16", "26.10.16", "28.11.16", "29.11.16", "31.10.16", "8.10.16"
), class = "factor")), .Names = c("udid", "count", "avg_duration",
"date"), class = "data.frame", row.names = c(NA, -24L))
I need calculating the time difference between the first date of the player's appearance and the last date when he was seen.
For example uid 97135620 the first time when he started play was 18.10.2016 and last time he was seen at 18.12.2016, it is mean that the difference between first and last day = 60,9 days,
meanwhile uid74385162 started at 31.10.2016 and after he didn't play(i.e he played one time), it is mean the difference between first data and last data = 0.
id79599601 has two count of session in 1 day(i.e for one day I played 2 times), so the the difference =1
In output i expect this format only with last date and the value of the difference between the last day and the first day.
udid count avg_duration date datediff
74385162 1 39 31.10.2016 0
79599601 1 568 15.11.2016 1
91475825 1 5 11.10.2016 1
91492531 1 79 08.10.2016 0
92137561 1 9 12.12.2016 0
96308016 1 785 29.11.2016 1
96495076 1 22 21.10.2016 0
97135620 1 539 18.12.2016 61
97165942 1 450 26.10.2016 0
How do that?
This function calculates the difference between first and last session, and only returns the date of the last session:
get_datediff <- function (x) {
dates <- as.Date(as.character(x$date), "%d.%m.%y")
x <- x[order(dates), ]
if (length(x$date)==1) {
x$datediff <- 0
} else {
x$datediff <- max(1, diff(range(dates)))
}
x[nrow(x), ]
}
This can then be applied to data for each user, making use of dplyr and magrittr packages:
group_by(mydat, udid) %>% do(get_datediff(.))
# A tibble: 9 x 5
# Groups: udid [9]
udid count avg_duration date datediff
<int> <int> <int> <fctr> <dbl>
1 74385162 1 39 31.10.16 0
2 79599601 1 568 15.11.16 1
3 91475825 3 6 11.10.16 1
4 91492531 1 79 8.10.16 0
5 92137561 1 9 12.12.16 0
6 96308016 1 785 29.11.16 1
7 96495076 1 22 21.10.16 0
8 97135620 1 539 18.12.16 61
9 97165942 1 450 26.10.16 0
The way you describe how your metrics are calculated are confusing, but following what you wrote as closely as possible, I ended up with the following:
dplyr solution:
timeData%>%
mutate(dateFormat = as.Date(date, format = "%d.%m.%y"))%>%
group_by(udid)%>%
arrange(udid,dateFormat)%>%
summarise(dateBetween = difftime(last(dateFormat), first(dateFormat), units = "days"), mean(avg_duration))%>%
left_join((timeData%>%
mutate(dateFormat = as.Date(date, format = "%d.%m.%y"))%>%
select(udid, count,dateFormat)%>%
group_by(udid)%>%
slice(which.min(dateFormat))))
Result:
# A tibble: 9 x 5
udid dateBetween `mean(avg_duration)` count dateFormat
<int> <time> <dbl> <int> <date>
1 74385162 0 days 39.0 1 2016-10-31
2 79599601 0 days 892.0 1 2016-11-15
3 91475825 0 days 5.5 1 2016-10-11
4 91492531 0 days 79.0 1 2016-10-08
5 92137561 0 days 9.0 1 2016-12-12
6 96308016 1 days 591.6 1 2016-11-29
7 96495076 0 days 22.0 1 2016-10-21
8 97135620 61 days 753.9 1 2016-12-18
9 97165942 0 days 450.0 1 2016-10-26
Here is my dataset:
structure(list(Date = structure(c(14609, 14609, 14609, 14609, 14699, 14699, 14699, 14699, 14790, 14790, 14790, 14790), class = "Date"),
ID = structure(c(5L, 4L, 6L, 10L, 9L, 3L, 10L, 8L, 7L, 1L,
10L, 2L), .Label = c("B00NYQ2", "B03J9L7", "B05DZD1", "B06HC42",
"B09V3X7", "B09YCC8", "X6114659", "X6478816", "X6556701",
"X6812555"), class = "factor"), Name = structure(c(10L, 4L,
9L, 8L, 7L, 3L, 8L, 6L, 2L, 5L, 8L, 1L), .Label = c("AIRA",
"BOUS", "CSCS", "EVF", "GTB", "JER", "MGB", "MPR", "NVB",
"TTNP"), class = "factor"), Score = c(55.075, 54.5, 53.325,
52.175, 70.275, 69.825, 60.15, 60.025, 56.175, 52.65, 52.175,
52.125), Score.rank = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L)), .Names = c("Date", "ID", "Name", "Score", "Score.rank"), row.names = c(1L, 2L, 3L, 4L, 71L, 72L, 73L, 74L, 156L, 157L, 158L, 159L), class = "data.frame")
I'm trying to find which IDs come in and out when we go into a new period.
What i mean by that is..i want to compare if the ID was present in the previous period, denoted by "Date".
If it existed in the previous period (date), It should not return anything.
If it did not exist in the previous period, it should return "IN".
I also want to show that if does not exist in the next period, it should return an "OUT".
ie the this period's OUTs should be equal to next periods INs
my expected dataframe is supposed to look like this
Date ID Name Score Score.rank THIS PERIOD NEXT PERIOD
31/12/2009 B09V3X7 TTNP 55.075 1 OUT
31/12/2009 B06HC42 EVF 54.5 2 OUT
31/12/2009 B09YCC8 NVB 53.325 3 OUT
31/12/2009 X6812555 MPR 52.175 4
31/3/2010 X6556701 MGB 70.275 1 IN
31/3/2010 B05DZD1 CSCS 69.825 2 IN OUT
31/3/2010 X6812555 MPR 60.15 3
31/3/2010 X6478816 JER 60.025 4 IN OUT
30/6/2010 X6114659 BOUS 56.175 1 IN
30/6/2010 B00NYQ2 GTB 52.65 2 IN
30/6/2010 X6812555 MPR 52.175 3
30/6/2010 B03J9L7 AIRA 52.125 4 IN
Can somebody point me in the right direction as to how to do this?
Thanks in advance
Your description and example doesn't match, unfortunately.
Considering your description, it seems you want to tag entry and exit conditions for the IDs.
Which can be achieved as:
dft %>%
group_by(ID) %>%
dplyr::mutate( This_period = if_else(Date == min(Date), "IN", NULL) ) %>%
dplyr::mutate( Next_period = if_else(Date == max(Date), "OUT", NULL))
and returns:
#Source: local data frame [12 x 7]
#Groups: ID [10]
#
# Date ID Name Score Score.rank This_period Next_period
# <date> <fctr> <fctr> <dbl> <int> <chr> <chr>
#1 2009-12-31 B09V3X7 TTNP 55.075 1 IN OUT
#2 2009-12-31 B06HC42 EVF 54.500 2 IN OUT
#3 2009-12-31 B09YCC8 NVB 53.325 3 IN OUT
#4 2009-12-31 X6812555 MPR 52.175 4 IN <NA>
#5 2010-03-31 X6556701 MGB 70.275 1 IN OUT
#6 2010-03-31 B05DZD1 CSCS 69.825 2 IN OUT
#7 2010-03-31 X6812555 MPR 60.150 3 <NA> <NA>
#8 2010-03-31 X6478816 JER 60.025 4 IN OUT
#9 2010-06-30 X6114659 BOUS 56.175 1 IN OUT
#10 2010-06-30 B00NYQ2 GTB 52.650 2 IN OUT
#11 2010-06-30 X6812555 MPR 52.175 3 <NA> OUT
#12 2010-06-30 B03J9L7 AIRA 52.125 4 IN OUT
However, your example suggests you want to exclude the min(Date) from this_period check and the max(Date) from the Next_period check. Is it so? if yes, is score.rank somehow related to Date?
please clarify.