R for loops creating a dataTable - r

my data is structured as follows:
price machine timestamp date hour weekday month year trans_id
1: 3.1 179 2017-01-11 15:53:58 2017-01-11 15 Wednesday 1 2017 2017-01-11 15:53:58,179
2: 3.1 179 2017-01-11 15:53:45 2017-01-11 15 Wednesday 1 2017 2017-01-11 15:53:45,179
3: 3.1 179 2017-01-28 00:31:20 2017-01-28 0 Saturday 1 2017 2017-01-28 00:31:20,179
4: 3.1 179 2017-02-04 02:08:42 2017-02-04 2 Saturday 2 2017 2017-02-04 02:08:42,179
5: 3.1 179 2017-03-03 06:34:04 2017-03-03 6 Friday 3 2017 2017-03-03 06:34:04,179
---
1840473: 2.3 2707 2017-04-01 17:06:42 2017-04-01 17 Saturday 4 2017 2017-04-01 17:06:42,2707
1840474: 2.3 2707 2017-04-01 07:55:11 2017-04-01 7 Saturday 4 2017 2017-04-01 07:55:11,2707
1840475: 2.3 2709 2017-02-19 00:28:08 2017-02-19 0 Sunday 2 2017 2017-02-19 00:28:08,2709
1840476: 2.3 2709 2017-03-19 07:34:21 2017-03-19 7 Sunday 3 2017 2017-03-19 07:34:21,2709
1840477: 2.3 2709 2017-03-29 05:56:19 2017-03-29 5 Wednesday 3 2017 2017-03-29 05:56:19,2709
What I am trying to do is calculate the average number of transactions per day for each machine. Then I look at every hour the machine has made a sale. I want to add a column with the difference of transactions in the hour compared to the daily average.
I have managed to get this when I subset my total data per day and per machine setting ex:
ex=dt_2017[(machine=='179')&(date=='2017-01-11')]
total_hours=ex[,unique(hour)]
total_day_transaction=nrow(ex)
average_hour_transaction=total_day_transaction/length(total_hours)
change_hour=vector(mode='list')
counterk=1
for (k in total_hours){
hour_transac=nrow(ex[hour==k])
change=(hour_transac-average_hour_transaction)/average_hour_transaction
change_hour[[counterk]]=change
counterk=counterk+1
}
avg_matrix=cbind(as.data.frame(total_hours),transpose(as.data.frame(change_hour)))
ex2=setDT(merge(x=ex,
y=avg_matrix,
by.x='hour',
by.y='total_hours'))
colnames(ex2)[ncol(ex2)]<-'hour_change'
trans_id=ex2[,trans_id]
dyna_price=vector(mode='list')
counterl=1
for (l in trans_id){
if (ex2[trans_id==l,hour_change]>0){
dyna_price[counterl]=ex2[trans_id==l,price]*(1+ex2[trans_id==l,hour_change])
}else{
dyna_price[counterl]=ex2[trans_id==l,price]
}
counterl=counterl+1
}
dyna_price_matrix=cbind(as.data.frame(trans_id),transpose(as.data.frame(dyna_price)))
ex3=merge(x=dt_2017,
y=dyna_price_matrix,
by='trans_id',
all.x=TRUE)
colnames(ex3)[ncol(ex3)]<-'dynamic_price'
However I would like to iterate it over every machine and every day. I believe what I would need to find a way to name my data table with a variable but I cannot find anything online.
Any help is appreciated.
Thank you very much

We can use different group by= and assign to variables with :=. .N is a special symbol that contains the number of rows in the group.
library(data.table)
setDT(Data)[,hour.trans := .N, by = c("machine","date","hour")][
,daily.avg := .N / 24,by = c("machine","date")][
,difference := hour.trans - daily.avg, by = c("machine","date")][
,.(machine,date,hour,daily.avg,difference)]
# machine date hour daily.avg difference
# 1: 179 2017-01-11 15 0.08333333 1.9166667
# 2: 179 2017-01-11 15 0.08333333 1.9166667
# 3: 179 2017-01-28 0 0.04166667 0.9583333
# 4: 179 2017-02-04 2 0.04166667 0.9583333
# 5: 179 2017-03-03 6 0.04166667 0.9583333
# 6: 2707 2017-04-01 17 0.08333333 0.9166667
# 7: 2707 2017-04-01 7 0.08333333 0.9166667
# 8: 2709 2017-02-19 0 0.04166667 0.9583333
# 9: 2709 2017-03-19 7 0.04166667 0.9583333
#10: 2709 2017-03-29 5 0.04166667 0.9583333
Data
structure(list(price = c(3.1, 3.1, 3.1, 3.1, 3.1, 2.3, 2.3, 2.3,
2.3, 2.3), machine = c(179L, 179L, 179L, 179L, 179L, 2707L, 2707L,
2709L, 2709L, 2709L), timestamp = structure(c(2L, 1L, 3L, 4L,
6L, 10L, 9L, 5L, 7L, 8L), .Label = c("2017-01-11 15:53:45", "2017-01-11 15:53:58",
"2017-01-28 00:31:20", "2017-02-04 02:08:42", "2017-02-19 00:28:08",
"2017-03-03 06:34:04", "2017-03-19 07:34:21", "2017-03-29 05:56:19",
"2017-04-01 07:55:11", "2017-04-01 17:06:42"), class = "factor"),
date = structure(c(1L, 1L, 2L, 3L, 5L, 8L, 8L, 4L, 6L, 7L
), .Label = c("2017-01-11", "2017-01-28", "2017-02-04", "2017-02-19",
"2017-03-03", "2017-03-19", "2017-03-29", "2017-04-01"), class = "factor"),
hour = c(15L, 15L, 0L, 2L, 6L, 17L, 7L, 0L, 7L, 5L), weekday = structure(c(4L,
4L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 4L), .Label = c("Friday",
"Saturday", "Sunday", "Wednesday"), class = "factor"), month = c(1L,
1L, 1L, 2L, 3L, 4L, 4L, 2L, 3L, 3L), year = c(2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L),
trans_id = structure(c(2L, 1L, 3L, 4L, 6L, 10L, 9L, 5L, 7L,
8L), .Label = c("2017-01-11 15:53:45,179", "2017-01-11 15:53:58,179",
"2017-01-28 00:31:20,179", "2017-02-04 02:08:42,179", "2017-02-19 00:28:08,2709",
"2017-03-03 06:34:04,179", "2017-03-19 07:34:21,2709", "2017-03-29 05:56:19,2709",
"2017-04-01 07:55:11,2707", "2017-04-01 17:06:42,2707"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))

Related

How do you split a time series into separate events and assign an event ID?

I want split an irregular time series into separate events and assign each event a unique numerical ID for each site.
Here is an example data frame:
structure(list(site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("AllenBrook", "Eastberk"), class =
"factor"),
timestamp = structure(c(10L, 13L, 8L, 4L, 5L, 6L, 7L, 9L,
11L, 12L, 1L, 2L, 3L), .Label = c("10/1/12 11:29", "10/1/12 14:29",
"10/1/12 17:29", "10/20/12 16:30", "10/20/12 19:30", "10/21/12 1:30",
"10/21/12 4:30", "9/5/12 12:30", "9/5/12 4:14", "9/5/12 6:30",
"9/5/12 7:14", "9/5/12 7:44", "9/5/12 9:30"), class = "factor")), class
= "data.frame", row.names = c(NA,
-13L))
Each event is not the same length or number of timestamps, so I want to split them into separate events if more than 12 hours elapsed between a timestamp and the next timestamp at that site. Each event at the site should receive a unique numerical ID. Here's the outcome I would like:
site timestamp eventid
1 AllenBrook 9/5/12 6:30 1
2 AllenBrook 9/5/12 9:30 1
3 AllenBrook 9/5/12 12:30 1
4 AllenBrook 10/20/12 16:30 2
5 AllenBrook 10/20/12 19:30 2
6 AllenBrook 10/21/12 1:30 2
7 AllenBrook 10/21/12 4:30 2
8 Eastberk 9/5/12 4:14 1
9 Eastberk 9/5/12 7:14 1
10 Eastberk 9/5/12 7:44 1
11 Eastberk 10/1/12 11:29 2
12 Eastberk 10/1/12 14:29 2
13 Eastberk 10/1/12 17:29 2
Any coding solution will do, but bonus points for a tidyverse or data.table solution. Thanks for any help you can provide!
Using data.table, you can perhaps do the following:
library(data.table)
setDT(tmp)[, timestamp := as.POSIXct(timestamp, format="%m/%d/%y %H:%M")][,
eventid := 1L+cumsum(c(0L, diff(timestamp)>720)), by=.(site)]
diff(timestamp) calculates the time difference between adjacent rows. Then we check if the diff is greater than 12h (or 720mins). A common trick in R is to use cumsum to identify when an event happens in a series and group subsequent elements together with this event until the next event happens again. Since cumsum returns 1 less element, we use 0L to pad the beginning. 1+ merely starts the indexing from 1 instead of 0.
output:
site timestamp eventid
1: AllenBrook 2012-09-05 06:30:00 1
2: AllenBrook 2012-09-05 09:30:00 1
3: AllenBrook 2012-09-05 12:30:00 1
4: AllenBrook 2012-10-20 16:30:00 2
5: AllenBrook 2012-10-20 19:30:00 2
6: AllenBrook 2012-10-21 01:30:00 2
7: AllenBrook 2012-10-21 04:30:00 2
8: Eastberk 2012-09-05 04:14:00 1
9: Eastberk 2012-09-05 07:14:00 1
10: Eastberk 2012-09-05 07:44:00 1
11: Eastberk 2012-10-01 11:29:00 2
12: Eastberk 2012-10-01 14:29:00 2
13: Eastberk 2012-10-01 17:29:00 2
data:
tmp <- structure(list(site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("AllenBrook", "Eastberk"), class =
"factor"),
timestamp = structure(c(10L, 13L, 8L, 4L, 5L, 6L, 7L, 9L,
11L, 12L, 1L, 2L, 3L), .Label = c("10/1/12 11:29", "10/1/12 14:29",
"10/1/12 17:29", "10/20/12 16:30", "10/20/12 19:30", "10/21/12 1:30",
"10/21/12 4:30", "9/5/12 12:30", "9/5/12 4:14", "9/5/12 6:30",
"9/5/12 7:14", "9/5/12 7:44", "9/5/12 9:30"), class = "factor")), class
= "data.frame", row.names = c(NA,
-13L))

how to avoid For loop to calculate new varaible by group

I hope someone here could help me. I am trying to learn R coding for my work.
I am following the growth of plants ( called Ecotype) over the time, they are Treated with Mock or a bacteria Xcc. I have 2 different experiments (done at different time) and after image processing I get the Area.
I would like to compute Normalized_Area = Area(t1)/Area(t0) for each ecotype, for each treatment for each experiment (Manip) which is the Area at a time divide by the Area of this ecotype at the beginning of the experiment(t0). Each plant have a different Area at time 0 and the different experiments have a different starting time. (example of expected results in Normalized_Area column)
Please find below a piece of my df
# A tibble: 24 x 6
Manip Traitment Ecotype Date Area Normalized_Area
<dbl> <chr> <chr> <dttm> <dbl> <dbl>
1 1 mock a1-2 2017-12-12 00:00:00 17699 1
2 1 mock a1-2 2017-12-13 00:00:00 24538 1.39
3 1 mock a1-2 2017-12-14 00:00:00 27958 1.58
4 1 xcc a1-2 2017-12-12 00:00:00 19857 1
5 1 xcc a1-2 2017-12-13 00:00:00 27973 1.41
6 1 xcc a1-2 2017-12-14 00:00:00 35875 1.81
7 2 mock a1-2 2018-03-20 00:00:00 18177 1
8 2 mock a1-2 2018-03-21 00:00:00 20251 1.11
9 2 mock a1-2 2018-03-23 00:00:00 36679 2.02
10 2 xcc a1-2 2018-03-20 00:00:00 17261 1
11 2 xcc a1-2 2018-03-21 00:00:00 18697 1.08
12 2 xcc a1-2 2018-03-23 00:00:00 35345 2.05
13 1 mock a1-10 2017-12-12 00:00:00 22853 1
14 1 mock a1-10 2017-12-13 00:00:00 34641 1.52
15 1 mock a1-10 2017-12-14 00:00:00 40311 1.76
16 1 xcc a1-10 2017-12-12 00:00:00 23754 1
17 1 xcc a1-10 2017-12-13 00:00:00 33247 1.40
18 1 xcc a1-10 2017-12-14 00:00:00 40603 1.71
19 2 mock a1-10 2018-03-20 00:00:00 28201 1
20 2 mock a1-10 2018-03-21 00:00:00 30306 1.07
21 2 mock a1-10 2018-03-23 00:00:00 49086 1.74
22 2 xcc a1-10 2018-03-20 00:00:00 27217 1
23 2 xcc a1-10 2018-03-21 00:00:00 29844 1.10
24 2 xcc a1-10 2018-03-23 00:00:00 46540 1.71
I wrote a piece of code using For loops, but it raise some errors and I would like to turn it into a more readable code with dplyr
date_debut=c("2017-12-12", "2018-03-20") # starting_time
data$Normalized_Area = NA
for(manips in levels(as.factor(data$Manip))){ # for each manip
for(ecoty in levels(as.factor(data$Ecotype))){ # for each ecotype
for(traity in levels(as.factor(data$Traitement))){ # for each treatment
for( dd in levels(as.factor(date_debut))){ # for each level
tmp = subset(data,subset=c(Traitement==traity & Ecotype == ecoty & Manip == manips)) # creation d'un fichier tmp
if(dim(tmp)[1] != 0){
#tmp = ordered(tmp$date[1:length(tmp$date-1)])
# compute Area mean at D=0 for each Experiment
if(dd %in% as.character(tmp$Date)!=F){
A0 = tmp$Area[as.character(tmp$Date)== dd] # Select A0 in tmp$Area corresponding to dd
Norm_Area = tmp$Area /A0
data$Normalized_Area[data$Traitement == traity & data$Ecotype== ecoty & data$Manip == manips] = Norm_Area
}
}
}
}
}
Here the beginning of my new code, but I get stuck
gpeData %>%
group_by(Traitement, Ecotype, Manip ) %>%
mutate_( Normalized_Area = Area / Area[which(Date %in% date_debut)] )
Does someone have any idea how to do that? I apologize for the ugly code, but I learned alone.
You were very close to have to solving the problem yourself. Here is my solution, I used the which.min to find the index of the earliest date from each group, then I used this index value in the calculation.
gpeData<-structure(list(Manip = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L),
Traitment = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L
), .Label = c("mock", "xcc"), class = "factor"), Ecotype = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("a1-10", "a1-2"
), class = "factor"), Date = structure(c(1513036800, 1513123200,
1513209600, 1513036800, 1513123200, 1513209600, 1521504000,
1521590400, 1521763200, 1521504000, 1521590400, 1521763200,
1513036800, 1513123200, 1513209600, 1513036800, 1513123200,
1513209600, 1521504000, 1521590400, 1521763200, 1521504000,
1521590400, 1521763200), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Area = c(17699L, 24538L, 27958L, 19857L, 27973L, 35875L,
18177L, 20251L, 36679L, 17261L, 18697L, 35345L, 22853L, 34641L,
40311L, 23754L, 33247L, 40603L, 28201L, 30306L, 49086L, 27217L,
29844L, 46540L), Normalized_Area = c(1, 1.39, 1.58, 1, 1.41,
1.81, 1, 1.11, 2.02, 1, 1.08, 2.05, 1, 1.52, 1.76, 1, 1.4,
1.71, 1, 1.07, 1.74, 1, 1.1, 1.71)), row.names = c(NA, -24L
), class = "data.frame")
library(dplyr)
ans<-gpeData %>%
group_by(Traitment, Ecotype, Manip ) %>%
mutate(NormArea=Area[which.min(Date)], Normalized= Area/NormArea)

r How to check if values exist in a previous period (rolling)

Here is my dataset:
structure(list(Date = structure(c(14609, 14609, 14609, 14609, 14699, 14699, 14699, 14699, 14790, 14790, 14790, 14790), class = "Date"),
ID = structure(c(5L, 4L, 6L, 10L, 9L, 3L, 10L, 8L, 7L, 1L,
10L, 2L), .Label = c("B00NYQ2", "B03J9L7", "B05DZD1", "B06HC42",
"B09V3X7", "B09YCC8", "X6114659", "X6478816", "X6556701",
"X6812555"), class = "factor"), Name = structure(c(10L, 4L,
9L, 8L, 7L, 3L, 8L, 6L, 2L, 5L, 8L, 1L), .Label = c("AIRA",
"BOUS", "CSCS", "EVF", "GTB", "JER", "MGB", "MPR", "NVB",
"TTNP"), class = "factor"), Score = c(55.075, 54.5, 53.325,
52.175, 70.275, 69.825, 60.15, 60.025, 56.175, 52.65, 52.175,
52.125), Score.rank = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L)), .Names = c("Date", "ID", "Name", "Score", "Score.rank"), row.names = c(1L, 2L, 3L, 4L, 71L, 72L, 73L, 74L, 156L, 157L, 158L, 159L), class = "data.frame")
I'm trying to find which IDs come in and out when we go into a new period.
What i mean by that is..i want to compare if the ID was present in the previous period, denoted by "Date".
If it existed in the previous period (date), It should not return anything.
If it did not exist in the previous period, it should return "IN".
I also want to show that if does not exist in the next period, it should return an "OUT".
ie the this period's OUTs should be equal to next periods INs
my expected dataframe is supposed to look like this
Date ID Name Score Score.rank THIS PERIOD NEXT PERIOD
31/12/2009 B09V3X7 TTNP 55.075 1 OUT
31/12/2009 B06HC42 EVF 54.5 2 OUT
31/12/2009 B09YCC8 NVB 53.325 3 OUT
31/12/2009 X6812555 MPR 52.175 4
31/3/2010 X6556701 MGB 70.275 1 IN
31/3/2010 B05DZD1 CSCS 69.825 2 IN OUT
31/3/2010 X6812555 MPR 60.15 3
31/3/2010 X6478816 JER 60.025 4 IN OUT
30/6/2010 X6114659 BOUS 56.175 1 IN
30/6/2010 B00NYQ2 GTB 52.65 2 IN
30/6/2010 X6812555 MPR 52.175 3
30/6/2010 B03J9L7 AIRA 52.125 4 IN
Can somebody point me in the right direction as to how to do this?
Thanks in advance
Your description and example doesn't match, unfortunately.
Considering your description, it seems you want to tag entry and exit conditions for the IDs.
Which can be achieved as:
dft %>%
group_by(ID) %>%
dplyr::mutate( This_period = if_else(Date == min(Date), "IN", NULL) ) %>%
dplyr::mutate( Next_period = if_else(Date == max(Date), "OUT", NULL))
and returns:
#Source: local data frame [12 x 7]
#Groups: ID [10]
#
# Date ID Name Score Score.rank This_period Next_period
# <date> <fctr> <fctr> <dbl> <int> <chr> <chr>
#1 2009-12-31 B09V3X7 TTNP 55.075 1 IN OUT
#2 2009-12-31 B06HC42 EVF 54.500 2 IN OUT
#3 2009-12-31 B09YCC8 NVB 53.325 3 IN OUT
#4 2009-12-31 X6812555 MPR 52.175 4 IN <NA>
#5 2010-03-31 X6556701 MGB 70.275 1 IN OUT
#6 2010-03-31 B05DZD1 CSCS 69.825 2 IN OUT
#7 2010-03-31 X6812555 MPR 60.150 3 <NA> <NA>
#8 2010-03-31 X6478816 JER 60.025 4 IN OUT
#9 2010-06-30 X6114659 BOUS 56.175 1 IN OUT
#10 2010-06-30 B00NYQ2 GTB 52.650 2 IN OUT
#11 2010-06-30 X6812555 MPR 52.175 3 <NA> OUT
#12 2010-06-30 B03J9L7 AIRA 52.125 4 IN OUT
However, your example suggests you want to exclude the min(Date) from this_period check and the max(Date) from the Next_period check. Is it so? if yes, is score.rank somehow related to Date?
please clarify.

using r to check the shift in manufacturing plant based on the timestamp

By using the timestamp of produced unit, i want to check in which shift it was produced. Basically the production is carried out in two shifts per day. shifts timings are 06:00 to 18:00 and 18:00 to 06:00. shifts data frame below shows the planning of shifts of december month.
let me make it more clear
2015-12-01 A shift(2015-12-01 06:00:00 to 2015-12-01 17:59:59)
2015-12-01 D shift(2015-12-01 18:00:00 to 2015-12-02 05:59:59)
2015-12-02 A shift(2015-12-02 06:00:00 to 2015-12-02 17:59:59)
2015-12-02 D shift(2015-12-02 18:00:00 to 2015-12-03 05:59:59)
and so on..
head(shifts)
date day_shift night_shift
1 2015-12-01 A D
2 2015-12-02 A D
3 2015-12-03 B A
4 2015-12-04 B A
5 2015-12-05 C B
6 2015-12-06 C B
shifts <- structure(list(date = structure(1:31, .Label = c("2015-12-01",
"2015-12-02", "2015-12-03", "2015-12-04", "2015-12-05", "2015-12-06",
"2015-12-07", "2015-12-08", "2015-12-09", "2015-12-10", "2015-12-11",
"2015-12-12", "2015-12-13", "2015-12-14", "2015-12-15", "2015-12-16",
"2015-12-17", "2015-12-18", "2015-12-19", "2015-12-20", "2015-12-21",
"2015-12-22", "2015-12-23", "2015-12-24", "2015-12-25", "2015-12-26",
"2015-12-27", "2015-12-28", "2015-12-29", "2015-12-30", "2015-12-31"
), class = "factor"), day_shift = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 1L, 1L, 2L, 2L, 3L, 3L, 4L), .Label = c("A",
"B", "C", "D"), class = "factor"), night_shift = structure(c(4L,
4L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 1L, 1L, 2L, 2L, 3L), .Label = c("A",
"B", "C", "D"), class = "factor")), .Names = c("date", "day_shift",
"night_shift"), class = "data.frame", row.names = c(NA, -31L))
In the check data frame, i have the timestamp of each unit produced. by using these timestamp i want to check in which shift the unit is produced.
head(check)
eventtime
1 2015-12-01 06:10:08
2 2015-12-01 10:10:24
3 2015-12-01 19:01:15
4 2015-12-02 01:54:54
5 2015-12-02 06:24:14
6 2015-12-02 08:15:47
check <- structure(list(eventtime = structure(c(1448946608, 1448961024,
1448992875, 1449017694, 1449033854, 1449040547, 1449076903, 1449085710,
1449100168, 1449119720), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = "eventtime", row.names = c(NA,
-10L), class = "data.frame")
Desired Result:
ds
eventtime shift
1 2015-12-01 06:10:08 A
2 2015-12-01 10:10:24 A
3 2015-12-01 19:01:15 D
4 2015-12-02 01:54:54 D
5 2015-12-02 06:24:14 A
6 2015-12-02 08:15:47 A
7 2015-12-02 18:21:43 D
8 2015-12-02 20:48:30 D
9 2015-12-03 00:49:28 D
10 2015-12-03 06:15:20 B
To keep it simple, i showed only shifts plan of december month. In reality i need to check the complete year.
Here's an answer using lubridate and its %within% functions to check if a date is within an interval. Depending upon whether your raw data is actually stored as factors or not you can simplify the code by removing some of the conversions.
library(lubridate)
day_shift_start <- as.POSIXct(shifts$date) + hms("06:00:00")
day_shift_end <- as.POSIXct(shifts$date) + hms("17:59:59")
night_shift_start <- as.POSIXct(shifts$date) + hms("18:00:00")
night_shift_end <- as.POSIXct(shifts$date) + days(1) + hms("05:59:59")
shift_intervals <- data.frame(intervals = c(interval(day_shift_start, day_shift_end),
interval(night_shift_start, night_shift_end)),
shift = c(as.character(shifts$day_shift),
as.character(shifts$night_shift)))
check$shift <- unlist(lapply(check$eventtime, function(x) {
shift_intervals$shift[x %within% shift_intervals$intervals]
}))
check
# eventtime shift
# 1 2015-12-01 06:10:08 A
# 2 2015-12-01 10:10:24 A
# 3 2015-12-01 19:01:15 D
# 4 2015-12-02 01:54:54 D
# 5 2015-12-02 06:24:14 A
# 6 2015-12-02 08:15:47 A
# 7 2015-12-02 18:21:43 D
# 8 2015-12-02 20:48:30 D
# 9 2015-12-03 00:49:28 D
# 10 2015-12-03 06:15:20 B

How to create a datetime object from separate date fields?

I have a dataset like this:
Year MM DD HH
158 2010 7 1 5
159 2010 7 1 5
160 2010 7 1 6
161 2010 7 1 6
structure(list(Year = c(2010L, 2010L, 2010L, 2010L), MM = c(7L,
7L, 7L, 7L), DD = c(1L, 1L, 1L, 1L), HH = c(5L, 5L, 6L, 6L)), .Names = c("Year",
"MM", "DD", "HH"), row.names = 158:161, class = "data.frame")
How can I create a one datetime object from this data set (new column for this data)?
There are a few options, here's one (where x is your data.frame):
x$datetime <- ISOdatetime(x$Year, x$MM, x$DD, x$HH, 0, 0)
You can pass in the correct time zone if need be, see ?ISOdatetime.
You can now do this in lubridate using make_date or make_datetime:
From the cran doc:
make_datetime(year = 1970L, month = 1L, day = 1L, hour = 0L, min = 0L,
sec = 0, tz = "UTC")
make_date(year = 1970L, month = 1L, day = 1L)
Assuming you have a your data in a dataframe x:
transform(x,datetime = as.POSIXct(paste(paste(Year,MM,DD,sep="-"), paste(HH,"00",sep=":"))))
Year MM DD HH datetime
158 2010 7 1 5 2010-07-01 05:00:00
159 2010 7 1 5 2010-07-01 05:00:00
160 2010 7 1 6 2010-07-01 06:00:00
161 2010 7 1 6 2010-07-01 06:00:00

Resources