Related
I have a longitudinal data where respondents recruited as cohort. Right now, I have year in which they took the survey. But I want to create a new column simply counting if it is the first, second, or third time a person took the survey.
Original Table
PersonID
SurveyYear
SurveyQ1Rating
SurveyQ2Rating
Gender
12
2013
5
4
f
12
2012
4
4
f
12
2010
3
3
f
2
2007
4
4
m
2
2008
3
3
m
2
2009
3
5
m
2
2010
5
5
m
2
2013
2
2
m
5
2013
4
4
f
5
2014
5
5
f
Target Table (Where I created a new col SurveytTime to mark the ith time one took the survey)
PersonID
SurveyYear
SurveyTime
SurveyQ1Rating
SurveyQ2Rating
Gender
12
2013
3
5
4
f
12
2012
2
4
4
f
12
2010
1
3
3
f
2
2007
1
4
4
m
2
2008
2
3
3
m
2
2009
3
3
5
m
2
2010
4
5
5
m
2
2013
5
2
2
m
5
2013
1
4
4
f
5
2014
2
5
5
f
A base solution:
df |>
transform(SurveyTime = ave(SurveyYear, PersonID, FUN = rank))
Its dplyr equivalent:
library(dplyr)
df %>%
group_by(PersonID) %>%
mutate(SurveyTime = dense_rank(SurveyYear)) %>%
ungroup()
Data
df <- structure(list(PersonID = c(12L, 12L, 12L, 2L, 2L, 2L, 2L, 2L,
5L, 5L), SurveyYear = c(2013L, 2012L, 2010L, 2007L, 2008L, 2009L,
2010L, 2013L, 2013L, 2014L), SurveyQ1Rating = c(5L, 4L, 3L, 4L,
3L, 3L, 5L, 2L, 4L, 5L), SurveyQ2Rating = c(4L, 4L, 3L, 4L, 3L,
5L, 5L, 2L, 4L, 5L), Gender = c("f", "f", "f", "m", "m", "m",
"m", "m", "f", "f")), class = "data.frame", row.names = c(NA, -10L))
Using data.table
library(data.table)
setDT(df1)[, SurveyTime := frank(SurveyYear), PersonID]
Hi all this should be a straightforward question, I just can't seem to figure it out. I would like to break up this data set biweekly in order to look at the annual cycle in 2 week intervals. I do not want to summarize or aggregate the data. I would like to do exactly what the 'week' function is doing, but every 2 weeks instead. Below is an example of the data and code. Any help would be greatly appreciated!
DF<-dput(head(indiv))
structure(list(event.id = 1142811808:1142811813, timestamp = structure(c(1323154800,
1323200450, 1323202141, 1323203545, 1323208151, 1323209966), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), argos.altitude = c(43, 43, 39, 43,
44, 42), argos.best.level = c(0, -136, -128, -136, -126, -137
), argos.calcul.freq = c(0, 676813.1, 676802.4, 676813.1, 676810,
676811.8), argos.lat1 = c(43.857, 43.916, 43.87, 43.89, 43.891,
43.89), argos.lat2 = c(43.857, 35.141, 49.688, 35.254, 40.546,
54.928), argos.lc = structure(c(7L, 6L, 2L, 3L, 4L, 3L), .Label = c("0",
"1", "2", "3", "A", "B", "G", "Z"), class = "factor"), argos.lon1 = c(-77.244,
-77.326, -77.223, -77.21, -77.208, -77.21), argos.lon2 = c(-77.244,
-121.452, -46.86, -118.496, -94.12, -16.159), argos.nb.mes.identical = c(0L,
2L, 6L, 4L, 5L, 6L), argos.nopc = c(0L, 1L, 2L, 3L, 4L, 4L),
argos.sensor.1 = c(0L, 149L, 194L, 1L, 193L, 193L), argos.sensor.2 = c(0L,
220L, 216L, 1L, 216L, 212L), argos.sensor.3 = c(0L, 1L, 1L,
0L, 3L, 1L), argos.sensor.4 = c(0L, 1L, 5L, 1L, 5L, 5L),
tag.local.identifier = c(112571L, 112571L, 112571L, 112571L,
112571L, 112571L), utm.easting = c(319655.836066914, 313250.096346666,
321382.422921619, 322486.41178559, 322650.029658403, 322486.41178559
), utm.northing = c(4858437.89950188, 4865173.18448801, 4859836.18321128,
4862029.54057323, 4862136.31345349, 4862029.54057323), utm.zone = structure(c(7L,
7L, 7L, 7L, 7L, 7L), .Label = c("12N", "13N", "14N", "15N",
"16N", "17N", "18N", "19N", "20N", "22N", "39N"), class = "factor"),
study.timezone = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Eastern Daylight Time",
"Eastern Standard Time"), class = "factor"), study.local.timestamp = structure(c(1323154800,
1323200450, 1323202141, 1323203545, 1323208151, 1323209966
), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = 1120:1125, class = "data.frame")
weeknumber<-week(timestamps(DF))
I don't use lubridate, but here's a base R solution to subset your data fortnightly. We look if the week numbers as numeric modulo 2 are not zero and the year week is not duplicated. All using strftime.
res <- DF[as.numeric(strftime(DF$timestamp, "%U")) %% 2 != 0 &
!duplicated(strftime(DF$timestamp, "%U %y")), ]
res
# timestamp x
# 1 2011-12-06 01:00:00 0.73178884
# 13 2011-12-18 01:00:00 -0.19310018
# 27 2012-01-01 01:00:00 1.13017531
# 41 2012-01-15 01:00:00 1.06546084
# 55 2012-01-29 01:00:00 -0.16664011
# 69 2012-02-12 01:00:00 -1.86596108
# 83 2012-02-26 01:00:00 0.59200189
# 97 2012-03-11 01:00:00 1.08327366
# 111 2012-03-25 01:00:00 -0.71291090
# 125 2012-04-08 02:00:00 0.51984052
# 139 2012-04-22 02:00:00 0.32738506
# 153 2012-05-06 02:00:00 2.50837829
# 167 2012-05-20 02:00:00 0.75116168
# 181 2012-06-03 02:00:00 -0.56359736
# 195 2012-06-17 02:00:00 0.60658448
# 209 2012-07-01 02:00:00 -0.07242813
# 223 2012-07-15 02:00:00 0.13811301
# 237 2012-07-29 02:00:00 0.19454153
# 251 2012-08-12 02:00:00 0.23119092
# 265 2012-08-26 02:00:00 -0.97278351
# 279 2012-09-09 02:00:00 -1.18143276
# 293 2012-09-23 02:00:00 -0.43294048
# 307 2012-10-07 02:00:00 0.05664472
# 321 2012-10-21 02:00:00 -0.90725782
# 335 2012-11-04 01:00:00 0.78939068
# 349 2012-11-18 01:00:00 -0.46047924
# 363 2012-12-02 01:00:00 1.45941339
Check by differencing.
## check
diff(res$timestamp)
# Time differences in days
# [1] 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14
# [21] 14 14 14 14 14
Data:
DF <- data.frame(timestamp=as.POSIXct(seq(as.Date("2011-12-06"), as.Date("2012-12-06"), "day")),
x=rnorm(367))
As I had said in my comment to your previous (since deleted) question, use seq.Date and either cut or findInterval.
I'll create a vector of "every other Monday", starting on January 1st, 2011. This is arbitrary, but you will want to ensure that you choose (1) a day that is meaningful to you, (2) a start-point that is before your earliest data, and (3) a length.out= that extends beyond your latest data.
every_other_monday <- seq(as.Date("2011-01-03"), by = "14 days", length.out = 26)
every_other_monday
# [1] "2011-01-03" "2011-01-17" "2011-01-31" "2011-02-14" "2011-02-28" "2011-03-14" "2011-03-28" "2011-04-11" "2011-04-25"
# [10] "2011-05-09" "2011-05-23" "2011-06-06" "2011-06-20" "2011-07-04" "2011-07-18" "2011-08-01" "2011-08-15" "2011-08-29"
# [19] "2011-09-12" "2011-09-26" "2011-10-10" "2011-10-24" "2011-11-07" "2011-11-21" "2011-12-05" "2011-12-19"
every_other_monday[ findInterval(as.Date(DF$timestamp), every_other_monday) ]
# [1] "2011-12-05" "2011-12-05" "2011-12-05" "2011-12-05" "2011-12-05" "2011-12-05"
(The choice to start on Jan 3 was conditioned on the assumption that your real data spans a much larger length of time. You don't need a full year's worth of biweeks in every_other_monday, nor does it need to be a Monday, it can be whatever base-date you choose. So long as it includes at least one date before and after the actual DF dates, you should be covered.)
Alternative: round to the week-level, then filter out those where the modulus of its julian day is odd. (The reason I chose "modulus of its julian day" is to reduce the chance that it could shift based on slight changes in data range.)
weeks <- lubridate::floor_date(as.Date(DF$timestamp), unit = "weeks")
weeks
# [1] "2011-12-04" "2011-12-04" "2011-12-04" "2011-12-04" "2011-12-04" "2011-12-04"
isodd <- as.POSIXlt(weeks)$yday %% 2 == 1
weeks[isodd] <- weeks[isodd] - 7L
weeks # technically, now "biweeks"
# [1] "2011-11-27" "2011-11-27" "2011-11-27" "2011-11-27" "2011-11-27" "2011-11-27"
See example below. This function uses which.max and sapply to round the date variable to the nearest Sunday within two week intervals.
library(lubridate)
## Create Data Frame
DF <- data.frame(timestamp=as.POSIXct(seq(as.Date("2011-12-06"), as.Date("2012-12-06"), "day")))
## Create two week intervals (change the start date if you don't want to start on Sundays)
every_other_sunday <- seq(as.Date("2011-12-18"), by = "14 days", length.out = 27)
## Make the date variable
DF$date <- as.Date(DF$timestamp)
## Function to find the closest Sunday from the intervals created above
find_closest_sunday <- function(index){
which.max(abs(every_other_sunday - DF$date[index] - 7) <= min(abs(every_other_sunday - DF$date[index] - 7)))
}
## Add the new variable to your dataset
DF$every_two_weeks <- every_other_sunday[sapply(seq_along(DF$date), function(i) find_closest_sunday(i))]
## Check that the function worked correctly
DF[,c("date", "every_two_weeks")]
## If you want the week number instead of a date, wrap the every_two_weeks variable in the week() function
week(DF$every_two_weeks)
my data is structured as follows:
price machine timestamp date hour weekday month year trans_id
1: 3.1 179 2017-01-11 15:53:58 2017-01-11 15 Wednesday 1 2017 2017-01-11 15:53:58,179
2: 3.1 179 2017-01-11 15:53:45 2017-01-11 15 Wednesday 1 2017 2017-01-11 15:53:45,179
3: 3.1 179 2017-01-28 00:31:20 2017-01-28 0 Saturday 1 2017 2017-01-28 00:31:20,179
4: 3.1 179 2017-02-04 02:08:42 2017-02-04 2 Saturday 2 2017 2017-02-04 02:08:42,179
5: 3.1 179 2017-03-03 06:34:04 2017-03-03 6 Friday 3 2017 2017-03-03 06:34:04,179
---
1840473: 2.3 2707 2017-04-01 17:06:42 2017-04-01 17 Saturday 4 2017 2017-04-01 17:06:42,2707
1840474: 2.3 2707 2017-04-01 07:55:11 2017-04-01 7 Saturday 4 2017 2017-04-01 07:55:11,2707
1840475: 2.3 2709 2017-02-19 00:28:08 2017-02-19 0 Sunday 2 2017 2017-02-19 00:28:08,2709
1840476: 2.3 2709 2017-03-19 07:34:21 2017-03-19 7 Sunday 3 2017 2017-03-19 07:34:21,2709
1840477: 2.3 2709 2017-03-29 05:56:19 2017-03-29 5 Wednesday 3 2017 2017-03-29 05:56:19,2709
What I am trying to do is calculate the average number of transactions per day for each machine. Then I look at every hour the machine has made a sale. I want to add a column with the difference of transactions in the hour compared to the daily average.
I have managed to get this when I subset my total data per day and per machine setting ex:
ex=dt_2017[(machine=='179')&(date=='2017-01-11')]
total_hours=ex[,unique(hour)]
total_day_transaction=nrow(ex)
average_hour_transaction=total_day_transaction/length(total_hours)
change_hour=vector(mode='list')
counterk=1
for (k in total_hours){
hour_transac=nrow(ex[hour==k])
change=(hour_transac-average_hour_transaction)/average_hour_transaction
change_hour[[counterk]]=change
counterk=counterk+1
}
avg_matrix=cbind(as.data.frame(total_hours),transpose(as.data.frame(change_hour)))
ex2=setDT(merge(x=ex,
y=avg_matrix,
by.x='hour',
by.y='total_hours'))
colnames(ex2)[ncol(ex2)]<-'hour_change'
trans_id=ex2[,trans_id]
dyna_price=vector(mode='list')
counterl=1
for (l in trans_id){
if (ex2[trans_id==l,hour_change]>0){
dyna_price[counterl]=ex2[trans_id==l,price]*(1+ex2[trans_id==l,hour_change])
}else{
dyna_price[counterl]=ex2[trans_id==l,price]
}
counterl=counterl+1
}
dyna_price_matrix=cbind(as.data.frame(trans_id),transpose(as.data.frame(dyna_price)))
ex3=merge(x=dt_2017,
y=dyna_price_matrix,
by='trans_id',
all.x=TRUE)
colnames(ex3)[ncol(ex3)]<-'dynamic_price'
However I would like to iterate it over every machine and every day. I believe what I would need to find a way to name my data table with a variable but I cannot find anything online.
Any help is appreciated.
Thank you very much
We can use different group by= and assign to variables with :=. .N is a special symbol that contains the number of rows in the group.
library(data.table)
setDT(Data)[,hour.trans := .N, by = c("machine","date","hour")][
,daily.avg := .N / 24,by = c("machine","date")][
,difference := hour.trans - daily.avg, by = c("machine","date")][
,.(machine,date,hour,daily.avg,difference)]
# machine date hour daily.avg difference
# 1: 179 2017-01-11 15 0.08333333 1.9166667
# 2: 179 2017-01-11 15 0.08333333 1.9166667
# 3: 179 2017-01-28 0 0.04166667 0.9583333
# 4: 179 2017-02-04 2 0.04166667 0.9583333
# 5: 179 2017-03-03 6 0.04166667 0.9583333
# 6: 2707 2017-04-01 17 0.08333333 0.9166667
# 7: 2707 2017-04-01 7 0.08333333 0.9166667
# 8: 2709 2017-02-19 0 0.04166667 0.9583333
# 9: 2709 2017-03-19 7 0.04166667 0.9583333
#10: 2709 2017-03-29 5 0.04166667 0.9583333
Data
structure(list(price = c(3.1, 3.1, 3.1, 3.1, 3.1, 2.3, 2.3, 2.3,
2.3, 2.3), machine = c(179L, 179L, 179L, 179L, 179L, 2707L, 2707L,
2709L, 2709L, 2709L), timestamp = structure(c(2L, 1L, 3L, 4L,
6L, 10L, 9L, 5L, 7L, 8L), .Label = c("2017-01-11 15:53:45", "2017-01-11 15:53:58",
"2017-01-28 00:31:20", "2017-02-04 02:08:42", "2017-02-19 00:28:08",
"2017-03-03 06:34:04", "2017-03-19 07:34:21", "2017-03-29 05:56:19",
"2017-04-01 07:55:11", "2017-04-01 17:06:42"), class = "factor"),
date = structure(c(1L, 1L, 2L, 3L, 5L, 8L, 8L, 4L, 6L, 7L
), .Label = c("2017-01-11", "2017-01-28", "2017-02-04", "2017-02-19",
"2017-03-03", "2017-03-19", "2017-03-29", "2017-04-01"), class = "factor"),
hour = c(15L, 15L, 0L, 2L, 6L, 17L, 7L, 0L, 7L, 5L), weekday = structure(c(4L,
4L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 4L), .Label = c("Friday",
"Saturday", "Sunday", "Wednesday"), class = "factor"), month = c(1L,
1L, 1L, 2L, 3L, 4L, 4L, 2L, 3L, 3L), year = c(2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L),
trans_id = structure(c(2L, 1L, 3L, 4L, 6L, 10L, 9L, 5L, 7L,
8L), .Label = c("2017-01-11 15:53:45,179", "2017-01-11 15:53:58,179",
"2017-01-28 00:31:20,179", "2017-02-04 02:08:42,179", "2017-02-19 00:28:08,2709",
"2017-03-03 06:34:04,179", "2017-03-19 07:34:21,2709", "2017-03-29 05:56:19,2709",
"2017-04-01 07:55:11,2707", "2017-04-01 17:06:42,2707"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
I am new to dplyr and trying to calculate the avg sales going 3 weeks back from each week of a product.If we don't have all 3 weeks historical data we will consider how many ever data points which are present.
Edit:
I am sorry I missed this earlier , but there is an additional factor which we need to consider, we should only take lag of those columns where there was no Promo.
Here is a sample data
df = structure(list(Product = structure(c(1L, 2L, 1L, 1L, 1L, 2L,
2L, 1L), .Label = c("A", "B"), class = "factor"), Promo = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
Week = structure(c(1L, 1L, 2L, 3L, 4L, 2L, 3L, 5L), .Label = c("2017-01-01",
"2017-01-02", "2017-01-03", "2017-01-04", "2017-01-05"), class = "factor"),
Sales = c(50, 50, 60, 70, 50, 50, 80, 70)), .Names = c("Product",
"Promo", "Week", "Sales"), row.names = c(NA, -8L), class = "data.frame")
Product Promo Week Sales
1 A 0 2017-01-01 50
2 B 0 2017-01-01 50
3 A 1 2017-01-02 60
4 A 0 2017-01-03 70
5 A 0 2017-01-04 50
6 B 1 2017-01-02 50
7 B 0 2017-01-03 80
8 A 0 2017-01-05 70
I want go back 3 non Promo (Flag =0) weeks for each product , so o/p would be something like this
Product Promo Week Sales Avg_Non_Promo_Sales
1 A 0 2017-01-01 50 50
2 B 0 2017-01-01 50 50
3 A 1 2017-01-02 60 50 # We have only 1 non promo week
#before
4 A 0 2017-01-03 70 60 #70+50 /2
5 A 0 2017-01-04 50 56.6 #(50 + 70 + 50 /3, non promo)
6 B 1 2017-01-02 50 50
7 B 0 2017-01-03 80 65 #50 + 80 /2
8 A 0 2017-01-05 70 60 # 240/4
I have a data set ProductTable, I want to return the date of all the ProductsFamily has been ordered first time and the very last time. Examples:
ProductTable
OrderPostingYear OrderPostingMonth OrderPostingDate ProductsFamily Sales QTY
2008 1 20 R1 5234 1
2008 1 12 R2 223 2
2009 1 30 R3 34 1
2008 2 1 R1 1634 3
2010 4 23 R3 224 1
2009 3 20 R1 5234 1
2010 7 12 R2 223 2
Result as followings
OrderTime
ProductsFamily OrderStart OrderEnd SumSales
R1 2008/1/20 2009/3/20 12102
R2 2008/1/12 2010/7/12 446
R3 2009/1/30 2010/4/23 258
I have no idea how to do it. Any suggestions?
ProductTable <- structure(list(OrderPostingYear = c(2008L, 2008L, 2009L, 2008L,
2010L, 2009L, 2010L), OrderPostingMonth = c(1L, 1L, 1L, 2L, 4L,
3L, 7L), OrderPostingDate = c(20L, 12L, 30L, 1L, 23L, 20L, 12L
), ProductsFamily = structure(c(1L, 2L, 3L, 1L, 3L, 1L, 2L), .Label = c("R1",
"R2", "R3"), class = "factor"), Sales = c(5234L, 223L, 34L, 1634L,
224L, 5234L, 223L), QTY = c(1L, 2L, 1L, 3L, 1L, 1L, 2L)), .Names = c("OrderPostingYear",
"OrderPostingMonth", "OrderPostingDate", "ProductsFamily", "Sales",
"QTY"), class = "data.frame", row.names = c(NA, -7L))
We can also use dplyr/tidyr to do this. We arrange the columns, concatenate the 'Year:Date' columns with unite, group by 'ProductsFamily', get the first, last of 'Date' column and sum of 'Sales' within summarise.
library(dplyr)
library(tidyr)
ProductTable %>%
arrange(ProductsFamily, OrderPostingYear, OrderPostingMonth, OrderPostingDate) %>%
unite(Date,OrderPostingYear:OrderPostingDate, sep='/') %>%
group_by(ProductsFamily) %>%
summarise(OrderStart=first(Date), OrderEnd=last(Date), SumSales=sum(Sales))
# Source: local data frame [3 x 4]
# ProductsFamily OrderStart OrderEnd SumSales
# (fctr) (chr) (chr) (int)
# 1 R1 2008/1/20 2009/3/20 12102
# 2 R2 2008/1/12 2010/7/12 446
# 3 R3 2009/1/30 2010/4/23 258
You can first set up the date in a new column, and then aggregate your data using data.table package (you take the first and last date by ID, as well as the sum of sales):
library(data.table)
# First build up the date
ProductTable$date = with(ProductTable,
as.Date(paste(OrderPostingYear,
OrderPostingMonth,
OrderPostingDate, sep = "." ),
format = "%Y.%m.%d"))
# In a second step, aggregate your data
setDT(ProductTable)[,list(OrderStart = sort(date)[1],
OrderEnd = sort(date)[.N],
SumSales = sum(Sales))
,ProductsFamily]
# ProductsFamily OrderStart OrderEnd SumSales
#1: R1 2008-01-20 2009-03-20 12102
#2: R2 2008-01-12 2010-07-12 446
#3: R3 2009-01-30 2010-04-23 258