I have a file (format RData).https://stepik.org/media/attachments/course/724/all_data.Rdata This file contains 7 lists with id and temperature of patients.
I need to make one data.frame from these lists and then remove all rows with NA
id temp i.temp i.temp.1 i.temp.2 i.temp.3 i.temp.4 i.temp.5
1: 1 36.70378 36.73161 36.22944 36.05907 35.66014 37.32798 35.88121
2: 2 36.43545 35.96814 36.86782 37.20890 36.45172 36.82727 36.83450
3: 3 36.87599 36.38842 36.70508 37.44710 36.73362 37.09359 35.92993
4: 4 36.17120 35.95853 36.33405 36.45134 37.17186 36.87482 35.45489
5: 5 37.20341 37.04881 36.53252 36.22922 36.78106 36.89219 37.13207
6: 6 36.12201 36.53433 37.29784 35.96451 36.70838 36.58684 36.60122
7: 7 36.92314 36.16220 36.48154 37.05324 36.57829 36.24955 37.23835
8: 8 35.71390 37.26879 37.01673 36.65364 36.89143 36.46331 37.15398
9: 9 36.63558 37.03452 36.40129 37.53705 36.03568 36.78083 36.71873
10: 10 36.77329 36.07161 36.42992 36.20715 36.78880 36.79875 36.15004
11: 11 36.66199 36.74958 36.28661 36.72539 36.17700 37.47495 35.60980
12: 12 NA 36.97689 36.00473 36.64292 35.96789 36.73904 36.93957
13: 13 NA NA NA NA NA 36.63760 36.83916
14: 14 37.40307 35.89668 36.30619 36.64382 37.21882 35.87420 35.45550
15: 15 NA NA NA 37.03758 36.72512 36.45281 37.54388
16: 16 NA 36.44912 36.57126 36.20703 36.83076 36.48287 35.99391
17: 17 NA NA NA 36.39900 36.54043 36.75989 36.47079
18: 18 36.51696 37.09903 37.31166 36.51000 36.42414 36.87976 36.45736
19: 19 37.05117 37.42526 36.15820 36.11824 37.07024 36.60699 36.80168
20: 20 NA NA NA NA NA NA 36.74118
I wrote:
load("https://stepik.org/media/attachments/course/724/all_data.Rdata")
library(data.table)
day1<-as.data.table(all_data[1])
day2<-as.data.table(all_data[2])
day3<-as.data.table(all_data[3])
day4<-as.data.table(all_data[4])
day5<-as.data.table(all_data[5])
day6<-as.data.table(all_data[6])
day7<-as.data.table(all_data[7])
setkey(day1, id)
setkey(day2, id)
setkey(day3, id)
setkey(day4, id)
setkey(day5, id)
setkey(day6, id)
setkey(day7, id)
all_day<-day1[day2,][day3, ][day4,][day5,][day6,][day7,]
all_day<-na.omit(all_day)
But it takes too long. How can I make it faster?
here is a data.table solution
library( data.table )
#set names for all_data
names( all_data ) <- paste0( "day", 1:length(all_data))
#bind lists to data.table
DT <- data.table::rbindlist( all_data, use.names = TRUE, fill = TRUE, idcol = "day" )
#cast to wide
ans <- dcast( DT, id ~ day, value.var = "temp" )
#only keep complete rows and present output (using [] at the end)
ans[ complete.cases( ans ), ][]
# id day1 day2 day3 day4 day5 day6 day7
# 1: 1 36.70378 36.73161 36.22944 36.05907 35.66014 37.32798 35.88121
# 2: 2 36.43545 35.96814 36.86782 37.20890 36.45172 36.82727 36.83450
# 3: 3 36.87599 36.38842 36.70508 37.44710 36.73362 37.09359 35.92993
# 4: 4 36.17120 35.95853 36.33405 36.45134 37.17186 36.87482 35.45489
# 5: 5 37.20341 37.04881 36.53252 36.22922 36.78106 36.89219 37.13207
# 6: 6 36.12201 36.53433 37.29784 35.96451 36.70838 36.58684 36.60122
# 7: 7 36.92314 36.16220 36.48154 37.05324 36.57829 36.24955 37.23835
# 8: 8 35.71390 37.26879 37.01673 36.65364 36.89143 36.46331 37.15398
# 9: 9 36.63558 37.03452 36.40129 37.53705 36.03568 36.78083 36.71873
# 10:10 36.77329 36.07161 36.42992 36.20715 36.78880 36.79875 36.15004
# 11:11 36.66199 36.74958 36.28661 36.72539 36.17700 37.47495 35.60980
# 12:14 37.40307 35.89668 36.30619 36.64382 37.21882 35.87420 35.45550
# 13:18 36.51696 37.09903 37.31166 36.51000 36.42414 36.87976 36.45736
# 14:19 37.05117 37.42526 36.15820 36.11824 37.07024 36.60699 36.80168
Related
I am currently working in R to build a for loop which will add the year to 7 columns that contain partial dates (dd/mm). I have been attempting to run the following for-loop and have not been successful. What am I doing wrong?
Here's a sample of what my data set looks like (The actual data set includes columns HomDate - HomDate_7 but I only included the first few as I know you'll get the point...)
Participant DateVisit HomDate HomDate_2 HomeDate_3 year_flag
1 2012-04-25 18/04 19/04 20/04 NA
2 2012-01-04 28/12 29/12 30/12 1
3 2012-01-05 31/12 01/01 01/02 1
4 2012-06-13 06/06 07/06 08/06 NA
5 2012-02-12 05/02 06/02 07/02 NA
Here's the code I've been trying to use:
hom_date <- list("HomDate", "HomDate_2", "HomDate_3", "HomDate_4", "HomDate_5", "HomDate_6",
"HomDate_7")
set_dates <- function(x){
home_morbid[,x:=as.character(x)]
home_morbid[(substr(x, 4, 5)==12) & (year_flag==1), x:=paste(x, "/2011", sep="")]
home_morbid[(substr(x, 4, 5)==01) & (year_flag==1), x:=paste(x, "/2012", sep="")]
home_morbid[is.na(year_flag), x:=paste(x, "/", substr(DateVisit, 1, 4), sep="")]
}
for(i in 1:length(hom_date)){
x <- hom_date[i]
home_morbid_2<-set_dates(x)
}
I'm not sure what happens to those with an NA flag. Here is an approach:
to_replace<-grep("^Hom",names(df))
df[,(to_replace):=lapply(.SD, function(x) ifelse(is.na(year_flag),x,
ifelse(substr(x, 4, 5)==12,
paste0(x,"/","2011"),
paste0(x,"/","2012")))),
.SDcols=HomDate:HomeDate_3][]
Participant DateVisit HomDate HomDate_2 HomeDate_3 year_flag
1: 1 2012-04-25 18/04 19/04 20/04 NA
2: 2 2012-01-04 28/12/2011 29/12/2011 30/12/2011 1
3: 3 2012-01-05 31/12/2011 01/01/2012 01/02/2012 1
4: 4 2012-06-13 06/06 07/06 08/06 NA
5: 5 2012-02-12 05/02 06/02 07/02 NA
To replace NA flagged years with the year from DateVisit:
library(lubridate)
to_replace<-grep("^Hom",names(df))
df[,(to_replace):=lapply(.SD, function(x) ifelse(is.na(year_flag),
paste0(x,"/",year(ymd(DateVisit))),
ifelse(substr(x, 4, 5)==12,
paste0(x,"/","2011"),
paste0(x,"/","2012")))),
.SDcols=HomDate:HomeDate_3][]
Participant DateVisit HomDate HomDate_2 HomeDate_3 year_flag
1: 1 2012-04-25 18/04/2012 19/04/2012 20/04/2012 NA
2: 2 2012-01-04 28/12/2011 29/12/2011 30/12/2011 1
3: 3 2012-01-05 31/12/2011 01/01/2012 01/02/2012 1
4: 4 2012-06-13 06/06/2012 07/06/2012 08/06/2012 NA
5: 5 2012-02-12 05/02/2012 06/02/2012 07/02/2012 NA
My codes executes just fine, however it takes an enourmous amount of time to finalize. Would like some help to optimize the code, if possible, a way to execute a rolling aggregation on multiple columns.
I've been tring a few other ways by creating a function and vectorizing my dataframe with library(data.table), but no success in doing so, i actually get half of what i should get and I can only do with one column at a time.
# Creating functions
fun <- function(x, date, thresh) {
D <- as.matrix(dist(date)) #distance matrix between dates
D <- D <= thresh
D[lower.tri(D)] <- FALSE #don't sum to future
R <- D * x #FALSE is treated as 0
colMeans(R, na.rm = TRUE)
}
setDT(df_2)
df_2[, invoiceDate := as.Date(invoiceDate, format = "%m/%d/%Y")]
setkey(df_2, cod_unb, cod_pdv, invoiceDate)
df_2[, volume_total_diario_RT30 := fun(volume_total_diario, invoiceDate, 30), by = list(cod_unb, cod_pdv)]
This is my current code that works fine, but takes too much time (Over 8h to process 30 days)
years <- c(2017:2019)
months <- c(1:12)
days <- c(1:31)
df_final <- df_n[1,c('cod_unb','cod_pdv','cpf_cnpj','idade_pdv_meses','status_telefone','col1','col2','col3','year','month','day')] #eliminating first line
for (i in years) {
for (j in months) {
for (k in days) {
if (j == 1){
df_temp <- df_n[(df_n$years == i & df_n$months == j & df_n$days <= k) | (df_n$years == (i-1) & df_n$months == 12 & df_n$days >= k),]
}
if (j != 1){
df_temp <- df_n[(df_n$years == i & df_n$months == j & df_n$days <= k) | (df_n$years == i & df_n$months == (j - 1) & df_n$days >= k),]
}
#Agreggate.
if(nrow(df_temp) >= 1){
df_temp <- aggregate(df_temp[, c('col1','col2','col3')], by = list(df_temp$cod_unb,df_temp$cod_pdv,df_temp$cpf_cnpj,df_temp$idade_pdv_meses,df_temp$status_telefone), FUN = mean)
names(df_temp)[names(df_temp) == "Group.1"] <- "cod_unb"
names(df_temp)[names(df_temp) == "Group.2"] <- "cod_pdv"
names(df_temp)[names(df_temp) == "Group.3"] <- "cpf_cnpj"
names(df_temp)[names(df_temp) == "Group.4"] <- "idade_pdv_meses"
names(df_temp)[names(df_temp) == "Group.5"] <- "status_telefone"
df_temp$years <- i
df_temp$months <- j
df_temp$days <- k
df_final <- rbind(df_final,df_temp)
}
}
}
}
df_final <- df_final[-1,]
Output should be column R30
cod_unb;cod_pdv;Years;Months;Days;date;volume_total_diario;R30
111;1005;2018;11;3;03/11/2018;0.48;
111;1005;2018;11;9;09/11/2018;0.79035;
111;1005;2018;11;16;16/11/2018;1.32105;
111;1005;2018;11;24;24/11/2018;0.6414;
111;1005;2018;11;30;30/11/2018;0.6;
111;1005;2018;12;7;07/12/2018;1.79175;1.02891
111;1005;2018;12;15;15/12/2018;1.4421;1.15926
111;1005;2018;12;21;21/12/2018;0.48;0.99105
111;1005;2018;12;28;28/12/2018;0.5535;0.97347
111;1005;2019;1;4;04/01/2019;0.36;0.92547
If I understand correctly, the OP has requested to aggregate values over a rolling period of 30 days and to append these aggregates to the original data.
This can be solved efficiently by aggregating in a non-equi join.
Here is an example for one variable using sample data provided by the OP:
library(data.table)
# coerce to data.table, coerce character date to class IDate
setDT(df_n)[, date := as.IDate(date, "%d/%m/%Y")]
# intermediate result for demonstration:
df_n[.(upper = date, lower = date - 30), on = .(date <= upper, date >= lower),
mean(volume_total_diario), by = .EACHI]
date date V1
1: 2018-11-03 2018-10-04 0.480000
2: 2018-11-09 2018-10-10 0.635175
3: 2018-11-16 2018-10-17 0.863800
4: 2018-11-24 2018-10-25 0.808200
5: 2018-11-30 2018-10-31 0.766560
6: 2018-12-07 2018-11-07 1.028910
7: 2018-12-15 2018-11-15 1.159260
8: 2018-12-21 2018-11-21 0.991050
9: 2018-12-28 2018-11-28 0.973470
10: 2019-01-04 2018-12-05 0.925470
The intermediate result shows the upper and lower limits of the date range included in the aggregation and the aggragated values for the respective periods. This can be used to add a new column to df_n:
# update df_n by appending new column
setDT(df_n)[, R30_new := df_n[.(upper = date, lower = date - 30), on = .(date <= upper, date >= lower),
mean(volume_total_diario), by = .EACHI]$V1]
df_n
cod_unb cod_pdv Years Months Days date volume_total_diario R30 R30_new
1: 111 1005 2018 11 3 2018-11-03 0.48000 NA 0.480000
2: 111 1005 2018 11 9 2018-11-09 0.79035 NA 0.635175
3: 111 1005 2018 11 16 2018-11-16 1.32105 NA 0.863800
4: 111 1005 2018 11 24 2018-11-24 0.64140 NA 0.808200
5: 111 1005 2018 11 30 2018-11-30 0.60000 NA 0.766560
6: 111 1005 2018 12 7 2018-12-07 1.79175 1.02891 1.028910
7: 111 1005 2018 12 15 2018-12-15 1.44210 1.15926 1.159260
8: 111 1005 2018 12 21 2018-12-21 0.48000 0.99105 0.991050
9: 111 1005 2018 12 28 2018-12-28 0.55350 0.97347 0.973470
10: 111 1005 2019 1 4 2019-01-04 0.36000 0.92547 0.925470
The values of R30 and R30_new are identical; R30_new contains also results for the first 5 rows.
Caveat
Additional grouping variables have been ignored for the sake of clarity but can be included easily. Also, the solution can be extended to aggregate multiple value columns.
Data
library(data.table)
df_n <- fread("
cod_unb;cod_pdv;Years;Months;Days;date;volume_total_diario;R30
111;1005;2018;11;3;03/11/2018;0.48;
111;1005;2018;11;9;09/11/2018;0.79035;
111;1005;2018;11;16;16/11/2018;1.32105;
111;1005;2018;11;24;24/11/2018;0.6414;
111;1005;2018;11;30;30/11/2018;0.6;
111;1005;2018;12;7;07/12/2018;1.79175;1.02891
111;1005;2018;12;15;15/12/2018;1.4421;1.15926
111;1005;2018;12;21;21/12/2018;0.48;0.99105
111;1005;2018;12;28;28/12/2018;0.5535;0.97347
111;1005;2019;1;4;04/01/2019;0.36;0.92547
")
EDIT: Aggregating multiple variables
As the OP has asked for a way to execute a rolling aggregation on multiple columns here is an example.
First, we need to create an additional value var in OP's sample dataset:
df_n <- fread("
cod_unb;cod_pdv;Years;Months;Days;date;volume_total_diario;R30
111;1005;2018;11;3;03/11/2018;0.48;
111;1005;2018;11;9;09/11/2018;0.79035;
111;1005;2018;11;16;16/11/2018;1.32105;
111;1005;2018;11;24;24/11/2018;0.6414;
111;1005;2018;11;30;30/11/2018;0.6;
111;1005;2018;12;7;07/12/2018;1.79175;1.02891
111;1005;2018;12;15;15/12/2018;1.4421;1.15926
111;1005;2018;12;21;21/12/2018;0.48;0.99105
111;1005;2018;12;28;28/12/2018;0.5535;0.97347
111;1005;2019;1;4;04/01/2019;0.36;0.92547
")[
, date := as.IDate(date, "%d/%m/%Y")][, var2 := .I][]
df_n
cod_unb cod_pdv Years Months Days date volume_total_diario R30 var2
1: 111 1005 2018 11 3 2018-11-03 0.48000 NA 1
2: 111 1005 2018 11 9 2018-11-09 0.79035 NA 2
3: 111 1005 2018 11 16 2018-11-16 1.32105 NA 3
4: 111 1005 2018 11 24 2018-11-24 0.64140 NA 4
5: 111 1005 2018 11 30 2018-11-30 0.60000 NA 5
6: 111 1005 2018 12 7 2018-12-07 1.79175 1.02891 6
7: 111 1005 2018 12 15 2018-12-15 1.44210 1.15926 7
8: 111 1005 2018 12 21 2018-12-21 0.48000 0.99105 8
9: 111 1005 2018 12 28 2018-12-28 0.55350 0.97347 9
10: 111 1005 2019 1 4 2019-01-04 0.36000 0.92547 10
So, a column var2 has been added (which simply contains the row number).
This is the code to aggregate multiple column using the same aggregation function:
cols <- c("volume_total_diario", "var2")
setDT(df_n)[, paste0("mean_", cols) :=
df_n[.(upper = date, lower = date - 30),
on = .(date <= upper, date >= lower),
lapply(.SD, mean),
.SDcols = cols, by = .EACHI][
, .SD, .SDcols = cols]][]
df_n
cod_unb cod_pdv Years Months Days date volume_total_diario R30 var2 mean_volume_total_diario mean_var2
1: 111 1005 2018 11 3 2018-11-03 0.48000 NA 1 0.480000 1.0
2: 111 1005 2018 11 9 2018-11-09 0.79035 NA 2 0.635175 1.5
3: 111 1005 2018 11 16 2018-11-16 1.32105 NA 3 0.863800 2.0
4: 111 1005 2018 11 24 2018-11-24 0.64140 NA 4 0.808200 2.5
5: 111 1005 2018 11 30 2018-11-30 0.60000 NA 5 0.766560 3.0
6: 111 1005 2018 12 7 2018-12-07 1.79175 1.02891 6 1.028910 4.0
7: 111 1005 2018 12 15 2018-12-15 1.44210 1.15926 7 1.159260 5.0
8: 111 1005 2018 12 21 2018-12-21 0.48000 0.99105 8 0.991050 6.0
9: 111 1005 2018 12 28 2018-12-28 0.55350 0.97347 9 0.973470 7.0
10: 111 1005 2019 1 4 2019-01-04 0.36000 0.92547 10 0.925470 8.0
Note that the new columns have been named programmtically.
I have some data. ID and date and I'm trying to create a new field for semester.
df:
id date
1 20160822
2 20170109
3 20170828
4 20170925
5 20180108
6 20180402
7 20160711
8 20150831
9 20160111
10 20160502
11 20160829
12 20170109
13 20170501
I also have a semester table:
start end season_year
20120801 20121222 Fall-2012
20121223 20130123 Winter-2013
20130124 20130523 Spring-2013
20130524 20130805 Summer-2013
20130806 20131228 Fall-2013
20131229 20140122 Winter-2014
20140123 20140522 Spring-2014
20140523 20140804 Summer-2014
20140805 20141227 Fall-2014
20141228 20150128 Winter-2015
20150129 20150528 Spring-2015
20150529 20150803 Summer-2015
20150804 20151226 Fall-2015
20151227 20160127 Winter-2016
20160128 20160526 Spring-2016
20160527 20160801 Summer-2016
20160802 20161224 Fall-2016
20161225 20170125 Winter-2017
20170126 20170525 Spring-2017
20170526 20170807 Summer-2017
20170808 20171230 Fall-2017
20171231 20180124 Winter-2018
20180125 20180524 Spring-2018
20180525 20180806 Summer-2018
20180807 20181222 Fall-2018
20181223 20190123 Winter-2019
20190124 20190523 Spring-2019
20190524 20180804 Summer-2019
I'd like to create a new field in df if df$date is between semester$start and semester$end, then place the respective value semester$season_year in df
I tried to see if the lubridate package could help but that seems to be more for calculations
I saw this question and it seems to be the closest to what i want, but, to make things more complicated, not all of our semesters are six months
Does this work?
library(lubridate)
semester$start <- ymd(semester$start)
semester$end <- ymd(semester$end)
df$date <- ymd(df$date)
LU <- Map(`:`, semester$start, semester$end)
LU <- data.frame(value = unlist(LU),
index = rep(seq_along(LU), lapply(LU, length)))
df$semester <- semester$season_year[LU$index[match(df$date, LU$value)]]
A solution using non-equi update joins using data.table and lubridate package can be as:
library(data.table)
setDT(df)
setDT(semester)
df[,date:=as.IDate(as.character(date), format = "%Y%m%d")]
semester[,':='(start = as.IDate(as.character(start), format = "%Y%m%d"),
end=as.IDate(as.character(end), format = "%Y%m%d"))]
df[semester, on=.(date >= start, date <= end), season_year := i.season_year]
df
# id date season_year
# 1: 1 2016-08-22 Fall-2016
# 2: 2 2017-01-09 Winter-2017
# 3: 3 2017-08-28 Fall-2017
# 4: 4 2017-09-25 Fall-2017
# 5: 5 2018-01-08 Winter-2018
# 6: 6 2018-04-02 Spring-2018
# 7: 7 2016-07-11 Summer-2016
# 8: 8 2015-08-31 Fall-2015
# 9: 9 2016-01-11 Winter-2016
# 10: 10 2016-05-02 Spring-2016
# 11: 11 2016-08-29 Fall-2016
# 12: 12 2017-01-09 Winter-2017
# 13: 13 2017-05-01 Spring-2017
Data:
df <- read.table(text="
id date
1 20160822
2 20170109
3 20170828
4 20170925
5 20180108
6 20180402
7 20160711
8 20150831
9 20160111
10 20160502
11 20160829
12 20170109
13 20170501",
header = TRUE, stringsAsFactors = FALSE)
semester <- read.table(text="
start end season_year
20120801 20121222 Fall-2012
20121223 20130123 Winter-2013
20130124 20130523 Spring-2013
20130524 20130805 Summer-2013
20130806 20131228 Fall-2013
20131229 20140122 Winter-2014
20140123 20140522 Spring-2014
20140523 20140804 Summer-2014
20140805 20141227 Fall-2014
20141228 20150128 Winter-2015
20150129 20150528 Spring-2015
20150529 20150803 Summer-2015
20150804 20151226 Fall-2015
20151227 20160127 Winter-2016
20160128 20160526 Spring-2016
20160527 20160801 Summer-2016
20160802 20161224 Fall-2016
20161225 20170125 Winter-2017
20170126 20170525 Spring-2017
20170526 20170807 Summer-2017
20170808 20171230 Fall-2017
20171231 20180124 Winter-2018
20180125 20180524 Spring-2018
20180525 20180806 Summer-2018
20180807 20181222 Fall-2018
20181223 20190123 Winter-2019
20190124 20190523 Spring-2019
20190524 20180804 Summer-2019",
header = TRUE, stringsAsFactors = FALSE)
Apologies in advance...I couldn't articulate a better title.
Here is the problem:
I am working with a data.table and have grouped rows using 'by'. This results in the same number of rows as the unique values of the column of interest. For each unique 'by' value (in this example, 'lat_lon'), I want to take the unique values in another column (ID) and add them to the same row as the unique by column.
Here is an example:
lat_lon ID
1: 42.04166667_-80.4375 26D25
2: 42.04166667_-80.4375 26D26
3: 42.04166667_-80.3125 26D34
4: 42.04166667_-80.3125 26D35
5: 42.04166667_-80.3125 26D36
6: 42.125_-80.1875 26D41
7: 42.125_-80.1875 27C46
8: 42.125_-80.1875 27D42
9: 42.04166667_-80.1875 26D43
10: 42.04166667_-80.1875 26D45
11: 42.04166667_-80.1875 27D44
12: 42.04166667_-80.1875 27D46
13: 42.29166667_-79.8125 27B76
14: 42.20833333_-80.0625 27C53
15: 42.20833333_-80.0625 27C54
16: 42.125_-80.0625 27C55
17: 42.125_-80.0625 27C56
18: 42.125_-80.0625 27D51
19: 42.125_-80.0625 27D52
What I really want is this:
lat_lon ID.1 ID.2 ID.3 ID.4 ID.5 ID.6 ID.7 ID.8 ID.9 ID.10
42.04166667_-80.4375 26D25 26D26 NA NA NA NA NA NA NA NA
42.04166667_-80.3125 26D34 26D35 26D36 NA NA NA NA NA NA NA
...
42.125_-80.0625 27C55 27C56 27D51 27D52 NA NA NA NA NA NA
Thank you for your patience and helpful comments.
For a data.table solution, adding a idx column (rn) first then pivot using dcast.data.table would help:
dcast.data.table(dat[, rn := paste0("ID.", seq_len(.N)), by=.(lat_lon)],
lat_lon ~ rn, value.var="ID")
# lat_lon ID.1 ID.2 ID.3 ID.4
# 1: 42.04166667_-80.1875 26D43 26D45 27D44 27D46
# 2: 42.04166667_-80.3125 26D34 26D35 26D36 NA
# 3: 42.04166667_-80.4375 26D25 26D26 NA NA
# 4: 42.125_-80.0625 27C55 27C56 27D51 27D52
# 5: 42.125_-80.1875 26D41 27C46 27D42 NA
# 6: 42.20833333_-80.0625 27C53 27C54 NA NA
# 7: 42.29166667_-79.8125 27B76 NA NA NA
data:
dat <- fread("lat_lon ID
42.04166667_-80.4375 26D25
42.04166667_-80.4375 26D26
42.04166667_-80.3125 26D34
42.04166667_-80.3125 26D35
42.04166667_-80.3125 26D36
42.125_-80.1875 26D41
42.125_-80.1875 27C46
42.125_-80.1875 27D42
42.04166667_-80.1875 26D43
42.04166667_-80.1875 26D45
42.04166667_-80.1875 27D44
42.04166667_-80.1875 27D46
42.29166667_-79.8125 27B76
42.20833333_-80.0625 27C53
42.20833333_-80.0625 27C54
42.125_-80.0625 27C55
42.125_-80.0625 27C56
42.125_-80.0625 27D51
42.125_-80.0625 27D52")
This is a departure from data.table (not that it can't be done there I'm sure but I'm less familiar) into the tidyverse
require(tidyr)
require(dplyr)
wide_data <- dat %>% group_by(lat_lon) %>% mutate(IDno = paste0("ID.",row_number())) %>% spread(IDno, ID)
This assumes that there are no duplicated lines with an ID repeated for a lat_lon. You could add distinct() to the chain before the grouping if this isn't the case
I have two data tables that I'm trying to merge. One is data on company market values through time and the other is company dividend history through time. I'm trying to find out how much each company has paid each quarter and put that value next to the market value data through time.
library(magrittr)
library(data.table)
library(zoo)
library(lubridate)
set.seed(1337)
# data table of company market values
companies <-
data.table(companyID = 1:10,
Sedol = rep(c("91772E", "7A662B"), each = 5),
Date = (as.Date("2005-04-01") + months(seq(0, 12, 3))) - days(1),
MktCap = c(100 + cumsum(rnorm(5,5)),
50 + cumsum(rnorm(5,1,5)))) %>%
setkey(Sedol, Date)
# data table of dividends
dividends <-
data.table(DivID = 1:7,
Sedol = c(rep('91772E', each = 4), rep('7A662B', each = 3)),
Date = as.Date(c('2004-11-19', '2005-01-13', '2005-01-29',
'2005-10-01', '2005-06-29', '2005-06-30',
'2006-04-17')),
DivAmnt = rnorm(7, .8, .3)) %>%
setkey(Sedol, Date)
I believe this is a situation where you could use a data.table rolling join, something like:
dividends[companies, roll = "nearest"]
to try and get a dataset that looks like
DivID Sedol Date DivAmnt companyID MktCap
1: NA 7A662B <NA> NA 6 61.21061
2: 5 7A662B 2005-06-29 0.7772631 7 66.92951
3: 6 7A662B 2005-06-30 1.1815343 7 66.92951
4: NA 7A662B <NA> NA 8 78.33914
5: NA 7A662B <NA> NA 9 88.92473
6: NA 7A662B <NA> NA 10 87.85067
7: 2 91772E 2005-01-13 0.2964291 1 105.19249
8: 3 91772E 2005-01-29 0.8472649 1 105.19249
9: NA 91772E <NA> NA 2 108.74579
10: 4 91772E 2005-10-01 1.2467408 3 113.42261
11: NA 91772E <NA> NA 4 120.04491
12: NA 91772E <NA> NA 5 124.35588
(note that I've matched the dividends to the company market values by the exact quarter)
But I'm not exactly sure how to execute it. The CRAN pdf is rather vague about what the number is or should be if roll is a value (Can you pass dates? Does a number quantify the days forward to carry? the number of obersvations?) and changing rollends around doesn't seem to get me what I want.
In the end, I ended up mapping the dividend dates to their quarter end and then joining on that. A good solution, but not useful if I end up needing to know how to perform rolling joins. In your answer, could you describe a situation where rolling joins are the only solution as well as help me understand how to perform them?
Instead of a rolling join, you may want to use an overlap join with the foverlaps function of data.table:
# create an interval in the 'companies' datatable
companies[, `:=` (start = compDate - days(90), end = compDate + days(15))]
# create a second date in the 'dividends' datatable
dividends[, Date2 := divDate]
# set the keys for the two datatable
setkey(companies, Sedol, start, end)
setkey(dividends, Sedol, divDate, Date2)
# create a vector of columnnames which can be removed afterwards
deletecols <- c("Date2","start","end")
# perform the overlap join and remove the helper columns
res <- foverlaps(companies, dividends)[, (deletecols) := NULL]
the result:
> res
Sedol DivID divDate DivAmnt companyID compDate MktCap
1: 7A662B NA <NA> NA 6 2005-03-31 61.21061
2: 7A662B 5 2005-06-29 0.7772631 7 2005-06-30 66.92951
3: 7A662B 6 2005-06-30 1.1815343 7 2005-06-30 66.92951
4: 7A662B NA <NA> NA 8 2005-09-30 78.33914
5: 7A662B NA <NA> NA 9 2005-12-31 88.92473
6: 7A662B NA <NA> NA 10 2006-03-31 87.85067
7: 91772E 2 2005-01-13 0.2964291 1 2005-03-31 105.19249
8: 91772E 3 2005-01-29 0.8472649 1 2005-03-31 105.19249
9: 91772E NA <NA> NA 2 2005-06-30 108.74579
10: 91772E 4 2005-10-01 1.2467408 3 2005-09-30 113.42261
11: 91772E NA <NA> NA 4 2005-12-31 120.04491
12: 91772E NA <NA> NA 5 2006-03-31 124.35588
In the meantime the data.table authors have introduced non-equi joins (v1.9.8). You can also use that to solve this problem. Using a non-equi join you just need:
companies[, `:=` (start = compDate - days(90), end = compDate + days(15))]
dividends[companies, on = .(Sedol, divDate >= start, divDate <= end)]
to get the intended result.
Used data (the same as in the question, but without the creation of the keys):
set.seed(1337)
companies <- data.table(companyID = 1:10, Sedol = rep(c("91772E", "7A662B"), each = 5),
compDate = (as.Date("2005-04-01") + months(seq(0, 12, 3))) - days(1),
MktCap = c(100 + cumsum(rnorm(5,5)), 50 + cumsum(rnorm(5,1,5))))
dividends <- data.table(DivID = 1:7, Sedol = c(rep('91772E', each = 4), rep('7A662B', each = 3)),
divDate = as.Date(c('2004-11-19','2005-01-13','2005-01-29','2005-10-01','2005-06-29','2005-06-30','2006-04-17')),
DivAmnt = rnorm(7, .8, .3))