how to sum two table "tk_xts" by date - r

I need to sum two times series tables`
df1 <- data.frame(date =c("20191228","20191229","20191230","20191231"), value=c(1,2,3,4), income= c(12,14,51,12))
df1$date<-ymd(df1$date)
df1ts <- tk_xts(df1,data_var =date)
df2 <- data.frame(date =c("20191226","20191227","20191228","20191229"), value=c(4,5,6,7), income= c(14,11,53,13))
df2$date<-ymd(df2$date)
df2ts <- tk_xts(df2,data_var =date)
I want to get this
df3ts value
2019-12-26 4
2019-12-27 5
2019-12-28 7
2019-12-29 10
2019-12-30 3
2019-12-31 4
What kind of funtion I need to apply to get this?

You can rbind the dataframes and use aggregate.
library("tidyverse")
rbind(df1,df2) %>%
aggregate(. ~ date, ., FUN=sum)
If you only want the column 'value', replace the first '.' with 'value'.

Related

How to create subsets of multiple date ranges in R

I have a data frame with dates and numbers called 'df'. I have another data frame with start and end dates called 'date_ranges'.
My goal is to filter/subset df so that it only shows for the start/end dates in each row of the date_ranges column. Here is my code so far:
df_date <- as.Date((as.Date('2010-01-01'):as.Date('2010-04-30')))
df_numbers <- c(1:120)
df <- data.frame(df_date, df_numbers)
start_dates <- as.Date(c("2010-01-06", "2010-02-01", '2010-04-15'))
end_dates <- as.Date(c("2010-01-23", "2010-02-06", '2010-04-29'))
date_ranges <- data.frame(start_dates, end_dates)
# Attempting to filter df by start and end dates
for (i in range(date_ranges$start_dates)){
for (j in range(date_ranges$end_dates)){
print (
df %>%
filter(between(df_date, i, j)))
}
}
The first and third result of the nested for loop is what I want, but not the second result. The first and third give me the dates and values for df between their respective rows, but the second result is the range from the earliest date to the latest date. How can I fix this loop to exclude the second result?
A tidyverse approach could be to create a sequence between start and end_dates and join with df to keep only the dates which lie in the range.
library(dplyr)
date_ranges %>%
mutate(df_date = purrr::map2(start_dates, end_dates, seq, "day")) %>%
tidyr::unnest(df_date) %>%
select(-start_dates, -end_dates) %>%
left_join(df, by = 'df_date')
# A tibble: 39 x 2
# df_date df_numbers
# <date> <int>
# 1 2010-01-06 6
# 2 2010-01-07 7
# 3 2010-01-08 8
# 4 2010-01-09 9
# 5 2010-01-10 10
# 6 2010-01-11 11
# 7 2010-01-12 12
# 8 2010-01-13 13
# 9 2010-01-14 14
#10 2010-01-15 15
# … with 29 more rows
You can try looping through index
for (i in seq_along(date_ranges$start_dates)){
print (
df %>%
filter(between(df_date, date_ranges$start_dates[i], date_ranges$end_dates[i])))
}
Base R solution:
# Your data creation can be simplified:
df <- data.frame(df_date = seq.Date(as.Date('2010-01-01', "%Y-%m-%d"), as.Date('2010-04-30', "%Y-%m-%d"),
by = 1), df_numbers = c(1:120))
# Store start and end date vectors to filter the data.frame:
start_dates <- as.Date(c("2010-01-06", "2010-02-01", '2010-04-15'))
end_dates <- as.Date(c("2010-01-23", "2010-02-06", '2010-04-29'))
# Subset the data to extract records with matching dates: df => stdout (Console
df[df$df_date %in% c(start_dates, end_dates),]

I want filter by previous near date my dataframe to a given date

I have this dataframe
Data <- c("2013-08-03", "2013-09-04", "2013-09-08", "2013-09-12", "2013-11-01")
prezzi <- c(1,2,3,4,5)
t <- data.frame(Data, prezzi)
Time <- "2013-09-11"
I want filter the dataframe Data by previous near date to a given date Time
The output is
Data Prezzi
2013-09-08 3
I use this code filter(Data == t$Data[which.min(abs(Time - as.Date(t$Data)))])
and it doesn't work. The result is 2013-09-12 but I want previous near date (2013-09-08.
We can subtract 1 from the which.min. If the 'Data' is not arranged, we may need to the order it first
t1 <- t[order(t$as.Date(Data)),]
i1 <- with(t1, which.min(abs(as.Date(Time) - as.Date(Data)))-1)
t1[i1, , drop = FALSE]
# Data prezzi
#3 2013-09-08 3
If we are using dplyr
library(dplyr)
t %>%
mutate(Data = as.Date(Data)) %>%
arrange(Data) %>%
slice(which.min(abs(as.Date(Time) - Data)) - 1)
# Data prezzi
#1 2013-09-08 3
Update
If the OP is not considering based on previous row, but just the rows that is from a minimal difference from 'Time', then use findInterval
Time <- "2013-09-09"
t[findInterval(as.Date(Time), as.Date(t$Data)),]
# Data prezzi
#3 2013-09-08 3
Less straightforward than #akrun's answer, you can filter first for date below "Time" and then keep the maximum date in the subset:
library(dplyr)
t %>% filter(as.Date(Data) < as.Date(Time)) %>%
filter(as.Date(Data) == max(as.Date(Data)))
Data prezzi
1 2013-09-08 3
1) Using base R only this gives the largest date prior to Time:
tail(subset(t, Time > format(Data)), 1)
## Data prezzi
## 3 2013-09-08 3
2) or using dplyr:
t %>%
filter(Time > format(Data)) %>%
slice(n())
3) or using sqldf:
library(sqldf)
fn$sqldf("select max(Data) Data, prezzi from t
where '$Time' > Data")
## Data prezzi
## 1 2013-09-08 3
One way in base R would be :
#Convert to date class
t$Data <- as.Date(t$Data)
Time <- as.Date("2013-09-11")
#Get difference between date and time
inds <- t$Data - Time
#Select the one with minimum difference
t[inds == max(inds[inds < 0 ]), ]
# Data prezzi
#3 2013-09-08 3

Check if date is between two dates in another data frame, and manipulate date if it is

I have two data frames (df1 and df2); they each have an ID column, and are organized by ID number with many rows per ID for each dataframe. df1 has a "unique_posix" column, and df2 has a "date.time.start" and "date.time.end" column, as well as a column of "depth" and "shape".
I would like, for each ID, to take my "unique_posix" column from df1 and go to df2 and find the "date.time.start" and "date.time.end" that it falls between or on. When I find the row that it corresponds to, I want to pull the "depth" and "shape" from df2 and copy it to new columns in df1 for that unique date/time.
I have tried doing this as a for loop with if/else, and I have tried doing this in dplyr.
df1<-data.frame(ID=c('SW12','SW12','SW12','SW12','SW12','SW13','SW13','SW13','SW13','SW13'), unique_posix=c('5/3/10 16:47','5/3/10 16:53','5/3/10 17:00', '5/3/10 18:00','5/3/10/ 18:12','8/15/10 17:13','8/15/10 17:18','8/15/10 17:37','8/15/10 18:00','8/15/10 18:52'))
df2<- data.frame(ID=c('SW12','SW12','SW12','SW12','SW12','SW13','SW13','SW13','SW13','SW13'), Date.Time.Start=c('5/3/10 15:57','5/3/10 16:18', '5/3/10 16:55','5/3/10 17:36','5/3/10 18:17','8/15/10 16:55','8/15/10 17:28','8/15/10 17:54', '8/15/10 18:55','8/15/10 19:20'), Date.Time.End=c('5/3/10 16:09','5/3/10 16:44','5/3/10 17:28', '5/3/10 18:08', '5/3/10 18:49', '8/15/10 17:22', '8/15/10 17:52','8/15/10 18:06','8/15/10 19:15','8/15/10 19:40'), Shape=c('U','U','V','Square','U','U','U','Square','V','U'), Depth=c(1,2,3,4,5,6,7,8,9,10))
I would like df1 to end up looking like:
df1b<-data.frame(ID=c('SW12','SW12','SW12','SW12','SW12','SW13','SW13','SW13','SW13','SW13'), unique_posix=c('5/3/10 16:47','5/3/10 16:53','5/3/10 17:00', '5/3/10 18:00','5/3/10/ 18:12','8/15/10 17:13','8/15/10 17:18','8/15/10 17:37','8/15/10 18:00','8/15/10 18:52'), Dive.Shape=c(NA,NA,'V','Square',NA,'U','U','U','Square', NA),Dive.Depth=c(NA,NA,3,4,NA,6,6,7,8,NA))
I've converted the date/times to POSIXct/lt:
library(dplyr)
df1 <- df1 %>%
mutate(
ID = factor(ID),
unique_posix = mdy_hm(unique_posix)
)
class(df1$unique_posix)
df2 <- df2 %>%
mutate(
ID = factor(ID),
Date.Time.Start = mdy_hm(Date.Time.Start),
Date.Time.End = mdy_hm(Date.Time.End)
)
class(df2$Date.Time.Start)
As a for-loop I have tried:
df1b<-df1
for (i in 1:nrow(df1)) {
if (df1$unique_posix %within% interval(df2$Date.Time.Start, df2$Date.Time.End)) {
df1b$Dive.Shape<-df2$Shape
df1b$Dive.Depth<-df2$Depth
}
else {
df1b$Dive.Shape<-NA
df2b$Dive.Depth<-NA
}
}
In dplyr I was trying something like this:
df1b<-inner_join(df1, df2, by="DeployID")
df1b %>% rowwise() %>%
mutate(Dive.Shape=ifelse(between(unique_posix, Date.Time.Start, Date.Time.End),Shape,NA )) %>%
mutate(Dive.Depth=ifelse(between(unique_posix, Date.Time.Start, Date.Time.End),Depth,NA ))
arrange(DeployID,desc(unique_posix)) %>%
distinct(unique_posix)
None of this seems to be working, but I feel like I'm close?
I would like to end up with my df1b having two extra columns of Dive.Shape and Dive.Depth, that would contain an "NA" if the unique_posix date/time didn't fall within or on a Date.Time.Start and Date.Time.End range in the df2 frame [for each ID]. The columns would contain values from df2's Shape and df2's Depth column if the df1's unique_posix fell between or on the df2's Date.Time.Start or Date.Time.End columns.
Thank you for any help I can get on this!
With data.table this is relatively simple with a non-equi update join:
library(data.table)
setDT(df1)
setDT(df2)
df1[df2
, on = .(ID
, unique_posix > Date.Time.Start
, unique_posix < Date.Time.End)
, `:=`(Dive.Shape = Shape, Dive.Depth = Depth)]
df1
> df1
ID unique_posix Dive.Shape Dive.Depth
1: SW12 2010-05-03 16:47:00 <NA> NA
2: SW12 2010-05-03 16:53:00 <NA> NA
3: SW12 2010-05-03 17:00:00 V 3
4: SW12 2010-05-03 18:00:00 Square 4
5: SW12 2010-05-03 18:12:00 <NA> NA
6: SW13 2010-08-15 17:13:00 U 6
7: SW13 2010-08-15 17:18:00 U 6
8: SW13 2010-08-15 17:37:00 U 7
9: SW13 2010-08-15 18:00:00 Square 8
10: SW13 2010-08-15 18:52:00 <NA> NA
See also: How to do a data.table rolling join?
I think you are. The issue is that in the data.frames, the dates/times are saved as characters.
apply(df1, 2, class)
ID unique_posix
> "character" "character"
apply(df2, 2, class)
ID Date.Time.Start Date.Time.End Shape Depth
"character" "character" "character" "character" "character"
In reality, you want to convert unique_posix, Date.Time.Start and Date.Time.End to dates/times. Possibly use strptime()? I think that the comparisons would work, but I didn't verify them yet. I need to go soon, but I wanted to give you something anyway.
If you still want to pursue the dplyr solution, try this:
inner_join(df1, df2, by = "ID") %>%
rowwise() %>%
filter (between(unique_posix, Date.Time.Start, Date.Time.End)) %>%
right_join(df1, by = c("ID", "unique_posix")) %>%
dplyr::select (-c(Date.Time.Start, Date.Time.End), Dive.Shape = Shape, Dive.Depth = Depth)

Fill in missing date and fill with the data above

I've researched enough until i ask this here but can you please help me with some ideas for this issue?
My data table (df) looks like this:
client id value repmonth
123 100 2012-01-31
123 200 2012-02-31
123 300 2012-05-31
Therefore I have 2 missing months. And i want my data table to look like this:
client id value repmonth
123 100 2012-01-31
123 200 2012-02-31
123 200 2012-03-31
123 200 2012-04-31
123 300 2012-05-31
The code should be filling in the missing repmonth and fill the rows with the last value, in this case 200 and the came client id.
I have tried the following:
zoo library
tidyr library
dlpyr library
posixct
As for codes: ...plenty of fails
library(tidyr)
df %>%
mutate (repmonth = as.Date(repmonth)) %>%
complete(repmonth = seq.Date(min(repmonth), max(repmonth),by ="month"))
or
library(dplyr)
df$reportingDate.end.month <- as.POSIXct(df$datetime, tz = "GMT")
df <- tbl_df(df)
list_df <- list(df, df) # fake list of data.frames
seq_df <- data_frame(datetime = seq.POSIXt(as.POSIXct("2012-01-31"),
as.POSIXct("2018-12-31"),
by="month"))
lapply(list_df, function(x){full_join(total_loan_portfolios_3$reportingDate.end.month, seq_df, by=reportingDate.end.month)})
total_loan_portfolios_3$reportingmonth_notmissing <- full_join(seq_df,total_loan_portfolios_3$reportingDate.end.month)
or
library(dplyr)
ts <- seq.POSIXt(as.POSIXct("2012-01-01",'%d/%m/%Y'), as.POSIXct("2018/12/01",'%d/%m/%Y'), by="month")
ts <- seq.POSIXt(as.POSIXlt("2012-01-01"), as.POSIXlt("2018-12-01"), by="month")
ts <- format.POSIXct(ts,'%d/%m/%Y')
df <- data.frame(timestamp=ts)
total_loan_portfolios_3 <- full_join(df,total_loan_portfolios_3$Reporting_date)
Finally, I have plenty of errors like
the format is not date
or
Error in seq.int(r1$mon, 12 * (to0$year - r1$year) + to0$mon, by) :
'from' must be a finite number
and others.
The following solution uses lubridate and tidyr packages. Note that in OP example, dates are malformed, but implies having data with last-day-of-month input, so tried to replicate it here. Solution creates a sequence of dates from min input date to max input date to get all possible months of interest. Note that input dates are normalized to first-day-of-month to ensure proper sequence generation. With the sequence created, a left-join merge is done to merge data we have and identify missing data. Then fill() is applied to columns to fill in the missing NAs.
library(lubridate)
library(tidyr)
#Note OP has month of Feb with 31 days... Corrected to 28 but this fails to parse as a date
df <- data.frame(client_id=c(123,123,123),value=c(100,200,300),repmonth=c("2012-01-31","2012-02-29","2012-05-31"),stringsAsFactors = F)
df$repmonth <- ymd(df$repmonth) #convert character dates to Dates
start_month <- min(df$repmonth)
start_month <- start_month - days(day(start_month)-1) #first day of month to so seq.Date sequences properly
all_dates <- seq.Date(from=start_month,to=max(df$repmonth),by="1 month")
all_dates <- (all_dates %m+% months(1)) - days(1) #all end-of-month-day since OP suggests having last-day-of-month input?
all_dates <- data.frame(repmonth=all_dates)
df<-merge(x=all_dates,y=df,by="repmonth",all.x=T)
df <- fill(df,c("client_id","value"))
Solution yields:
> df
repmonth client_id value
1 2012-01-31 123 100
2 2012-02-29 123 200
3 2012-03-31 123 200
4 2012-04-30 123 200
5 2012-05-31 123 300

Using lapply to output values between date ranges within different factor levels

I have 2 dataframes, one representing daily sales figures of different stores (df1) and one representing when each store has been audited (df2). I need to create a new dataframe displaying sales information from each site taken 1 week before each audit (i.e. the information in df2). Some example data, firstly for the daily sales figures from different stores across a certain period:
Dates <- as.data.frame(seq(as.Date("2015/12/30"), as.Date("2016/4/7"),"day"))
Sales <- as.data.frame(matrix(sample(0:50, 30*10, replace=TRUE), ncol=3))
df1 <- cbind(Dates,Sales)
colnames(df1) <- c("Dates","Site.A","Site.B","Site.C")
And for the dates of each audit across different stores:
Store<- c("Store.A","Store.A","Store.B","Store.C","Store.C")
Audit_Dates <- as.data.frame(as.POSIXct(c("2016/1/4","2016/3/1","2016/2/1","2016/2/1","2016/3/1")))
df2 <- as.data.frame(cbind(Store,Audit_Dates ))
colnames(df2) <- c("Store","Audit_Dates")
Of note is that there will be an uneven amount of dates within each output (i.e. there may not be a full weeks worth of information prior to some store audits). I have previously asked a question addressing a similar problem Creating a dataframe from an lapply function with different numbers of rows. Below shows an answer from this which would work for an example if I was to consider information from only 1 store:
library(lubridate)
##Data input
Store.A_Dates <- as.data.frame(seq(as.Date("2015/12/30"), as.Date("2016/4/7"),"day"))
Store.A_Sales <- as.data.frame(matrix(sample(0:50, 10*10, replace=TRUE), ncol=1))
Store.A_df1 <- cbind(Store.A_Dates,Store.A_Sales)
colnames(Store.A_df1) <- c("Store.A_Dates","Store.A_Sales")
Store.A_df2 <- as.Date(c("2016/1/3","2016/3/1"))
##Output
Store.A_output<- lapply(Store.A_df2, function(x) {Store.A_df1[difftime(Store.A_df1[,1], x - days(7)) >= 0 & difftime(Store.A_df1[,1], x) <= 0, ]})
n1 <- max(sapply(Store.A_output, nrow))
output <- data.frame(lapply(Store.A_output, function(x) x[seq_len(n1),]))
But I don't know how I would get this for multiple sites.
Try this:
# Renamed vars for my convenience...
colnames(df1) <- c("t","Store.A","Store.B","Store.C")
colnames(df2) <- c("Store","t")
library(tidyr)
library(dplyr)
# Gather df1 so that df1 and df2 have the same format:
df1 = gather(df1, Store, Sales, -t)
head(df1)
t Store Sales
1 2015-12-30 Store.A 16
2 2015-12-31 Store.A 24
3 2016-01-01 Store.A 8
4 2016-01-02 Store.A 42
5 2016-01-03 Store.A 7
6 2016-01-04 Store.A 46
# This lapply call does not iterate over actual values, just indexes, which allows
# you to subset the data comfortably:
r <- lapply(1:nrow(df2), function(i) {
audit.t = df2[i, "t"] #time of audit
audit.s = df1[, "Store"] == df2[i, "Store"] #store audited
df = df1[audit.s, ] #data from audited store
df[, "audited"] = audit.t #add extra column with audit date
week_before = difftime(df[, "t"], audit.t - (7*24*3600)) >= 0
week_audit = difftime(df[, "t"], audit.t) <= 0
df[week_before & week_audit, ]
})
Does this give you the proper subsets?
Also, to summarise your results:
r = do.call("rbind", r) %>%
group_by(audited, Store) %>%
summarise(sales = sum(Sales))
r
audited Store sales
<time> <chr> <int>
1 2016-01-04 Store.A 97
2 2016-02-01 Store.B 156
3 2016-02-01 Store.C 226
4 2016-03-01 Store.A 115
5 2016-03-01 Store.C 187

Resources