How to impute data from one dataframe to another in r - r

I have two data frames df1 and df2
df1= data.frame( ts = c('2020-01-15', '2020-01-16' , '2020-01-17', '2020-01-20', '2020-01-22','2020-
01-24','2020-01-27','2020-01-30','2020-01-31'),
lla=c(12,13,14,15,16,17,18,19,20),
llb=c(1,2,3,4,6,5,9,8,7),
llc=c(0.6,1.6,2.6,3.6,4.6,5.6,6.6,7.6,8.6),
lld=c(10,11,12,13,14,15,16,154,167))
df2= data.frame( ts = c('2020-01-17','2020-01-24','2020-01-31'),
lla=NA,llb=NA,llc=NA,lld=NA)
if the date of df2$ts matches df1$ts it will look back 4 days ago and impute the max value of every column in df2.
For example:
the 1st value of df2$ts is "2020-01-17" matches to df1$ts .. it will look back 4 days means , it will filter out the data of df1$ts between 2020-01-13 to 2020-01-17 so we get
#code
df1[(as.Date(df1$ts)>= (as.Date(df2[1,1])-2)) &
(as.Date(df1$ts)<= (as.Date(df2[1,1]))),]
#i am writing this in a loop so that it would iterate over every date of df2
df1= data.frame( ts = c('2020-01-15', '2020-01-16' , '2020-01-17'),
lla=c(12,13,14),
llb=c(1,2,3),
llc=c(0.6,1.6,2.6),
lld=c(10,11,12))
So now we have to get the max of every column which we would achieve with this code
#would return the maximum of every column
apply(ohlc[(as.Date(ohlc$ts)>= (as.Date(cls[1,1])-2)) &
(as.Date(ohlc$ts)<= (as.Date(cls[1,1]))),],2,max)
But I don't know how to input this data in df2 for matching date i.e "2020-01-17" and so on for other dates of df2

Try:
libray(dplyr)
df1 <- df1 %>% mutate(ts=as.Date(ts))
df2 <- df2 %>% mutate(ts=as.Date(ts))
my_function <- function(x,df){
df %>% filter(ts >= (x$ts-3) & ts <= x$ts) %>%
mutate(ts=x$ts) %>%
summarise(across(.cols = lla:lld, .fns = max)) %>%
mutate(ts=x$ts)}
lapply(split(df2,df2$ts),my_function,df=df1) %>% do.call(rbind,.)

Here is an option with roll after creating a new column 4 days back
library(data.table)
library(lubridate)
# // convert columns to Date class
df1$ts <- as.Date(df1$ts)
df2$ts <- as.Date(df2$ts)
nm1 <- names(df2)[-1]
# // change the type of NA columns from logical to numeric
setDT(df2)[, (nm1) := lapply(.SD, as.numeric), .SDcols = nm1]
# // subtract 4 days from ts to create ts1
setDT(df1)[, ts1 := ts %m-% days(4)]
# do a rolling join while getting the `max` for each column
df2[df1, (nm1) := lapply(mget(paste0("i.", nm1)), max),
on = .(ts = ts1), roll = -Inf, by = .EACHI]

Alternate approach using runner
df1$ts <- as.Date(df1$ts)
df2$ts <- as.Date(df2$ts)
library(runner)
library(dplyr)
df2 %>%
mutate(across(!ts, ~max_run(x = df1[[cur_column()]],
k = 4,
idx = df1$ts,
at = cur_data()[[1]])))
#> ts lla llb llc lld
#> 1 2020-01-17 14 3 2.6 12
#> 2 2020-01-24 17 6 5.6 15
#> 3 2020-01-31 20 8 8.6 167
Created on 2021-06-06 by the reprex package (v2.0.0)

Related

how to create a new variable in df1 with the sum of a variable in df2 based on a start and end date in df1?

The perfect answer to my question already exists as the first answer by #akrun to this question Sum variable between dates in R?
the answer by #akrun is exactly what I am looking for, but when I run the code with the example data in the original question I do not get a sum of the value column between the two dates, instead I get the last value in the date interval...
Any suggestions?
Example data:
df1 <- data.frame(Start = as.Date(c('1/1/20', '5/1/20', '10/1/20', '2/2/21', '3/20/21'),"%m/%d/%y"), End = as.Date(c('1/7/20', '5/7/20', '10/7/20', '2/7/21', '3/30/21'),"%m/%d/%y"))
df2 <- data.frame(Date = as.Date(c('1/1/20','1/3/20' ,'5/1/20','5/2/20','6/2/20' ,'6/4/20','10/1/20', '2/2/21', '3/20/21'),"%m/%d/%y"),value=as.numeric(c('1','2','5','15','20','2','3','78','100')))
#akrun code:
setDT(df1)[df2, value := sum(value),
on = .(Start <= Date, End >= Date), by = .EACHI]
We could do the join in reverse
library(data.table)
setDT(df2)[df1, .(value = sum(value)),
on = .(Date >= Start, Date <= End ), by = .EACHI]
Here is a fuzzyjoin solution: Key point is using match_fun function:
library(fuzzyjoin)
library(dplyr)
fuzzy_left_join(
df2, df1,
by = c(
"Date" = "Start",
"Date" = "End"
),
match_fun = list(`>=`, `<=`)
) %>%
group_by(Start, End) %>%
summarise(sum = sum(value))
Start End sum
<date> <date> <dbl>
1 2020-01-01 2020-01-07 3
2 2020-05-01 2020-05-07 20
3 2020-10-01 2020-10-07 3
4 2021-02-02 2021-02-07 78
5 2021-03-20 2021-03-30 100
6 NA NA 22

get max maximum value of last 3 days if date matches from another dataframe in R?

i have two dataframe
df1= data.frame( ts = c('2020-01-15', '2020-01-16' , '2020-01-17', '2021-01-14', '2021-01-15','2021-
01-16','2021-01-24','2021-01-25','2021-01-26'),
aa_h=c(1,2,3,6,4,5,7,9,8),
bh= c(12,13,14,11,11,11,122,12,56))
df2_mx=data.frame( ts = c('2020-01-17', '2021-01-16' , '2021-01-26'),
aa= NA)
Now here i want to compare the dates of df2_mx from df1, and if matches, I want the max value of aa_h of the last two and the current day and insert it in "aa" column of df2_mx
Example
1st row i.e. '2020-01-17' of df2_mx would match the 3rd row of df1 and it would look up 2 days above and get the value which_max(c(1,2,3))--> 3 and insert it "aa" column of df2_mx .
Expected Output:
df2_mx=data.frame( ts = c('2020-01-17', '2021-01-16' , '2021-01-26'),
aa= c(3,6,9))
Tryout Code
n=1
for (i in 1:nrow(df1)){
ifelse(which(as.Date(df1[i,1])==as.Date(df2_mx[,1])),
oh_df_mx[which(as.Date(df1[i,1])==as.Date(df2_mx
[,1])),n+1]<-which.max(df1[(i-2):i,3]),invisible())
}
An option with fuzzyjoin package.
library(dplyr)
df1 %>%
mutate(ts = as.Date(ts)) %>%
fuzzyjoin::fuzzy_right_join(df2_mx %>%
mutate(ts = as.Date(ts), ts_2_day = ts - 2),
by = c('ts', 'ts' = 'ts_2_day'),
match_fun = c(`<=`, `>=`)) %>%
group_by(ts = ts.y) %>%
summarise(aa_h = max(aa_h, na.rm = TRUE))
# ts aa_h
# <date> <dbl>
#1 2020-01-17 3
#2 2021-01-16 6
#3 2021-01-26 9

groupby summarise outside of groupby dplyr

I'm trying to group ids with date in this dataset, but I want to summarise based on one of the features outside of the group.
library(dplyr)
library(lubridate)
set.seed(100)
df <- data.frame(ids = sample(c('436247', '2465347', '346654645'), 10000, replace=TRUE),
date = sample(seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1), 10000, replace=TRUE))
new_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(ids[date >= date - 30 & date <= date]))
I'm trying to take this dataframe and answer the question - "for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date". Unfortunately, when I group_by both the ids and date, it only looks within the grouped date. I've created the solution below, but not sure if there is a better one with dplyr?
groupby_function <- function(df, spec_date){
result <- df %>%
group_by(ids) %>%
summarise(events = length(ids[date >= spec_date - 30 & date <= spec_date])) %>%
mutate(date = spec_date)
return(result)
}
date_vector <- seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1)
list_results <- lapply(date_vector, groupby_function, df=df)
x <- do.call(rbind, list_results)
"for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date"
For that, a "join by" condition makes sense, but isn't yet included in dplyr. Until it is, you could use data.table inside your dplyr chain:
# enumerate id-date combos of interest
grid_df = expand.grid(
id = unique(df$ids),
d = seq(min(df$date), max(df$date), by="day")
)
# helper function
library(data.table)
count_matches = function(DF, targetDF, ...){
onexpr = substitute(list(...))
data.table(targetDF)[DF, on=eval(onexpr), .N, by=.EACHI]$N
}
# use a non-equi join to count matching rows
res = grid_df %>%
mutate(d_dn = d - 30) %>%
mutate(n = count_matches(., df, ids = id, date >= d_dn, date <= d)) %>%
as.tibble
# A tibble: 186 x 4
id d d_dn n
<fctr> <date> <date> <int>
1 436247 2018-03-01 2018-01-30 72
2 2465347 2018-03-01 2018-01-30 69
3 346654645 2018-03-01 2018-01-30 51
4 436247 2018-03-02 2018-01-31 123
5 2465347 2018-03-02 2018-01-31 120
6 346654645 2018-03-02 2018-01-31 100
7 436247 2018-03-03 2018-02-01 170
8 2465347 2018-03-03 2018-02-01 166
9 346654645 2018-03-03 2018-02-01 154
10 436247 2018-03-04 2018-02-02 228
# ... with 176 more rows
It should work fine for equality conditions to write either ids = id or ids == id, I think.
If you're interested, the syntax is x[i, on=, j, by=.EACHI] where x and i are tables. For each row of i, we look up rows of x based on the on= criteria (left-hand side refers to columns in x; right-hand to columns in i); then we do j for each ("by each row of i" so by=.EACHI). In this case, j = .N means that we count matched rows of x, returned as a column of counts N.
You can look at the "ungrouped" data by just going back to the original data frame(calling df$date or df$ids). So I think what you are after is
test_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(df$ids[df$date >= date[1] - 30 & df$date <= date[1] & df$ids == ids[1]]))
Also, I ran your proposed function, but I did not see any difference in the result from your original group_by solution, so I don't think that is what you want.
If a 'non dplyr' solution is acceptable, this gives you what you want.
df$diff <- as.vector(
sapply(unique(df$ids), function(x)
sapply(df$date[df$ids == x], function(y)
sum(abs(y - df$date[df$ids == x]) >= 30)
)
)
)
Alternatively, in dplyr, you can get a result like the above using:
f <- function(x) {
sapply(x, function(y) sum(abs(y - x) >= 30))
}
df$diff <- unlist(
df %>%
group_by(ids) %>%
do(diff = f(.$date)) %>%
.$diff
)
Here's an answer. But it assumes there's a continuous sequence of dates in each id.
df %>%
group_by(ids, date) %>%
count() %>%
arrange(ids, date) %>%
group_by(ids) %>%
mutate(
events = cumsum(n) - cumsum(lag(n, 30, 0))
)

How to filter by dates and grouping months together in R using dplyr

I have a dataframe (lets call it df1) that looks something like this...
Date Price
2014-08-06 22
2014-08-06 89
2014-09-15 56
2014-06-04 41
2015-01-19 11
2015-05-23 5
2014-07-21 108
There are other variables in the dataframe but we will ignore them for now, as I do not require them.
I have previously ordered it using
df2 <- df1[order(as.Date(df1$Date, format="%Y/%m/%d")),]
And then created a dataframe containing the values for just one month, for example, just September 2015 dates...
september2015 <- df2[df2$Date >= "2015-09-01" & df2$Date <= "2015-09-30",]
I have done this for all the months in 2015 and 2014.
Then I need to create an average of prices within each given month. I have done this by...
mean(september2015$Price, na.rm = TRUE)
Obviously, this is very long and tedious and involves many lines of code. I am trying to make my code more efficient through using the dplyr package.
So far I have...
datesandprices <- select(df2, Date, Price)
datesandprices <- arrange(datesandprices, Date)
summarise(datesandprices, avg = mean(Price, na.rm = TRUE))
Or in a simpler form...
df1 %>%
select(Date, Price) %>%
arrange(Date) %>%
filter(Date >= 2014-08-06 & Date =< 2014-08-30)
summarise(mean(Price, na.rm = TRUE))
The filter line is not working for me and I can't figure out how to filter by dates using this method. I would like to get the mean for each month without having to calculate it one by one - and ideally extract the monthly means into a new dataframe or column that looks like...
Month Average
Jan 2014 x
Feb 2014 y
...
Nov 2015 z
Dec 2015 a
I hope this makes sense. I can't find anything on stackoverflow that works with dates, attempting to do something similar to this (unless I am searching for the wrong functions). Many thanks!
I made a separate column in your data set that contains only year and month. Then, I did a group_by on that column to get the means for each month.
Date <- c("2014-08-06", "2014-08-06", "2014-09-15", "2014-06-04", "2015-01-19", "2015-05-23", "2014-07-21")
Price <- c(22,89,56,41,11,5,108)
Date <- as.Date(Date, format="%Y-%m-%d")
df <- data.frame(Date, Price)
df$Month_Year <- substr(df$Date, 1,7)
library(dplyr)
df %>%
#select(Date, Price) %>%
group_by(Month_Year) %>%
summarise(mean(Price, na.rm = TRUE))
For the sake of completeness, here is also a data.table solution:
library(data.table)
# in case Date is of type character
setDT(df1)[, .(Average = mean(Price, na.rm = TRUE)), keyby = .(Yr.Mon = substr(Date, 1,7))]
# in case Date is of class Date or POSIXct
setDT(df2)[, .(Average = mean(Price, na.rm = TRUE)), keyby = .(Yr.Mon = format(Date, "%Y-%m"))]
Yr.Mon Average
1: 2014-06 41.0
2: 2014-07 108.0
3: 2014-08 55.5
4: 2014-09 56.0
5: 2015-01 11.0
6: 2015-05 5.0
Note that the grouping variable Yr.Mon is created "on-the-fly" in the keyby clause.
Data
library(data.table)
df1 <- fread(
"Date Price
2014-08-06 22
2014-08-06 89
2014-09-15 56
2014-06-04 41
2015-01-19 11
2015-05-23 5
2014-07-21 108")
df2 <- df1[, Date := as.Date(Date)]
I managed to do it using all dplyr functions, with help from #user108636
df %>%
select(Date, Price) %>%
arrange(Date) %>%
mutate(Month_Year = substr(Date, 1,7)) %>%
group_by(Month_Year) %>%
summarise(mean(Price, na.rm = TRUE))
The select function selects the date and price columns.
The arrange function arranges my dataframe according to the date - with the earliest date first. The mutate function adds another column which excludes the day and leaves us with, for example...
Month_Year
2015-10
2015-10
2015-11
2015-12
2015-12
The group by function groups all the months together and the summarise function calculates the mean of the price of each month.
This should mean your price data by month-year.
library(zoo)
#Pull out columns
Price<-df1["Price"]
Date<-df1["Date"]
#Put in Zoo
zooPrice <- zoo(Price,Date)
#Monthly mean with year (vector)
monthly.avg <- apply.monthly(zooPrice, mean)
#function to change back to DF
zooToDf <- function(z) {
df <- as.data.frame(z)
df$Date <- time(z) #create a Date column
rownames(df) <- NULL #so row names not filled with dates
df <- df[,c(ncol(df), 1:(ncol(df)-1))] #reorder columns so Date first
return(df)
}
#Apply function to create new Df with data!
MonthYearAvg<-zooToDf(monthly.avg)
Convert your column to a Date object and use format
df <- data.frame(
Date = c("2014-08-06", "2014-08-06", "2014-09-15", "2014-06-04", "2015-01-19", "2015-05-23", "2014-07-21"),
Price = c(22, 89, 56, 41, 11, 5, 108))
library(dplyr)
df %>%
group_by(Month_Year = as.Date(Date) %>% format("%b %Y")) %>%
summarise(avg = mean(Price, na.rm = TRUE))
# A tibble: 6 x 2
Month_Year avg
<chr> <dbl>
1 août 2014 55.5
2 janv. 2015 11
3 juil. 2014 108
4 juin 2014 41
5 mai 2015 5
6 sept. 2014 56

Cumulative total by group

For the following dataset:
d = data.frame(date = as.Date(as.Date('2015-01-01'):as.Date('2015-04-10'), origin = "1970-01-01"),
group = rep(c('A','B','C','D'), 25), value = sample(1:100))
head(d)
date group value
1: 2015-01-01 A 4
2: 2015-01-02 B 32
3: 2015-01-03 C 46
4: 2015-01-04 D 40
5: 2015-01-05 A 93
6: 2015-01-06 B 10
.. can anyone advise a more elegant way to calculate a cumulative total of values by group than this data.table) method?
library(data.table)
setDT(d)
d.cast = dcast.data.table(d, group ~ date, value.var = 'value', fun.aggregate = sum)
c.sum = d.cast[, as.list(cumsum(unlist(.SD))), by = group]
.. which is pretty clunky and yields a flat matrix that needs dplyr::gather or reshape2::melt to reformat.
Surely R can do better than this??
If you just want cumulative sums per group, then you can do
transform(d, new=ave(value,group,FUN=cumsum))
with base R.
This should work
library(dplyr)
d %>%
group_by(group) %>%
arrange(date) %>%
mutate(Total = cumsum(value))
As this question was tagged with data.table, you are probably looking for (a modification of #Franks comment).
setDT(d)[order(date), new := cumsum(value), by = group]
This will simultaneously rearrange the data by date (not sure if needed, if not, you can get rid of order(date)) and update your data set in place utilizing the := operator
Is this it?
sp <- split(d, d$group)
res <- lapply(seq_along(sp), function(i) cumsum(sp[[i]]$value))
res <- lapply(seq_along(res), function(i){
sp[[i]]$c.sum <- res[[i]]
sp[[i]]
})
res <- do.call(rbind, res)
res

Resources