df <- data.frame("Date"=seq(as.Date("2020/1/1"),by="day", length.out = 20),events=sample(0:100,20))
trying to sum the closest
df <- df %>% mutate(seven_sum=sum(events[Date <= Date & Date > Date-7]) )
Then i want to sum everyting that has happend during the last 7 days and I can understand why this is not working but not really how to solve it.
So basically i would like to for each row sum that date and all other within 7 days. it is wasy to fix if I use a fixed date range but i would like to to change for each row...
Any advice on how to continue would be very helpfull.
Using purrr::map_int :
library(dplyr)
library(purrr)
df %>% mutate(seven_sum=map_int(Date, ~sum(events[Date <= .x & Date > (.x-7)])))
# Date events seven_sum
#1 2020-01-01 66 66
#2 2020-01-02 94 160
#3 2020-01-03 49 209
#4 2020-01-04 39 248
#5 2020-01-05 84 332
#6 2020-01-06 29 361
#7 2020-01-07 36 397
#8 2020-01-08 20 351
#9 2020-01-09 40 297
#10 2020-01-10 25 273
#11 2020-01-11 3 237
#12 2020-01-12 97 250
#13 2020-01-13 22 243
#14 2020-01-14 63 270
#15 2020-01-15 58 308
#16 2020-01-16 91 359
#17 2020-01-17 26 360
#18 2020-01-18 47 404
#19 2020-01-19 35 342
#20 2020-01-20 38 358
and same logic in base R :
sapply(df$Date, function(x) sum(df$events[df$Date <= x & df$Date > (x-7)]))
We can use data.table methods to do a non-equi join which would be more efficient
library(data.table)
v1 <- setDT(df)[df[, Date1 := Date - 7], sum(events),
on = .(Date <= Date, Date > Date1), allow.cartesian =TRUE, by = .EACHI]$V1
df[, seven_sum := v1][]
Related
For example, I have the following dataframe (df1):
Date Price
2020-01-01 500
2020-01-02 550
2020-01-03 480
2020-01-04 420
2020-01-07 450
2020-01-08 390
2020-01-09 430
2020-01-11 480
2020-01-12 490
2020-01-13 485
And I want to insert the Price2 column in the previous data frame, so that I repeat the line until the next date, but this "next date" is not on the df1 dataframe:
Date Price2
2020-01-05 50
2020-01-10 20
2020-01-13 90
Would be like this:
Date Price Price2
2020-01-01 500 50
2020-01-02 550 50
2020-01-03 480 50
2020-01-04 420 50
2020-01-07 450 20
2020-01-08 390 20
2020-01-09 430 20
2020-01-11 480 90
2020-01-12 490 90
2020-01-13 485 90
Can someone help me?
Here's a tidyverse approach with tidyr::fill:
library(dplyr)
library(tidyr)
full_join(df1,df2) %>%
arrange(Date) %>%
tidyr::fill(Price2,.direction = "up") %>%
dplyr::filter(!is.na(Price))
Date Price Price2
1 2020-01-01 500 50
2 2020-01-02 550 50
3 2020-01-03 480 50
4 2020-01-04 420 50
5 2020-01-07 450 20
6 2020-01-08 390 20
7 2020-01-09 430 20
8 2020-01-11 480 90
9 2020-01-12 490 90
10 2020-01-13 485 90
This appears to work:
library(data.table)
# Create Data
#df1 with all dates
df1 <- data.frame(Date = seq(as.Date("2020-01-01"),as.Date("2020-01-13"),by = "days"),
Price = c(500,550,480,420,450,390,430,480,490,485,485,490,450))
#drop dates included below:
df1<- df1[!df1$Date==c("2020-01-05","2020-01-08","2020-01-13"),]
# Set up df2
df2 <- data.frame(Date = as.Date(c("2020-01-05","2020-01-08","2020-01-13")),
Price2 = c(50,20,90))
setDT(df1)
setDT(df2)
setkey(df2, Date)[, dateMatch:=dateTarget]
df2[df1, roll='nearest']
Although my Price2 and Price are reversed from your expected output, I think this otherwise lines up:
Date Price2 Price
1: 2020-01-01 50 500
2: 2020-01-02 50 550
3: 2020-01-03 50 480
4: 2020-01-04 50 420
5: 2020-01-05 50 450
6: 2020-01-06 50 390
7: 2020-01-07 20 430
8: 2020-01-09 20 490
9: 2020-01-10 20 485
10: 2020-01-11 90 485
11: 2020-01-12 90 490
12: 2020-01-13 90 450
Assuming the second dataframe with Price2 is called df2 :
library(magrittr)
lapply(df2$Date,function(x){
df2$Price2 * (df1$Date > x)
}) %>%
do.call(what = "+")
It should work, let mee know if it didn't.
dplyr + zoo:
library("dplyr")
library("zoo")
df = df1 %>%
full_join(df2, by = "Date") %>%
arrange(Date) %>%
mutate(Price2 = na.locf(Price2, fromLast = T)) %>%
filter(Date %in% df1$Date)
> df
Date Price Price2
1 2020-01-01 500 50
2 2020-01-02 550 50
3 2020-01-03 480 50
4 2020-01-04 420 50
5 2020-01-07 450 20
6 2020-01-08 390 20
7 2020-01-09 430 20
8 2020-01-11 480 90
9 2020-01-12 490 90
10 2020-01-13 485 90
Using cut.
res <-
transform(df1,
Price2=cut(1:nrow(df1),
c(0, rowSums(sapply(df1$Date, function(x)
df2$Date >= x))),
labels=df2$Price2))
res
# Date Price Price2
# 1 2020-01-01 500 50
# 2 2020-01-02 550 50
# 3 2020-01-03 480 50
# 4 2020-01-04 420 50
# 5 2020-01-07 450 20
# 6 2020-01-08 390 20
# 7 2020-01-09 430 20
# 8 2020-01-11 480 90
# 9 2020-01-12 490 90
# 10 2020-01-13 485 90
Data:
df1 <- read.table(text="Date Price
2020-01-01 500
2020-01-02 550
2020-01-03 480
2020-01-04 420
2020-01-07 450
2020-01-08 390
2020-01-09 430
2020-01-11 480
2020-01-12 490
2020-01-13 485", header=TRUE)
df1$Date <- as.Date(df1$Date)
df2 <- read.table(text="Date Price2
2020-01-05 50
2020-01-10 20
2020-01-13 90", header=TRUE)
df2$Date <- as.Date(df2$Date)
I have a dataframe df with 3 columns id, first and last
id <- c(27,27,134,134)
first <- c(14,20,9,16)
last <- c(17,24,13,20)
df <- as.data.frame(cbind(id,first,last))
df
Each row corresponds to a chunk of data from another dataframe that I want to keep.
first and last indicate the first and last frames of the relevant chunk
I want to use this to subset the other dataframe dat which is structured as below
dat_id <- c(rep(27, 30), rep(134,30))
dat_frame <- c(seq(1:30), seq(1:30))
dat_data <- c(sample(1:60))
dat <- as.data.frame(cbind(dat_id,dat_frame,dat_data))
dat
The only way I know to extract the relevant portion is with a for loop as below (this produces the expected output), but I expect this is a horribly inefficient way to do it. What's a better way?
#header row
new_df <- data.frame(id = numeric(), frame = numeric(), data = numeric())
#populate
for (i in (seq (1:nrow(df)))){
new_df <- rbind(new_df, subset(dat, dat_id == df[i,"id"])[df[i,"first"]:df[i,"last"],])
}
new_df
This can be done with a complex join in sql. This avoids creating a large intermediate data frame based on joining only on id and then cutting it down.
library(sqldf)
sqldf("
select dat.*
from dat
join df on dat.dat_id = df.id and
dat.dat_frame between df.first and df.last
")
Update
The example in the question changed and the solution has been simplified assuming the new example.
Using dplyr we can do a left_join on dat and df and select only those rows which lie in between first and last of their respective id.
library(dplyr)
left_join(dat, df, by = c("dat_id" = "id")) %>%
filter(between(dat_frame, first, last)) %>%
select(-first, -last)
Or using the same logic in base R
subset(merge(dat, df, by.x = "dat_id", by.y = "id", all.x = TRUE),
dat_frame >= first & dat_frame <= last)
We can use a non-equi join for this. It would be more faster and efficient
library(data.table)
setDT(dat)[, newcol := dat_frame][df, on = .(dat_id = id,
newcol >= first, newcol <=last)][, .(dat_id, dat_frame, dat_data)]
# dat_id dat_frame dat_data
# 1: 27 14 26
# 2: 27 15 56
# 3: 27 16 30
# 4: 27 17 49
# 5: 27 20 23
# 6: 27 21 37
# 7: 27 22 7
# 8: 27 23 40
# 9: 27 24 12
#10: 134 9 57
#11: 134 10 35
#12: 134 11 31
#13: 134 12 53
#14: 134 13 38
#15: 134 16 15
#16: 134 17 14
#17: 134 18 33
#18: 134 19 54
#19: 134 20 43
Or another option is fuzzyjoin
library(fuzzyjoin)
library(dplyr)
dat %>%
mutate(newcol = dat_frame) %>%
fuzzy_left_join(df, by = c("dat_id" = 'id', 'newcol' = 'first',
'newcol' = 'last'), match_fun = list(`==`, `>=`, `<=`)) %>%
na.omit %>%
select(dat_id, dat_frame, dat_data)
# dat_id dat_frame dat_data
#14 27 14 26
#15 27 15 56
#16 27 16 30
#17 27 17 49
#20 27 20 23
#21 27 21 37
#22 27 22 7
#23 27 23 40
#24 27 24 12
#39 134 9 57
#40 134 10 35
#41 134 11 31
#42 134 12 53
#43 134 13 38
#46 134 16 15
#47 134 17 14
#48 134 18 33
#49 134 19 54
#50 134 20 43
Or using base R
out <- do.call(rbind, Map(function(x, y) do.call(rbind,
Map(function(u, v) subset(x, dat_frame >= u & dat_frame <= v),
y$first, y$last)), split(dat, dat$dat_id), split(df, df$id)))
row.names(out) <- NULL
out
# dat_id dat_frame dat_data
#1 27 14 26
#2 27 15 56
#3 27 16 30
#4 27 17 49
#5 27 20 23
#6 27 21 37
#7 27 22 7
#8 27 23 40
#9 27 24 12
#10 134 9 57
#11 134 10 35
#12 134 11 31
#13 134 12 53
#14 134 13 38
#15 134 16 15
#16 134 17 14
#17 134 18 33
#18 134 19 54
#19 134 20 43
NOTE: All the above solutions work
Also, note that the solution in the other post gives Error
left_join(dat, df, by = c("dat_id" = "id")) %>%
filter(between(dat_frame, first, last)) %>%
select(-first, -last)
#Error: Expecting a single value: [extent=120].
NOTE: That the accepted answer is wrong and it is giving error.
I’m a newbie in R.
I have two dataset A and B.
A <- data.table::fread(
"
V1 DATE ID
1 7/16/11 a
2 2/18/09 b
3 3/25/08 c
")
B <- data.table::fread(
"
V1 DATE ID Value
1 2013-06-13 a 109
2 2017-08-22 a 86
3 2017-09-15 a 88
4 2008-11-05 a 78
5 2009-02-17 a 74
6 2009-03-09 a 84
7 2009-03-17 a 81
8 2009-04-14 a 57
9 2009-04-21 a 65
10 2009-05-12 a 54
11 2009-06-08 a 54
12 2009-08-27 a 68
13 2011-08-26 b 199
14 2011-12-07 b 174
15 2012-01-31 b 66
16 2012-02-15 b 58
17 2012-04-17 b 59
18 2012-12-21 b 78
19 2013-01-14 b 91
20 2014-03-12 b 74
21 2014-08-28 b 98
22 2014-10-18 b 112
23 2010-12-15 b 36
24 2011-08-26 b 199
25 2011-12-07 b 174
26 2012-01-31 b 66
27 2012-02-15 b 58
28 2012-04-17 b 59
29 2015-05-08 c 105
30 2006-03-27 c 69
31 2007-03-12 c 104
32 2007-11-09 c 63
33 2008-03-25 c 239
34 2008-04-04 c 446
35 2008-04-09 c 354
36 2008-04-10 c 365
37 2008-04-11 c 366
38 2008-04-18 c 273
39 2008-04-28 c 271
40 2008-05-06 c 262
41 2008-05-19 c 72
42 2008-05-24 c 86
43 2008-06-20 c 47
44 2008-07-10 c 46
45 2008-08-06 c 55
46 2008-09-01 c 58
47 2008-09-29 c 56
48 2008-10-30 c 53
49 2008-12-09 c 71
50 2008-12-18 c 63
51 2009-01-14 c 60
52 2009-02-21 c 58
53 2009-03-28 c 54
54 2009-04-29 c 56
55 2009-04-30 c 59
56 2009-06-23 c 64
57 2009-07-24 c 69
58 2009-08-17 c 73
59 2009-10-04 c 127
60 2009-11-26 c 289
61 2009-12-02 c 277
62 2009-12-08 c 230
")
I tried weeks to use R to:
find value from B which ID==A$ID, and B$DATE is closest date before or the same date as A$DATE;
The expected result is : ID=c, DATE=2008-03-25, Value=239
find value from B which ID==A$ID, and B$DATE is 14 days after A$DATE. If there is no exact date after 14 days, find the closest date's value (like 15, 16 or 17 days after A$DATE)
The expected result is : ID=c, DATE=2008-04-09, Value=354
Both questions can answered using a rolling join from data.table.
However, there are two important steps in preparing the data.
The date strings need to be converted to class IDate (or Date) to allow for date arithmetic. (IDate uses an integer representation to save memory).
The dataframes need to be coerced to data.table to enable the enhanced syntax. setDT() coerces a dataframe or tibble to data.table by reference, i.e., without copying.
BTW: The sample datasets provided by the OP were already data.tables as the OP had used the data.table::fread() function.
Data preparation:
library(data.table)
setDT(A)[, DATE := as.IDate(DATE, "%m/%d/%y")]
setDT(B)[, DATE := as.IDate(DATE)]
Now, we can apply the rolling join:
B[A, on = .(ID, DATE), roll = +Inf, .(ID, DATE, Value)]
ID DATE Value
1: a 2011-07-16 68
2: b 2009-02-18 NA
3: c 2008-03-25 239
The result can be verified by printing B in proper order B[order(ID, DATE)]. The earliest date for ID == "b" in B is 2011-08-26. So, there is no date in B on or before 2009-02-18.
Please, note that the value in the DATE column is the reference date A$DATE, not the matching B$DATE.
Edit after clarification of the expected result by the OP:
Also the second question can be solved by a rolling join but the code requires three modifications:
The reference dates A$DATE need to be shifted by 14 days later.
We need a backward rolling join because the OP wants to find the closest date in B on or after the shifted reference date.
According to OP's expected result the result should contain the matching B$DATE.
With the additional requrements we get
B[A[, .(ID, DATE = DATE + 14)], on = .(ID, DATE), roll = -Inf, .(ID, DATE = x.DATE, Value)]
ID DATE Value
1: a 2013-06-13 109
2: b 2010-12-15 36
3: c 2008-04-09 354
A solution using dplyr:
q1 and q2 corresponds to your two questions.
library(dplyr)
A$DATE <- as.Date(A$DATE,format = "%m/%d/%y")
B$DATE <- as.Date(B$DATE)
BA <- left_join(B,A, by= c("ID"="ID"))
q1 <- BA %>%
filter(ID %in% A$ID) %>%
filter(DATE.x < DATE.y) %>%
group_by(ID) %>%
arrange(desc(DATE.x)) %>%
slice(1)
q2 <- BA %>%
filter(ID %in% A$ID) %>%
group_by(ID) %>%
filter(as.numeric(DATE.x) - as.numeric(DATE.y) >= 14)
q1
#> # A tibble: 2 x 6
#> # Groups: ID [2]
#> V1.x DATE.x ID Value V1.y DATE.y
#> <int> <date> <chr> <int> <int> <date>
#> 1 12 2009-08-27 a 68 1 2011-07-16
#> 2 32 2007-11-09 c 63 3 2008-03-25
q2
#> # A tibble: 48 x 6
#> # Groups: ID [3]
#> V1.x DATE.x ID Value V1.y DATE.y
#> <int> <date> <chr> <int> <int> <date>
#> 1 1 2013-06-13 a 109 1 2011-07-16
#> 2 2 2017-08-22 a 86 1 2011-07-16
#> 3 3 2017-09-15 a 88 1 2011-07-16
#> 4 13 2011-08-26 b 199 2 2009-02-18
#> 5 14 2011-12-07 b 174 2 2009-02-18
#> 6 15 2012-01-31 b 66 2 2009-02-18
#> 7 16 2012-02-15 b 58 2 2009-02-18
#> 8 17 2012-04-17 b 59 2 2009-02-18
#> 9 18 2012-12-21 b 78 2 2009-02-18
#> 10 19 2013-01-14 b 91 2 2009-02-18
#> # ... with 38 more rows
I'm having trouble when trying to calculate the average time interval (how many days) between appearances of the same value in another column.
My data looks like this:
dt subject_id
2016-09-13 77
2016-11-07 1791
2016-09-18 1332
2016-08-31 84
2016-08-23 89
2016-08-23 41
2016-09-15 41
2016-10-12 93
2016-10-05 93
2016-11-09 94
2016-10-25 94
2016-11-03 94
2016-10-09 375
2016-10-14 11
2016-09-27 11
2016-09-13 11
2016-08-23 11
2016-08-27 11
And I want to get something like this:
subject_id mean_day
41 23
93 7
94 7.5
11 13
I tried to use:
aggregate(dt~subject_id, data, mean)
But it can't calculate mean from Date values. Any ideas?
My first approach would be something like this:
df$dt <- as.Date(df$dt)
library(dplyr)
df %>%
group_by(subject_id) %>%
summarise((max(dt) - min(dt))/(n()-1))
# <int> <time>
#1 11 13.0 days
#2 41 23.0 days
#3 77 NaN days
#4 84 NaN days
#5 89 NaN days
#6 93 7.0 days
#7 94 7.5 days
#8 375 NaN days
#9 1332 NaN days
#10 1791 NaN days
I think it's a starting point for you ... you can modify as you want.
The following sample data.frame:
Date <- seq(as.Date("2016/9/1"), as.Date("2016/9/10"), "days")
A <- sample(0:200, 10)
B <- sample(0:400, 10)
A_limit <- rep(200,10)
B_limit <- rep(400,10)
data_sample <- data.frame(Date,A,B,A_limit,B_limit)
> Date A B A_limit B_limit
1 2016-09-01 175 270 200 400
2 2016-09-02 160 50 200 400
3 2016-09-03 173 25 200 400
...
and I would like to reshape it into the form:
> Date limit variable value
1 2016-09-01 200 A 175
2 2016-09-02 200 A 160
3 2016-09-03 200 A 173
...
31 2016-09-01 400 B 270
32 2016-09-02 400 B 50
33 2016-09-03 400 B 25
....
I manage to get it done but it seems to me my way is far to complicated:
library("reshape2")
data_sample_2 <- data_sample %>% melt(id=c("Date","A","B"))
levels(data_sample_2$variable) <- c(levels(data_sample_2$variable),"A","B")
data_sample_2$variable[data_sample_2$variable == "A_limit"] <- as.factor("A")
data_sample_2$variable[data_sample_2$variable == "B_limit"] <- as.factor("B")
names(data_sample_2)[names(data_sample_2) == "value"] <- "limit"
names(data_sample_2)[names(data_sample_2) == "variable"] <- "variable_1"
data_sample_3 <- data_sample_2 %>% melt(id=c("Date","variable_1","limit"))
data_sample_3 <- droplevels(data_sample_3)
data_sample_4 <- data_sample_3[data_sample_3$variable_1 == data_sample_3$variable,]
data_sample_4$variable_1 <- NULL
I just started using the reshape2 package so please let me know about any way i can improve this data.frame transformation (no matter how obvious it may seem).
You can do this via base R simply by stacking everything, i.e.
df1 <- data.frame(Date = data_sample$Date, limit = stack(data_sample[-(1:3)])[[1]],
variable = stack(data_sample[2:3])[[2]],
value = stack(data_sample[2:3])[[1]],
stringsAsFactors = FALSE)
head(df1)
# Date limit variable value
#1 2016-09-01 200 A 67
#2 2016-09-02 200 A 100
#3 2016-09-03 200 A 166
#4 2016-09-04 200 A 116
#5 2016-09-05 200 A 89
#6 2016-09-06 200 A 138
tail(df1)
# Date limit variable value
#15 2016-09-05 400 B 208
#16 2016-09-06 400 B 387
#17 2016-09-07 400 B 125
#18 2016-09-08 400 B 116
#19 2016-09-09 400 B 120
#20 2016-09-10 400 B 241
Is this what you want?
data_sample_2 <- melt(data_sample,id.vars=c("Date","A_limit","B_limit"))
data_sample_2$limit<- ifelse(data_sample_2$variable=="A",data_sample_2$A_limit,data_sample_2$B_limit)
data_sample_2[,c("Date","limit","variable","value")]
Since you used reshape2 in your example, it might interest you to see how to handle it in the (more updated) tidyverse setup.
I'll repeat your generation code:
Date <- seq(as.Date("2016/9/1"), as.Date("2016/9/10"), "days")
A <- sample(0:200, 10)
B <- sample(0:400, 10)
A_limit <- rep(200,10)
B_limit <- rep(400,10)
data_sample <- data.frame(Date,A,B,A_limit,B_limit)
# Preview
head(data_sample)
#> Date A B A_limit B_limit
#> 1 2016-09-01 39 53 200 400
#> 2 2016-09-02 96 193 200 400
#> 3 2016-09-03 143 75 200 400
#> 4 2016-09-04 60 241 200 400
#> 5 2016-09-05 126 225 200 400
#> 6 2016-09-06 184 349 200 400
Now we can use dplyr and tidyr (which take on much of the responsibilities that reshape2 has) to manipulate the data in a "clear" way.
library(dplyr)
library(tidyr)
data_clean <- data_sample %>%
gather(variable, value, A, B) %>%
mutate(limit = if_else(variable == "A", A_limit, B_limit)) %>%
select(Date, limit, variable, value)
# Inspect results
head(data_clean)
#> Date limit variable value
#> 1 2016-09-01 200 A 39
#> 2 2016-09-02 200 A 96
#> 3 2016-09-03 200 A 143
#> 4 2016-09-04 200 A 60
#> 5 2016-09-05 200 A 126
#> 6 2016-09-06 200 A 184