I know there must be a one-line data.table solution for this, probably with dcast, but I can't figure it out.
I have data like this:
library(data.table)
data1 <- data.table(
id = seq(1:5),
code = c("A","A","B","A","B"),
date = as.Date( c("2021-08-11","2021-01-05","2021-02-18","2021-02-13","2021-12-13" ))
)
data2 <- data.table(
id = seq(1:5),
code = c("B","B","A","B","A"),
date = as.Date( c("2021-08-13","2021-01-05","2021-02-19","2021-02-14","2021-12-13" ))
)
data3 <- rbind(data1, data2)
I simply wish to reshape to a wide format like this
data_want <- data.table(
id = seq(1:5),
code1 = c("A", "A","B","A","B"),
data1 = c("2021-08-11", "2021-01-05","2021-02-18","2021-02-13","2021-12-13"),
code2 = c("B", "B","A","B","A"),
data2 = c("2021-08-13", "2021-01-05","2021-02-19","2021-02-14","2021-12-13")
)
How to do it with dcast?
You could also make use of rowid as follows
dcast(data3, id ~ rowid(id), value.var = c("code", "date"))
# id code_1 code_2 date_1 date_2
#1: 1 A B 2021-08-11 2021-08-13
#2: 2 A B 2021-01-05 2021-01-05
#3: 3 B A 2021-02-18 2021-02-19
#4: 4 A B 2021-02-13 2021-02-14
#5: 5 B A 2021-12-13 2021-12-13
# load package
library(data.table)
# create batch number
data3[, batch := 1:.N, id]
# long to wide
data4 <- dcast(data3
, id ~ batch
, value.var = c('code', 'date')
); data4
id code_1 code_2 date_1 date_2
1: 1 A B 2021-08-11 2021-08-13
2: 2 A B 2021-01-05 2021-01-05
3: 3 B A 2021-02-18 2021-02-19
4: 4 A B 2021-02-13 2021-02-14
5: 5 B A 2021-12-13 2021-12-13
Related
I'm trying cross join a data.table by three variables (group, id, and date). The R code below accomplishes exactly what I want to do, i.e., each id within each group is expanded to include all of the dates_wanted. But is there a way to do the same thing more efficiently using the excellent data.table package?
library(data.table)
data <- data.table(
group = c(rep("A", 10), rep("B", 10)),
id = c(rep("frank", 5), rep("tony", 5), rep("arthur", 5), rep("edward", 5)),
date = seq(as.IDate("2020-01-01"), as.IDate("2020-01-20"), by = "day")
)
data
dates_wanted <- seq(as.IDate("2020-01-01"), as.IDate("2020-01-31"), by = "day")
names_A <- data[group == "A"][["id"]]
names_B <- data[group == "B"][["id"]]
names_A <- CJ(group = "A", id = names_A, date = dates_wanted, unique = TRUE)
names_B <- CJ(group = "B", id = names_B, date = dates_wanted, unique = TRUE)
alldates <- rbind(names_A, names_B)
alldates
data[alldates, on = .(group, id, date)]
You can also do this:
data[, .(date=dates_wanted), .(group,id)]
Output:
group id date
1: A frank 2020-01-01
2: A frank 2020-01-02
3: A frank 2020-01-03
4: A frank 2020-01-04
5: A frank 2020-01-05
---
120: B edward 2020-01-27
121: B edward 2020-01-28
122: B edward 2020-01-29
123: B edward 2020-01-30
124: B edward 2020-01-31
We can use do.call with CJ on the id and date transformed grouped by group:
out <- data[, do.call(CJ, c(.(id = id, date = dates_wanted),
unique = TRUE)), group]
... checking:
> dim(out)
[1] 124 3
> out0 <- data[alldates, on = .(group, id, date)]
> dim(out0)
[1] 124 3
> all.equal(out, out0)
[1] TRUE
Here are some data:
library(data.table)
library(lubridate)
foo <- data.table(
date = seq.Date(from = as_date('2020-01-01'), to = as_date('2020-03-01'), by = '1 month'),
a = rep(1:3, 2),
group = c(rep('a', 3), rep('b', 3))
)
> foo
date a group
1: 2020-01-01 1 a
2: 2020-02-01 2 a
3: 2020-03-01 3 a
4: 2020-01-01 1 b
5: 2020-02-01 2 b
6: 2020-03-01 3 b
The desired output is the following:
date a group diff
1: 2020-01-01 1 a 1
2: 2020-02-01 2 a 1
3: 2020-03-01 3 a 1
4: 2020-04-01 0 a -3
5: 2020-01-01 1 b 1
6: 2020-02-01 2 b 1
7: 2020-03-01 3 b 1
8: 2020-04-01 0 b -3
And here follows my own solution.
bar <- foo[foo[, .I[which.max(date)], by = group]$V1]
bar <- bar[a != 0][, c('date', 'a') := .(date %m+% months(1), 0)]
foo <- rbindlist(list(foo, bar))
foo[, diff := a - shift(a, fill = 0), by = group]
foo[order(group, date)]
I wonder if a more compact solution exists in data.table, such as a fill option able to look at the past with a shift operation performed from the point of view of future non-existent data.
This is more compact but it's largely similar:
foo[
order(date),
.(date = c(date, date[.N] %m+% months(1)), a = c(a, 0)),
by = group
][ , diff := a - shift(a, fill=0)][]
I guess we could also do things in one query:
foo[
order(date),
.(
date = c(date, date[.N] %m+% months(1)),
a = c(a, 0),
diff = c(a - shift(a, fill=0), -a[.N])
),
by = group
]
another option that may be more palatable:
foo[
order(date),
{
out <- rbind(
.SD,
data.table(
date = date[.N] %m+% months(1),
a = 0
)
)
out[ , diff := a - shift(a, fill=0)]
out
},
by = group
]
I want to keep empty groups (with a default value like NA or 0) when grouping by multiple conditions.
dt = data.table(user = c("A", "A", "B"), date = c("t1", "t2", "t1"), duration = c(1, 2, 1))
dt[, .("total" = sum(duration)), by = .(date, user)]
Result:
date user total
1: t1 A 1
2: t2 A 2
3: t1 B 1
Desired result:
date user total
1: t1 A 1
2: t2 A 2
3: t1 B 1
3: t2 B NA
One solution could be to add rows with 0 values before grouping, but it would require to create the Descartes product of many columns and manually checking if a value already exists for that combination, but I would prefer a built-in / simpler one.
You can try:
dt[CJ(user = user, date = date, unique = TRUE), on = .(user, date)]
user date duration
1: A t1 1
2: A t2 2
3: B t1 1
4: B t2 NA
Here is an option with complete from tidyr
library(tidyr)
library(dplyr)
dt1 <- dt[, .("total" = sum(duration)), by = .(date, user)]
dt1 %>%
complete(user, date)
# user date total
# <chr> <chr> <dbl>
# A t1 1
#2 A t2 2
#3 B t1 1
#4 B t2 NA
Or using dcast/melt
melt(dcast(dt, user ~ date, value.var = 'duration', sum),
id.var = 'user', variable.name = 'date', value.name = 'total')
Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
For sum if data falls within range this works (from my previous question):
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
d = foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
How can I do countif ?
because when I try
d = foverlaps(df1, df2)[, list(mySum = count(value)), by = Date ]
I get error
no applicable method for 'groups' applied to an object of class "c('double', 'numeric')"
We can use .N:
foverlaps(df1, df2)[, list(myCount = .N), by = Date ]
# Date myCount
# 1: 2018-01-01 7
# 2: 2018-01-08 7
# 3: 2018-01-15 7
# 4: 2018-01-22 7
# 5: 2018-01-29 2
d = foverlaps(df1, df2)[, .N, by = Date]
If you want to count the number of rows per Date, you can try .N
foverlaps(df1, df2)[, .(mysum = .N), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
If you want the count of unique values per Date you can try uniqueN()
foverlaps(df1, df2)[, .(mysum = uniqueN(value)), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
Both .N and uniqueN() are from {data.table}.
Instead of list(mySum = count(value)) try c(mySum = count(value)). The Code runs for me then.
d2 <- foverlaps(df1, df2)[, c(mySum = count(value)), by = Date ]
Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
I want for each date in df2$Date calculate the sum of df1$Value if date in df1$Date falls within df2$Date and df2$Date+6
Inshort I need to calculate weekly sums
Using data.table, create a range start/end, then merge on overlap, then get sum over group:
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
# Date mySum
# 1: 2018-01-01 138
# 2: 2018-01-08 96
# 3: 2018-01-15 83
# 4: 2018-01-22 109
# 5: 2018-01-29 39
Check out library lubridate and dplyr, those two are quiet common.
library(lubridate)
library(dplyr)
df1$last_week_day <- ceiling_date(df1$Date, "week") + 1
df1 %>% group_by(last_week_day) %>% summarize(week_value = sum(value))
We can use fuzzyjoin
library(dplyr)
library(fuzzyjoin)
df2$EndDate <- df2$Date+6
fuzzy_left_join(
df1, df2,
by = c(
"Date" = "Date",
"Date" = "EndDate"
), match_fun = list(`>=`, `<=`)) %>%
group_by(Date.y) %>% summarise(Sum=sum(value))
# A tibble: 5 x 2
Date.y Sum
<date> <int>
1 2018-01-01 138
2 2018-01-08 96
3 2018-01-15 83
4 2018-01-22 109
5 2018-01-29 39