Computing rolling mean in data.table with adaptive window lengths - r

I am looking to compute a moving average by group in a data.table with an adaptive window so that there are no NAs at the beginning of the time series. I know how to do this with frollmean and setting adaptive = TRUE (see for instance jangorecki's response in this thread). I can get the same code to work when all groups in my data.table are of the same length but run into errors when the groups are of different size.
For example, if my data is
tmp = data.table(Gp = c(rep('A',6),rep('B',4)), Val = c(1,3,4,6,2,2,8,5,7,10))
and I am doing a moving average of length 3, then the desired response is
> desired_output
Gp Val
1: A 1.00
2: A 2.00
3: A 2.67
4: A 4.33
5: A 4.00
6: A 3.33
7: B 8.00
8: B 6.50
9: B 6.67
10: B 7.33
I tried the following:
mov_window_len = vector("list",2)
mov_window_len[[1]] = c(1,2,rep(3,4))
mov_window_len[[2]] = c(1,2,rep(3,2))
tmp[,lapply(.SD, frollmean, n = mov_window_len, align = "right", adaptive = TRUE), by = Gp]
but I get an error saying length of integer vector(s) provided as list to 'n' argument must be equal to number of observations provided in 'x'
Any help in resolving this will be much appreciated. Thanks in advance.

You can use the group index .GRP to subset mov_window_len. This will give you the right lengths for each group. You only want to take frollmean of Val, so no need for lapply.
tmp[, frollmean(Val, n = mov_window_len[.GRP], align = "right", adaptive = TRUE), by = Gp]
# Gp V1
# 1: A 1.000000
# 2: A 2.000000
# 3: A 2.666667
# 4: A 4.333333
# 5: A 4.000000
# 6: A 3.333333
# 7: B 8.000000
# 8: B 6.500000
# 9: B 6.666667
# 10: B 7.333333
Alternatively window length can be added to input data.table (Len field below), as it corresponds to each row.
tmp[Gp=="A", Len:=mov_window_len[[1]]
][Gp=="B", Len:=mov_window_len[[2]]
][, .(Val, Len, RollVal=frollmean(Val, Len, adaptive=TRUE)), by=Gp]
# Gp Val Len RollVal
# 1: A 1 1 1.000000
# 2: A 3 2 2.000000
# 3: A 4 3 2.666667
# 4: A 6 3 4.333333
# 5: A 2 3 4.000000
# 6: A 2 3 3.333333
# 7: B 8 1 8.000000
# 8: B 5 2 6.500000
# 9: B 7 3 6.666667
#10: B 10 3 7.333333

Related

Add new blank rows into dataset by group (in R)

I use R. I have dataframe like this:
dat <- data.frame(
group = c(1,1,1,1,1,1,2,2,2,2,2),
horizon = c(1,3,5,6,7,10,1,3,5,9,10),
value = c(1.0,0.9,0.8,0.6,0.3,0.0,0.5,0.6,0.8,0.9,0.8)
other = c(a,a,a,a,a,a,b,b,b,b,b)
)
And i would like to add row for every horizon that is missing (2,4,8 and 9 for the first group and 2,4,6,7,8 for the second group). Values (value) for the missing horizons would be blank.
I would like to get something like this:
datx <- data.frame(
group = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
horizon = c(1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10),
value = c(1.0,"na",0.9,"na",0.8,0.6,0.3,"na","na",0.0,0.5,"na",0.6,"na",0.8,"na","na","na",0.9,0.8)
other = c(a,a,a,a,a,a,a,a,a,a,b,b,b,b,b,b,b,b,b,b)
)
i.e. englarged dataset with new horizons, blank or "na" spaces in "value" variable and retained "other" variable.
This is just an example. I am actually working with a much larger dataset.
Without the groups, the problem would be much easier to solve, i would use something like this:
newdat <- merge(data.frame(horizon=seq(1,10,1)),dat,all=TRUE)
newdat <- newdat[order(newdat$horizon),]
Thanks for help!
I'll assume that the values in the variable other are the characters, a or b, and that this is completely redundant with your variable group. If this is the case, you could accomplish this with full_join in the dplyrpackage.
a="a"
b="b"
dat <- data.frame(
group = c(1,1,1,1,1,1,2,2,2,2,2),
horizon = c(1,3,5,6,7,10,1,3,5,9,10),
value = c(1.0,0.9,0.8,0.6,0.3,0.0,0.5,0.6,0.8,0.9,0.8),
other = c(a,a,a,a,a,a,b,b,b,b,b)
)
groups <- expand.grid(group=c(1,2),horizon=1:10)
groups <- groups %>% dplyr::mutate(other=ifelse(group==1,"a","b"))
dat %>%
dplyr::full_join(groups,by=c('group','horizon','other')) %>%
dplyr::arrange(group,horizon)
Using data.table:
library(data.table)
setDT(dat)
fill = c("other")
RES =
dat[CJ(group = group, horizon = min(horizon):max(horizon), unique = TRUE),
on = .(group, horizon)
][, (fill) := lapply(.SD, \(x) x[which.min(is.na(x))]), by = group, .SDcols = fill]
RES[]
# group horizon value other
# <num> <int> <num> <char>
# 1: 1 1 1.0 a
# 2: 1 2 NA a
# 3: 1 3 0.9 a
# 4: 1 4 NA a
# 5: 1 5 0.8 a
# 6: 1 6 0.6 a
# 7: 1 7 0.3 a
# 8: 1 8 NA a
# 9: 1 9 NA a
# 10: 1 10 0.0 a
# 11: 2 1 0.5 b
# 12: 2 2 NA b
# 13: 2 3 0.6 b
# 14: 2 4 NA b
# 15: 2 5 0.8 b
# 16: 2 6 NA b
# 17: 2 7 NA b
# 18: 2 8 NA b
# 19: 2 9 0.9 b
# 20: 2 10 0.8 b
# group horizon value other

Assign bin value according to vector of thresholds

I have a vector of thresholds that I want to use for creating bins of a column on a data.table
thrshlds <- seq(from = 0, to = 1, by = 0.05)
test <- data.table(
A = rnorm(1000, 0.7, 1),
B = rbinom(1000, 3, 0.6)
)
The logic that I'm looking to implement is:
If the value of column A is equal or less than the value of each threshold, then assign it its respective threshold value. Similar to a SQL case when, but without manually assigning each threshold value.
Something like:
test[, new_category := fcase(A <= thrshlds[1], thrshlds[1],
A <= thrshlds[2], thrshlds[2],
.....)]
But I don't know how to do this kind of iteration inside a data.table query.
Thanks!
You can use cut :
library(data.table)
test[, new_category := cut(A, c(-Inf, thrshlds), thrshlds)]
test
# A B new_category
# 1: 0.220744413 3 0.25
# 2: -0.814886795 3 0
# 3: 1.134536656 2 <NA>
# 4: 0.180463333 1 0.2
# 5: -0.134559033 1 0
# ---
# 996: -0.332559649 1 0
# 997: 0.585641110 0 0.6
# 998: 0.765738832 2 0.8
# 999: 2.167632026 2 <NA>
#1000: 0.008935421 2 0.05
Not sure if this is an appropriate method or not, but here's a rolling join option that seems to work:
test[, new_category := data.table(thrshlds)[test, on="thrshlds==A", x.thrshlds, roll=-Inf] ]
#test[sample(1000, 12)]
# A B new_category
# 1: -1.1317742 3 0.00
# 2: 0.2926608 2 0.30
# 3: 1.5441214 2 NA
# 4: 0.9249706 1 0.95
# 5: 1.2663975 2 NA
# 6: 0.6472989 0 0.65
# 7: -0.5606153 2 0.00
# 8: 0.4439064 2 0.45
# 9: 0.8182938 1 0.85
#10: 0.8461909 2 0.85
#11: 1.0237554 1 NA
#12: 0.7752323 1 0.80

R moving average function to deal with values less window size

Following this answer, I have used a moving average function for a window size of 2, 3 and 4.
require(zoo)
#MOVING AVERAGE FUNCTION
get.mav <- function(df, n = 2){
if(length(df) < n){
return(df)
}
c(df[1:(n-1)],rollapply(df,width = n, mean, align="right"))
}
#DATA FRAME (dummy)
ID <- c("d","b","a","a","c","e","b","d","b","b")
Value <- c(4,5,5,3,2,1,6,9,5,5)
df <-data.frame(ID,Value)
# FUNCTION IMPLEMENTATION
df <- with(df,df[order(ID),])
df$mav2 <- unlist(aggregate(Value~ID,df,get.mav,na.action = NULL,n=2)$Value)
df$mav3 <- unlist(aggregate(Value~ID,df,get.mav,na.action = NULL,n=3)$Value)
df$mav4 <- unlist(aggregate(Value~ID,df,get.mav,na.action = NULL,n=4)$Value)
#OUTPUT
ID Value mav2 mav3 mav4
a 5 5 5 5
a 3 4 3 3
b 5 5 5 5
b 6 5.5 6 6
b 5 5.5 5.3 5
b 5 5 5.3 5.25
c 2 2 2 2
d 4 4 4 4
d 9 6.5 9 9
e 1 1 1 1
The function get.mav works exactly the way it should. I want to change this function such that
For window size 3, if df length is 2, it takes the mean of those two elements rather than simply returning df.
Similarly for window size 4, if length is 3 or 2, it takes mean of those three or two elements rather simply returning df.
I tried the if statements but comparisons are not working correctly. Any help would be appreciated.
Thanks.
For each width use ave to to invoke rollapplyr by ID. partial = TRUE in rollapplyr causes it to average partial number of points near the beginning.
library(zoo)
roll <- function(x, group, w) {
ave(x, group, FUN = function(x) rollapplyr(x, w, mean, partial = TRUE))
}
transform(df[order(df$ID), ],
mav2 = roll(Value, ID, 2),
mav3 = roll(Value, ID, 3),
mav4 = roll(Value, ID, 4)
)
or alternatively:
w <- 2:4
names(w) <- paste0("mav", w)
with(df[order(df$ID), ],
data.frame(ID, Value, lapply(w, roll, x = Value, group = ID), check.names = FALSE)
)
Either gives:
ID Value mav2 mav3 mav4
1 a 5 5.0 5.000000 5.000000
2 a 3 4.0 4.000000 4.000000
3 b 5 5.0 5.000000 5.000000
4 b 6 5.5 5.500000 5.500000
5 b 5 5.5 5.333333 5.333333
6 b 5 5.0 5.333333 5.250000
7 c 2 2.0 2.000000 2.000000
8 d 4 4.0 4.000000 4.000000
9 d 9 6.5 6.500000 6.500000
10 e 1 1.0 1.000000 1.000000
Update: Fixed.

How do I apply a function to row subsets of a data.table where each call returns a data.table

Here's a data.table
dt <- data.table(group = c("a","a","a","b","b","b"), x = c(1,3,5,1,3,5), y= c(3,5,8,2,8,9))
dt
group x y
1: a 1 3
2: a 3 5
3: a 5 8
4: b 1 2
5: b 3 8
6: b 5 9
And here's a function that operates on a data.table and returns a data.table
myfunc <- function(dt){
# Hyman spline interpolation (which preserves monotonicity)
newdt <- data.table(x = seq(min(dt$x), max(dt$x)))
newdt$y <- spline(x = dt$x, y = dt$y, xout = newdt$x, method = "hyman")$y
return(newdt)
}
How do I apply myfunc to each subset of dt defined by the "group" column? In other words, I want an efficient, generalized way to do this
result <- rbind(myfunc(dt[group=="a"]), myfunc(dt[group=="b"]))
result
x y
1: 1 3.000
2: 2 3.875
3: 3 5.000
4: 4 6.375
5: 5 8.000
6: 1 2.000
7: 2 5.688
8: 3 8.000
9: 4 8.875
10: 5 9.000
EDIT: I've updated my sample dataset and myfunc because I think it was initially too simplistic and invited work-arounds to the actual problem I'm trying to solve.
The whole idea of data.table is being both memory efficient and fast. Thus we never use $ within the data.table scope (only in very rare situations) and we don't create data.table objects within data.tables environment (currently, even .SD has an overhead).
In your case you can take advantage of data.table's non-standard evaluation capabilities and define your function as follows
myfunc <- function(x, y){
temp = seq(min(x), max(x))
y = spline(x = x, y = y, xout = temp, method = "hyman")$y
list(x = temp, y = y)
}
Then the implementation within the dt scope is straight forward
dt[, myfunc(x, y), by = group]
# group x y
# 1: a 1 3.0000
# 2: a 2 3.8750
# 3: a 3 5.0000
# 4: a 4 6.3750
# 5: a 5 8.0000
# 6: b 1 2.0000
# 7: b 2 5.6875
# 8: b 3 8.0000
# 9: b 4 8.8750
# 10: b 5 9.0000

R: Data.table on subset excluding by value

Using data.table in R, I'm trying to make an operation on the subset excluding selected element. I'm using the by operator, but I don't know if this is the right approach.
Here's an example. E.g. the value for Delta in IAH:SNA is (3+3)/2 which is the mean of Stops in IAH:SNA once Delta has been excluded.
library(data.table)
s1 <- "Market Carrier Stops
IAH:SNA Delta 1
IAH:SNA Delta 1
IAH:SNA Southwest 3
IAH:SNA Southwest 3
MSP:CLE Southwest 2
MSP:CLE Southwest 2
MSP:CLE American 2
MSP:CLE JetBlue 1"
d <- data.table(read.table(textConnection(s1), header=TRUE))
setkey(d, Carrier, Market)
f <- function(x, y){
subset(d, !(Carrier %in% x) & Market == y, Stops)[, mean(Stops)]}
d[, s := f(.BY[[1]], .BY[[2]]), by=list(Carrier, Market)]
## Market Carrier Stops s
## 1: MSP:CLE American 2 1.666667
## 2: IAH:SNA Delta 1 3.000000
## 3: IAH:SNA Delta 1 3.000000
## 5: IAH:SNA Southwest 3 1.000000
## 6: IAH:SNA Southwest 3 1.000000
## 7: MSP:CLE Southwest 2 1.500000
## 8: MSP:CLE Southwest 2 1.500000
The above solution performs very poorly on large data sets (it's essentially an mapply), but I'm not sure how to do it in a fast data.table-like way.
Perhaps one could (dynamically) generate a factor that does this? I'm just not sure how. . .
Is there a way to improve it?
Edit: Just for the heck of it, here's a way to get a bigger version of the above
library(data.table)
dl.dta <- function(...){
## input years ..
years <- gsub("\\.", "_", c(...))
baseurl <- "http://www.transtats.bts.gov/Download/"
names <- paste("Origin_and_Destination_Survey_DB1BMarket", years, sep="_")
info <- t(sapply(names, function(x) file.exists(paste(x, c("zip", "csv"), sep="."))))
to.download <- paste(baseurl, names, ".zip", sep="")[!apply(info, 1, any)]
if (length(to.download) > 0){
message("starting download...")
sapply(to.download,
function(x) download.file(x, rev(strsplit(x, "/")[[1]])[1]))}
to.unzip <- paste(names, "zip", sep=".")[!info[, 2]]
if (length(to.unzip > 0)){
message("starting to unzip...")
sapply(to.unzip, unzip)}
paste(names, "csv", sep=".")}
countWords.split <- function(x, s=":"){
## Faster on my machine than grep for some reanon
sapply(strsplit(as.character(x), s), length)}
countWords.grep <- function(x){
sapply(gregexpr("\\W+", x), length)+1}
fname <- dl.dta(2013.1)
cols <- rep("NULL", 41)
## Columns to keep: 9 is Origin, 18 is Dest, 24 is groups of airports in travel
## 30 is RPcarrier (reporting carrier).
## For more columns: 35 is market fare and 36 is distance.
cols[9] <- cols[18] <- cols[24] <- cols[30] <- NA
d <- data.table(read.csv(file=fname, colClasses=cols))
d[, Market := paste(Origin, Dest, sep=":")]
## should probably
d[, Stops := -2 + countWords.split(AirportGroup)]
d[, Carrier := RPCarrier]
d[, c("RPCarrier", "Origin", "Dest", "AirportGroup") := NULL]
Use a tiny bit of elementary maths:
d[, c("tmp.mean", "N") := list(mean(Stops), .N), by = Market]
d[, exep.mean := (tmp.mean * N - sum(Stops)) / (N - .N), by = list(Market,Carrier)]
# Market Carrier Stops tmp.mean N exep.mean
# 1: IAH:SNA Delta 1 2.00 4 3.000000
# 2: IAH:SNA Delta 1 2.00 4 3.000000
# 3: IAH:SNA Southwest 3 2.00 4 1.000000
# 4: IAH:SNA Southwest 3 2.00 4 1.000000
# 5: MSP:CLE Southwest 2 1.75 4 1.500000
# 6: MSP:CLE Southwest 2 1.75 4 1.500000
# 7: MSP:CLE American 2 1.75 4 1.666667
# 8: MSP:CLE JetBlue 1 1.75 4 2.000000
#Roland's answer will work for some functions (and when it does it will be best) but not in general. Unfortunately you can't apply the split-apply-combine strategy to the data as is to do the task, but you can if you make the data larger. Let's start with a simpler example:
dt = data.table(a = c(1,1,2,2,3,3), b = c(1:6), key = 'a')
# now let's extend this table the following way
# take the unique a's and construct all the combinations excluding one element
combinations = dt[, combn(unique(a), 2)]
# now combine this into a data.table with the excluded element as the index
# and merge it back into the original data.table
extension = rbindlist(apply(combinations, 2,
function(x) data.table(a = x, index = setdiff(c(1,2,3), x))))
setkey(extension, a)
dt.extended = extension[dt, allow.cartesian = TRUE]
dt.extended[order(index)]
# a index b
# 1: 2 1 3
# 2: 2 1 4
# 3: 3 1 5
# 4: 3 1 6
# 5: 1 2 1
# 6: 1 2 2
# 7: 3 2 5
# 8: 3 2 6
# 9: 1 3 1
#10: 1 3 2
#11: 2 3 3
#12: 2 3 4
# Now we have everything we need:
dt.extended[, mean(b), by = list(a = index)]
# a V1
#1: 3 2.5
#2: 2 3.5
#3: 1 4.5
Going back to original data (and doing some operations slightly differently, to simplify expressions):
extension = d[, {Carrier.uniq = unique(Carrier);
.SD[, rbindlist(combn(Carrier.uniq, length(Carrier.uniq)-1,
function(x) data.table(Carrier = x,
index = setdiff(Carrier.uniq, x)),
simplify = FALSE))]}, by = Market]
setkey(extension, Market, Carrier)
extension[d, allow.cartesian = TRUE][, mean(Stops), by = list(Market, Carrier = index)]
# Market Carrier V1
#1: IAH:SNA Southwest 1.000000
#2: IAH:SNA Delta 3.000000
#3: MSP:CLE JetBlue 2.000000
#4: MSP:CLE Southwest 1.500000
#5: MSP:CLE American 1.666667

Resources