rowmean and standard deviation using data.table - r

x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)
library(data.table)
setDT(x)
I want to calculate mean, sd and no. of data points per row but I am getting error
x[, c("meanY",'sdY',"nY") := .(rowMeans(.SD, na.rm = TRUE), sd(.SD, na.rm = TRUE), rowSums(!is.na(.SD))), .SDcols=c(2:10)]

The issues lies in sd() which doesn't work row-wise.
x[,
c("meanY",'sdY',"nY") :=
.(rowMeans(.SD, na.rm = TRUE),
apply(.SD, 1, sd, na.rm = TRUE),
rowSums(!is.na(.SD))),
.SDcols = 2:10]

Assuming the output as a list, you can use following code to have a try:
op <- c("mean","sd","length")
r <- lapply(op, function(v) apply(x, 1, eval(parse(text = v))))
names(r) <- op
where it should work with your data.frame example:
x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)

Related

R: how to specify my own CV folds in SuperLearner

library(SuperLearner)
library(MASS)
set.seed(23432)
## training set
n <- 500
p <- 50
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
colnames(X) <- paste("X", 1:p, sep="")
X <- data.frame(X)
Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n)
sl_cv = SuperLearner(Y = Y, X = X, family = gaussian(),
SL.library = c("SL.mean", "SL.ranger"),
verbose = TRUE, cvControl = list(V = 5))
In the above code, I'm performing a 5-fold CV to train a SuperLearner. However, what if I want to create my own folds in the data manually? I'm interested in trying this because I know there are clusters in my data, and I would like to perform CV on the folds that I've created.
Take for example that below are the five folds for my toy data: split1, ..., split5. Is there a way to use these 5 folds to perform cross-validation on instead of letting SuperLearner split up the data by itself?
set.seed(1)
index <- sample(1:5, size = nrow(X), replace = TRUE, prob = c(0.2, 0.2, 0.2, 0.2, 0.2))
split1 <- X[index == 1, ]
split2 <- X[index == 2, ]
split3 <- X[index == 3, ]
split4 <- X[index == 4, ]
split5 <- X[index == 5, ]
split1.y <- Y[index == 1]
split2.y <- Y[index == 2]
split3.y <- Y[index == 3]
split4.y <- Y[index == 4]
split5.y <- Y[index == 5]
Repeating the preparation of data, there is a full solution.
Last lines verify that training data exclude validation data.
library(SuperLearner)
library(MASS)
set.seed(23432)
## training set
n <- 500
p <- 50
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
colnames(X) <- paste("X", 1:p, sep="")
X <- data.frame(X)
Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n)
set.seed(1)
index <- sample(1:5, size = nrow(X), replace = TRUE, prob = c(0.2, 0.2, 0.2, 0.2, 0.2))
validRows=list()
for (v in 1:5)
validRows[[v]] <- which(index==v)
sl_cv = SuperLearner(Y = Y, X = X, family = gaussian(),
SL.library = c("SL.mean", "SL.ranger"),
verbose = TRUE,
control = SuperLearner.control(saveCVFitLibrary = TRUE),
cvControl = list(V = 5, shuffle = FALSE, validRows = validRows))
# sample size deducted from length of declared validRows
n - sapply(sl_cv$validRows, length)
# sample size deducted from resulting models
sapply(1:5, function(i) length(sl_cv$cvFitLibrary[[i]]$SL.ranger_All$object$predictions))
There are some control parameters for the cross-validation procedure. You could use the validRows parameter. You will need a list with 5 elements, each element having a vector of all rows that correspond to the clusters you have predefined. Assuming you added a column that shows which cluster an observation belongs to, you could write something like:
cluster1_ids = which(df$cluster==1) #similar for other cluster values
L = list(cluster1_ids, cluster2_ids, cluster3_ids, cluster4_ids, cluster5_ids)
X = df[-c("cluster")]
sl_cv = SuperLearner(Y = Y, X = X, family = gaussian(),
SL.library = c("SL.mean", "SL.ranger"),
verbose = TRUE, cvControl = list(V = 5, validRows=L))
Hope this helps!

Getting NA irrespective of the factor conversion method used

I see this question has been asked a lot of times and I myself have tried all the methods to convert my factors to numeric including:
as.numeric(as.character(x))
lapply(x, function(y) as.numeric(levels(y))[y])
as.numeric.factor <- function(y) as.numeric(levels(y))[y]
lapply(x, as.numeric.factor)
I cannot use the unlist function because I want the output in the wide format and unlist converts it to long.
But all of them are giving NA values as output.
When I specifically check x, it shows the values appropriately.
> class(x)
[1] "data.frame"
> class(x$Col2)
[1] "factor"
> class(x$Col1)
[1] "Date"
Please find a reproducible example:
set.seed(354)
df <- data.frame(Product_Id = rep(1:100, each = 50),
Date = seq(from = as.Date("2014/1/1"), to = as.Date("2018/2/1") , by = "month"),
Sales = rnorm(100, mean = 50, sd= 20))
df <- df[-c(251:256, 301:312, 2551:2562, 2651:2662, 2751:2762), ]
library(zoo)
z <- read.zoo(df, index = "Date", split = "Product_Id", FUN = as.yearmon)
tt <- as.ts(z)
fcast <- matrix(NA, ncol = ncol(tt), nrow = 12)
library(forecast)
for(i in 1:ncol(tt)){
fc1 <- forecast(stlf(na.trim(tt[,i]),h=12))
fcast[,i] <-fc1$mean
}
forecasted <- format(round(rbind(tt, fcast),2),nsmall = 2)
dd <- data.frame(date = seq(as.Date('2014-01-01'), by = 'months', length = nrow(forecasted)),Prod_Id = forecasted)
dd_f <- lapply(dd, function(x) as.numeric(levels(x))[x])
Any suggestion as to what can I be missing?

How to calculate "1+1" without using eval(parse(...?

I have been looking for quite a while but it seems the answer always seems to be to use eval(parse(text="1+1")).
I have a column in my data frame, it has a list of strings such as "1+1*6", "1*4/3" etc. I wish to compute these into a new column without using the eval(parse( functions as I am looking to do it over 8 million rows.
It is basically an attempt to answer the question: Given the numbers 1:9 find all the solutions where (A_B_C) / (D_E_F) = GHI, where A:I are the numbers 1:9 (without repeating) and the underscores are one of the four operators *, /, +,-, also without repeating.
I created a dataframe with all the permutations of 1:9 and for each of these I calculated the permutations of the four operators.
require(gtools)
x <- permutations(n = 9, r = 9, v = 1:9)
y <- permutations(n = 4, r = 4, v = c("*", "/", "+", "-"))
for(i in 1:nrow(x)){
for(j in 1:nrow(y)){
math <- paste("(", x[i,1], y[j,1], x[i,2],y[j,2], x[i,3],")", "/", "(", x[i,4] ,y[j,3], x[i,5] ,y[j,4], x[i,6],")")
equals <- eval(parse(text=math))
sum <- as.numeric(paste0(x[i,7], x[i,8], x[i,9]))
if(sum==equals) {
print(c(i,j))
}
}
}
However this takes far too long, hence I am trying to remove the time consuming eval(parse(..
Any help would be really appreciated. Thanks!
Freddie
Vectorisation is key
math <- apply(
y,
1,
function(j){
paste("(", x[, 1], j[1], x[, 2], j[2], x[, 3],")/(", x[, 4], j[3], x[, 5], j[4], x[, 6], ")")
}
)
math <- apply(math, 2, paste, collapse = ",")
math <- paste("c(", math, ")")
equals <- sapply(parse(text = math), eval)
sum <-matrix(x[, 7] * 100 + x[, 8] * 10 + x[, 9], nrow = nrow(x), ncol = nrow(y))
abs(sum - equals) < 1e-8
Let's see what the difference in speed is
require(gtools)
x <- permutations(n = 9, r = 9, v = 1:9)
y <- permutations(n = 4, r = 4, v = c("*", "/", "+", "-"))
x <- x[sample(nrow(x), 40), ]
y <- y[sample(nrow(y), 20), ]
library(microbenchmark)
microbenchmark(
loop = for(i in 1:nrow(x)){
for(j in 1:nrow(y)){
math <- paste("(", x[i,1], y[j,1], x[i,2],y[j,2], x[i,3],")", "/", "(", x[i,4] ,y[j,3], x[i,5] ,y[j,4], x[i,6],")")
equals <- eval(parse(text=math))
sum <- as.numeric(paste0(x[i,7], x[i,8], x[i,9]))
if(sum==equals) {
print(c(i,j))
}
}
},
vectorised = {
math <- apply(
y,
1,
function(j){
paste("(", x[, 1], j[1], x[, 2], j[2], x[, 3],")/(", x[, 4], j[3], x[, 5], j[4], x[, 6], ")")
}
)
math <- apply(math, 2, paste, collapse = ",")
math <- paste("c(", math, ")")
equals <- sapply(parse(text = math), eval)
sum <-matrix(x[, 7] * 100 + x[, 8] * 10 + x[, 9], nrow = nrow(x), ncol = nrow(y))
abs(sum - equals) < 1e-8
}
)
The results:
Unit: milliseconds
expr min lq mean median uq max neval cld
loop 158.666383 162.084918 167.477490 165.880665 170.258076 240.43746 100 b
vectorised 8.540623 8.966214 9.613615 9.142515 9.413117 17.88282 100 a

R data.table Performing a by aggregation on a joined column

I want to do an aggregation on the result of a join (x and i are keyed data.tables) without materializing the whole result. It is similar to what .EACHI tries to resolve but for columns that are added by the join.
D1 <- data.table(x = rnorm(1:100), i = sample(1:100, 100, replace = TRUE), k = sample(1:100, 100, replace = TRUE))
D2 <- data.table(x = rnorm(1:100), j = sample(1:100, 100, replace = TRUE), k = sample(1:100, 100, replace = TRUE))
setkey(D1, k)
setkey(D2, k)
I would like to sum x on the result of the join of D1 and D2 for the pairs i and j without materializing the result of D1[D2]
D1[D2, list(x = sum(x * i.x)), by = list(i, j), allow.cartesian = TRUE] ## Fails
D1[D2, allow.cartesian = TRUE][, list(x = sum(x * i.x)), by = list(i, j)] ## Is the result I want but uses up more memory
Is there a way to do this? If not will it be implemented in a future version?

Optimizing rollapplyr custom function

I have the following data:
y <- data.table(cbind(week = rep(1:61,5352),
ID = rep(1:5352, each = 61), w = runif(326472), v = runif(326472)))
y$v[sample(1:326472, 10000, replace=FALSE)] <- NA
for which I'm running the code bellow that creates a rolling mean of variable v, ignoring outliers and NAs.
The code is working, but with poor perfomance.
I'm sure there is more efficient way to run it using apply or something similar, but I've been unsuccessful in creating a faster version. Can anyone shed some light on how to make it more efficient?
IDs <- unique(y$ID)
y$vol_m12 <- 0
for (i in 1:length(IDs)) {
x <- y[ID==IDs[i]]
outlier <- 0.2
w_outlier <- quantile(x$w, c(outlier), na.rm = T)
v_outlier <-quantile(x$v, c(1 - outlier), na.rm = T)
# Ignore outliers
x$v_temp <- x$v
x$v_temp[((x$v_temp >= v_outlier)
& (x$w <= w_outlier))] <- NA
# Creating rolling mean
y$vol_m12[y$ID==IDs[i]] <- x[, rollapplyr(v_temp, 12, (mean), fill = NA, na.rm=T)]
}
Thanks for the replies.
Following 42 advice, I've produced the following code:
library(RcppRoll)
# Ignore outliers
y[, w_out := quantile(w, c(outlier), na.rm = T), by=ID]
y[, v_out := quantile(v, c(1-outlier), na.rm = T), by=ID]
y[((v <= v_out) & (w >= w_out)), v_temp := v]
y[,w_out := NULL]
y[,v_out := NULL]
y[, v_m12 := roll_mean(as.matrix(v_temp), n =12L, fill = NA,
align = c("right"), normalize = TRUE, na.rm = T), by = ID]
System time is about .59 seconds against 10.36 for the solution bellow, which uses rollapplyr (but probably it is possible to make the outlier removal more efficient).
y[, v_m12 :=rollapplyr(v_temp, 12, (mean), fill = NA, na.rm=T), by = ID]

Resources