Related
I use R. I have dataframe like this:
dat <- data.frame(
group = c(1,1,1,1,1,1,2,2,2,2,2),
horizon = c(1,3,5,6,7,10,1,3,5,9,10),
value = c(1.0,0.9,0.8,0.6,0.3,0.0,0.5,0.6,0.8,0.9,0.8)
other = c(a,a,a,a,a,a,b,b,b,b,b)
)
And i would like to add row for every horizon that is missing (2,4,8 and 9 for the first group and 2,4,6,7,8 for the second group). Values (value) for the missing horizons would be blank.
I would like to get something like this:
datx <- data.frame(
group = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
horizon = c(1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10),
value = c(1.0,"na",0.9,"na",0.8,0.6,0.3,"na","na",0.0,0.5,"na",0.6,"na",0.8,"na","na","na",0.9,0.8)
other = c(a,a,a,a,a,a,a,a,a,a,b,b,b,b,b,b,b,b,b,b)
)
i.e. englarged dataset with new horizons, blank or "na" spaces in "value" variable and retained "other" variable.
This is just an example. I am actually working with a much larger dataset.
Without the groups, the problem would be much easier to solve, i would use something like this:
newdat <- merge(data.frame(horizon=seq(1,10,1)),dat,all=TRUE)
newdat <- newdat[order(newdat$horizon),]
Thanks for help!
I'll assume that the values in the variable other are the characters, a or b, and that this is completely redundant with your variable group. If this is the case, you could accomplish this with full_join in the dplyrpackage.
a="a"
b="b"
dat <- data.frame(
group = c(1,1,1,1,1,1,2,2,2,2,2),
horizon = c(1,3,5,6,7,10,1,3,5,9,10),
value = c(1.0,0.9,0.8,0.6,0.3,0.0,0.5,0.6,0.8,0.9,0.8),
other = c(a,a,a,a,a,a,b,b,b,b,b)
)
groups <- expand.grid(group=c(1,2),horizon=1:10)
groups <- groups %>% dplyr::mutate(other=ifelse(group==1,"a","b"))
dat %>%
dplyr::full_join(groups,by=c('group','horizon','other')) %>%
dplyr::arrange(group,horizon)
Using data.table:
library(data.table)
setDT(dat)
fill = c("other")
RES =
dat[CJ(group = group, horizon = min(horizon):max(horizon), unique = TRUE),
on = .(group, horizon)
][, (fill) := lapply(.SD, \(x) x[which.min(is.na(x))]), by = group, .SDcols = fill]
RES[]
# group horizon value other
# <num> <int> <num> <char>
# 1: 1 1 1.0 a
# 2: 1 2 NA a
# 3: 1 3 0.9 a
# 4: 1 4 NA a
# 5: 1 5 0.8 a
# 6: 1 6 0.6 a
# 7: 1 7 0.3 a
# 8: 1 8 NA a
# 9: 1 9 NA a
# 10: 1 10 0.0 a
# 11: 2 1 0.5 b
# 12: 2 2 NA b
# 13: 2 3 0.6 b
# 14: 2 4 NA b
# 15: 2 5 0.8 b
# 16: 2 6 NA b
# 17: 2 7 NA b
# 18: 2 8 NA b
# 19: 2 9 0.9 b
# 20: 2 10 0.8 b
# group horizon value other
Here is a sample dataset:
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4))
I would like to create a new column for means using rowMeans. I would like to keep na.rm=F because if its truly NA I do not want to include that into my means calculation.
But if its either 88/99 I would like R to ignore it while calculating the mean and still use the remaining valid values. So far I have the below.
data$mean <- rowMeans(subset(data, select = c(`x`,`y`,`z`,`w`)), na.rm = T)
But I am not sure how to add in a function where it would just ignore the 88 and 99 from calculations.
This is what I am hoping to get
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4),
mean=c(4,NA,3.25,3.25,4.3))
Any help is appreciated - thank you!
Using rowMeans nevertheless with na.rm=TRUE, but on a subset and temporally replaceing 88 and 99 with NA.
s <- rowSums(is.na(data)) == 0 ## store row subset
v <- c("x", "y", "z", "w") ## col subset to calc. mean
data$mean <- NA ## ini column
m <- as.matrix(data[v]) ## we'll ned a matrix
data$mean[s] <- rowMeans(replace(m[s, v], m[s, v] %in% c(88, 99), NA), na.rm=TRUE)
data
# x y z w mean
# 1 4 4 88 4 4.000000
# 2 3 NA NA 5 NA
# 3 4 3 4 2 3.250000
# 4 4 2 4 3 3.250000
# 5 99 4 5 4 4.333333
Or simply using apply but is much slower.
f <- \(x) if (any(is.na(x))) NA else mean(x[!x %in% c(88, 99)])
cbind(data, mean=apply(data, 1, f))
# x y z w mean
# 1 4 4 88 4 4.000000
# 2 3 NA NA 5 NA
# 3 4 3 4 2 3.250000
# 4 4 2 4 3 3.250000
# 5 99 4 5 4 4.333333
From microbenchmark.
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# apply 35.018162 35.941815 38.834333 36.394632 36.960161 212.469412 100 b
# rowMeans 1.097393 1.119396 1.493563 1.193787 1.226691 9.352118 100 a
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4))
df$mean <- apply(data, 1, function(x) {
idx <- which((x %in% c(88, 89)) == FALSE)
mean(x[ idx ], na.rm = TRUE)
})
x y z w mean
1 4 4 88 4 4.00
2 3 NA NA 5 4.00
3 4 3 4 2 3.25
4 4 2 4 3 3.25
5 99 4 5 4 28.00
Suppose, there are many simulations (and other variables) in a data.table:
data <- setDT(data.frame(sim1=c(1,1,1), sim2= c(2,2,2), sim3=c(3,3,3),
sim4=c(4,4,4), sim5=c(5,5,5), index=c(2,2,2)))
sim1 sim2 sim3 sim4 sim5 index
1: 1 2 3 4 5 2
2: 1 2 3 4 5 2
3: 1 2 3 4 5 2
I want to calculate the mean of the simulations higher than index column:
data[, higher.than.index.ave := rowMeans(.SD[.SD > index]),
.SDcols = names(data[, grepl(paste(paste("sim", 1:5, sep=""),
collapse = "|") , names(data)), with=FALSE])]
I have tried other solutions as well, no luck. Any suggestion how I can perform such a task?
data <- data.table(sim1=c(1,1,1), sim2= c(2,2,2), sim3=c(3,3,3),
sim4=c(4,4,4), sim5=c(5,5,5), index=c(2,2,2))
data[, means :=
rowMeans(data[, lapply(.SD, function(x) ifelse(x < index, NA, x))
][, -'index'],
na.rm = T)]
Or, using .SDcols to select only sim columns:
data[, means :=
rowMeans(data[, lapply(.SD, function(x) ifelse(x < index, NA, x))
, .SDcols = intersect(paste0('sim', 1:5), names(data))],
na.rm = T)]
Output:
data
sim1 sim2 sim3 sim4 sim5 index means
1: 1 2 3 4 5 2 3.5
2: 1 2 3 4 5 2 3.5
3: 1 2 3 4 5 2 3.5
data$higher.than.index.ave <- apply(data,1,function(x) {y <- x[1:5]; mean(y[y>=x[6]])})
# sim1 sim2 sim3 sim4 sim5 index higher.than.index.ave
# 1: 1 2 3 4 5 2 3.5
# 2: 1 2 3 4 5 2 3.5
# 3: 1 2 3 4 5 2 3.5
MWE.
library(data.table)
x <- data.table(
g=rep(c("x", "y"), each=4), # grouping variable
time=c(1,3,5,7,2,4,6,8), # time index
val=1:8) # value
setkeyv(x, c("g", "time"))
cumsd <- function(x) sapply(sapply(seq_along(x)-1, head, x=x), sd)
x[, cumsd(val), by=g]
## Output
# g V1
# 1: x NA
# 2: x NA
# 3: x 0.7071068
# 4: x 1.0000000
# 5: y NA
# 6: y NA
# 7: y 0.7071068
# 8: y 1.0000000
I want to compute the standard deviation (or more generally, a mathematical function) of all prior values (not including the current value), per observation, by group, in R.
The cumsd ("cumulative sd") function above does what I need. For e.g. row 3, V1 = sd(c(1, 2)), corresponding to the values in rows 1 and 2. Row 7, V1 = sd(c(5, 6)), corresponding to the values in rows 5 and 6.
However, cumsd is very slow (too slow to use in my real-world application). Any ideas on how to do the computation more efficiently?
Edit
For sd we can use runSD from library TTR as discussed here: Calculating cumulative standard deviation by group using R
Gabor's answer below addresses the more general case of any arbitrary mathematical function on prior values. Though potentially the generalisability comes at some cost of efficiency.
We can specify the window widths as a vector and then omit the last value in the window for each application of sd.
library(zoo)
x[, sd:=rollapplyr(val, seq_along(val), function(x) sd(head(x, -1)), fill = NA), by = g]
giving:
> x
g time val sd
1: x 1 1 NA
2: x 3 2 NA
3: x 5 3 0.7071068
4: x 7 4 1.0000000
5: y 2 5 NA
6: y 4 6 NA
7: y 6 7 0.7071068
8: y 8 8 1.0000000
Alternately we can specify the offsets in a list. Negative offsets, used here, refer to prior values so -1 is the immediate prior value, -2 is the value before that and so on.
negseq <- function(x) -seq_len(x))
x[, sd:=rollapplyr(val, lapply(seq_along(val)-1, negseq), sd, fill = NA), by = g]
giving:
> x
g time val sd
1: x 1 1 NA
2: x 3 2 NA
3: x 5 3 0.7071068
4: x 7 4 1.0000000
5: y 2 5 NA
6: y 4 6 NA
7: y 6 7 0.7071068
8: y 8 8 1.0000000
We can use TTR::runSD with shift:
library(TTR);
setDT(x)[, cum_sd := shift(runSD(val, n = 2, cumulative = TRUE)) , g]
# g time val cum_sd
#1: x 1 1 NA
#2: x 3 2 NA
#3: x 5 3 0.7071068
#4: x 7 4 1.0000000
#5: y 2 5 NA
#6: y 4 6 NA
#7: y 6 7 0.7071068
#8: y 8 8 1.0000000
Turned out that neither option were fast enough for my application (millions of groups and observations). But your comments inspired me to write a small function in Rcpp that did the trick. Thanks everyone!
library(data.table)
library(Rcpp)
x <- data.table(
g=rep(c("x", "y"), each=4), # grouping variable
time=c(1,3,5,7,2,4,6,8), # time index
val=1:8) # value
setkeyv(x, c("g", "time"))
cumsd <- function(x) sapply(sapply(seq_along(x)-1, head, x=x), sd)
x[, v1:=cumsd(val), by=g]
cppFunction('
Rcpp::NumericVector rcpp_cumsd(Rcpp::NumericVector inputVector){
int len = inputVector.size();
Rcpp::NumericVector outputVector(len, NumericVector::get_na());
if (len < 3) return (outputVector);
for (int i = 2; i < len; ++i){
outputVector(i) = Rcpp::sd(inputVector[Rcpp::seq(0, i - 1)]);
}
return(outputVector);
};
')
x[, v2:= rcpp_cumsd(val), by=g]
all.equal(x$v1, x$v2)
## TRUE
The speed difference seems to depend on the number of groups vs. the number of observations per group in the data.table. I won't post benchmarks but in my case, the Rcpp version was much, much faster.
I have a R data.table
DT = data.table(x=rep(c("b","a",NA_character_),each=3), y=rep(c('A', NA_character_, 'C'), each=3), z=c(NA_character_), v=1:9)
DT
# x y z v
#1: b A NA 1
#2: b A NA 2
#3: b A NA 3
#4: a NA NA 4
#5: a NA NA 5
#6: a NA NA 6
#7: NA C NA 7
#8: NA C NA 8
#9: NA C NA 9
For each column if the value is not NA, I want to extract the max value from column v. I am using
sapply(DT, function(x) { ifelse(all(is.na(x)), NA_integer_, max(DT[['v']][!is.na(x)])) })
#x y z v
#6 9 NA 9
Is there a simpler way to achive this?
here is a way, giving you -Inf (and a warning) if all values of the column are NA (you can later replace that by NA if you prefer):
DT[, lapply(.SD, function(x) max(v[!is.na(x)]))]
# x y z v
# 1: 6 9 -Inf 9
As suggested by #DavidArenburg, to ensure that everything goes well even when all values are NA (no warning and directly NA as result), you can do:
DT[, lapply(.SD, function(x) {
temp <- v[!is.na(x)]
if(!length(temp)) NA else max(temp)
})]
# x y z v
#1: 6 9 NA 9
We can use summarise_each from dplyr
library(dplyr)
DT %>%
summarise_each(funs(max(v[!is.na(.)])))
# x y z v
#1: 6 9 -Inf 9