Row mean of selected columns conditional on a different column - r

Suppose, there are many simulations (and other variables) in a data.table:
data <- setDT(data.frame(sim1=c(1,1,1), sim2= c(2,2,2), sim3=c(3,3,3),
sim4=c(4,4,4), sim5=c(5,5,5), index=c(2,2,2)))
sim1 sim2 sim3 sim4 sim5 index
1: 1 2 3 4 5 2
2: 1 2 3 4 5 2
3: 1 2 3 4 5 2
I want to calculate the mean of the simulations higher than index column:
data[, higher.than.index.ave := rowMeans(.SD[.SD > index]),
.SDcols = names(data[, grepl(paste(paste("sim", 1:5, sep=""),
collapse = "|") , names(data)), with=FALSE])]
I have tried other solutions as well, no luck. Any suggestion how I can perform such a task?

data <- data.table(sim1=c(1,1,1), sim2= c(2,2,2), sim3=c(3,3,3),
sim4=c(4,4,4), sim5=c(5,5,5), index=c(2,2,2))
data[, means :=
rowMeans(data[, lapply(.SD, function(x) ifelse(x < index, NA, x))
][, -'index'],
na.rm = T)]
Or, using .SDcols to select only sim columns:
data[, means :=
rowMeans(data[, lapply(.SD, function(x) ifelse(x < index, NA, x))
, .SDcols = intersect(paste0('sim', 1:5), names(data))],
na.rm = T)]
Output:
data
sim1 sim2 sim3 sim4 sim5 index means
1: 1 2 3 4 5 2 3.5
2: 1 2 3 4 5 2 3.5
3: 1 2 3 4 5 2 3.5

data$higher.than.index.ave <- apply(data,1,function(x) {y <- x[1:5]; mean(y[y>=x[6]])})
# sim1 sim2 sim3 sim4 sim5 index higher.than.index.ave
# 1: 1 2 3 4 5 2 3.5
# 2: 1 2 3 4 5 2 3.5
# 3: 1 2 3 4 5 2 3.5

Related

rowmeans but ignore certain values when calculating the mean but na.rm=F

Here is a sample dataset:
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4))
I would like to create a new column for means using rowMeans. I would like to keep na.rm=F because if its truly NA I do not want to include that into my means calculation.
But if its either 88/99 I would like R to ignore it while calculating the mean and still use the remaining valid values. So far I have the below.
data$mean <- rowMeans(subset(data, select = c(`x`,`y`,`z`,`w`)), na.rm = T)
But I am not sure how to add in a function where it would just ignore the 88 and 99 from calculations.
This is what I am hoping to get
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4),
mean=c(4,NA,3.25,3.25,4.3))
Any help is appreciated - thank you!
Using rowMeans nevertheless with na.rm=TRUE, but on a subset and temporally replaceing 88 and 99 with NA.
s <- rowSums(is.na(data)) == 0 ## store row subset
v <- c("x", "y", "z", "w") ## col subset to calc. mean
data$mean <- NA ## ini column
m <- as.matrix(data[v]) ## we'll ned a matrix
data$mean[s] <- rowMeans(replace(m[s, v], m[s, v] %in% c(88, 99), NA), na.rm=TRUE)
data
# x y z w mean
# 1 4 4 88 4 4.000000
# 2 3 NA NA 5 NA
# 3 4 3 4 2 3.250000
# 4 4 2 4 3 3.250000
# 5 99 4 5 4 4.333333
Or simply using apply but is much slower.
f <- \(x) if (any(is.na(x))) NA else mean(x[!x %in% c(88, 99)])
cbind(data, mean=apply(data, 1, f))
# x y z w mean
# 1 4 4 88 4 4.000000
# 2 3 NA NA 5 NA
# 3 4 3 4 2 3.250000
# 4 4 2 4 3 3.250000
# 5 99 4 5 4 4.333333
From microbenchmark.
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# apply 35.018162 35.941815 38.834333 36.394632 36.960161 212.469412 100 b
# rowMeans 1.097393 1.119396 1.493563 1.193787 1.226691 9.352118 100 a
data <- data.frame(x=c(4,3,4,4,99),
y=c(4,NA,3,2,4),
z = c(88,NA,4,4,5),
w = c(4,5,2,3,4))
df$mean <- apply(data, 1, function(x) {
idx <- which((x %in% c(88, 89)) == FALSE)
mean(x[ idx ], na.rm = TRUE)
})
x y z w mean
1 4 4 88 4 4.00
2 3 NA NA 5 4.00
3 4 3 4 2 3.25
4 4 2 4 3 3.25
5 99 4 5 4 28.00

Post-processing of full_join output to remove multiplicity

I have two data frames(df1, df2) and performed full_join using the common column of interest col1.
df1 <- data.frame(col1=c('A','D','C','C','E','E','I'),col2=c(4,7,8,3,2,4,9))
df2 <- data.frame(col1=c('A','A','B','C','C','E','E','I'),col2=c(4,1,6,8,3,2,1,9))
df1 %>% full_join(df2, by = "col1")
# col1 col2.x col2.y
# 1 A 4 4
# 2 A 4 1
# 3 D 7 NA
# 4 C 8 8
# 5 C 8 3
# 6 C 3 8
# 7 C 3 3
# 8 E 2 2
# 9 E 2 1
# 10 E 4 2
# 11 E 4 1
# 12 I 9 9
# 13 B NA 6
As expected the full_join provides multiplicty of the joining column values and I wish to avoid it. I wish to arrive at the following output. What kind of post-processing approaches do you suggest?
# col1 col2.x col2.y
# 1 A 4 4
# 2 A NA 1
# 3 D 7 NA
# 4 C 8 8
# 5 C 3 3
# 6 E 2 2
# 7 E 4 1
# 8 I 9 9
# 9 B NA 6
More information:
Case 1: I do not need four rows in the output for two same values in both input objects:
# 4 C 8 8
# 5 C 8 3
# 6 C 3 8
# 7 C 3 3
instead, I want only two as:
# 4 C 8 8
# 5 C 3 3
Case 2: Similarly, I need same row for the difference in values:
# 8 E 2 2
# 9 E 2 1
# 10 E 4 2
# 11 E 4 1
instead, I want only two rows as below:
# 8 E 2 2
# 9 E 4 1
A possible solution in 2 steps using the data.table-package:
0) load package & convert to data.table's
library(data.table)
setDT(df1)
setDT(df2)
1) define helper function
unlistSD <- function(x) {
l <- length(x)
ls <- sapply(x, lengths)
m <- max(ls)
newSD <- vector(mode = "list", length = l)
for (i in 1:l) {
u <- unlist(x[[i]])
lu <- length(u)
if (lu < m) {
u <- c(u, rep(NA_real_, m - lu))
}
newSD[[i]] <- u
}
return(setNames(as.list(newSD), names(x)))
}
2) merge and apply helper function
merge(df1[, .(col2 = list(col2)), by = col1],
df2[, .(col2 = list(col2)), by = col1],
by = "col1", all = TRUE
)[, unlistSD(.SD), by = col1]
which gives the following result:
col1 col2.x col2.y
1: A 4 4
2: A NA 1
3: C 8 8
4: C 3 3
5: D 7 NA
6: E 2 2
7: E 4 1
8: I 9 9
9: B NA 6
Another possibiliy with base R:
unlistDF <- function(d, groupcols) {
ds <- split(d[, setdiff(names(d), groupcols)], d[,groupcols])
ls <- lapply(ds, function(x) max(sapply(x, lengths)))
dl <- lapply(ds, function(x) lapply(as.list(x), unlist))
du <- Map(function(x, y) {
lapply(x, function(i) {
if(length(i) < y) {
c(i, rep(NA_real_, y - length(i)))
} else i
})
}, x = dl, y = ls)
ld <- lapply(du, as.data.frame)
cbind(d[rep(1:nrow(d), ls), groupcols, drop = FALSE],
do.call(rbind.data.frame, c(ld, make.row.names = FALSE)),
row.names = NULL)
}
Now you can use this function as follows in combination with merge:
df <- merge(aggregate(col2 ~ col1, df1, as.list),
aggregate(col2 ~ col1, df2, as.list),
by = "col1", all = TRUE)
unlistDF(df, "col1")

Unexpected behavior of data.table := when subsetting

Given the data.table dt <- data.table(a=c(1,NA,3), b = c(4:6))
a b
1: 1 4
2: NA 5
3: 3 6
... , the result for dt[is.na(a), a := sum(a, na.rm = T)] is:
a b
1: 1 4
2: 0 5
3: 3 6
... , instead of the expected:
a b
1: 1 4
2: 4 5
3: 3 6
What is going on? I am using data.table 1.12.8
We could use fcoalesce
library(data.table)
dt[, a := fcoalesce(a, sum(a, na.rm = TRUE))]

Calculate cummean() and cumsd() while ignoring NA values and filling NAs

My goal is to obtain the cum mean (and cumsd) of a dataframe while ignoring NA and filling those with the previous cum means:
df:
var1 var2 var3
x1 y1 z1
x2 y2 z2
NA NA NA
x3 y3 z3
cummean:
var1 var2 var3
x1/1 y1/1 z1/1
(x1+x2)/2 (y1+y2)/2 (z1+z2)/2
(x1+x2)/2 (y1+y2)/2 (z1+z2)/2
(x1+x2+x3)/3 (y1+y2+y3)/3 (z1+z2+z3)/3
So for row 3 where df has NA, I want the new matrix to contain the cum mean from the line above (numerator should not increase).
So far, I am using this to compute the cum mean (I am aware that somewhere a baby seal gets killed because I used a for loop and not something from the apply family)
for(i in names(df){
df[i][!is.na(df[i])] <- GMCM:::cummean(df[i][!is.na(df[i])])
}
I have also tried this:
setDT(posRegimeReturns)
cols<-colnames((posRegimeReturns))
posRegimeReturns[, (cols) := lapply(.SD, cummean) , .SD = cols]
But both of those leave the NAs empty.
Note: this question is similar to this post Calculate cumsum() while ignoring NA values
but unlike the solution there, I don't want to leave the NAs but rather fill those with the same values as the last row above that was not NA.
You might want to use the definition of variance to calculate this
library(data.table)
dt <- data.table(V1=c(1,2,NA,3), V2=c(1,2,NA,3), V3=c(1,2,NA,3))
cols <- copy(names(dt))
#means
dt[ , paste0("mean_",cols) := lapply(.SD, function(x) {
#get the num of non-NA observations
lens <- cumsum(!is.na(x))
#set NA to 0 before doing cumulative sum
x[is.na(x)] <- 0
cumsum(x) / lens
}), .SDcols=cols]
#sd
dt[ , paste0("sd_",cols) := lapply(.SD, function(x) {
lens <- cumsum(!is.na(x))
x[is.na(x)] <- 0
#use defn of variance mean of sum of squares minus square of means and also n-1 in denominator
sqrt(lens/(lens-1) * (cumsum(x^2)/lens - (cumsum(x) / lens)^2))
}), .SDcols=cols]
Using data table. In particular:
library(data.table)
DT <- data.table(z = sample(N),idx=1:N,key="idx")
z idx
1: 4 1
2: 10 2
3: 9 3
4: 6 4
5: 1 5
6: 8 6
7: 3 7
8: 7 8
9: 5 9
10: 2 10
We now make use of the use of -apply function and data.table.
DT[,cummean:=sapply(seq(from=1,to=nrow(DT)) ,function(iii) mean(DT$z[1:iii],na.rm = TRUE))]
DT[,cumsd:=sapply(seq(from=1,to=nrow(DT)) ,function(iii) sd(DT$z[1:iii],na.rm = TRUE))]
resulting in:
z idx cummean cumsd
1: 4 1 4.000000 NA
2: 10 2 7.000000 4.242641
3: 9 3 7.666667 3.214550
4: 6 4 7.250000 2.753785
5: 1 5 6.000000 3.674235
6: 8 6 6.333333 3.386247
7: 3 7 5.857143 3.338092
8: 7 8 6.000000 3.116775
9: 5 9 5.888889 2.934469
10: 2 10 5.500000 3.027650

R: Data.table on subset excluding by value

Using data.table in R, I'm trying to make an operation on the subset excluding selected element. I'm using the by operator, but I don't know if this is the right approach.
Here's an example. E.g. the value for Delta in IAH:SNA is (3+3)/2 which is the mean of Stops in IAH:SNA once Delta has been excluded.
library(data.table)
s1 <- "Market Carrier Stops
IAH:SNA Delta 1
IAH:SNA Delta 1
IAH:SNA Southwest 3
IAH:SNA Southwest 3
MSP:CLE Southwest 2
MSP:CLE Southwest 2
MSP:CLE American 2
MSP:CLE JetBlue 1"
d <- data.table(read.table(textConnection(s1), header=TRUE))
setkey(d, Carrier, Market)
f <- function(x, y){
subset(d, !(Carrier %in% x) & Market == y, Stops)[, mean(Stops)]}
d[, s := f(.BY[[1]], .BY[[2]]), by=list(Carrier, Market)]
## Market Carrier Stops s
## 1: MSP:CLE American 2 1.666667
## 2: IAH:SNA Delta 1 3.000000
## 3: IAH:SNA Delta 1 3.000000
## 5: IAH:SNA Southwest 3 1.000000
## 6: IAH:SNA Southwest 3 1.000000
## 7: MSP:CLE Southwest 2 1.500000
## 8: MSP:CLE Southwest 2 1.500000
The above solution performs very poorly on large data sets (it's essentially an mapply), but I'm not sure how to do it in a fast data.table-like way.
Perhaps one could (dynamically) generate a factor that does this? I'm just not sure how. . .
Is there a way to improve it?
Edit: Just for the heck of it, here's a way to get a bigger version of the above
library(data.table)
dl.dta <- function(...){
## input years ..
years <- gsub("\\.", "_", c(...))
baseurl <- "http://www.transtats.bts.gov/Download/"
names <- paste("Origin_and_Destination_Survey_DB1BMarket", years, sep="_")
info <- t(sapply(names, function(x) file.exists(paste(x, c("zip", "csv"), sep="."))))
to.download <- paste(baseurl, names, ".zip", sep="")[!apply(info, 1, any)]
if (length(to.download) > 0){
message("starting download...")
sapply(to.download,
function(x) download.file(x, rev(strsplit(x, "/")[[1]])[1]))}
to.unzip <- paste(names, "zip", sep=".")[!info[, 2]]
if (length(to.unzip > 0)){
message("starting to unzip...")
sapply(to.unzip, unzip)}
paste(names, "csv", sep=".")}
countWords.split <- function(x, s=":"){
## Faster on my machine than grep for some reanon
sapply(strsplit(as.character(x), s), length)}
countWords.grep <- function(x){
sapply(gregexpr("\\W+", x), length)+1}
fname <- dl.dta(2013.1)
cols <- rep("NULL", 41)
## Columns to keep: 9 is Origin, 18 is Dest, 24 is groups of airports in travel
## 30 is RPcarrier (reporting carrier).
## For more columns: 35 is market fare and 36 is distance.
cols[9] <- cols[18] <- cols[24] <- cols[30] <- NA
d <- data.table(read.csv(file=fname, colClasses=cols))
d[, Market := paste(Origin, Dest, sep=":")]
## should probably
d[, Stops := -2 + countWords.split(AirportGroup)]
d[, Carrier := RPCarrier]
d[, c("RPCarrier", "Origin", "Dest", "AirportGroup") := NULL]
Use a tiny bit of elementary maths:
d[, c("tmp.mean", "N") := list(mean(Stops), .N), by = Market]
d[, exep.mean := (tmp.mean * N - sum(Stops)) / (N - .N), by = list(Market,Carrier)]
# Market Carrier Stops tmp.mean N exep.mean
# 1: IAH:SNA Delta 1 2.00 4 3.000000
# 2: IAH:SNA Delta 1 2.00 4 3.000000
# 3: IAH:SNA Southwest 3 2.00 4 1.000000
# 4: IAH:SNA Southwest 3 2.00 4 1.000000
# 5: MSP:CLE Southwest 2 1.75 4 1.500000
# 6: MSP:CLE Southwest 2 1.75 4 1.500000
# 7: MSP:CLE American 2 1.75 4 1.666667
# 8: MSP:CLE JetBlue 1 1.75 4 2.000000
#Roland's answer will work for some functions (and when it does it will be best) but not in general. Unfortunately you can't apply the split-apply-combine strategy to the data as is to do the task, but you can if you make the data larger. Let's start with a simpler example:
dt = data.table(a = c(1,1,2,2,3,3), b = c(1:6), key = 'a')
# now let's extend this table the following way
# take the unique a's and construct all the combinations excluding one element
combinations = dt[, combn(unique(a), 2)]
# now combine this into a data.table with the excluded element as the index
# and merge it back into the original data.table
extension = rbindlist(apply(combinations, 2,
function(x) data.table(a = x, index = setdiff(c(1,2,3), x))))
setkey(extension, a)
dt.extended = extension[dt, allow.cartesian = TRUE]
dt.extended[order(index)]
# a index b
# 1: 2 1 3
# 2: 2 1 4
# 3: 3 1 5
# 4: 3 1 6
# 5: 1 2 1
# 6: 1 2 2
# 7: 3 2 5
# 8: 3 2 6
# 9: 1 3 1
#10: 1 3 2
#11: 2 3 3
#12: 2 3 4
# Now we have everything we need:
dt.extended[, mean(b), by = list(a = index)]
# a V1
#1: 3 2.5
#2: 2 3.5
#3: 1 4.5
Going back to original data (and doing some operations slightly differently, to simplify expressions):
extension = d[, {Carrier.uniq = unique(Carrier);
.SD[, rbindlist(combn(Carrier.uniq, length(Carrier.uniq)-1,
function(x) data.table(Carrier = x,
index = setdiff(Carrier.uniq, x)),
simplify = FALSE))]}, by = Market]
setkey(extension, Market, Carrier)
extension[d, allow.cartesian = TRUE][, mean(Stops), by = list(Market, Carrier = index)]
# Market Carrier V1
#1: IAH:SNA Southwest 1.000000
#2: IAH:SNA Delta 3.000000
#3: MSP:CLE JetBlue 2.000000
#4: MSP:CLE Southwest 1.500000
#5: MSP:CLE American 1.666667

Resources