How to create combinatory variable in R data.frame? - r

I have a data.frame that has several variables with zero values. I need to construct an extra variable that would return the combination of variables that are not zero for each observation. E.g.
df <- data.frame(firm = c("firm1", "firm2", "firm3", "firm4", "firm5"),
A = c(0, 0, 0, 1, 2),
B = c(0, 1, 0, 42, 0),
C = c(1, 1, 0, 0, 0))
Now I would like to generate the new variable:
df$varCombination <- c("C", "B-C", NA, "A-B", "A")
I thought up something like this, which obviously did not work:
for (i in 1:nrow(df)){
df$varCombination[i] <- paste(names(df[i,2:ncol(df) & > 0]), collapse = "-")
}

This could be probably solved easily using apply(df, 1, fun), but here is an attempt to solve this column wise instead of row wise for performance sake (I once saw something similar done by #alexis_laz but can't find it right now)
## Create a logical matrix
tmp <- df[-1] != 0
## or tmp <- sapply(df[-1], `!=`, 0)
## Prealocate result
res <- rep(NA, nrow(tmp))
## Run per column instead of per row
for(j in colnames(tmp)){
res[tmp[, j]] <- paste(res[tmp[, j]], j, sep = "-")
}
## Remove the pre-allocated `NA` values from non-NA entries
gsub("NA-", "", res, fixed = TRUE)
# [1] "C" "B-C" NA "A-B" "A"
Some benchmarks on a bigger data set
set.seed(123)
BigDF <- as.data.frame(matrix(sample(0:1, 1e4, replace = TRUE), ncol = 10))
library(microbenchmark)
MM <- function(df) {
var_names <- names(df)[-1]
res <- character(nrow(df))
for (i in 1:nrow(df)){
non_zero_names <- var_names[df[i, -1] > 0]
res[i] <- paste(non_zero_names, collapse = '-')
}
res
}
ZX <- function(df) {
res <-
apply(df[,2:ncol(df)]>0, 1,
function(i)paste(colnames(df[, 2:ncol(df)])[i], collapse = "-"))
res[res == ""] <- NA
res
}
DA <- function(df) {
tmp <- df[-1] != 0
res <- rep(NA, nrow(tmp))
for(j in colnames(tmp)){
res[tmp[, j]] <- paste(res[tmp[, j]], j, sep = "-")
}
gsub("NA-", "", res, fixed = TRUE)
}
microbenchmark(MM(BigDF), ZX(BigDF), DA(BigDF))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# MM(BigDF) 239.36704 248.737408 253.159460 252.177439 255.144048 289.340528 100 c
# ZX(BigDF) 35.83482 37.617473 38.295425 38.022897 38.357285 76.619853 100 b
# DA(BigDF) 1.62682 1.662979 1.734723 1.735296 1.761695 2.725659 100 a

Using apply:
# paste column names
df$varCombination <-
apply(df[,2:ncol(df)]>0, 1,
function(i)paste(colnames(df[, 2:ncol(df)])[i], collapse = "-"))
# convert blank to NA
df$varCombination[df$varCombination == ""] <- NA
# result
df
# firm A B C varCombination
# 1 firm1 0 0 1 C
# 2 firm2 0 1 1 B-C
# 3 firm3 0 0 0 <NA>
# 4 firm4 1 42 0 A-B
# 5 firm5 2 0 0 A

You had the right idea but the logical comparison in your loop wasn't correct.
I've attempted to keep the code fairly similar to what you had before, this should work:
var_names <- names(df)[-1]
df$varCombination <- character(nrow(df))
for (i in 1:nrow(df)){
non_zero_names <- var_names[df[i, -1] > 0]
df$varCombination[i] <- paste(non_zero_names, collapse = '-')
}
> df
firm A B C varCombination
1 firm1 0 0 1 C
2 firm2 0 1 1 B-C
3 firm3 0 0 0
4 firm4 1 42 0 A-B
5 firm5 2 0 0 A

Related

For a dataset of 0's and 1's, set all but the first 1 in each row to 0's

I have a data.frame of 1,480 rows and 1,400 columns like:
1 2 3 4 5 6 ..... 1399 1400
1 0 0 0 1 0 0 ..... 1 0 #first occurrence would be at 4
2 0 0 0 0 0 1 ..... 0 1
3 1 0 0 1 0 0 ..... 0 0
## and etc
Each row contains a series of 0's and 1's - predominantly 0's. For each row, I want to find at which column the first 1 shows up and set the remaining values to 0's.
My current implementation can efficiently find the occurrence of the first 1, but I've only figured out how to zero out the remaining values iteratively by row. In repeated simulations, this iterative process is taking too long.
Here is the current implementation:
N <- length(df[which(df$arm == 0), "pt_id"]) # of patients
M <- max_days
#
# df is like the data frame shown above
#
df[which(df$arm == 0), 5:length(colnames(df))] <- unlist(lapply(matrix(data = rep(pbo_hr, M*N), nrow=N, ncol = M), rbinom, n=1, size = 1))
event_day_post_rand <- apply(df[,5:length(colnames(df))], MARGIN = 1, FUN = function(x) which (x>0)[1])
df <- add_column(df, "event_day_post_rand" = event_day_post_rand, .after = "arm_id")
##
## From here trial days start on column 6 for df
##
#zero out events that occurred after the first event, since each patient can only have 1 max event which will be taken as the earliest event
for (pt_id in df[which(!is.na(df$event_day_post_rand)),"pt_id"]){
event_idx = df[which(df$pt_id == pt_id), "event_day_post_rand"]
df[which(df$pt_id == pt_id), as.character(5+event_idx+1):"1400"] <- 0
}
We can do
mat <- as.matrix(df) ## data frame to matrix
j <- max.col(mat, ties.method = "first")
mat[] <- 0
mat[cbind(1:nrow(mat), j)] <- 1
df <- data.frame(mat) ## matrix to data frame
I also suggest just using a matrix to store these values. In addition, the result will be a sparse matrix. So I recommend
library(Matrix)
sparseMatrix(i = 1:nrow(mat), j = j, x = rep(1, length(j)))
We can get a little more performance by setting the 1 elements to 0 whose rows are duplicates.
Since the OP is open to starting with a matrix rather than a data.frame, I'll do the same.
# dummy data
m <- matrix(sample(0:1, 1480L*1400L, TRUE, c(0.9, 0.1)), 1480L, 1400L)
# proposed solution
f1 <- function(m) {
ones <- which(m == 1L)
m[ones[duplicated((ones - 1L) %% nrow(m), nmax = nrow(m))]] <- 0L
m
}
# Zheyuan Li's solution
f2 <- function(m) {
j <- max.col(m, ties.method = "first")
m[] <- 0L
m[cbind(1:nrow(m), j)] <- 1L
m
}
microbenchmark::microbenchmark(f1 = f1(m),
f2 = f2(m),
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 9.1457 11.45020 12.04258 11.9011 12.3529 37.6716 100
#> f2 12.8424 14.92955 17.31811 15.3251 16.0550 43.6314 100
Zheyuan Li's suggestion to go with a sparse matrix is a good idea.
# convert to a memory-efficient nsparseMatrix
library(Matrix)
m1 <- as(Matrix(f1(m), dimnames = list(NULL, NULL), sparse = TRUE), "nsparseMatrix")
object.size(m)
#> 8288216 bytes
object.size(m1)
#> 12864 bytes
# proposed function to go directly to a sparse matrix
f3 <- function(m) {
n <- nrow(m)
ones <- which(m == 1L) - 1L
i <- ones %% n
idx <- which(!duplicated(i, nmax = n))
sparseMatrix(i[idx], ones[idx] %/% n, dims = dim(m), index1 = FALSE, repr = "C")
}
# going directly to a sparse matrix using Zheyuan Li's solution
f4 <- function(m) {
sparseMatrix(1:nrow(m), max.col(m, ties.method = "first"), dims = dim(m), repr = "C")
}
identical(m1, f3(m))
#> [1] TRUE
identical(m1, f4(m))
#> [1] TRUE
microbenchmark::microbenchmark(f1 = f1(m),
f3 = f3(m),
f4 = f4(m))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 9.1719 9.30715 11.12569 9.52300 11.92740 83.8518 100
#> f3 7.4330 7.59875 12.62412 7.69610 11.08815 84.8291 100
#> f4 8.9607 9.31115 14.01477 9.49415 11.44825 87.1577 100

For loop in R with no extra computation taking too long (Code efficiency)

I have a large data frame that is taking to long to compute a for loop, I've tried removing all computations to time the for loop but I still have an inefficient code. I'm new to R but I think there should be a better way of coding my for loop.
If you could provide some guidance it would be appreciated.
My dataFrame has 2,772,807 obs of 6 variables.
Simplified code (Still takes long):
library("tictoc")
tic()
dataFlights <- read_delim("U.S._DOT_O&D_Monthly_Traffic_Report.tsv",
"\t", escape_double = FALSE, trim_ws = TRUE)
dataFlights["Connections"] = ""
pb <- txtProgressBar(min = 0, max = nrow(dataFlights), style = 3)
for (row in 1:nrow(dataFlights)) {
dataFlights[row,7] <- 1
setTxtProgressBar(pb, row)
}
close(pb)
toc()
Original Code:
#Reads DOT public flight information for 2017 & 2018,
#and computes the number of connections
#per route (Cp#1 or Cp#2) into a new column. Possible results 0,1, or 2 connections.
library("tictoc")
tic()
dataFlights <- read_delim("U.S._DOT_O&D_Monthly_Traffic_Report.tsv",
"\t", escape_double = FALSE, trim_ws = TRUE)
dataFlights["Connections"] = ""
pb <- txtProgressBar(min = 0, max = nrow(dataFlights), style = 3)
for (row in 1:nrow(dataFlights)) {
if(is.na(dataFlights[row,2]) & is.na(dataFlights[row,3])){
dataFlights[row,7] <- 0
} else if (is.na(dataFlights[row,2]) | is.na(dataFlights[row,3])) {
dataFlights[row,7] <- 1
} else {
dataFlights[row,7] <- 2
}
setTxtProgressBar(pb, row)
}
close(pb)
toc()
As indicated in the comments, this can be done effortlessly with ifelse
# data
set.seed(123)
n <- 1e+6
dataFlights <- data.frame(x1 = runif(n),
x2 = sample(c(runif(n/2), rep(NA, n/2)), n),
x3 = sample(c(runif(n/2), rep(NA, n/2)), n),
stringsAsFactors = FALSE
)
# conditions
na_2 <- is.na(.subset2(dataFlights, 2))
na_3 <- is.na(.subset2(dataFlights, 3))
na_sum <- na_2 + na_3
# ifelse
dataFlights$x4 <- ifelse(na_sum == 2, 0, ifelse(na_sum == 1, 1, 2))
head(dataFlights)
# x1 x2 x3 x4
# 1 0.2875775 NA NA 0
# 2 0.7883051 0.4415287 NA 1
# 3 0.4089769 NA 0.3130298 1
# 4 0.8830174 0.3077688 NA 1
# 5 0.9404673 NA NA 0
# 6 0.0455565 0.5718788 NA 1
where for simplicity I set column 4 as opposed to column 7.
Few suggestions:
dataFlights["Connections"] = ""
In this piece, if you use NA instead of "", it will keep the data size smaller. For comparison, I created a 3,000,000 x 3 matrix to see size. With only one column different, the one with "" had size 268Mb but the one with NA was only about 60Mb. Smaller the size, faster it will be to index.
pb <- txtProgressBar(min = 0, max = nrow(dataFlights), style = 3)
for (row in 1:nrow(dataFlights)) {
dataFlights[row,7] <- 1
setTxtProgressBar(pb, row)
}
In each iteration, you are assigning 1 to a matrix / data.frame cell. This assignment is a computationally expensive step. For your example, this can be completely vectorized. Here are few ways to get 7th column to replace your for loop
rowSums
col7.rowSums = rowSums(!is.na(dataFlights[, 2:3]))
sapply
col7.sapply = sapply(1:nrow(dataFlights), function(x) sum(!is.na(dataFlights[x, 2:3])))
apply
col7.apply = apply(!is.na(dataFlights[, 2:3]), 1, sum)
Microbenchmark
Unit: microseconds
expr min lq mean median uq max neval
for.loop 52604.86 56768.5590 58810.55595 58137.651 60064.056 81958.717 100
rowSums 35.87 49.2225 61.23889 53.845 72.010 139.409 100
sapply 49756.32 53131.1065 55778.95541 54414.455 56154.496 102558.473 100
apply 997.21 1060.5380 1225.48577 1135.066 1254.936 3864.779 100

Alternative nested for loops for counting value occurrence in R dataframe

I am working on a large dataset, i what to count how many time two columns have the same values. Here is an example of the dataset:
id = rep(replicate(4, paste(sample(LETTERS, 3, replace=F), collapse="")), 12500)
names = rep(replicate(3125, paste(sample(letters, 5, replace=T), collapse="")), 16)
times = sample(c(3,6,24), 50000, replace = T)
df = data.frame(id=id, names=names, times=times)
count <- list()
ids <- as.vector(unique(df$id))
nms <- as.vector(unique(df$names))
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- nrow(df[df$id == ids[i] & df$names == nms[j], ])
}
count[[i]] <- vec
}
My real data have about 50000 x 10 dimension and the id and name fields are randomly scattered. Can anyone suggest a better way to handle this? because my approach is working but too slow. dplyr or plyr methods?
Thanks,
EDIT:
short version of my dataframe:
id = rep(replicate(3, paste(sample(LETTERS, 3, replace=F), collapse="")), 5)
names = rep(replicate(3, paste(sample(letters, 5, replace=T), collapse="")), 5)
times = sample(c(3,6,24), 15, replace = T)
df = data.frame(id=id, names=names, times=times)
df
id names times
1 DEW xxsre 24
2 QHY xkbhr 24
3 DQE tuyfk 6
4 DEW xxsre 24
5 QHY xkbhr 24
6 DQE tuyfk 3
7 DEW xxsre 3
8 QHY xkbhr 24
9 DQE tuyfk 3
10 DEW xxsre 24
11 QHY xkbhr 24
12 DQE tuyfk 3
13 DEW xxsre 24
14 QHY xkbhr 3
15 DQE tuyfk 3
output:
> count
[[1]]
[1] 5 0 0
[[2]]
[1] 0 5 0
[[3]]
[1] 0 0 5
each list item is for id, and the list vec is for names count. in other words as.vector(unique(df$id)) and as.vector(unique(df$names)) respectively.
You can use data.table, which is likely the fastest solution:
library(data.table)
# convert your dataset into a data.table
setDT(df)
output <- df [ , .N, by = .(id, names)]
head(output)
> id names N
> 1: FYG vlrcd 4
> 2: FAL mjhhs 4
> 3: BZU rfnvc 4
> 4: HJA zhssf 4
> 5: FYG pxtne 4
> 6: FAL qgeqr 4
If you want the output to be a list, you can convert the output in different ways:
L1 <- as.list(as.data.frame(t(output))) # or
L2 <- split(output, list(output$id, output$names)) # or
L3 <- split(output, seq(nrow(output)))
Does this do what you want?
library(dplyr)
count <- df %>%
group_by(id, names) %>%
summarise(n=sum(times))
count
Without using plyr and dplyr you can reduce computing time by 25%.
To a reasonnable computing time, I subsetted the first 1000 rows of your data.
library(microbenchmark)
id = rep(replicate(4, paste(sample(LETTERS, 3, replace=F), collapse="")), 12500)
names = rep(replicate(3125, paste(sample(letters, 5, replace=T), collapse="")), 16)
times = sample(c(3,6,24), 50000, replace = T)
df = data.frame(id=id, names=names, times=times)
df = df[1:1000,]
ids <- as.vector(unique(df$id))
nms <- as.vector(unique(df$names))
Then I define 3 functions, default, summation, and sum+preallocation
default<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- nrow(df[df$id == ids[i] & df$names == nms[j], ])
}
count[[i]] <- vec
}
}
summation<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- sum(df$id == ids[i] & df$names == nms[j])
}
count[[i]] <- vec
}
}
summation_and_preallocation<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- integer(length = length(nms))
for(j in 1:length(nms)){
vec[j] <- sum(df$id == ids[i] & df$names == nms[j])
}
count[[i]] <- vec
}
}
Tests with microbenchmark show:
m<-microbenchmark(default(ids,nms,df),summation(ids,nms,df),summation_and_preallocation(ids,nms,df),times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
default(ids, nms, df) 994.5040 1012.1560 1040.7012 1042.5689 1072.4689 1074.8893 10
summation(ids, nms, df) 735.0831 740.6620 741.2254 742.1361 742.9321 743.7806 10
summation_and_preallocation(ids, nms, df) 729.1192 733.0536 753.8661 736.8319 791.5001 804.2335 10
How does it compare with dplyr solution from #Adrian?
dplyr_count(ids, nms, df) 3.154741 3.206819 49.06034 3.275624 3.701375 457.943 10
So about 200 times faster for dplyr!

R: recode previous/following n observations

I have a dataframe of 0/1 dummy variables. Each dummy variable only takes the value 1 once. For each column, I would want to replace n preceding/following observations counting from the observation with the value 1 to a particular value (say 1).
So for single vector, with n=1:
c(0, 0, 1, 0, 0)
I would want to get
c(0, 1, 1, 1, 0)
What would be a good general approach with n columns and allowing for a different number of preceding/following observations to replace (e.g n-1 before & n after)?
Thanks for help!
x<-c(0,0,1,0,0)
ind<-which(x==1)
x[(ind-1):(ind+x)]<-1
Another option:
f <- function(x, pre, post) {
idx <- which.max(x)
x[max(1, (idx-pre)):min(length(x), (idx+post))] <- 1
x
}
Sample data:
df <- data.frame(x = c(0, 0, 1, 0, 0), y = c(0, 1, 0, 0, 0))
Application:
df[] <- lapply(df, f, pre=2, post=1)
#df
# x y
#1 1 1
#2 1 1
#3 1 1
#4 1 0
#5 0 0
What you can do is the following:
vec <- c(0, 0, 1, 0, 0)
sapply(1:length(vec), function(i) {
minval <- max(0, i - 1)
maxval <- min(i + 1, length(vec))
return(sum(vec[minval:maxval]))
})
# [1] 0 1 1 1 0
Or to put it in a function (same code but a bit more compact)
f <- function(vec){
sapply(1:length(vec), function(i)
sum(vec[max(0, i-1):min(i+1, length(vec))]))
}
f(vec)
# [1] 0 1 1 1 0
Speedtest
To compare the two different solutions, I quickly ran a benchmark using microbenchmark, and the winner is: Clearly #Shenglin's code.... Always nice to see simple solutions (as well as to see how complicated some (my) solutions can be).
fDavid <- function(vec){
sapply(1:length(vec), function(i)
sum(vec[max(0, i-1):min(i+1, length(vec))]))
}
fHeroka <- function(vec){
res <- vec
test <- which(vec==1)
#create indices to be replaced
n=1 #variable n
replace_indices <- c(test+(1:n),test-(1:n))
#filter out negatives (may happen with larger n)
replace_indices <- replace_indices[replace_indices>0]
#replace items in 'res' that need to be replaced with 1
res[replace_indices] <- 1
}
fShenglin <- function(vec){
ind<-which(vec==1)
vec[(ind-1):(ind+x)]<-1
}
vect <- sample(0:1, size = 1000, replace = T)
library(microbenchmark)
microbenchmark(fHeroka(vect), fDavid(vect), fShenglin)
# # Unit: nanoseconds
# expr min lq mean median uq max
# fHeroka(vect) 38929 42999 54422.57 49546 61755.5 145451
# fDavid(vect) 2463805 2577935 2875024.99 2696844 2849548.5 5994596
# fShenglin 0 0 138.63 1 355.0 1063
# neval cld
# 100 a
# 100 b
# 100 a
# Warning message:
# In microbenchmark(fHeroka(vect), fDavid(vect), fShenglin) :
# Could not measure a positive execution time for 30 evaluations.
This might be a start:
myv <- c(0, 0, 1, 0, 0)
#make a copy
res <- myv
#check where the ones are
test <- which(myv==1)
#create indices to be replaced
n=1 #variable n
replace_indices <- c(test+(1:n),test-(1:n))
#filter out negatives (may happen with larger n)
replace_indices <- replace_indices[replace_indices>0]
#replace items in 'res' that need to be replaced with 1
res[replace_indices] <- 1
res
> res
[1] 0 1 1 1 0
This could be a solution:
dat<-data.frame(x=c(0,0,1,0,0,0),y=c(0,0,0,1,0,0),z=c(0,1,0,0,0,0))
which_to_change<-data.frame(prev=c(2,2,1),foll=c(1,1,3))
for(i in 1:nrow(which_to_change)){
dat[(which(dat[,i]==1)-which_to_change[i,1]):(which(dat[,i]==1)+which_to_change[i,2]),i]<-1
}

Fill NA values with the trailing row value times a growth rate?

What would be a good way to populate NA values with the previous value times (1 + growth)?
df <- data.frame(
year = 0:6,
price1 = c(1.1, 2.1, 3.2, 4.8, NA, NA, NA),
price2 = c(1.1, 2.1, 3.2, NA, NA, NA, NA)
)
growth <- .02
In this case, I would want the missing values in price1 to be filled with 4.8*1.02, 4.8*1.02^2, and 4.8*1.02^3. Similarly, I would want the missing values in price2 to be filled with 3.2*1.02, 3.2*1.02^2, 3.2*1.02^3, and 3.2*1.02^4.
I've tried this, but I think it needs to be set to repeat somehow (apply?):
library(dplyr)
df %>%
mutate(price1 = ifelse(is.na(price1),
lag(price1) * (1 + growth), price1
))
I'm not using dplyr for anything else (yet), so something from base R or plyr or similar would be appreciated.
Assuming only trailing NAs:
NAgrow <- function(x,growth=0.02) {
isna <- is.na(x)
lastval <- tail(x[!isna],1)
x[isna] <- lastval*(1+growth)^seq(sum(isna))
return(x)
}
If there are interior NA values as well this would get a little trickier.
Apply to all columns except the first:
df[-1] <- lapply(df[-1],NAgrow)
## year price1 price2
## 1 0 1.100000 1.100000
## 2 1 2.100000 2.100000
## 3 2 3.200000 3.200000
## 4 3 4.800000 3.264000
## 5 4 4.896000 3.329280
## 6 5 4.993920 3.395866
## 7 6 5.093798 3.463783
A compact base R solution can be obtained using Reduce:
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
replace(df, TRUE, lapply(df, Reduce, f = growthfun, acc = TRUE))
giving:
year price1 price2
1 0 1.100000 1.100000
2 1 2.100000 2.100000
3 2 3.200000 3.200000
4 3 4.800000 3.264000
5 4 4.896000 3.329280
6 5 4.993920 3.395866
7 6 5.093798 3.463783
Note: The data in the question has no non-trailing NA values but if there were some then we could use na.fill from zoo to first replace the trailing NAs with a special value, such as NaN, and look for it instead of NA:
library(zoo)
DF <- as.data.frame(na.fill(df, c(NA, NA, NaN)))
growthfun <- function(x, y) if (is.nan(y)) (1+growth)*x else y
replace(DF, TRUE, lapply(DF, Reduce, f = growthfun, acc = TRUE))
The following solution based on rle works with NA in any position and does not rely on looping to fill in the missing values:
NAgrow.rle <- function(x) {
if (is.na(x[1])) stop("Can't have NA at beginning")
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
x[is.na(x)] <- ave(x[b], b, FUN=function(y) y[1]*(1+growth)^seq_along(y))
x
}
df[,-1] <- lapply(df[,-1], NAgrow.rle)
# year price1 price2
# 1 0 1.100000 1.100000
# 2 1 2.100000 2.100000
# 3 2 3.200000 3.200000
# 4 3 4.800000 3.264000
# 5 4 4.896000 3.329280
# 6 5 4.993920 3.395866
# 7 6 5.093798 3.463783
I'll drop in two additional solutions using for loops, one in base R and one in Rcpp:
NAgrow.for <- function(x) {
for (i in which(is.na(x))) {
x[i] <- x[i-1] * (1+growth)
}
x
}
library(Rcpp)
cppFunction(
"NumericVector NAgrowRcpp(NumericVector x, double growth) {
const int n = x.size();
NumericVector y(x);
for (int i=1; i < n; ++i) {
if (R_IsNA(x[i])) {
y[i] = (1.0 + growth) * y[i-1];
}
}
return y;
}")
The solutions based on rle (crimson and josilber.rle) take about twice as long as the simple solution based on a for loop (josilber.for), and as expected the Rcpp solution is the fastest, running in about 0.002 seconds.
set.seed(144)
big.df <- data.frame(ID=1:100000,
price1=sample(c(1:10, NA), 100000, replace=TRUE),
price2=sample(c(1:10, NA), 100000, replace=TRUE))
crimson <- function(df) apply(df[,-1], 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValIs[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
ggrothendieck <- function(df) {
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
lapply(df[,-1], Reduce, f = growthfun, acc = TRUE)
}
josilber.rle <- function(df) lapply(df[,-1], NAgrow.rle)
josilber.for <- function(df) lapply(df[,-1], NAgrow.for)
josilber.rcpp <- function(df) lapply(df[,-1], NAgrowRcpp, growth=growth)
library(microbenchmark)
microbenchmark(crimson(big.df), ggrothendieck(big.df), josilber.rle(big.df), josilber.for(big.df), josilber.rcpp(big.df))
# Unit: milliseconds
# expr min lq mean median uq max neval
# crimson(big.df) 98.447546 131.063713 161.494366 152.477661 183.175840 379.643222 100
# ggrothendieck(big.df) 437.015693 667.760401 822.530745 817.864707 925.974019 1607.352929 100
# josilber.rle(big.df) 59.678527 115.220519 132.874030 127.476340 151.665657 262.003756 100
# josilber.for(big.df) 21.076516 57.479169 73.860913 72.959536 84.846912 178.412591 100
# josilber.rcpp(big.df) 1.248793 1.894723 2.373469 2.190545 2.697246 5.646878 100
It looks like dplyr can't handle access newly assigned lag values. Here is a solution that should work even if the NA's are in the middle of a column.
df <- apply(
df, 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValI[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
You can try such function
test <- function(x,n) {
if (!is.na(df[x,n])) return (df[x,n])
else return (test(x-1,n)*(1+growth))
}
a=1:nrow(df)
lapply(a, FUN=function(i) test(i,2))
unlist(lapply(a, FUN=function(i) test(i,2)))
[1] 1.100000 2.100000 3.200000 4.800000 4.896000 4.993920 5.093798

Resources