What would be a good way to populate NA values with the previous value times (1 + growth)?
df <- data.frame(
year = 0:6,
price1 = c(1.1, 2.1, 3.2, 4.8, NA, NA, NA),
price2 = c(1.1, 2.1, 3.2, NA, NA, NA, NA)
)
growth <- .02
In this case, I would want the missing values in price1 to be filled with 4.8*1.02, 4.8*1.02^2, and 4.8*1.02^3. Similarly, I would want the missing values in price2 to be filled with 3.2*1.02, 3.2*1.02^2, 3.2*1.02^3, and 3.2*1.02^4.
I've tried this, but I think it needs to be set to repeat somehow (apply?):
library(dplyr)
df %>%
mutate(price1 = ifelse(is.na(price1),
lag(price1) * (1 + growth), price1
))
I'm not using dplyr for anything else (yet), so something from base R or plyr or similar would be appreciated.
Assuming only trailing NAs:
NAgrow <- function(x,growth=0.02) {
isna <- is.na(x)
lastval <- tail(x[!isna],1)
x[isna] <- lastval*(1+growth)^seq(sum(isna))
return(x)
}
If there are interior NA values as well this would get a little trickier.
Apply to all columns except the first:
df[-1] <- lapply(df[-1],NAgrow)
## year price1 price2
## 1 0 1.100000 1.100000
## 2 1 2.100000 2.100000
## 3 2 3.200000 3.200000
## 4 3 4.800000 3.264000
## 5 4 4.896000 3.329280
## 6 5 4.993920 3.395866
## 7 6 5.093798 3.463783
A compact base R solution can be obtained using Reduce:
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
replace(df, TRUE, lapply(df, Reduce, f = growthfun, acc = TRUE))
giving:
year price1 price2
1 0 1.100000 1.100000
2 1 2.100000 2.100000
3 2 3.200000 3.200000
4 3 4.800000 3.264000
5 4 4.896000 3.329280
6 5 4.993920 3.395866
7 6 5.093798 3.463783
Note: The data in the question has no non-trailing NA values but if there were some then we could use na.fill from zoo to first replace the trailing NAs with a special value, such as NaN, and look for it instead of NA:
library(zoo)
DF <- as.data.frame(na.fill(df, c(NA, NA, NaN)))
growthfun <- function(x, y) if (is.nan(y)) (1+growth)*x else y
replace(DF, TRUE, lapply(DF, Reduce, f = growthfun, acc = TRUE))
The following solution based on rle works with NA in any position and does not rely on looping to fill in the missing values:
NAgrow.rle <- function(x) {
if (is.na(x[1])) stop("Can't have NA at beginning")
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
x[is.na(x)] <- ave(x[b], b, FUN=function(y) y[1]*(1+growth)^seq_along(y))
x
}
df[,-1] <- lapply(df[,-1], NAgrow.rle)
# year price1 price2
# 1 0 1.100000 1.100000
# 2 1 2.100000 2.100000
# 3 2 3.200000 3.200000
# 4 3 4.800000 3.264000
# 5 4 4.896000 3.329280
# 6 5 4.993920 3.395866
# 7 6 5.093798 3.463783
I'll drop in two additional solutions using for loops, one in base R and one in Rcpp:
NAgrow.for <- function(x) {
for (i in which(is.na(x))) {
x[i] <- x[i-1] * (1+growth)
}
x
}
library(Rcpp)
cppFunction(
"NumericVector NAgrowRcpp(NumericVector x, double growth) {
const int n = x.size();
NumericVector y(x);
for (int i=1; i < n; ++i) {
if (R_IsNA(x[i])) {
y[i] = (1.0 + growth) * y[i-1];
}
}
return y;
}")
The solutions based on rle (crimson and josilber.rle) take about twice as long as the simple solution based on a for loop (josilber.for), and as expected the Rcpp solution is the fastest, running in about 0.002 seconds.
set.seed(144)
big.df <- data.frame(ID=1:100000,
price1=sample(c(1:10, NA), 100000, replace=TRUE),
price2=sample(c(1:10, NA), 100000, replace=TRUE))
crimson <- function(df) apply(df[,-1], 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValIs[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
ggrothendieck <- function(df) {
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
lapply(df[,-1], Reduce, f = growthfun, acc = TRUE)
}
josilber.rle <- function(df) lapply(df[,-1], NAgrow.rle)
josilber.for <- function(df) lapply(df[,-1], NAgrow.for)
josilber.rcpp <- function(df) lapply(df[,-1], NAgrowRcpp, growth=growth)
library(microbenchmark)
microbenchmark(crimson(big.df), ggrothendieck(big.df), josilber.rle(big.df), josilber.for(big.df), josilber.rcpp(big.df))
# Unit: milliseconds
# expr min lq mean median uq max neval
# crimson(big.df) 98.447546 131.063713 161.494366 152.477661 183.175840 379.643222 100
# ggrothendieck(big.df) 437.015693 667.760401 822.530745 817.864707 925.974019 1607.352929 100
# josilber.rle(big.df) 59.678527 115.220519 132.874030 127.476340 151.665657 262.003756 100
# josilber.for(big.df) 21.076516 57.479169 73.860913 72.959536 84.846912 178.412591 100
# josilber.rcpp(big.df) 1.248793 1.894723 2.373469 2.190545 2.697246 5.646878 100
It looks like dplyr can't handle access newly assigned lag values. Here is a solution that should work even if the NA's are in the middle of a column.
df <- apply(
df, 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValI[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
You can try such function
test <- function(x,n) {
if (!is.na(df[x,n])) return (df[x,n])
else return (test(x-1,n)*(1+growth))
}
a=1:nrow(df)
lapply(a, FUN=function(i) test(i,2))
unlist(lapply(a, FUN=function(i) test(i,2)))
[1] 1.100000 2.100000 3.200000 4.800000 4.896000 4.993920 5.093798
Related
I have two matrices, one is generated out of the other by deleting some rows. For example:
m = matrix(1:18, 6, 3)
m1 = m[c(-1, -3, -6),]
Suppose I do not know which rows in m were eliminated to create m1, how should I find it out by comparing the two matrices? The result I want looks like this:
1, 3, 6
The actual matrix I am dealing with is very big. I was wondering if there is any efficient way of conducting it.
Here are some approaches:
1) If we can assume that there are no duplicated rows in m -- this is the case in the example in the question -- then:
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
## [1] 1 3 6
2) Transpose m and m1 giving tm and tm1 since it is more efficient to work on columns than rows.
Define match_indexes(i) which returns a vector r such that each row in m[r, ] matches m1[i, ].
Apply that to each i in 1:n1 and remove the result from 1:n.
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
## [1] 1 3 6
3) Calculate an interaction vector for each matrix and then use setdiff and finally match to get the indexes:
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
## [1] 1 3 6
Added If there can be duplicates in m then (1) and (3) will only return the first of any multiply occurring row in m not in m1.
m <- matrix(1:18, 6, 3)
m1 <- m[c(2, 4, 5),]
m <- rbind(m, m[1:2, ])
# 1
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
## 1 3 6
# 2
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
## 1 3 6 7
# 3
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
## 1 3 6
A possible way is to represent each row as a string:
x1 <- apply(m, 1, paste0, collapse = ';')
x2 <- apply(m1, 1, paste0, collapse = ';')
which(!x1 %in% x2)
# [1] 1 3 6
Some benchmark with a large matrix using my solution and G. Grothendieck's solutions:
set.seed(123)
m <- matrix(rnorm(20000 * 5000), nrow = 20000)
m1 <- m[-sample.int(20000, 1000), ]
system.time({
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
})
# user system elapsed
# 339.888 2.368 342.204
system.time({
x1 <- apply(m, 1, paste0, collapse = ';')
x2 <- apply(m1, 1, paste0, collapse = ';')
which(!x1 %in% x2)
})
# user system elapsed
# 395.428 0.568 395.955
system({
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
})
# > 15 min, not finish
system({
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
})
# run out of memory. My 32G RAM machine crashed.
We can also use do.call
which(!do.call(paste, as.data.frame(m)) %in% do.call(paste, as.data.frame(m1)))
#[1] 1 3 6
I have a data.frame that has several variables with zero values. I need to construct an extra variable that would return the combination of variables that are not zero for each observation. E.g.
df <- data.frame(firm = c("firm1", "firm2", "firm3", "firm4", "firm5"),
A = c(0, 0, 0, 1, 2),
B = c(0, 1, 0, 42, 0),
C = c(1, 1, 0, 0, 0))
Now I would like to generate the new variable:
df$varCombination <- c("C", "B-C", NA, "A-B", "A")
I thought up something like this, which obviously did not work:
for (i in 1:nrow(df)){
df$varCombination[i] <- paste(names(df[i,2:ncol(df) & > 0]), collapse = "-")
}
This could be probably solved easily using apply(df, 1, fun), but here is an attempt to solve this column wise instead of row wise for performance sake (I once saw something similar done by #alexis_laz but can't find it right now)
## Create a logical matrix
tmp <- df[-1] != 0
## or tmp <- sapply(df[-1], `!=`, 0)
## Prealocate result
res <- rep(NA, nrow(tmp))
## Run per column instead of per row
for(j in colnames(tmp)){
res[tmp[, j]] <- paste(res[tmp[, j]], j, sep = "-")
}
## Remove the pre-allocated `NA` values from non-NA entries
gsub("NA-", "", res, fixed = TRUE)
# [1] "C" "B-C" NA "A-B" "A"
Some benchmarks on a bigger data set
set.seed(123)
BigDF <- as.data.frame(matrix(sample(0:1, 1e4, replace = TRUE), ncol = 10))
library(microbenchmark)
MM <- function(df) {
var_names <- names(df)[-1]
res <- character(nrow(df))
for (i in 1:nrow(df)){
non_zero_names <- var_names[df[i, -1] > 0]
res[i] <- paste(non_zero_names, collapse = '-')
}
res
}
ZX <- function(df) {
res <-
apply(df[,2:ncol(df)]>0, 1,
function(i)paste(colnames(df[, 2:ncol(df)])[i], collapse = "-"))
res[res == ""] <- NA
res
}
DA <- function(df) {
tmp <- df[-1] != 0
res <- rep(NA, nrow(tmp))
for(j in colnames(tmp)){
res[tmp[, j]] <- paste(res[tmp[, j]], j, sep = "-")
}
gsub("NA-", "", res, fixed = TRUE)
}
microbenchmark(MM(BigDF), ZX(BigDF), DA(BigDF))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# MM(BigDF) 239.36704 248.737408 253.159460 252.177439 255.144048 289.340528 100 c
# ZX(BigDF) 35.83482 37.617473 38.295425 38.022897 38.357285 76.619853 100 b
# DA(BigDF) 1.62682 1.662979 1.734723 1.735296 1.761695 2.725659 100 a
Using apply:
# paste column names
df$varCombination <-
apply(df[,2:ncol(df)]>0, 1,
function(i)paste(colnames(df[, 2:ncol(df)])[i], collapse = "-"))
# convert blank to NA
df$varCombination[df$varCombination == ""] <- NA
# result
df
# firm A B C varCombination
# 1 firm1 0 0 1 C
# 2 firm2 0 1 1 B-C
# 3 firm3 0 0 0 <NA>
# 4 firm4 1 42 0 A-B
# 5 firm5 2 0 0 A
You had the right idea but the logical comparison in your loop wasn't correct.
I've attempted to keep the code fairly similar to what you had before, this should work:
var_names <- names(df)[-1]
df$varCombination <- character(nrow(df))
for (i in 1:nrow(df)){
non_zero_names <- var_names[df[i, -1] > 0]
df$varCombination[i] <- paste(non_zero_names, collapse = '-')
}
> df
firm A B C varCombination
1 firm1 0 0 1 C
2 firm2 0 1 1 B-C
3 firm3 0 0 0
4 firm4 1 42 0 A-B
5 firm5 2 0 0 A
I am working on a large dataset, i what to count how many time two columns have the same values. Here is an example of the dataset:
id = rep(replicate(4, paste(sample(LETTERS, 3, replace=F), collapse="")), 12500)
names = rep(replicate(3125, paste(sample(letters, 5, replace=T), collapse="")), 16)
times = sample(c(3,6,24), 50000, replace = T)
df = data.frame(id=id, names=names, times=times)
count <- list()
ids <- as.vector(unique(df$id))
nms <- as.vector(unique(df$names))
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- nrow(df[df$id == ids[i] & df$names == nms[j], ])
}
count[[i]] <- vec
}
My real data have about 50000 x 10 dimension and the id and name fields are randomly scattered. Can anyone suggest a better way to handle this? because my approach is working but too slow. dplyr or plyr methods?
Thanks,
EDIT:
short version of my dataframe:
id = rep(replicate(3, paste(sample(LETTERS, 3, replace=F), collapse="")), 5)
names = rep(replicate(3, paste(sample(letters, 5, replace=T), collapse="")), 5)
times = sample(c(3,6,24), 15, replace = T)
df = data.frame(id=id, names=names, times=times)
df
id names times
1 DEW xxsre 24
2 QHY xkbhr 24
3 DQE tuyfk 6
4 DEW xxsre 24
5 QHY xkbhr 24
6 DQE tuyfk 3
7 DEW xxsre 3
8 QHY xkbhr 24
9 DQE tuyfk 3
10 DEW xxsre 24
11 QHY xkbhr 24
12 DQE tuyfk 3
13 DEW xxsre 24
14 QHY xkbhr 3
15 DQE tuyfk 3
output:
> count
[[1]]
[1] 5 0 0
[[2]]
[1] 0 5 0
[[3]]
[1] 0 0 5
each list item is for id, and the list vec is for names count. in other words as.vector(unique(df$id)) and as.vector(unique(df$names)) respectively.
You can use data.table, which is likely the fastest solution:
library(data.table)
# convert your dataset into a data.table
setDT(df)
output <- df [ , .N, by = .(id, names)]
head(output)
> id names N
> 1: FYG vlrcd 4
> 2: FAL mjhhs 4
> 3: BZU rfnvc 4
> 4: HJA zhssf 4
> 5: FYG pxtne 4
> 6: FAL qgeqr 4
If you want the output to be a list, you can convert the output in different ways:
L1 <- as.list(as.data.frame(t(output))) # or
L2 <- split(output, list(output$id, output$names)) # or
L3 <- split(output, seq(nrow(output)))
Does this do what you want?
library(dplyr)
count <- df %>%
group_by(id, names) %>%
summarise(n=sum(times))
count
Without using plyr and dplyr you can reduce computing time by 25%.
To a reasonnable computing time, I subsetted the first 1000 rows of your data.
library(microbenchmark)
id = rep(replicate(4, paste(sample(LETTERS, 3, replace=F), collapse="")), 12500)
names = rep(replicate(3125, paste(sample(letters, 5, replace=T), collapse="")), 16)
times = sample(c(3,6,24), 50000, replace = T)
df = data.frame(id=id, names=names, times=times)
df = df[1:1000,]
ids <- as.vector(unique(df$id))
nms <- as.vector(unique(df$names))
Then I define 3 functions, default, summation, and sum+preallocation
default<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- nrow(df[df$id == ids[i] & df$names == nms[j], ])
}
count[[i]] <- vec
}
}
summation<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- c()
for(j in 1:length(nms)){
vec[j] <- sum(df$id == ids[i] & df$names == nms[j])
}
count[[i]] <- vec
}
}
summation_and_preallocation<-function(ids,nms,df){
count <- list()
for(i in 1:length(ids)){
vec <- integer(length = length(nms))
for(j in 1:length(nms)){
vec[j] <- sum(df$id == ids[i] & df$names == nms[j])
}
count[[i]] <- vec
}
}
Tests with microbenchmark show:
m<-microbenchmark(default(ids,nms,df),summation(ids,nms,df),summation_and_preallocation(ids,nms,df),times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
default(ids, nms, df) 994.5040 1012.1560 1040.7012 1042.5689 1072.4689 1074.8893 10
summation(ids, nms, df) 735.0831 740.6620 741.2254 742.1361 742.9321 743.7806 10
summation_and_preallocation(ids, nms, df) 729.1192 733.0536 753.8661 736.8319 791.5001 804.2335 10
How does it compare with dplyr solution from #Adrian?
dplyr_count(ids, nms, df) 3.154741 3.206819 49.06034 3.275624 3.701375 457.943 10
So about 200 times faster for dplyr!
I am looking for ways to speed up my code. I am looking into the apply/ply methods as well as data.table. Unfortunately, I am running into problems.
Here is a small sample data:
ids1 <- c(1, 1, 1, 1, 2, 2, 2, 2)
ids2 <- c(1, 2, 3, 4, 1, 2, 3, 4)
chars1 <- c("aa", " bb ", "__cc__", "dd ", "__ee", NA,NA, "n/a")
chars2 <- c("vv", "_ ww_", " xx ", "yy__", " zz", NA, "n/a", "n/a")
data <- data.frame(col1 = ids1, col2 = ids2,
col3 = chars1, col4 = chars2,
stringsAsFactors = FALSE)
Here is a solution using loops:
library("plyr")
cols_to_fix <- c("col3","col4")
for (i in 1:length(cols_to_fix)) {
data[,cols_to_fix[i]] <- gsub("_", "", data[,cols_to_fix[i]])
data[,cols_to_fix[i]] <- gsub(" ", "", data[,cols_to_fix[i]])
data[,cols_to_fix[i]] <- ifelse(data[,cols_to_fix[i]]=="n/a", NA, data[,cols_to_fix[i]])
}
I initially looked at ddply, but some methods I want to use only take vectors. Hence, I cannot figure out how to do ddply across just certain columns one-by-one.
Also, I have been looking at laply, but I want to return the original data.frame with the changes. Can anyone help me? Thank you.
Based on the suggestions from earlier, here is what I tried to use from the plyr package.
Option 1:
data[,cols_to_fix] <- aaply(data[,cols_to_fix],2, function(x){
x <- gsub("_", "", x,perl=TRUE)
x <- gsub(" ", "", x,perl=TRUE)
x <- ifelse(x=="n/a", NA, x)
},.progress = "text",.drop = FALSE)
Option 2:
data[,cols_to_fix] <- alply(data[,cols_to_fix],2, function(x){
x <- gsub("_", "", x,perl=TRUE)
x <- gsub(" ", "", x,perl=TRUE)
x <- ifelse(x=="n/a", NA, x)
},.progress = "text")
Option 3:
data[,cols_to_fix] <- adply(data[,cols_to_fix],2, function(x){
x <- gsub("_", "", x,perl=TRUE)
x <- gsub(" ", "", x,perl=TRUE)
x <- ifelse(x=="n/a", NA, x)
},.progress = "text")
None of these are giving me the correct answer.
apply works great, but my data is very large and the progress bars from plyr package would be a very nice. Thanks again.
Here's a data.table solution using set.
require(data.table)
DT <- data.table(data)
for (j in cols_to_fix) {
set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
}
DT
# col1 col2 col3 col4
# 1: 1 1 aa vv
# 2: 1 2 bb ww
# 3: 1 3 cc xx
# 4: 1 4 dd yy
# 5: 2 1 ee zz
# 6: 2 2 NA NA
# 7: 2 3 NA NA
# 8: 2 4 NA NA
First line reads: set in DT for all i(=NULL), and column=j the value gsub(..).
Second line reads: set in DT where i(=condn) and column=j with value NA_character_.
Note: Using PCRE (perl=TRUE) has nice speed-up, especially on bigger vectors.
Here is a data.table solution, should be faster if your table is large.
The concept of := is an "update" of the columns. I believe that because of this you aren't copying the table internally again as a "normal" dataframe solution would.
require(data.table)
DT <- data.table(data)
fxn = function(col) {
col = gsub("[ _]", "", col, perl = TRUE)
col[which(col == "n/a")] <- NA_character_
col
}
cols = c("col3", "col4");
# lapply your function
DT[, (cols) := lapply(.SD, fxn), .SDcols = cols]
print(DT)
No need for loops (for or *ply):
tmp <- gsub("[_ ]", "", as.matrix(data[,cols_to_fix]), perl=TRUE)
tmp[tmp=="n/a"] <- NA
data[,cols_to_fix] <- tmp
Benchmarks
I only benchmark Arun's data.table solution and my matrix solution. I assume that many columns need to be fixed.
Benchmark code:
options(stringsAsFactors=FALSE)
set.seed(45)
K <- 1000; N <- 1e5
foo <- function(K) paste(sample(c(letters, "_", " "), 8, replace=TRUE), collapse="")
bar <- function(K) replicate(K, foo(), simplify=TRUE)
data <- data.frame(id1=sample(5, K, TRUE),
id2=sample(5, K, TRUE)
)
data <- cbind(data, matrix(sample(bar(K), N, TRUE), ncol=N/K))
cols_to_fix <- as.character(seq_len(N/K))
library(data.table)
benchfun <- function() {
time1 <- system.time({
DT <- data.table(data)
for (j in cols_to_fix) {
set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
}
})
data2 <- data
time2 <- system.time({
tmp <- gsub("[_ ]", "", as.matrix(data2[,cols_to_fix]), perl=TRUE)
tmp[tmp=="n/a"] <- NA
data2[,cols_to_fix] <- tmp
})
list(identical= identical(as.data.frame(DT), data2),
data.table_timing= time1[[3]],
matrix_timing=time2[[3]])
}
replicate(3, benchfun())
Benchmark results:
#100 columns to fix, nrow=1e5
# [,1] [,2] [,3]
#identical TRUE TRUE TRUE
#data.table_timing 6.001 5.571 5.602
#matrix_timing 17.906 17.21 18.343
#1000 columns to fix, nrow=1e4
# [,1] [,2] [,3]
#identical TRUE TRUE TRUE
#data.table_timing 4.509 4.574 4.857
#matrix_timing 13.604 14.219 13.234
#1000 columns to fix, nrow=100
# [,1] [,2] [,3]
#identical TRUE TRUE TRUE
#data.table_timing 0.052 0.052 0.055
#matrix_timing 0.134 0.128 0.127
#100 columns to fix, nrow=1e5 and including
#data1 <- as.data.frame(DT) in the timing
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#identical TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#data.table_timing 5.642 5.58 5.762 5.382 5.419 5.633 5.508 5.578 5.634 5.397
#data.table_returnDF_timing 5.973 5.808 5.817 5.705 5.736 5.841 5.759 5.833 5.689 5.669
#matrix_timing 20.89 20.3 19.988 20.271 19.177 19.676 20.836 20.098 20.005 19.409
data.table is faster only by a factor of three. This advantage could probably be even smaller, if we decide to change the data structure (as the data.table solution does) and keep it a matrix.
I think you can do this with regular old apply, which will call your cleanup function on each column (margin=2):
fxn = function(col) {
col <- gsub("_", "", col)
col <- gsub(" ", "", col)
col <- ifelse(col=="n/a", NA, col)
return(col)
}
data[,cols_to_fix] <- apply(data[,cols_to_fix], 2, fxn)
data
# col1 col2 col3 col4
# 1 1 1 aa vv
# 2 1 2 bb ww
# 3 1 3 cc xx
# 4 1 4 dd yy
# 5 2 1 ee zz
# 6 2 2 <NA> <NA>
# 7 2 3 <NA> <NA>
# 8 2 4 <NA> <NA>
Edit: it sounds like you're requiring the use of the plyr package. I'm not an expert in plyr, but this seemed to work:
library(plyr)
data[,cols_to_fix] <- t(laply(data[,cols_to_fix], fxn))
Here's a benchmark of all the different answers:
First, all the answers as separate functions:
1) Arun's
arun <- function(data, cols_to_fix) {
DT <- data.table(data)
for (j in cols_to_fix) {
set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
}
return(DT)
}
2) Martin's
martin <- function(data, cols) {
DT <- data.table(data)
colfun = function(col) {
col <- gsub("_", "", col)
col <- gsub(" ", "", col)
col <- ifelse(col=="n/a", NA, col)
}
DT[, (cols) := lapply(.SD, colfun), .SDcols = cols]
return(DT)
}
3) Roland's
roland <- function(data, cols_to_fix) {
tmp <- gsub("[_ ]", "", as.matrix(data[,cols_to_fix]))
tmp[tmp=="n/a"] <- NA
data[,cols_to_fix] <- tmp
return(data)
}
4) BrodieG's
brodieg <- function(data, cols_to_fix) {
fix_fun <- function(x) gsub("(_| )", "", ifelse(x == "n/a", NA_character_, x))
data[, cols_to_fix] <- apply(data[, cols_to_fix], 2, fix_fun)
return(data)
}
5) Josilber's
josilber <- function(data, cols_to_fix) {
colfun2 <- function(col) {
col <- gsub("_", "", col)
col <- gsub(" ", "", col)
col <- ifelse(col=="n/a", NA, col)
return(col)
}
data[,cols_to_fix] <- apply(data[,cols_to_fix], 2, colfun2)
return(data)
}
2) benchmarking function:
We'll run this function 3 times and take the minimum of the run (removes cache effects) to be the runtime:
bench <- function(data, cols_to_fix) {
ans <- c(
system.time(arun(data, cols_to_fix))["elapsed"],
system.time(martin(data, cols_to_fix))["elapsed"],
system.time(roland(data, cols_to_fix))["elapsed"],
system.time(brodieg(data, cols_to_fix))["elapsed"],
system.time(josilber(data, cols_to_fix))["elapsed"]
)
}
3) On (slightly) big data with just 2 cols to fix (like in OP's example here):
require(data.table)
set.seed(45)
K <- 1000; N <- 1e5
foo <- function(K) paste(sample(c(letters, "_", " "), 8, replace=TRUE), collapse="")
bar <- function(K) replicate(K, foo(), simplify=TRUE)
data <- data.frame(id1=sample(5, N, TRUE),
id2=sample(5, N, TRUE),
col3=sample(bar(K), N, TRUE),
col4=sample(bar(K), N, TRUE)
)
rown <- c("arun", "martin", "roland", "brodieg", "josilber")
coln <- paste("run", 1:3, sep="")
cols_to_fix <- c("col3","col4")
ans <- matrix(0L, nrow=5L, ncol=3L)
for (i in 1:3) {
print(i)
ans[, i] <- bench(data, cols_to_fix)
}
rownames(ans) <- rown
colnames(ans) <- coln
# run1 run2 run3
# arun 0.149 0.140 0.142
# martin 0.643 0.629 0.621
# roland 1.741 1.708 1.761
# brodieg 1.926 1.919 1.899
# josilber 2.067 2.041 2.162
The apply version is the way to go. Looks like #josilber came up with the same answer, but this one is slightly different (note regexp).
fix_fun <- function(x) gsub("(_| )", "", ifelse(x == "n/a", NA_character_, x))
data[, cols_to_fix] <- apply(data[, cols_to_fix], 2, fix_fun)
More importantly, generally you want to use ddply and data.table when you want to do split-apply-combine analysis. In this case, all your data belongs to the same group (there aren't any subgroups you're doing anything different with), so you might as well use apply.
The 2 at the center of the apply statement means we want to subset the input by the 2nd dimension, and pass the result (in this case vectors, each representing a column from your data frame in cols_to_fix) to the function that does the work. apply then re-assembles the result, and we assign it back to the columns in cols_to_fix. If we had used 1 instead, apply would have passed the rows in our data frame to the function. Here is the result:
data
# col1 col2 col3 col4
# 1 1 1 aa vv
# 2 1 2 bb ww
# 3 1 3 cc xx
# 4 1 4 dd yy
# 5 2 1 ee zz
# 6 2 2 <NA> <NA>
# 7 2 3 <NA> <NA>
# 8 2 4 <NA> <NA>
If you do have sub-groups, then I recommend you use data.table. Once you get used to the syntax it's hard to beat for convenience and speed. It will also do efficient joins across data sets.
I occasionally need to extract specific rows from a data.frame based on values from one of the variables. R has built-in functions for maximum (which.max()) and minimum (which.min()) that allow me to easily extract those rows.
Is there an equivalent for median? Or is my best bet to just write my own function?
Here's an example data.frame and how I would use which.max() and which.min():
set.seed(1) # so you can reproduce this example
dat = data.frame(V1 = 1:10, V2 = rnorm(10), V3 = rnorm(10),
V4 = sample(1:20, 10, replace=T))
# To return the first row, which contains the max value in V4
dat[which.max(dat$V4), ]
# To return the seventh row, which contains the min value in V4
dat[which.min(dat$V4), ]
For this particular example, since there are an even number of observations, I would need to have two rows returned, in this case, rows 2 and 10.
Update
It would seem that there is not a built-in function for this. As such, using the reply from Sacha as a starting point, I wrote this function:
which.median = function(x) {
if (length(x) %% 2 != 0) {
which(x == median(x))
} else if (length(x) %% 2 == 0) {
a = sort(x)[c(length(x)/2, length(x)/2+1)]
c(which(x == a[1]), which(x == a[2]))
}
}
I'm able to use it as follows:
# make one data.frame with an odd number of rows
dat2 = dat[-10, ]
# Median rows from 'dat' (even number of rows) and 'dat2' (odd number of rows)
dat[which.median(dat$V4), ]
dat2[which.median(dat2$V4), ]
Are there any suggestions to improve this?
While Sacha's solution is quite general, the median (or other quantiles) are order statistics, so you can calculate the corresponding indices from order (x) (instead of sort (x) for the quantile values).
Looking into quantile, types 1 or 3 could be used, all others lead to (weighted) averages of two values in certain cases.
I chose type 3, and a bit of copy & paste from quantile leads to:
which.quantile <- function (x, probs, na.rm = FALSE){
if (! na.rm & any (is.na (x)))
return (rep (NA_integer_, length (probs)))
o <- order (x)
n <- sum (! is.na (x))
o <- o [seq_len (n)]
nppm <- n * probs - 0.5
j <- floor(nppm)
h <- ifelse((nppm == j) & ((j%%2L) == 0L), 0, 1)
j <- j + h
j [j == 0] <- 1
o[j]
}
A little test:
> x <-c (2.34, 5.83, NA, 9.34, 8.53, 6.42, NA, 8.07, NA, 0.77)
> probs <- c (0, .23, .5, .6, 1)
> which.quantile (x, probs, na.rm = TRUE)
[1] 10 1 6 6 4
> x [which.quantile (x, probs, na.rm = TRUE)] == quantile (x, probs, na.rm = TRUE, type = 3)
0% 23% 50% 60% 100%
TRUE TRUE TRUE TRUE TRUE
Here's your example:
> dat [which.quantile (dat$V4, c (0, .5, 1)),]
V1 V2 V3 V4
7 7 0.4874291 -0.01619026 1
2 2 0.1836433 0.38984324 13
1 1 -0.6264538 1.51178117 17
I think just:
which(dat$V4 == median(dat$V4))
But be careful there since the median takes the mean of two numbers if there isn't a single middle number. E.g. median(1:4) gives 2.5 which doesn't match any of the elements.
Edit
Here is a function which will give you either the element of the median or the first match to the mean of the median, similar to how which.min() gives you the first element that is equal to the minimum only:
whichmedian <- function(x) which.min(abs(x - median(x)))
For example:
> whichmedian(1:4)
[1] 2
I've written a more comprehensive function that serves my needs:
row.extractor = function(data, extract.by, what) {
# data = your data.frame
# extract.by = the variable that you are extracting by, either
# as its index number or by name
# what = either "min", "max", "median", or "all", with quotes
if (is.numeric(extract.by) == 1) {
extract.by = extract.by
} else if (is.numeric(extract.by) != 0) {
extract.by = which(colnames(dat) %in% "extract.by")
}
which.median = function(data, extract.by) {
a = data[, extract.by]
if (length(a) %% 2 != 0) {
which(a == median(a))
} else if (length(a) %% 2 == 0) {
b = sort(a)[c(length(a)/2, length(a)/2+1)]
c(max(which(a == b[1])), min(which(a == b[2])))
}
}
X1 = data[which(data[extract.by] == min(data[extract.by])), ]
X2 = data[which(data[extract.by] == max(data[extract.by])), ]
X3 = data[which.median(data, extract.by), ]
if (what == "min") {
X1
} else if (what == "max") {
X2
} else if (what == "median") {
X3
} else if (what == "all") {
rbind(X1, X3, X2)
}
}
Some example usage:
> row.extractor(dat, "V4", "max")
V1 V2 V3 V4
1 1 -0.6264538 1.511781 17
> row.extractor(dat, 4, "min")
V1 V2 V3 V4
7 7 0.4874291 -0.01619026 1
> row.extractor(dat, "V4", "all")
V1 V2 V3 V4
7 7 0.4874291 -0.01619026 1
2 2 0.1836433 0.38984324 13
10 10 -0.3053884 0.59390132 14
4 1 -0.6264538 1.51178117 17
Suppose the vector from which you want to get the median is x.
The function which.min(x[x>=median(x)]) would give the median if length(x)=2*n+1 or the larger of the two middle values if length(x)=2*n. You can tweak it slightly if you want to get the smaller of the two middle values.
Building on the answers given by Sacha and cbeleites, here is a function to get inclusive quantile indices. One difference from previous answers is that the type argument is exposed and will produce slightly different quantile results (see ?quantile). If performance is an issue, one could replace the sapply with a version from the parallel package - something like unlist(mclapply(...)).
# Extract indices corresponding to inclusive quantiles
# EXAMPLE:
#
# x <- c(2.34, 5.83, NA, 9.34, 8.53, 6.42, NA, 8.07, NA, 0.77)
# probs <- c(0, .23, .5, .6, 1)
# which.quantile(x, probs, na.rm = TRUE)
#
# OUTPUT: 10 1 6 8 4
#
# x[ which.quantile(x, probs, na.rm = TRUE) ]
#
# OUTPUT: 0.77 2.34 6.42 8.07 9.34
#
# x <- c(2, 1, 3)
# p <- c(0.5)
# x[ which.quantile(x, p) ]
#
# OUTPUT: 2
which.quantile <- function (x,
probs,
na.rm = FALSE,
type = 7) {
stopifnot(all(probs >= 0.0))
stopifnot(all(probs <= 1.0))
quants = quantile(x,
probs = probs,
na.rm = na.rm,
type = type)
which.nearest <- function(quant) {
return(which.min(abs(x - quant)))
}
return(sapply(X = quants, FUN = which.nearest))
}
We only need a function that returns the locations of values by matching approximately:
match.approx <- function(x, y) {
## Purpose: Match Approximately for Numerical Data
## Arguments:
## "x": a vector of numeric values.
## "y": a vector of numeric values.
## RETURN:
## The index in "y" that indicates the closest y value to each of "x" value.
## ________________________________________________
sapply(x, function(x0) which.min(abs(x0 - y)))
}
if (F) {
match.approx(c(4.2, 1.2, 15), 1:10) # 4 1 10
}
Here is an example of finding the locations of quantiles:
set.seed(1)
a <- rnorm(100)
match.approx(quantile(a), a)
# 0% 25% 50% 75% 100%
# 14 29 23 63 61