Related
Using BASE R, I wonder how to answer the following question:
Are there any value on X or Y (i.e., variables of interest names) that occurs only in one element in m (as a cluster) but not others? If yes, produce my desired output below.
For example:
Here we see X == 3 only occurs in element m[[3]] but not m[[1]] and m[[2]].
Here we also see Y == 99 only occur in m[[1]] but not others.
Note: the following is a toy example, a functional answer is appreciated. AND X & Y may or may not be numeric (e.g., be string).
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,1,1,1,1,1,3,3),
Y = c(99,99,99,99,6,6,6,6))
m <- split(f, f$id) # Here is `m`
mods <- names(f)[-1] # variables of interest names
Desired output:
list(AA = c(Y = 99), CC = c(X = 3))
# $AA
# Y
# 99
# $CC
# X
# 3
This is a solution based on rapply() and table().
ux <- rapply(m, unique)
tb <- table(uxm <- ux[gsub(rx <- "^.*\\.(.*)$", "\\1", names(ux)) %in% mods])
r <- Map(setNames, n <- uxm[uxm %in% names(tb)[tb == 1]], gsub(rx, "\\1", names(n)))
setNames(r, gsub("^(.*)\\..*$", "\\1", names(r)))
# $AA
# Y
# 99
#
# $CC
# X
# 3
tmp = do.call(rbind, lapply(names(f)[-1], function(x){
d = unique(f[c("id", x)])
names(d) = c("id", "val")
transform(d, nm = x)
}))
tmp = tmp[ave(as.numeric(as.factor(tmp$val)), tmp$val, FUN = length) == 1,]
lapply(split(tmp, tmp$id), function(a){
setNames(a$val, a$nm)
})
#$AA
# Y
#99
#$BB
#named numeric(0)
#$CC
#X
#3
This utilizes #jay.sf's idea of rapply() with an idea from a previous answer:
vec <- rapply(lapply(m, '[', , mods), unique)
unique_vec <- vec[!duplicated(vec) & !duplicated(vec, fromLast = T)]
vec_names <- do.call(rbind, strsplit(names(unique_vec), '.', fixed = T))
names(unique_vec) <- vec_names[, 2]
split(unique_vec, vec_names[, 1])
$AA
Y
99
$CC
X
3
I am looking for speed improvement for a function which imports several ".txt" files to one data frame (adding file name). The number of ".txt" files is > 10 000 and all those files have the same structure and are located in one directory with several sub directories. Size of all 10 000 files is around 800 MB in total. It takes couple of hours to load all 10 000 file to a df.
My PC: Toshiba P50t with 8GB RAM and 1TB HDD
Please see the code I am using.
I am happy to hear suggestions how to improve loading speed (I would prefer not to use intermediary tool like load data to MS SQL and import it to R) I have tried to use fread instead of read_csv without significant speed difference.
files_to_df_v01 <- function( directory , Output_file_name , What_stocks) {
List <- data.frame(dir(directory, pattern="*.txt", recursive = T))
names(List)[1] <- "Path_file"
List <- arrange(List,List$Path_file)
List_wse_stocks <- (filter ( List , str_count(List$Path_file , pattern = What_stocks ) > 0 ))
library(readr)
rownumber = 1
setwd(directory)
############## LOOP ################
for (i in List_wse_stocks$Path_file) {
if (file.info(i)$size != 0) {
dat <- read_csv(i,col_types = cols(Ticker = col_character(), Date = col_date(format = "%Y-%m-%d"), Open = col_double(), High = col_double(), Low = col_double(), Close = col_double(), Volume = col_integer(), OpenInt = col_integer() ))
L_ = (str_locate_all(i,"/"))
sapply(L_,max)
File_name <- substr(i,sapply(L_,max)+1, nchar(i))
dat$Ticker <- substr(File_name,1,nchar(File_name)-4)
datt = dat %>% select(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
if (rownumber == 1) { rownumber = rownumber + 1
GPW_wse_stocks <- datt }
else{GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)}
}
}
# ) ############## END of LOOP
save(GPW_wse_stocks,file=Output_file_name)
return(data.frame(GPW_wse_stocks))
}
Using data.table I managed to get around 4 times faster solution:
# Creating test data :
dir.create("Test")
dd <- "Test/csvReadingTest2"
dir.create(dd)
dir.create(file.path(dd, "v1"))
dir.create(file.path(dd, "v2"))
n <- 3000
f <- function(x) sample(x, n, replace = T)
require(data.table)
set.seed(123)
d1 <- data.table(Ticker = f(LETTERS),
Date = f(seq.Date(as.Date("2016-01-01"), by = "month",
length.out = n/100)),
Open = f(c(1.2, 1.3)), High = f(c(1.2, 1.3)),
Low = f(c(1.2, 1.3)), Close = f(c(1.2, 1.3)),
Volume = f(1:10), OpenInt = f(1:10))
d1
# Ticker Date Open High Low Close Volume OpenInt
# 1: H 2203-04-01 1.2 1.3 1.2 1.2 6 4
# 2: N 2121-05-01 1.2 1.3 1.2 1.2 9 6
# 3: E 2060-04-01 1.3 1.2 1.2 1.3 1 3
# 4: V 2132-04-01 1.3 1.3 1.3 1.2 7 8
# 5: F 2253-04-01 1.2 1.3 1.3 1.2 3 10
# ---
# 2996: J 2027-05-01 1.3 1.3 1.2 1.2 7 6
# 2997: K 2177-05-01 1.2 1.3 1.2 1.2 5 4
# 2998: S 2200-03-01 1.2 1.2 1.2 1.2 6 2
# 2999: V 2110-05-01 1.3 1.3 1.3 1.2 4 3
# 3000: Q 2043-05-01 1.2 1.3 1.2 1.2 3 5
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v1/d", x, ".txt"))))
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v2/d", x, ".txt"))))
A little bit modified your function:
################################################################################
yourFunction_modified <- function(directory, Output_file_name, What_stocks) {
# require(plyr)
require(dplyr)
require(stringr)
library(readr)
# List <- data.frame(dir(directory, pattern = "*.txt", recursive = T))
# names(List)[1] <- "Path_file"
# List <- arrange(List, List$Path_file)
# List_wse_stocks <- (filter(List , str_count(List$Path_file ,
# pattern = What_stocks ) > 0 ))
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
rownumber = 1
for (i in l) {
if (file.info(i)$size != 0) {
dat <- read_csv(i,
col_types = cols(Ticker = col_character(),
Date = col_date(format = "%Y-%m-%d"),
Open = col_double(), High = col_double(),
Low = col_double(), Close = col_double(),
Volume = col_integer(),
OpenInt = col_integer()))
L_ = (str_locate_all(i,"/"))
File_name <- substr(i,sapply(L_,max) + 1, nchar(i))
dat$Ticker <- substr(File_name,1,nchar(File_name) - 4)
datt = dat %>% select(Ticker, Date, Open, High, Low, Close,
Volume, OpenInt)
if (rownumber == 1) {
rownumber = rownumber + 1
GPW_wse_stocks <- datt
} else {
GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)
}
}
}
save(GPW_wse_stocks, file = Output_file_name)
return(data.frame(GPW_wse_stocks))
}
system.time(
x <- yourFunction_modified(dd, file.path(dirname(dd), "csvReadingTest2.Rdat"),
"/d[0-9]")
)
# 25 - 18 sek
My function:
myFun <- function(directory, Output_file_name, What_stocks) {
require(data.table)
require(Hmisc)
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
l <- l[file.info(l)$size != 0]
dtList <- lapply(l, function(i) {
dat <- fread(i)
File_name <- basename(i)
dat$Ticker <- substr(File_name, 1, nchar(File_name) - 4)
necessary <- Cs(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
# Delete unnecesary columns:
for (ii in setdiff(colnames(dat), necessary)) {
set(dat, j = ii, value = NULL)
}
dat
})
dtList[1:2]
dt <- rbindlist(dtList, use.names = T, fill = T, idcol = F)
require(fasttime)
dt[, Date := as.Date(fastPOSIXct(Date))]
save(dt, file = Output_file_name)
return(dt[])
}
system.time(
x2 <- myFun(dd, file.path(dirname(dd), "csvReadingTest2v2.Rdat"),
"/d[0-9]")
)
# 6 - 4 sek
all.equal(as.data.table(x), x2)
# [1] TRUE1
rbindlist(lapply(files, fread)) is pretty quick, though if you have a high number of small files and you don't care about preserving the filename, you may be best using the operating system directly.
Set up data because OP didn't: 10,000 files of 100 rows.
setwd(tempdir())
dir.create("48492154")
setwd("48492154")
dates <- as.character(seq.Date(as.Date("2012-01-01"),
as.Date(Sys.Date()),
length.out = 500))
library(data.table)
for (i in 1:1e4) {
DT <- data.table(Ticker = 1:100,
Date = sample(dates, size = 100),
Open = round(runif(100) + 100, 1),
Close = round(runif(100) + 100, 1),
Volume = sample(1:100),
OpenInt = 1:100)
cat(i, "of 10,000\r")
flush.console()
fwrite(DT, paste0(i, ".csv"), showProgress = FALSE)
}
Simple method (also handles repeated headers and gets the colClasses nearer to the truth.)
system.time({
res <- rbindlist(lapply(dir(pattern = "\\.csv"), fread))
})
#> user system elapsed
#> 5.46 3.17 8.62
Using Windows's system copy:
system.time({
# Windows only
shell("copy /b *.csv out.txt > dump.log")
new_res <- fread("out.txt")
# Delete the headers mixed in (whereas rbindlist() above
# handles this automatically -- and better)
for (j in names(new_res)) {
new_res <- new_res[.subset2(new_res, j) != j]
}
})
#> user system elapsed
#> 0.76 0.13 3.31
Im trying to set names for a vector using the function names, but R gives me an error. I want to create a vector with function and then set name for each number in that vector. I want to do all this step by writing just one order (for example v(x)). This is example of my code script
v <- c(2,6,5)
d <- function(x) x*9
names(d(x))<-paste("q=", 1:3, sep="")
and R says
Error in names(d(x)) <- paste("q=", 1:3, sep = "") :
could not find function "d<-"
I don't really know what you are doing, but assuming this approximates it I can reproduce the error:
qn <- 1:11
div_1 <- function(x) { x <- x + 1 }
div_2 <- function(x) { x <- x + 2 }
div<- function(x) c(div_1(x)[1],div_2(x),div_1(x)[2:10])
x <- 1
names(div(x))<- paste("q=", qn, sep="" )
# Error in names(div(x)) <- paste("q=", qn, sep = "") :
# could not find function "div<-"
and I can fix it with this (breaking it into two steps):
qn <- 1:11
div_1 <- function(x) { x <- x + 1 }
div_2 <- function(x) { x <- x + 2 }
div<- function(x) c(div_1(x)[1],div_2(x),div_1(x)[2:10])
x <- 1
v <- div(x)
names(v)<- paste("q=", qn, sep="" )
# q=1 q=2 q=3 q=4 q=5 q=6 q=7 q=8 q=9 q=10 q=11
# 2 3 NA NA NA NA NA NA NA NA NA
It may be a bug, or a limitation in assigning names to a temporary variable (note that the result gets thrown out in your version). Out of curiousity, what are you doing with div?
data:
sam:
res = 0.25 , res1=0.30
bad:
res= 0.30 , res1=0.23
code:
write.table(sam, file = "C:\\Users\\data1.txt", append = F, sep = " ", row.names = TRUE,col.names = TRUE)
write.table(bad, file = "C:\\Users\\data1.txt", append = T, sep = " ",row.names = TRUE, col.names = TRUE)
output of data1:
"x"
"1" 0.25
"x"
"1" 0.3
In fact, I want the output in the text file data1 to be something like this:
res res1
sam 0.25 0.30
bad 0.3 0.23
Any idea is appreciated!
You can supply a named vector in write.table(). You'll just need to set the second set of column names to FALSE since they've already been provided in the first initial call.
sam <- 0.25; bad <- 0.30
write.table(c(sam = sam), col.names = "res", file = "data")
write.table(c(bad = bad), col.names = FALSE, file = "data", append = TRUE)
## read it back in
read.table("data")
# res
# sam 0.25
# bad 0.30
In response to your comment, you can write a helper function to do the appending once the file is initialized. Then we can read it as a list so we have a choice between returning a data frame or a matrix.
sam <- 0.25
bad = 0.3
## initial file creation
write.table(cbind(sam, bad), "data", row.names = FALSE)
## function to append to 'data'
wtFun <- function(x) {
write.table(x, "data", append = TRUE, col.names = FALSE, row.names = FALSE)
}
## new values
sam2 <- 0.99
bad2 <- 25
## append new values
wtFun(cbind(sam2, bad2))
## read the file as a list and set the names
res <- setNames(
scan("data", what = list(0, 0), skip = 1L),
scan("data", what = "", nlines = 1L)
)
## 'res' as a matrix
do.call(rbind, res)
# [,1] [,2]
# sam 0.25 0.99
# bad 0.30 25.00
## 'res' as a data frame
as.data.frame(res)
# sam bad
# 1 0.25 0.3
# 2 0.99 25.0
This will work better if you write sam and bad as data.frames (or matrices), instead of atomics. For example,
sam <- 0.25
bad <- 0.30
##
write.table(
data.frame(res=sam,row.names="sam"),
file="F:/temp/data1.txt",
append=F,sep=" ",
col.names=TRUE,
row.names=TRUE)
##
write.table(
data.frame(res=bad,row.names="bad"),
file="F:/temp/data1.txt",
append=T,sep=" ",
col.names=FALSE,
row.names=TRUE)
##
R> read.table("F:/temp/data1.txt",header=TRUE)
res
sam 0.25
bad 0.30
IMO though, it's not a good idea to force the row.names attribute like this because if you append an object to the file with a row name that already exists, you will get an error when you try to read it back in since the row.names attribute cannot contain duplicate values. You would be better off doing something like
write.table(
data.frame(name="sam",res=sam),
file="F:/temp/data1.txt",
append=F,sep=" ",
col.names=TRUE,
row.names=FALSE)
##
write.table(
data.frame(name="bad",res=bad),
file="F:/temp/data1.txt",
append=T,sep=" ",
col.names=FALSE,
row.names=FALSE)
##
R> read.table("F:/temp/data1.txt",header=TRUE)
name res
1 sam 0.25
2 bad 0.30
I am looking for ways to speed up my code. I am looking into the apply/ply methods as well as data.table. Unfortunately, I am running into problems.
Here is a small sample data:
ids1 <- c(1, 1, 1, 1, 2, 2, 2, 2)
ids2 <- c(1, 2, 3, 4, 1, 2, 3, 4)
chars1 <- c("aa", " bb ", "__cc__", "dd ", "__ee", NA,NA, "n/a")
chars2 <- c("vv", "_ ww_", " xx ", "yy__", " zz", NA, "n/a", "n/a")
data <- data.frame(col1 = ids1, col2 = ids2,
col3 = chars1, col4 = chars2,
stringsAsFactors = FALSE)
Here is a solution using loops:
library("plyr")
cols_to_fix <- c("col3","col4")
for (i in 1:length(cols_to_fix)) {
data[,cols_to_fix[i]] <- gsub("_", "", data[,cols_to_fix[i]])
data[,cols_to_fix[i]] <- gsub(" ", "", data[,cols_to_fix[i]])
data[,cols_to_fix[i]] <- ifelse(data[,cols_to_fix[i]]=="n/a", NA, data[,cols_to_fix[i]])
}
I initially looked at ddply, but some methods I want to use only take vectors. Hence, I cannot figure out how to do ddply across just certain columns one-by-one.
Also, I have been looking at laply, but I want to return the original data.frame with the changes. Can anyone help me? Thank you.
Based on the suggestions from earlier, here is what I tried to use from the plyr package.
Option 1:
data[,cols_to_fix] <- aaply(data[,cols_to_fix],2, function(x){
x <- gsub("_", "", x,perl=TRUE)
x <- gsub(" ", "", x,perl=TRUE)
x <- ifelse(x=="n/a", NA, x)
},.progress = "text",.drop = FALSE)
Option 2:
data[,cols_to_fix] <- alply(data[,cols_to_fix],2, function(x){
x <- gsub("_", "", x,perl=TRUE)
x <- gsub(" ", "", x,perl=TRUE)
x <- ifelse(x=="n/a", NA, x)
},.progress = "text")
Option 3:
data[,cols_to_fix] <- adply(data[,cols_to_fix],2, function(x){
x <- gsub("_", "", x,perl=TRUE)
x <- gsub(" ", "", x,perl=TRUE)
x <- ifelse(x=="n/a", NA, x)
},.progress = "text")
None of these are giving me the correct answer.
apply works great, but my data is very large and the progress bars from plyr package would be a very nice. Thanks again.
Here's a data.table solution using set.
require(data.table)
DT <- data.table(data)
for (j in cols_to_fix) {
set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
}
DT
# col1 col2 col3 col4
# 1: 1 1 aa vv
# 2: 1 2 bb ww
# 3: 1 3 cc xx
# 4: 1 4 dd yy
# 5: 2 1 ee zz
# 6: 2 2 NA NA
# 7: 2 3 NA NA
# 8: 2 4 NA NA
First line reads: set in DT for all i(=NULL), and column=j the value gsub(..).
Second line reads: set in DT where i(=condn) and column=j with value NA_character_.
Note: Using PCRE (perl=TRUE) has nice speed-up, especially on bigger vectors.
Here is a data.table solution, should be faster if your table is large.
The concept of := is an "update" of the columns. I believe that because of this you aren't copying the table internally again as a "normal" dataframe solution would.
require(data.table)
DT <- data.table(data)
fxn = function(col) {
col = gsub("[ _]", "", col, perl = TRUE)
col[which(col == "n/a")] <- NA_character_
col
}
cols = c("col3", "col4");
# lapply your function
DT[, (cols) := lapply(.SD, fxn), .SDcols = cols]
print(DT)
No need for loops (for or *ply):
tmp <- gsub("[_ ]", "", as.matrix(data[,cols_to_fix]), perl=TRUE)
tmp[tmp=="n/a"] <- NA
data[,cols_to_fix] <- tmp
Benchmarks
I only benchmark Arun's data.table solution and my matrix solution. I assume that many columns need to be fixed.
Benchmark code:
options(stringsAsFactors=FALSE)
set.seed(45)
K <- 1000; N <- 1e5
foo <- function(K) paste(sample(c(letters, "_", " "), 8, replace=TRUE), collapse="")
bar <- function(K) replicate(K, foo(), simplify=TRUE)
data <- data.frame(id1=sample(5, K, TRUE),
id2=sample(5, K, TRUE)
)
data <- cbind(data, matrix(sample(bar(K), N, TRUE), ncol=N/K))
cols_to_fix <- as.character(seq_len(N/K))
library(data.table)
benchfun <- function() {
time1 <- system.time({
DT <- data.table(data)
for (j in cols_to_fix) {
set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
}
})
data2 <- data
time2 <- system.time({
tmp <- gsub("[_ ]", "", as.matrix(data2[,cols_to_fix]), perl=TRUE)
tmp[tmp=="n/a"] <- NA
data2[,cols_to_fix] <- tmp
})
list(identical= identical(as.data.frame(DT), data2),
data.table_timing= time1[[3]],
matrix_timing=time2[[3]])
}
replicate(3, benchfun())
Benchmark results:
#100 columns to fix, nrow=1e5
# [,1] [,2] [,3]
#identical TRUE TRUE TRUE
#data.table_timing 6.001 5.571 5.602
#matrix_timing 17.906 17.21 18.343
#1000 columns to fix, nrow=1e4
# [,1] [,2] [,3]
#identical TRUE TRUE TRUE
#data.table_timing 4.509 4.574 4.857
#matrix_timing 13.604 14.219 13.234
#1000 columns to fix, nrow=100
# [,1] [,2] [,3]
#identical TRUE TRUE TRUE
#data.table_timing 0.052 0.052 0.055
#matrix_timing 0.134 0.128 0.127
#100 columns to fix, nrow=1e5 and including
#data1 <- as.data.frame(DT) in the timing
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#identical TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#data.table_timing 5.642 5.58 5.762 5.382 5.419 5.633 5.508 5.578 5.634 5.397
#data.table_returnDF_timing 5.973 5.808 5.817 5.705 5.736 5.841 5.759 5.833 5.689 5.669
#matrix_timing 20.89 20.3 19.988 20.271 19.177 19.676 20.836 20.098 20.005 19.409
data.table is faster only by a factor of three. This advantage could probably be even smaller, if we decide to change the data structure (as the data.table solution does) and keep it a matrix.
I think you can do this with regular old apply, which will call your cleanup function on each column (margin=2):
fxn = function(col) {
col <- gsub("_", "", col)
col <- gsub(" ", "", col)
col <- ifelse(col=="n/a", NA, col)
return(col)
}
data[,cols_to_fix] <- apply(data[,cols_to_fix], 2, fxn)
data
# col1 col2 col3 col4
# 1 1 1 aa vv
# 2 1 2 bb ww
# 3 1 3 cc xx
# 4 1 4 dd yy
# 5 2 1 ee zz
# 6 2 2 <NA> <NA>
# 7 2 3 <NA> <NA>
# 8 2 4 <NA> <NA>
Edit: it sounds like you're requiring the use of the plyr package. I'm not an expert in plyr, but this seemed to work:
library(plyr)
data[,cols_to_fix] <- t(laply(data[,cols_to_fix], fxn))
Here's a benchmark of all the different answers:
First, all the answers as separate functions:
1) Arun's
arun <- function(data, cols_to_fix) {
DT <- data.table(data)
for (j in cols_to_fix) {
set(DT, i=NULL, j=j, value=gsub("[ _]", "", DT[[j]], perl=TRUE))
set(DT, i=which(DT[[j]] == "n/a"), j=j, value=NA_character_)
}
return(DT)
}
2) Martin's
martin <- function(data, cols) {
DT <- data.table(data)
colfun = function(col) {
col <- gsub("_", "", col)
col <- gsub(" ", "", col)
col <- ifelse(col=="n/a", NA, col)
}
DT[, (cols) := lapply(.SD, colfun), .SDcols = cols]
return(DT)
}
3) Roland's
roland <- function(data, cols_to_fix) {
tmp <- gsub("[_ ]", "", as.matrix(data[,cols_to_fix]))
tmp[tmp=="n/a"] <- NA
data[,cols_to_fix] <- tmp
return(data)
}
4) BrodieG's
brodieg <- function(data, cols_to_fix) {
fix_fun <- function(x) gsub("(_| )", "", ifelse(x == "n/a", NA_character_, x))
data[, cols_to_fix] <- apply(data[, cols_to_fix], 2, fix_fun)
return(data)
}
5) Josilber's
josilber <- function(data, cols_to_fix) {
colfun2 <- function(col) {
col <- gsub("_", "", col)
col <- gsub(" ", "", col)
col <- ifelse(col=="n/a", NA, col)
return(col)
}
data[,cols_to_fix] <- apply(data[,cols_to_fix], 2, colfun2)
return(data)
}
2) benchmarking function:
We'll run this function 3 times and take the minimum of the run (removes cache effects) to be the runtime:
bench <- function(data, cols_to_fix) {
ans <- c(
system.time(arun(data, cols_to_fix))["elapsed"],
system.time(martin(data, cols_to_fix))["elapsed"],
system.time(roland(data, cols_to_fix))["elapsed"],
system.time(brodieg(data, cols_to_fix))["elapsed"],
system.time(josilber(data, cols_to_fix))["elapsed"]
)
}
3) On (slightly) big data with just 2 cols to fix (like in OP's example here):
require(data.table)
set.seed(45)
K <- 1000; N <- 1e5
foo <- function(K) paste(sample(c(letters, "_", " "), 8, replace=TRUE), collapse="")
bar <- function(K) replicate(K, foo(), simplify=TRUE)
data <- data.frame(id1=sample(5, N, TRUE),
id2=sample(5, N, TRUE),
col3=sample(bar(K), N, TRUE),
col4=sample(bar(K), N, TRUE)
)
rown <- c("arun", "martin", "roland", "brodieg", "josilber")
coln <- paste("run", 1:3, sep="")
cols_to_fix <- c("col3","col4")
ans <- matrix(0L, nrow=5L, ncol=3L)
for (i in 1:3) {
print(i)
ans[, i] <- bench(data, cols_to_fix)
}
rownames(ans) <- rown
colnames(ans) <- coln
# run1 run2 run3
# arun 0.149 0.140 0.142
# martin 0.643 0.629 0.621
# roland 1.741 1.708 1.761
# brodieg 1.926 1.919 1.899
# josilber 2.067 2.041 2.162
The apply version is the way to go. Looks like #josilber came up with the same answer, but this one is slightly different (note regexp).
fix_fun <- function(x) gsub("(_| )", "", ifelse(x == "n/a", NA_character_, x))
data[, cols_to_fix] <- apply(data[, cols_to_fix], 2, fix_fun)
More importantly, generally you want to use ddply and data.table when you want to do split-apply-combine analysis. In this case, all your data belongs to the same group (there aren't any subgroups you're doing anything different with), so you might as well use apply.
The 2 at the center of the apply statement means we want to subset the input by the 2nd dimension, and pass the result (in this case vectors, each representing a column from your data frame in cols_to_fix) to the function that does the work. apply then re-assembles the result, and we assign it back to the columns in cols_to_fix. If we had used 1 instead, apply would have passed the rows in our data frame to the function. Here is the result:
data
# col1 col2 col3 col4
# 1 1 1 aa vv
# 2 1 2 bb ww
# 3 1 3 cc xx
# 4 1 4 dd yy
# 5 2 1 ee zz
# 6 2 2 <NA> <NA>
# 7 2 3 <NA> <NA>
# 8 2 4 <NA> <NA>
If you do have sub-groups, then I recommend you use data.table. Once you get used to the syntax it's hard to beat for convenience and speed. It will also do efficient joins across data sets.