I am running this for loop without any problems but it takes a long time. I guess it can be faster with apply family but not sure how. Any hints?
set.seed(1)
nrows <- 1200
ncols <- 1000
outmat <- matrix(NA, nrows, ncols)
dat <- matrix(5, nrows, ncols)
for (nc in 1 : ncols){
for(nr in 1 : nrows){
val <- dat[nr, nc]
if(!is.na(val)){
file <- readBin(dir2[val], numeric(), size = 4, n = 1200*1000)
# my real data where dir2 is a list of files
# "dir2 <- list.files("/data/dir2", "*.dat", full.names = TRUE)"
file <- matrix((data = file), ncol = 1000, nrow = 1200) #my real data
outmat[nr, nc] <- file[nr, nc]
}
}
}
Two solutions.
The first fills more memory, but is more efficient and I guess feasible if you have 24 files, as you stated. You read all the files at once, then you subset properly according to dat. Something like:
allContents<-do.call(cbind,lapply(dir2,readBin,n=nrows*ncol,size=4,"numeric")
res<-matrix(allContents[cbind(1:length(dat),c(dat+1))],nrows,ncols)
The second one can handle a slightly bigger number of files (say 50-100). It reads chunks of each file and subset consequently. You have to open as many connections as the number of files you got. For instance:
outmat <- matrix(NA, nrows, ncols)
connections<-lapply(dir2,file,open="rb")
for (i in 1:ncols) {
values<-vapply(connections,readBin,what="numeric",n=nr,size=4,numeric(nr))
outmat[,i]<-values[cbind(seq_len(nrows),dat[,i]+1)]
}
The +1 after dat is due to the fact that, as you stated in the comments, the values in dat range from 0 to 23 and R indexing is 1-based.
Related
I am trying to read in chunks of a a large data set:
find the mean of each chunk (representing a larger column)
add the mean into a matrix column
then find the mean of the means to give me the overall mean of the column.
I have the set up, but my while-loop is not repeating its cycle. I think it may be with how I am referring to "chunks" and "chunk".
This is a practice using "iris.csv" in R
fl <- file("iris.csv", "r")
clname <- readLines(fl, n=1) # read the header
r <- unlist(strsplit(clname,split = ","))
length(r) # get the number of columns in the matrix
cm <- matrix(NA, nrow=1000, ncol=length(r)) # need a matrix that can be filled on each #iteration.
numchunk = 0 #set my chunks of code to build up
while(numchunk <= 0){ #stop when no more chunks left to run
numchunk <- numchunk + 1 # keep on moving through chunks of code
x <- readLines(fl, n=100) #read 100 lines at a time
chunk <- as.numeric(unlist(strsplit(x,split = ","))) # readable chunk of code
m <- matrix(chunk, ncol=length(r), byrow = TRUE) # put chunk in a matrix
cm[numchunk,] <- colMeans(m) #get the column means of the matrix and fill in larger matrix
print(numchunk) # print the number of chunks used
}
cm
close(fl)
final_mean <- colSums(cm)/nrow(cm)
return(final_mean)
--
This works when I set my n = 1000, but I want it to work for larger data sets, where the while will need to keep running.
Can anyone help me correct this please?
Perhaps, this helps
clname <- readLines(fl, n=1) # read the header
r <- unlist(strsplit(clname,split = ","))
length(r) # get the number of columns in the matrix
cm <- matrix(NA, nrow=1000, ncol=length(r)) #
numchunk = 0
flag <- TRUE
while(flag){
numchunk <- numchunk + 1 # keep on moving through chunks of code
x <- readLines(fl, n=5)
print(length(x))
if(length(x) == 0) {
flag <- FALSE
} else {
chunk <- as.numeric(unlist(strsplit(x,split = ","))) # readable chunk of code
m <- matrix(chunk, ncol=length(r), byrow = TRUE) # put chunk in a matrix
cm[numchunk,] <- colMeans(m) #get the column means of the matrix and fill in larger matrix
print(numchunk) # print the number of chunks used
}
}
cm
close(fl)
final_mean <- colSums(cm)/nrow(cm)
First, it might be helpful, to define a helper function r2v() to split raw lines into useful vectors.
r2v <- Vectorize(\(x) {
## splits raw lines to vectors
strsplit(gsub('\\"', '', x), split=",")[[1]][-1]
})
After opening file, check the size w/o the need to read it in, using system() and bash commands (for Windows see there.)
## open file
f <- 'iris.csv'
fl <- file(f, "r")
## rows
(nr <-
as.integer(gsub(paste0('\\s', f), '', system(paste('wc -l', f), int=T))) - 1)
# nr <- 150 ## alternatively define nrows manually
# [1] 150
## columns
nm <- readLines(fl, n=1) |> r2v()
(nc <- length(nm))
# [1] 5
Next, define a chunk size by which the rows can be divided.
## define chunk size
ch_sz <- 50
stopifnot(nr %% ch_sz == 0) ## all chunks should be filled
Then, using replicate(), we calculate chunk-wise rowMeans() (because we get the chunks transposed), and finally rowMeans() again on everything to get the column means of the entire matrix.
## calculate means chunk-wise
final_mean <-
replicate(nr / ch_sz,
rowMeans(type.convert(r2v(readLines(fl, n=ch_sz)), as.is=TRUE))) |>
rowMeans()
close(fl)
Vet's validate the result.
## test
all.equal(final_mean, as.numeric(colMeans(iris[-5])))
# [1] TRUE
Data:
iris[-5] |>
write.csv('iris.csv')
Background - I want to try and exhaustively search a set of all possible combinations of 250 rows taken 10 at a time. In order to iteratively get this, I use the following code
`
## Function definition
gen.next.cbn <- function(cbn, n){
## Generates the combination that follows the one provided as input
cbn.bin <- rep(0, n)
cbn.bin[cbn] <- 1
if (tail(cbn.bin, 1) == 0){
ind <- tail(which(cbn.bin == 1), 1)
cbn.bin[c(ind, ind+1)] <- c(0, 1)
}else{
ind <- 1 + tail(which(diff(cbn.bin) == -1), 1)
nb <- sum(cbn.bin[-c(1:ind)] == 1)
cbn.bin[c(ind-1, (n-nb+1):n)] <- 0
cbn.bin[ind:(ind+nb)] <- 1
}
cbn <- which(cbn.bin == 1)
}
## Example parameters
n <- 40
k <- 10
## Iteration example
for (i in 1:choose(n, k)){
if (i == 1){
cbn <- 1:k
}else{
cbn <- gen.next.cbn(cbn, n)
}
print(cbn)
}
`
I get the error "cannot allocate vector of size n GB" when I go beyond 40 rows.
Ideal Solution:
a) If the combinations can be dumped and memory can be flushed iteratively after every run in the loop (where I can check the further conditions)
b) If the combinations can be dumped to a csv file such that it does not cause a memory hog.
Thanks for your support.
As I said in the comments, iterpc is the way to go for such a task. You first need to initialize an iterator via the iterpc function. Next we can generate the next n combinations via getnext. After this, we simply append our results to a csv (or any file type you like).
getComboChunks <- function(n, k, chunkSize, totalCombos, myFile) {
myIter <- iterpc(n, k)
## initialized myFile
myCombs <- getnext(myIter, chunkSize)
write.table(myCombs, file = myFile, sep = ",", col.names = FALSE)
maxIteration <- (totalCombos - chunkSize) %/% chunkSize
for (i in 1:maxIteration) {
## get the next "chunkSize" of combinations
myCombs <- getnext(myIter, chunkSize)
## append the above combinations to your file
write.table(myCombs, file = myFile, sep = ",",
col.names = FALSE , append = TRUE)
}
}
For example, getComboChunks(250, 10, 100, 1000, "myCombos.csv") will write out 1000 combinations of 250 choose 10 to the file myCombos.csv 100 combinations at a time. Doing this in chunks will be more efficient than one at a time.
This library is written in C/C++ so it should be fairly efficient, but as #Florian points out in the comments, it won't produce all gmp::chooseZ(250, 10) = Big Integer ('bigz') : [1] 219005316087032475 combinations any time soon. I haven't tested it, but if you settle for 200 choose 5, I think you will be able to produce it in under a day (it is just over 2.5 billion results).
I have simple loop that generate a value at each step, and I want to save all results as a single table. Problem is that each step overwrites the previous.
for(i in 1:5){
x = 3*i
print(c(i,x))
}
This gives
[1] 1 3
[1] 2 6
[1] 3 9
[1] 4 12
[1] 5 15
Now I create a matrix that I will then save as a csv file, but it only shows the final step of the loop.
results = matrix(c(i,x), ncol = 2)
[,1] [,2]
[1,] 5 15
write.table(results, file = "Results.csv", col.names=NA, append = T)
How to show the entire list of results? Thanks in advance!
(ps.- I know that a similar question has been posted previously, e.g. Write output of R loop to file, but the problem was quite specific and I did not manage to adapt the answers to my case).
Your loop only prints, to the console, the results. The matrix you're creating only relies on the single (and last) value of i. There are many ways to do it but if you really want to write a matrix, then you need to store them somewhere to export all iteration intermediate results. You can try something like:
results <- matrix(NA, nrow=5, ncol=2)
for(i in 1:5){
results[i, ] <- c(i, 3*i)
}
write.table(results, file = "Results.csv", col.names=NA, append = T)
And by the way you don't really need a loop here:
i <- 1:5
m <- matrix(c(i, 3*i), nrow=5)
would do the job.
You can usually use sapply instead of for-loops:
results <- t(sapply(1:5, function(x) c(x, 3*x)))
write.table(results, file="Results.csv", col.names=NA, append=T)
Assuming you really want/need a for-loop
1) You store all the result into a matrix and then you write the whole matrix to file
n = 5;
results = matrix(NA, ncol=2, nrow=n);
for(i in 1:n) {
results[i, ] = c(i, x);
}
write.table(results, file = "Results.csv", col.names=NA, append = T);
This is a "good" solution if you don't have many results and you want to access the file just once.
2) You store current result only into a matrix and you write to file at each iteration
n = 5;
for(i in 1:n) {
results = matrix(c(i,x), ncol = 2)
write.table(results, file = "Results.csv", col.names=NA, append = T);
}
This is a "good" solution if you have many data and memory limits. Maybe slower than the previous one because you will open the file many times...
To append using a matrix you could use:
exampleMatrix <- matrix(ncol = 2)
for(i in 1:5){
x = 3*i
if(i ==1){
exampleMatrix<- rbind(c(x,i))
}else{
exampleMatrix<- rbind(exampleMatrix,c(x,i))
}}
To append to a dataframe using a loop you could use the following:
exampleDF <- data.frame()
for(i in 1:5){
x = 3*i
exampleDF <- rbind(exampleDF,c(x,i))
}
write.csv(exampleDF, "C:\\path")
So when you want to store you values while using a loop, it's important to index. Below, I created some code where a(the iteration) and x(the value x * 3) are each stored inside a vector.
After the loop has finished, I combine the two vectors into one data frame with the cbind() function
a <- vector()
x <- vector()
for(i in 1:5){
a[i] = i
x[i] = 3*i
}
df <- as.data.frame(cbind(a, x))
There are other ways to do this without loops. Once you start raising the number of iterations, or doing nested loops, the processing time starts to get really high. Other options are in the apply package.
Hope this helped!
I have read a series of 332 files like below by storing the data in each file as a data frame in List.
files <- list.files()
data <- list()
for (i in 1:332){
data[[i]] = read.csv(files[[i]])
}
The data has 3 columns with names id, city, town. Now I need to calculate the mean of all values under city corresponding to the id values 1:10 for which I wrote the below code
for(j in 1:10){
req.data <- data[[j]]$city
}
mean(na.omit(req.data))
But it is giving me a wrong value and when I call it in a function its transferring null values. Any help is highly appreciated.
Each time you iterate through j = 1:10 you assign data[[j]]$city to the object req.data. In doing so, for steps j = 2:10 you are overwriting the previous version of req.data with the contents of the jth data set. Hence req.data only ever contains at any one time a single city's worth of data and hence you are getting the wrong answer sa you are computing the mean for the last city only, not all 10.
Also note that you could do mean(req.data, na.rm = TRUE) to remove the NAs.
You can do this without an explicit loop at the user R level using lapply(), for example, with dummy data,
set.seed(42)
data <- list(data.frame(city = rnorm(100)),
data.frame(city = rnorm(100)),
data.frame(city = rnorm(100)))
mean(unlist(lapply(data, `[`, "city")), na.rm = TRUE)
which gives
> mean(unlist(lapply(data, `[`, "city")), na.rm = TRUE)
[1] -0.02177902
So in your case, you need:
mean(unlist(lapply(data[1:10], `[`, "city")), na.rm = TRUE)
If you want to write a loop, then perhaps
req.data <- vector("list", length = 3) ## allocate, adjust to length = 10
for (j in 1:3) { ## adjust to 1:10 for your data / Q
req.data[[j]] <- data[[j]]$city ## fill in
}
mean(unlist(req.data), na.rm = TRUE)
> mean(unlist(req.data), na.rm = TRUE)
[1] -0.02177902
is one way. Or alternatively, compute the mean of the individual cities and then average those means
vec <- numeric(length = 3) ## allocate, adjust to length = 10
for (j in 1:3) { ## adjust to 1:10 for your question
vec[j] <- mean(data[[j]]$city, na.rm = TRUE)
}
mean(vec)
I have a large number of csv files that I want to read into R. All the column headings in the csvs are the same. But I want to import only those rows from each file into the data frame for which a variable is within a given range (above min threshold & below max threshold), e.g.
v1 v2 v3
1 x q 2
2 c w 4
3 v e 5
4 b r 7
Filtering for v3 (v3>2 & v3<7) should results in:
v1 v2 v3
1 c w 4
2 v e 5
So fare I import all the data from all csvs into one data frame and then do the filtering:
#Read the data files
fileNames <- list.files(path = workDir)
mergedFiles <- do.call("rbind", sapply(fileNames, read.csv, simplify = FALSE))
fileID <- row.names(mergedFiles)
fileID <- gsub(".csv.*", "", fileID)
#Combining data with file IDs
combFiles=cbind(fileID, mergedFiles)
#Filtering the data according to criteria
resultFile <- combFiles[combFiles$v3 > min & combFiles$v3 < max, ]
I would rather apply the filter while importing each single csv file into the data frame. I assume a for loop would be the best way of doing it, but I am not sure how.
I would appreciate any suggestion.
Edit
After testing the suggestion from mnel, which worked, I ended up with a different solution:
fileNames = list.files(path = workDir)
mzList = list()
for(i in 1:length(fileNames)){
tempData = read.csv(fileNames[i])
mz.idx = which(tempData[ ,1] > minMZ & tempData[ ,1] < maxMZ)
mz1 = tempData[mz.idx, ]
mzList[[i]] = data.frame(mz1, filename = rep(fileNames[i], length(mz.idx)))
}
resultFile = do.call("rbind", mzList)
Thanks for all the suggestions!
Here is an approach using data.table which will allow you to use fread (which is faster than read.csv) and rbindlist which is a superfast implementation of do.call(rbind, list(..)) perfect for this situation. It also has a function between
library(data.table)
fileNames <- list.files(path = workDir)
alldata <- rbindlist(lapply(fileNames, function(x,mon,max) {
xx <- fread(x, sep = ',')
xx[, fileID := gsub(".csv.*", "", x)]
xx[between(v3, lower=min, upper = max, incbounds = FALSE)]
}, min = 2, max = 3))
If the individual files are large and v1 always integer values it might be worth setting v3 as a key then using a binary search, it may also be quicker to import everything and then run the filtering.
If you want to do "filtering" before importing the data try to use read.csv.sql from sqldf package
If you are really stuck for memory then the following solution might work. It uses LaF to read only the column needed for filtering; then calculates the total number of lines that will be read; initialized the complete data.frame and then read the required lines from the files. (It's probably not faster than the other solutions)
library("LaF")
colnames <- c("v1","v2","v3")
colclasses <- c("character", "character", "numeric")
fileNames <- list.files(pattern = "*.csv")
# First determine which lines to read from each file and the total number of lines
# to be read
lines <- list()
for (fn in fileNames) {
laf <- laf_open_csv(fn, column_types=colclasses, column_names=colnames, skip=1)
d <- laf$v3[]
lines[[fn]] <- which(d > 2 & d < 7)
}
nlines <- sum(sapply(lines, length))
# Initialize data.frame
df <- as.data.frame(lapply(colclasses, do.call, list(nlines)),
stringsAsFactors=FALSE)
names(df) <- colnames
# Read the lines from the files
i <- 0
for (fn in names(lines)) {
laf <- laf_open_csv(fn, column_types=colclasses, column_names=colnames, skip=1)
n <- length(lines[[fn]])
df[seq_len(n) + i, ] <- laf[lines[[fn]], ]
i <- i + n
}