R extremely slow processing loop - r

So I am gathering data from a relatively decent sized excel file (~200k rows, 35 columns). The data is broken up with a section name, cycle count, and then the next row is the data, and at the end of the data section is a blank line. So here is my function that gets the data, file for the parameter is all the files in the directory ending in .csv that have to be parsed and name is the name of the section of data that you want to get. The function works, but it runs at a snail pace, and by that I mean to process 10k lines about 4 minutes.
getData1 <- function(file,name) {
for(i in 1:length(file)) {
dat <- c()
lines <- readLines(file[i])
indx <- grep(name, lines) #row number for anything with search term in it
counter <- 3
dat <- c(read.table(text=lines[(indx+2)],
sep=",", header=FALSE, stringsAsFactors=FALSE, check.names=FALSE))
while(dat[counter-2] != "\t") {
dat <- c(dat,read.table(text=lines[(indx+counter)], #read only one line per loop
sep=",", header=FALSE, stringsAsFactors=FALSE, check.names=FALSE))
counter <- counter + 1
}
return(dat)
}
}

Related

How to find UNIX times corresponding to a particular date in a very large file in R

I have a very large (10 million row x 12 column) comma-delimited text file. The first column contains UNIX times (in seconds to 2 d.p.)
I would like to extract all rows corresponding to a particular date (e.g. 2014-06-26), and save the rows for each date in other smaller files.
In the below I scan through the file, reading in the first number in each row (the time), and spit out the row number whenever the date associated with the current row differs from the previous row:
## create fake data ; there are many duplicate times, rows are not always in order
con <- "BigFile.txt"; rile.remove(con)
Times <- seq ( 1581259391, 1581259391 + (7*24*3600), by=100)
write.table(data.frame(Time=Times, x=runif(n = length(Times))), file=con, sep=",", row.names=F, col.names=F, append=F)
## read in fake data line-by-line, note
con <- file( "BigFile.txt", open="r")
Row <- 0
Now <- 0
Last <- 0
while (length(myLine <- scan(con,what="numeric",nlines=1,sep=',',skip=1,quiet=TRUE)) > 0 )
{Row <- Row+1
Now <- as.Date(as.POSIXct( as.numeric(myLine[1]), origin="1970-01-01", tz="GMT" ) , format="%Y-%m-%d")
if (Now!=Last) {print(data.frame(Row,Now))}
Last <- Now
}
The idea would then be to save these indices, and use them to cut up the file into smaller daily chunks... However, I am sure there must be much more efficient approaches (I have tried opening these files using the data.table package, but still run into memory issues).
Any pointers will be greatly appreciated.
library(sqldf)
# data
con <- "BigFile.txt"
Times <- seq ( 1581259391, 1581259391 + (7*24*3600), by=100)
write.table(data.frame(Time=Times, x=runif(n = length(Times))), file=con, sep=",", row.names=F, col.names=F, append=F)
# solution
df <- read.csv.sql("BigFile.txt", header = F,
sql = "select * from file where V1 = 1403740800", eol = "\n")

Cumulative count instead of last file

I am trying to count all of the files cumulatively, but for some reason it is instead counting the last file and using that number for the rest of the analysis. How can I change this code to instead include the counts and unique counts of all files (there are 51 files).
#Move all files to one list
file_list <- list.files(pattern="Dataset 2.*txt")
Read files
for (i in 1:length(file_list)){
file <- read.table(file_list[i], header=TRUE, sep=",")
out.file <- rbind(file)
}
Count total number phone call records
count_PHONECALLRECORDS <- length(out.file$CALLER_ID)
#Count number unique caller id's
count_CALLERID <- length(unique(out.file$CALLER_ID))
Here's the correction you need -
# Read files
out.file <- NULL
for (i in 1:length(file_list)){
file <- read.table(file_list[i], header=TRUE, sep=",")
out.file <- rbind(out.file, file)
}
Note that this way of growing the data i.e rbind-ing to itself is not efficient but assuming you are a beginner I'd say don't worry about it until you have to.
You should move the counting code to the loop and initialize the counting variables before the loop:
setwd("~/Desktop/GEOG Research/Jordan/compression")
library(plyr)
library(rlang)
library(dplyr)
# Move all files to one list
file_list <- list.files(pattern="Dataset 2.*txt")
# Read files
count_PHONECALLRECORDS <- 0
count_CALLERID <- 0
for (i in 1:length(file_list)){
file <- read.table(file_list[i], header=TRUE, sep=",")
out.file <- rbind(file)
# Count total number phone call records
count_PHONECALLRECORDS <- count_PHONECALLRECORDS + length(out.file$CALLER_ID)
# Count number unique caller id's
count_CALLERID <- count_CALLERID + length(unique(out.file$CALLER_ID))
}
# Construct contingency matrix
tb_1 <- with(out.file, table(CALLEE_PREFIX, CALLER = substr(CALLER_ID, 0, 1)))
colnames(tb_1) <- c("Refugee Caller", "Non-Refugee Caller")
rownames(tb_1) <- c("Refugee Callee", "Non-Refugee Callee", "Unknown Callee")
tb_1

Mean values from multiple csv to data frame

After having searched for help in different threads on this topic, I still have not become wiser. Therefore: Here comes another question on looping through multiple data files...
OK. I have multiple CSV files in one folder containing 5 columns of data. The filenames are as follows:
Moist yyyymmdd hh_mm_ss.csv
I would like to create a script that reads processes the CSV-files one by one doing the following steps:
1) load file
2) check number of rows and exclude file if less than 3 registrations
3) calculate mean value of all measurements (=rows) for column 2
4) calculate mean value of all measurements (=rows) for column 4
5) output the filename timestamp, mean column 2 and mean column 4 to a data frame,
I have written the following function
moist.each.mean <- function() {
library("tcltk")
directory <- tk_choose.dir("","Choose folder for Humidity data files")
setwd(directory)
filelist <- list.files(path = directory)
filetitles <- regmatches(filelist, regexpr("[0-9].*[0-9]", filelist))
mdf <- data.frame(timestamp=character(), humidity=numeric(), temp=numeric())
for(i in 1:length(filelist)){
file.in[[i]] <- read.csv(filelist[i], header=F)
if (nrow(file.in[[i]]<3)){
print("discard")
} else {
newrow <- c(filetitles[[i]], round(mean(file.in[[i]]$V2),1), round(mean(file.in[[i]]$V4),1))
mdf <- rbind(mdf, newrow)
}
}
names(mdf) <- c("timestamp", "humidity", "temp")
}
but i keep getting an error:
Error in `[[<-.data.frame`(`*tmp*`, i, value = list(V1 = c(10519949L, :
replacement has 18 rows, data has 17
Any ideas?
Thx, kruemelprinz
I'd also suggest to use (l)apply... Here's my take:
getMeans <- function(fpath,runfct,
target_cols = c(2),
sep=",",
dec=".",
header = T,
min_obs_threshold = 3){
f <- list.files(fpath)
fcsv <- f[grepl("\.csv",f)]
fcsv <- paste0(fpath,fcsv)
csv_list <- lapply(fcsv,read.table,sep = sep,
dec = dec, header = header)
csv_rows <- sapply(csv_list,nrow)
rel_csv_list <- csv_list[!(csv_rows < min_obs_threshold)]
lapply(rel_csv_list,function(x) colMeans(x[,target_cols]))
}
Also with that kind of error message, the debugger might be very helpful.
Just run debug(moist.each.mean) and execute the function stepwise.
Here's a slightly different approach. Use lapply to read each csv file, exclude it if necessary, otherwise create a summary. This gives you a list where each element is a data frame summary. Then use rbind to create the final summary data frame.
Without a sample of your data, I can't be sure the code below exactly matches your problem, but hopefully it will be enough to get you where you want to go.
# Get vector of filenames to read
filelist=list.files(path=directory, pattern="csv")
# Read all the csv files into a list and create summaries
df.list = lapply(filelist, function(f) {
file.in = read.csv(f, header=TRUE, stringsAsFactors=FALSE)
# Set to empty data frame if file has less than 3 rows of data
if (nrow(file.in) < 3) {
print(paste("Discard", f))
# Otherwise, capture file timestamp and summarise data frame
} else {
data.frame(timestamp=substr(f, 7, 22),
humidity=round(mean(file.in$V2),1),
temp=round(mean(file.in$V4),1))
}
})
# Bind list into final summary data frame (excluding the list elements
# that don't contain a data frame because they didn't have enough rows
# to be included in the summary)
result = do.call(rbind, df.list[sapply(df.list, is.data.frame)])
One issue with your original code is that you create a vector of summary results rather than a data frame of results:
c(filetitles[[i]], round(mean(file.in[[i]]$V2),1), round(mean(file.in[[i]]$V4),1)) is a vector with three elements. What you actually want is a data frame with three columns:
data.frame(timestamp=filetitles[[i]],
humidity=round(mean(file.in[[i]]$V2),1),
temp=round(mean(file.in[[i]]$V4),1))
Thanks for the suggestions using lapply. This is definitely of value as it saves a whole lot of code as well! Meanwhile, I managed to fix my original code as well:
library("tcltk")
# directory: path to csv files
directory <-
tk_choose.dir("","Choose folder for Humidity data files")
setwd(directory)
filelist <- list.files(path = directory)
filetitles <-
regmatches(filelist, regexpr("[0-9].*[0-9]", filelist))
mdf <- data.frame()
for (i in 1:length(filelist)) {
file.in <- read.csv(filelist[i], header = F, skipNul = T)
if (nrow(file.in) < 3) {
print("discard")
} else {
newrow <-
matrix(
c(filetitles[[i]], round(mean(file.in$V2, na.rm=T),1), round(mean(file.in$V4, na.rm=T),1)), nrow = 1, ncol =
3, byrow = T
)
mdf <- rbind(mdf, newrow)
}
}
names(mdf) <- c("timestamp", "humidity", "temp")
Only I did not get it to work as a function because then I would only have one row in mdf containing the last file data. Somehow it did not add rows but overwrite row 1 with each iteration. But using it without a function wrapper worked fine...

R: selectively importing data from several csv files into single data frame while also changing data from rows to individual columns

I’m looking to do the following in R.
I have 250+ csv files of chromatographic data structured similarly to the example below, but with 21 rows instead of three:
1 4.708252 BB 9.946890 7.830349 0.01982016 4.684836 4.742056
2 4.970352 BB 1.792341 1.497008 0.01896829 4.945352 5.005390
3 6.393414 BB 6.599891 5.309925 0.01950091 6.368413 6.428723
What I want to do is read a subset of the data in all 250 files into a single data frame, which is easy enough — but I also need to restructure it a fair bit.
Every row in the table above is a peak. I only want the data from the first and fourth columns (which are ‘peak number’ and ‘area under the peak’, respectively), and in the output I need to make each peak an individual column, rather than a row as above, with the peak number as the header. Finally, I want to create a new column where each row (that is, the data from each individual csv file) is given the same name as the csv file name.
So, imagine I have 3 files: ABC1.csv, ABC2.csv, and ABC3.csv. Each file looks like my example above. I want to automatically take all those files and merge them into a single data frame such as the one below.
ID 1 2 3
ABC1 9.94689 1.792341 6.599891
ABC2 9.76651 1.932332 6.600022
ABC3 8.99193 2.556471 6.718934
I hope I’ve made this clear enough. I’ve been able to manage most of the steps but haven’t been successful writing them into a single script. And I have no idea how, if there is any way, to make the file name into a variable.
Cheers
I am assuming the working directory is set to where the files are. Then you can get the list of files below.
filenames <- list.files()
Have a helper function to read a file and keep just columns 1 and 4.
readdata <- function(filename) {
df <- read.csv(filename)
vec <- df[, 4]
names(vec) <- df[, 1]
return(vec)
}
Loop over all of the files and rbind them
result <- do.call(rbind, lapply(filenames, readdata))
Name them as you like
row.names(result) <- filenames
this following code can probably be of some help, though the file name is still not working properly -
path <- "C:\\Users\\Vidyut\\"
filenames <- list.files(path = path,pattern = ".csv")
l <- data.frame(ID=character(),col1=numeric(),col2=numeric(),col3=numeric(),stringsAsFactors=FALSE)
for (i in filenames) {
#i = filenames[1]
full = paste(path,i,sep="")
m <- read.csv(full, header=F)
# extract the subset of rows required from each file
# m <- m[c(),]
n<- m[,c(1,4)]
y <- gsub('.csv','',i)
print("y=")
print(y)
d <- list(ID=as.character(y),col1=n[1,2],col2=n[2,2],col3=n[3,2])
print("d=")
print(d)
l <- rbind.data.frame(l,d)
print("l=")
print(l)
}
Mind you, this is not very pretty code - just something hacked together to get the job done (visible from the multiple print lines scattered across).
Here's a solution for you. This only works if we can assume that there are exactly 21 peaks in each file and they are in order 1:21. If that's not the case a few changes to the code should remedy this.
folder = "c:/temp/"
files <- dir(folder)
first_loop <- TRUE
for (file in files) {
# Read one file, only the first and fourth columns
temp <- read.csv(file=paste0(folder,file),
header = FALSE,
colClasses = c("integer", "NULL", "NULL", "numeric", "NULL", "NULL", "NULL", "NULL"))
# Transpose the data
temp <- data.frame(t(temp))
# Remove the peak number
temp <- temp[2,]
# Concatenate the dataframes together
temp$file <- file
if (first_loop) {
data <- temp
first_loop <- FALSE
} else {
data <- rbind(data, temp)
}
}
data

Scanning a csv file for a string in R

I would like to be able to scan a csv file row by row in R and exclude the rows that contain the word "target".
The problem is that the data comes from different places and the word "target" can come up in a number of different columns in the data frame.
So I need a line in a function that will look for this string, and if it is not present, then append that row to a new data frame (that I will then write out as a new csv).
Any and all help gratefully recieved.
Andrie's comment is probably the way most users would approach this, but if you want to do this at the reading in stage, you can try this:
Read in your csv using readLines and make any lines that have the text target blank:
temp = gsub(".*target.*", "", readLines("test.csv"))
Use read.table to convert temp to a data.frame. Since all lines that have the text target are now blank, the default blank.lines.skip=TRUE in read.table should correctly read in the rest of your data as a data.frame.
read.table(text=temp, sep=",", header=TRUE)
Use readLines:
lines <- readLines(file)
n.lines <- length(lines)
vec.1 <- rep(0, n.lines)
vec.2 <- rep(0, n.lines)
# more vectors as necessary
counter <- 0
for (i in 1:n.lines){
this.line <- strplit(lines[i], ",")
if ("target" %in% this.line) next
counter <- counter + 1
vec.1[counter] <- this.line[1]
vec.2[counter] <- this.line[2]
# etc.
}
df <- data.frame(vec.1[1:counter], vec.2[1:counter])
You may have to change n.lines slightly and change the indexing of the for loop if your file has headers; two lines would change as follows:
n.lines <- length(lines) - 1
and
for(i in 2:(n.lines+1)){
I would call from.readLines <- readLines(filename) and then just sub-select the rows that don't contain the target string: data <- read.csv(text = from.readLines[-grep('target', from.readLines)], header = F).
The faster way to do it (if your file is huge) would be to grep -v 'target' original.csv > new.csv first on the command line and then run read.csv(new.csv, ...) in R.
But anyway,
> #Without header
> from.readLines <- c('afaf,afasf,target', 'afaf,target,afasf', 'dagdg,asgst,sagga', 'dagdg,dg,sfafgsgg')
> data <- read.csv(text = from.readLines[-grep('target', from.readLines)], header = F)
> print(data)
V1 V2 V3
1 dagdg asgst sagga
2 dagdg dg sfafgsgg
>
> #With header
> from.readLines <- c('var1,var2,var3', 'afaf,afasf,target', 'afaf,target,afasf', 'dagdg,asgst,sagga', 'dagdg,dg,sfafgsgg')
> data <- read.csv(text = from.readLines[-(grep('target', from.readLines[-1]) + 1)])
> print(data)
var1 var2 var3
1 dagdg asgst sagga
2 dagdg dg sfafgsgg

Resources