Modify columns in a dataframe by using function - r

I'm trying to modify my data frame columns and positions. Finally I found some solution to do that but I want to do all process in a function for all data sets in the directory and overwrite the real data.
kw <- matrix(1:11400, ncol = 19) # to make sample data
kw <- kw[, !(colnames(kw) %in% c("V18","V19"))] # to remove last two cols
add <- c(kw$V18 <- 0,kw$V19<- 0) # add new columns with all zero values
kw$V1 <- kw$V1 * 1000 # to modify first col of data frame
kw <- kw[ ,c(1,18:19,2:17)] # to replace col positions
lets say I have data set in the directory
kw<-read.table("5LSTT-test10.avgm", header = FALSE,fill=FALSE) # example which shows how I read single data
`5LSTT-test10.avgm`
.
.
.
.
5LSTT-test10.avgm`
how can apply this column modification process to each data separately and overwrite or make new data?
edit output readLines("5LSTT-test10.avgm", n = 1)
you can see 19 columns and think this data has 600 rows
[1] " 9.0000E-02 0.0000E+00 2.3075E-03 -6.4467E-03 9.9866E-01 9.8648E-02 4.5981E-02 9.8004E-01 1.2359E-01 6.1175E-02 9.7701E-01 8.6662E-02 3.0034E-02 9.7884E-01 7.0891E-02 8.2247E-03 9.8564E-01 -8.7967E-11 4.3105E-02"

With "data.table" you would be able to do something like:
setcolorder(
fread(yourfile)[, c("V1", "V18", "V19") := list(V1 * 1000, 0, 0)], c(1, 18:19, 2:17))
Thus, if you really needed a function, you can do something like:
myFun <- function(infile) {
require(data.table)
write.table(
setcolorder(
fread(infile)[
, c("V1", "V18", "V19") := list(V1 * 1000, 0, 0)],
c(1, 18:19, 2:17)),
file = gsub("(.*)(\\..*)", "\\1_new\\2", infile),
row.names = FALSE)
}
You can then use myFun within lapply over a vector of the files you want to read and process.
In other words:
lapply(myListOfFilePaths, myFun)
By default, this function renames (rather than overwrites) your file appending "_new" at the end, but before the extension.

This could be another way
Read all the files and store it in a list like this
# to list down all the files in the directory
files.new = list.files(directory.path, recursive = TRUE, pattern=".avgm")
# to read all the files and store it in list
file.contents = lapply(paste(directory.path,files.new, sep="/"), read.table, sep='\t', header = TRUE)
Next you can do the modifications to each of the dataset in the list something like this
outlist = lapply(file.contents, function(x){
# modifications
kw <- x[, !(colnames(x) %in% c("V18","V19"))]
add <- c(kw$V18 <- 0,kw$V19<- 0)
kw$V1 <- kw$V1 * 1000
kw <- kw[ ,c(1,18:19,2:17)]
})
and write the modified data into new files using the function below
# function to write files from a list object
write.files = function(modified.list, path){
outlist = file.contents[sapply(modified.list, function(x) length(x) > 1)]
sapply(names(outlist), function(x)
write.table( outlist[[x]], file= paste(path, x, sep="/"),
sep="\t", row.names=FALSE))
}
Writing the data to files
write.files(outlist, "/directory/path")

Related

Loop through files in R and select rows by string

I have a large number of CSV files. I need to extract relevant data from each file, and compile all of the relevant data into a new file.
I have been copying/pasting the code below and changing relevant details (e.g., file name) to repeat the same process for many CSV files. After that, I use cbind()/write.xlsx() to combine all of the relevant data and write it to an excel file. I need a more efficient method to accomplish this task.
How can I:
incorporate a loop that imports a large number of CSV files (to replace #1 below)
select relevant rows based on a string instead of entering specific row numbers
(to replace # 2 below)
combine all of the relevant data into a single data frame with each file's data in one column
library(tidyr)
# 1 - import raw data files
file1 <- read.csv ("1.csv", header = FALSE, sep = "\n")
# 2 - select relevant rows
file1 <- as.data.frame(file1[c(41:155),])
colnames(file1) <- c("file1")
#separate components of each line from raw csv file / isolate data
temp1 <- separate(file1, file1, into = c("Text", "IntNum", "Data", sep = "\\s"))
temp1 <- temp1$Data
temp1 <- as.data.frame(temp1)
If the number of relevant rows in each file is the same, you could do it like this. Option 1 shows a solution using a loop, option 2 shows a solution using sapply.
In a first step I generate three csv-files to make the code reproducible. The start row in each file is defined by "start", the end row by "end". I then get a list with the names of these files with dir().
#make csv-files, target vector always same length (3)
set.seed(1)
for (i in 1:3) {
df <- data.frame(x = c(rep(0, sample(1:10,1)), "begin",
paste0("dat", i),
"end",rep(0, sample(1:10, 1))))
write.csv(df, file = paste0("file", i, ".csv"), quote = FALSE, row.names = FALSE)
}
#get list of file names
allFiles <- dir(pattern = glob2rx("*.csv"))
Option 1 - loop
For the loop you could first initialize a result data frame ("outDF") with the number of columns set to the number of csv-files and the number of rows set to the length of the target vector in each file ("start" to "end"). You can then loop over the files and fill the data frame. The start and end rows can be indexed using which().
#initialise result data frame
outDF <- data.frame(matrix(nrow = 3, ncol = length(allFiles),
dimnames = list(NULL, allFiles)))
#loop over csv files
for (iFile in allFiles) {
idat <- read.csv(iFile, stringsAsFactors = FALSE) #read csv
outDF[, iFile] <- idat[which(idat$x == "start"):which(idat$x == "end"),]
}
Option 2 - sapply
Instead of a loop you could use sapply with a custom function to extract the relevant rows in each file. This returns a matrix which you could then transform into a dataframe.
out <- sapply(allFiles, FUN = function(x) {
idat <- read.csv(x, stringsAsFactors = FALSE)
return(idat[which(idat$x == "start"):which(idat$x == "end"),])
})
outDF <- as.data.frame(out)
If the number of rows between "start" and "end" differs between files, the above options won´t work. In this case you could generate a data frame by first using lapply() (similar to option 2) to generate a result list (with different lengths of the list elements) and then padding shorter lists with NAs before transforming the result into a dataframe again.
#make csv-files with with target vector of different lengths (3:12)
set.seed(1)
for (i in 1:3) {
df <- data.frame(x = c(rep(0, sample(1:10,1)), "start",
rep(paste0("dat", i), sample(1:10,1)),
"end",rep(0, sample(1:10, 1))))
write.csv(df, file = paste0("file", i, ".csv"), quote = FALSE, row.names = FALSE)
}
#lapply
out <- lapply(allFiles, FUN = function(x) {
idat = read.csv(x, stringsAsFactors = FALSE)
return(idat[which(idat$x == "start"):which(idat$x == "end"),])
})
out <- lapply(out, `length<-`, max(lengths(out)))
outDF <- do.call(cbind, out)

Correlation between variables in multiple files

I have 280 *.csv files in a directory. Each file has 3 columns and 1000 rows. I want to estimate Pearson's correlation between column 2 and 3 of each file and put the correlation value in the first cell of column 4, and also all 280 correlation values in a separate file. How can I do this in R?
I have tried several codes including the one below which although I know is incorrect, I do not know how to write. Please help.
files <- list.files(path="mydirectory", pattern="*.csv", full.names=TRUE,
recursive=FALSE)
function(files)
lapply(files,function(x){
x <- read.csv(files, header = TRUE)
out <- function(cor(files[,2:3])
write.csv(out, sep = "\t", quote = FALSE, row.names = FALSE)
})
As for the first part, that's easy. You can calculate the correlations in an lapply loop and write them to a new file:
lapply(files, function(f) {
# Read CSV data
csv_data <- read.csv(f, header=TRUE)
# Calculate correlation
csv_data[, 4] <- cor(csv_data[, 2], csv_data[, 3])
# Create a new filename by replacing the ending of the
# input file (.csv) with (_cor.csv)
newfile <- gsub("\\.csv$", "_cor.csv", f)
write.csv(csv_data, file = newfile, quote = FALSE)
})
Since R wants columns in data.frames to have the same number of rows, this will fill every row of the 4th column with the correlation value. I would roll with this, but if you have a lot of data this can waste storage. Here's a not very elegant solution to only have the correlation in the first row:
lapply(files, function(f) {
# Read CSV data
csv_data <- read.csv(f, header=TRUE)
# Calculate correlation
csv_data[, 4] <- cor(csv_data[, 2], csv_data[, 3])
# Now delete duplicate values of cor
csv_data[2:nrow(csv_data), 4] <- NA
# Create a new filename by replacing the ending of the
# input file (.csv) with (_cor.csv)
newfile <- gsub("\\.csv$", "_cor.csv", f)
# Now when we write, we tell R to write an empty string when it encounters
# missing values
write.csv(csv_data, file = newfile, quote = FALSE, na = "")
})
Also:
You do not need to call function() when you use functions that already exist (like lapply() or cor()). You only need to use that when you want to define a new function yourself.
If you want to have the output in a single data.frame try:
my_df <- do.call(rbind,
lapply(files, function(f) {
# Read CSV data
csv_data <- read.csv(f, header=TRUE)
# Calculate correlation
data.frame(File=f, Correlation=cor(csv_data[, 2], csv_data[, 3]))
})
)

R: Split and write very large data frame into slices

I have a large dataframe my_df in R containing 1983000 records. The following lines of sample code take the chunk of 1000 rows starting from 25001, do some processing, and write the processed data into a file to the local disk.
my_df1 <- my_df[25001:26000,]
my_df1$end <- as.POSIXct(paste(my_df1$end,"23:59",sep = ""))
my_df1$year <- lubridate::year(my_df1$start)
str_data <- my_df1
setwd("path_to_local_dir/data25001_26000")
write.table(str_data, file = "data25001-26000.csv",row.names = F,col.names = F,quote = F)
and so on like this:
my_df2 <- my_df[26001:27000,]
...
I would like automate this task such that the chunks of 1000 records are processed and written to a new directory. Any advise on how this could be done?
Consider generalizing your process in a function, data_to_disk, and call function with an iterator method like lapply passing a sequence of integers with seq() for each subsequent thousand. Also, incorporate a dynamic directory creation (but maybe dump all 1,000+ files in one directory instead of 1,000+ dirs?).
data_to_disk <- function(num) {
str_data <- within(my_df[num:(num + 999)], {
end <- as.POSIXct(paste0(end, "23:59"))
year <- lubridate::year($start)
})
my_dir <- paste0("path_to_local_dir/data", num, "_", num + 999)
if(!dir.exists(my_dir)) dir.create(my_dir)
write.table(str_data, file = paste0(my_dir, "/", "data", num, "-", num + 999, ".csv"),
row.names = FALSE, col.names = FALSE, quote = FALSE)
return(my_df)
}
seqs <- seq(25001, nrow(my_df), by=1000)
head(seqs)
# [1] 25001 26001 27001 28001 29001 30001
tail(seqs)
# [1] 1977001 1978001 1979001 1980001 1981001 1982001
# LIST OF 1,958 DATA FRAMES
df_list <- lapply(seqs, data_to_disk)
Here is my code doing the sliced loop:
step1 = 1000
runto = nrow(my_df)
nsteps = ceiling(runto/step1)
for( part in seq_len(nsteps) ) { # part = 1
cat( part, 'of', nsteps, '\n')
fr = (part-1)*step1 + 1
to = min(part*step1, runto)
my_df1 = my_df[fr:to,]
# ...
write.table(str_data, file = paste0("data",fr,"-",to,".csv"))
}
rm(part, step1, runto, nsteps, fr, to)
You can add a grouping variable to your data first (e.g., to identify every 1000 rows), then use d_ply() to split the data and write to file.
df <- data.frame(var=runif(1000000))
df$fold <- cut(seq(1,nrow(df)),breaks=100,labels=FALSE)
df %>% filter(fold<=2) %>% # only writes first two files
d_ply(.,.(fold), function(i){
# make filenames 'data1.csv', 'data2.csv'
write_csv(i,paste0('data',distinct(i,fold),'.csv'))
})
This is similar to #Parfait but takes a lot of stuff out of the function. Specifically, it creates a copy of the entire dataset and then performs the time manipulation functions.
my_df1 <- my_df
my_df1$end <- as.POSIXct(paste(my_df1$end,"23:59",sep = ""))
my_df1$year <- lubridate::year(my_df1$start)
lapply(seq(25001, nrow(my_df1), by = 1000),
function(i) write.table(my_df1[i:i+1000-1,]
, file = paste0('path_to_logal_dir/data'
, i, '-', i+1000-1, '.csv')
,row.names = F,col.names = F,quote = F)
)
For me, I'd probably just do:
write.table(my_df1, file = ...)
and be done with it. I don't see the advantages of splitting it up - 1 million rows really isn't that many.

New variable from list.file

I am creating an object calling all .csv files in a directory, reading them in according to some specifications, and merging them.
Before merging them I want to take the first two letters of the file names and create a new column in each table reporting that two letter as a variable.
I got this far:
temp = list.files(pattern="*.csv")
myfiles = lapply(temp, function(x) read.csv(x,
header=TRUE,
#sep=";",
stringsAsFactors=F,
encoding = "UTF-8",
na.strings = c("NA",""),
colClasses=c("code"="character")))
myfiles.final = do.call(rbind, myfiles)
When I try to create the new variable though I generate a replacement that has double the rows of the data:
temp.2 <- lapply(temp, function(x) substr(x, start = 1, stop = 2))
myfiles.2 = lapply(myfiles,
function(x){
a <- temp.2[seq_along(myfiles)]
x$identifier <- rep(a,nrow(x))
return(x)
})
In the folder the files are named, for example AA029893.csv,BB024593.csv..., for the first table I just want a new column called "identifier" that has "AA" for all entries, for the second "BB", and so on.
Thanks a lot
lapply is good for iterating along 1 list (e.g., myfiles data frames). To add a column to each data frame, you want to iterate in parallel over two lists, the list of data frames and the list of names. Map does this (for an arbitrary number of lists):
myfiles.2 = Map(function(dd, nn) {dd$identifier = nn; return(dd)},
dd = myfiles, nn = temp.2)
An easier alternative is to add the column post-hoc:
myfiles.final = do.call(rbind, myfiles)
myfiles.final$identifier = rep(
sapply(temp, function(x) substr(x, start = 1, stop = 2)),
each = lengths(myfiles)
)
The easiest alternative is to use data.table::rbindlist or dplyr::bind_rows, either of which will automatically add an ID column based on the names of your list. Depending on the size of your data, they might be quite a bit faster as well.
names(myfiles) = sapply(temp, function(x) substr(x, start = 1, stop = 2))
myfiles.2 = dplyr::bind_rows(myfiles)
myfiles.2 = data.table::rbindlist(myfiles, idcol = "identifier")

Mean values from multiple csv to data frame

After having searched for help in different threads on this topic, I still have not become wiser. Therefore: Here comes another question on looping through multiple data files...
OK. I have multiple CSV files in one folder containing 5 columns of data. The filenames are as follows:
Moist yyyymmdd hh_mm_ss.csv
I would like to create a script that reads processes the CSV-files one by one doing the following steps:
1) load file
2) check number of rows and exclude file if less than 3 registrations
3) calculate mean value of all measurements (=rows) for column 2
4) calculate mean value of all measurements (=rows) for column 4
5) output the filename timestamp, mean column 2 and mean column 4 to a data frame,
I have written the following function
moist.each.mean <- function() {
library("tcltk")
directory <- tk_choose.dir("","Choose folder for Humidity data files")
setwd(directory)
filelist <- list.files(path = directory)
filetitles <- regmatches(filelist, regexpr("[0-9].*[0-9]", filelist))
mdf <- data.frame(timestamp=character(), humidity=numeric(), temp=numeric())
for(i in 1:length(filelist)){
file.in[[i]] <- read.csv(filelist[i], header=F)
if (nrow(file.in[[i]]<3)){
print("discard")
} else {
newrow <- c(filetitles[[i]], round(mean(file.in[[i]]$V2),1), round(mean(file.in[[i]]$V4),1))
mdf <- rbind(mdf, newrow)
}
}
names(mdf) <- c("timestamp", "humidity", "temp")
}
but i keep getting an error:
Error in `[[<-.data.frame`(`*tmp*`, i, value = list(V1 = c(10519949L, :
replacement has 18 rows, data has 17
Any ideas?
Thx, kruemelprinz
I'd also suggest to use (l)apply... Here's my take:
getMeans <- function(fpath,runfct,
target_cols = c(2),
sep=",",
dec=".",
header = T,
min_obs_threshold = 3){
f <- list.files(fpath)
fcsv <- f[grepl("\.csv",f)]
fcsv <- paste0(fpath,fcsv)
csv_list <- lapply(fcsv,read.table,sep = sep,
dec = dec, header = header)
csv_rows <- sapply(csv_list,nrow)
rel_csv_list <- csv_list[!(csv_rows < min_obs_threshold)]
lapply(rel_csv_list,function(x) colMeans(x[,target_cols]))
}
Also with that kind of error message, the debugger might be very helpful.
Just run debug(moist.each.mean) and execute the function stepwise.
Here's a slightly different approach. Use lapply to read each csv file, exclude it if necessary, otherwise create a summary. This gives you a list where each element is a data frame summary. Then use rbind to create the final summary data frame.
Without a sample of your data, I can't be sure the code below exactly matches your problem, but hopefully it will be enough to get you where you want to go.
# Get vector of filenames to read
filelist=list.files(path=directory, pattern="csv")
# Read all the csv files into a list and create summaries
df.list = lapply(filelist, function(f) {
file.in = read.csv(f, header=TRUE, stringsAsFactors=FALSE)
# Set to empty data frame if file has less than 3 rows of data
if (nrow(file.in) < 3) {
print(paste("Discard", f))
# Otherwise, capture file timestamp and summarise data frame
} else {
data.frame(timestamp=substr(f, 7, 22),
humidity=round(mean(file.in$V2),1),
temp=round(mean(file.in$V4),1))
}
})
# Bind list into final summary data frame (excluding the list elements
# that don't contain a data frame because they didn't have enough rows
# to be included in the summary)
result = do.call(rbind, df.list[sapply(df.list, is.data.frame)])
One issue with your original code is that you create a vector of summary results rather than a data frame of results:
c(filetitles[[i]], round(mean(file.in[[i]]$V2),1), round(mean(file.in[[i]]$V4),1)) is a vector with three elements. What you actually want is a data frame with three columns:
data.frame(timestamp=filetitles[[i]],
humidity=round(mean(file.in[[i]]$V2),1),
temp=round(mean(file.in[[i]]$V4),1))
Thanks for the suggestions using lapply. This is definitely of value as it saves a whole lot of code as well! Meanwhile, I managed to fix my original code as well:
library("tcltk")
# directory: path to csv files
directory <-
tk_choose.dir("","Choose folder for Humidity data files")
setwd(directory)
filelist <- list.files(path = directory)
filetitles <-
regmatches(filelist, regexpr("[0-9].*[0-9]", filelist))
mdf <- data.frame()
for (i in 1:length(filelist)) {
file.in <- read.csv(filelist[i], header = F, skipNul = T)
if (nrow(file.in) < 3) {
print("discard")
} else {
newrow <-
matrix(
c(filetitles[[i]], round(mean(file.in$V2, na.rm=T),1), round(mean(file.in$V4, na.rm=T),1)), nrow = 1, ncol =
3, byrow = T
)
mdf <- rbind(mdf, newrow)
}
}
names(mdf) <- c("timestamp", "humidity", "temp")
Only I did not get it to work as a function because then I would only have one row in mdf containing the last file data. Somehow it did not add rows but overwrite row 1 with each iteration. But using it without a function wrapper worked fine...

Resources