Split multiple csv files in chunks based on condition - r

I have 400+ quite large csv files (~ million rows) having all a similar structure :
A long header of which I only need the 2nd and 3rd rows
A first time serie (always preceded by 'Target1')
A second time serie (always preceded by 'Target2')
Here is an example of data :
#multiple rows of header#
Target 1
Timestamp,X,Y,Z
1553972886851,0.017578,-0.003052,-0.971375
1553972886851,0.017883,-0.003662,-0.980408
1553972886851,0.016418,-0.003174,-0.977295
1553972886999,0.017151,-0.002808,-0.978088
1553972886999,0.016785,-0.003113,-0.977051
1553972886999,0.017883,-0.002197,-0.975830
1553972887096,0.017517,-0.003113,-0.976624
1553972887096,0.017883,-0.003113,-0.977966
1553972887096,0.017883,-0.002869,-0.978210
1553972887243,0.017151,-0.003113,-0.976135
1553972887243,0.018250,-0.003235,-0.975647
1553972887243,0.017273,-0.002991,-0.976257
1553972887340,0.018372,-0.003235,-0.977722
1553972887340,0.017761,-0.003235,-0.978027
Target 2
Timestamp,X,Y,Z
1553972886753,-0.411585,0.072409,-0.849848
1553972886753,-0.339177,-0.053354,-0.556402
1553972886753,-0.411585,-0.262957,-0.483994
1553972886855,-0.506860,-0.057165,-0.472561
1553972886855,-0.499238,-0.007622,-0.529726
1553972886855,-0.472561,-0.041921,-0.560213
1553972887002,-0.510671,-0.083841,-0.480183
1553972887002,-0.525915,-0.057165,-0.480183
1553972887002,-0.544969,-0.038110,-0.522104
1553972887098,-0.510671,-0.030488,-0.510671
1553972887098,-0.529726,-0.026677,-0.525915
1553972887098,-0.510671,-0.068598,-0.518293
I need to split each csv files in those 3 parts and name them accordingly.
I managed to do step 1) and step 3) but struggle for step 2).
Here is what I did for step 3) :
fileNames <- basename(list.files(path = ".", all.files = FALSE, full.names = FALSE, recursive = TRUE, ignore.case = FALSE, include.dirs = FALSE))
extension <- "txt"
fileNumbers <- seq(fileNames)
for (fileNumber in fileNumbers) {
newFileName <- paste("Target2-",
sub(paste("\\.", extension, sep = ""), "", fileNames[fileNumber]),
".", extension, sep = "")
# read old data:
Lines <- readLines(fileNames[fileNumber])
ix <- which(Lines == "Target2")
sample <- read.csv(fileNames[fileNumber],
header = TRUE,
sep = ",", skip= ix)
# write old data to new files:
write.table(sample,
newFileName,
append = FALSE,
quote = FALSE,
sep = ",",
row.names = FALSE,
col.names = TRUE)
}
I'm quite sure this is not the most straightforward approach and I can't get the data comprised between Target 1 and Target 2 using this approach. Also, this is super slow and I was wondering it there could be a more memory efficient approach ?

foo = function(filename) {
cat("\nprocessing", filename, "...")
x = readLines(con = filename)
idx = grepl("^Target", x)
x = split(x[!idx], cumsum(idx)[!idx])[-1]
invisible(lapply(seq_along(x), function(i) {
write.table(
x = x[[i]],
file = sub("\\.csv", paste0("_", i, ".csv"), filename),
append =FALSE,
row.names = FALSE,
quote = FALSE,
col.names = FALSE)
}))
}
files = list.files(path = "path/to/files", pattern = ".+\\.csv$")
lapply(files, foo)

Related

Rbind to csv file in R [duplicate]

I have a .csv file with 175 rows and 6 columns. I want to append a 176th row. My code is as follows:
x <- data.frame('1', 'ab', 'username', '<some.sentence>', '2017-05-04T00:51:35Z', '24')
write.table(x, file = "Tweets.csv", append = T)
What I expect to see is:
Instead, my result is:
How should I change my code?
write.table(x, file = "Tweets.csv", sep = ",", append = TRUE, quote = FALSE,
col.names = FALSE, row.names = FALSE)

for Loop in R not binding files

I'm fairly new to R, so my apologies if this is a very basic question.
I'm trying to read two Excel files in, using the list.files(pattern) method, then using a for loop to bind the files and replace values in the bound file. However, the output that my script is producing is the output from only one file, meaning that it is not binding.
The file names are fact_import_2020 and fact_import_20182019.
FilePath <- "//srdceld2/project2/"
FileNames <- list.files(path = FilePath, pattern = "fact_import_20", all.files = FALSE,
full.names = FALSE, recursive = FALSE,
ignore.case = FALSE, include.dirs = FALSE, no.. = FALSE)
FileCount <- length(FileNames)
for(i in 1:FileCount){
MOH_TotalHC_1 <- read_excel(paste(FilePath, "/", FileNames[i], sep = ""), sheet = 1, range = cell_cols("A:I"))
MOH_TotalHC_2 <- read_excel(paste(FilePath, "/", FileNames[i], sep = ""), sheet = 1, range = cell_cols("A:I"))
MOH_TotalHC <- rbind(MOH_TotalHC_1, MOH_TotalHC_2)
MOH_TotalHC <- MOH_TotalHC[complete.cases(MOH_TotalHC), ]
use full.names = TRUE in list.files().
After this, make sure FileNames has full path of the files.
Then loop through the filenames, instead of filecount.
I think, you are trying to do this. I am guessing here. Please see below.
You are getting data from one file, because you are overwriting the data from file-2 with data from file-1. The for() loop is indicating it.
FileNames <- list.files(path = FilePath, pattern = "fact_import_20", all.files = FALSE,
full.names = TRUE, recursive = FALSE,
ignore.case = FALSE, include.dirs = FALSE, no.. = FALSE)
# list of data from excell files
df_lst <- lapply(FileNames, function(fn){
read_excel(fn, sheet = 1, range = cell_cols("A:I"))
})
# combine both data
MOH_TotalHC <- do.call('rbind', df_lst)
# complete cases
MOH_TotalHC[complete.cases(MOH_TotalHC), ]
The potential solution is below. This solution is taken from here and seems like a
duplicate question.
Potential solution:
library(readxl)
library(data.table)
#Set your path here
FilePath <- "//srdceld2/project2/"
#Update the pattern to suit your needs. Currently, its just set for XLSX files
file.list <- list.files(path = FilePath, pattern = "*.xlsx", full.names = T)
df.list <- lapply(file.list, read_excel, sheet = 1, range = cell_cols("a:i"))
attr(df.list, "names") <- file.list
names(df.list) <- file.list
setattr(df.list, "names", file.list)
#final data frame is here
dfFinal <- rbindlist(df.list, use.names = TRUE, fill = TRUE)
Assumptions and call outs:
The files in the folder are similar file types. For example xlsx.
The files could have different set of columns and NULLs as well.
Note that the order of the columns matter and so if there are more columns in new file the number of output columns could be different.
Note: Like #Sathish, I am guessing what the input could look like

R: Append multiple rows to dataframe within for-loop

I have PDF files that I made from these wikipedia pages (for example):
https://en.wikipedia.org/wiki/AIM-120_AMRAAM
https://en.wikipedia.org/wiki/AIM-9_Sidewinder
I have a list of keywords I want to search for within the document and extract the sentences in which they appear.
keywords <- c("altitude", "range", "speed")
I can call the file, extract the text from the PDF, pull the sentences with the keywords from the PDF. This works if I do this with each of the keywords individually, but when I try to do this in a loop I keep getting this issue where the rows aren't appending. Instead it's almost doing a cbind and then an error gets thrown regarding the number of columns. Here is my code and any help you can provide as to what I can do to make this work is much appreciated.
How do I get the rows to append correctly and appear in one file per PDF?
pdf.files <- list.files(path = "/path/to/file", pattern = "*.pdf", full.names = FALSE, recursive = FALSE)
for (i in 1:length(pdf.files)) {
for (j in 1:length(keywords)) {
text <- pdf_text(file.path("path", "to", "file", pdf.files[i]))
text2 <- tolower(text)
text3 <- gsub("\r", "", text2)
text4 <- gsub("\n", "", text3)
text5 <- grep(keywords[j], unlist(strsplit(text4, "\\.\\s+")), value = TRUE)
}
temp <- rbind(text5)
assign(pdf.files[i], temp)
}
After I get the rows to append correctly the next step will be to add in the keywords as a variable to the left of the extracted sentences. Example of ideal output:
keywords sentence
altitude sentence1.1
altitude sentence1.2
range sentence2.1
range sentence2.2
range sentence2.3
speed sentence3.1
speed sentence3.2
Would this be done in the loop as well or post as a separate function?
Any help is appreciated.
Alright so it took some real thinking but I made it work and it's not pretty but it gets the job done:
# This first part initializes the files to be written to
files <- list.files(path = "/path/to/file", pattern = "*.*", full.names = FALSE, recursive = FALSE)
for (h in 1:length(files)) {
temp1 <- data.frame(matrix(ncol = 2, nrow = 0))
x <- c("Title", "x")
colnames(temp1) <- x
write.table(temp1, paste0("/path/to/file", tools::file_path_sans_ext(files[h]), ".txt"), sep = "\t", row.names = FALSE, quote = FALSE)
}
# This next part fills in the files with the sentences
pdf.files <- list.files(path = "/path/to/file", pattern = "*.pdf", full.names = FALSE, recursive = FALSE)
for (i in 1:length(pdf.files)) {
for (j in 1:length(keywords)) {
text <- pdf_text(file.path("path", "to", "file", pdf.files[i]))
text2 <- tolower(text)
text3 <- gsub("\r", "", text2)
text4 <- gsub("\n", "", text3)
text5 <- as.data.frame(grep(keywords[j], unlist(strsplit(text4, "\\.\\s+")), value = TRUE))
colnames(text5) <- "x"
if (nrow(text5) != 0) {
title <- as.data.frame(keywords[j])
colnames(title) <- "Title"
temp <- cbind(title, text5)
temp <- unique(temp)
write.table(temp, paste0("/path/to/file", tools::file_path_sans_ext(pdf.files[i]), ".txt"), sep = "\t", row.names = FALSE, quote = FALSE, col.names = FALSE, append = TRUE)
}
}
}

Using R to append a row to a .csv file

I have a .csv file with 175 rows and 6 columns. I want to append a 176th row. My code is as follows:
x <- data.frame('1', 'ab', 'username', '<some.sentence>', '2017-05-04T00:51:35Z', '24')
write.table(x, file = "Tweets.csv", append = T)
What I expect to see is:
Instead, my result is:
How should I change my code?
write.table(x, file = "Tweets.csv", sep = ",", append = TRUE, quote = FALSE,
col.names = FALSE, row.names = FALSE)

Maximum number of columns that can be read using read.csv

I want to read a csv file of 4000 columns and 3000 rows and rows are of different length. Now i'm using the code below to read, but the maximum number of columns can be read is 2067.
read_data <- function(filename) {
setwd(dir)
no_col <- max(count.fields(filename, sep = ","))
temp_data <- read.csv(filename, header = FALSE, sep = ",", row.names = NULL, na.strings = 0, fill = TRUE, col.names=1:no_col)
How do I solve this problem?

Resources