Read numeric input as string R - r

So, i have this input csv of the form,
id,No.,V,S,D
1,0100000109,623,233,331
2,0200000109,515,413,314
3,0600000109,611,266,662
I need to read the No. Column as it is(i.e., as a character). I know i can use something like this for that:
data <- read.csv("input.csv", colClasses = c("MSISDN" = "character"))
I have a code that i'm using to read the csv file in chunks:
chunk_size <- 2
con <- file("input.csv", open = "r")
data_frame <- read.csv(con,nrows = chunk_size,colClasses = c("MSISDN" = "character"),quote="",header = TRUE,)
header <- names(data_frame)
print(header)
print(data_frame)
if(nrow(data_frame) == chunk_size) {
repeat {
data_frame <- read.csv(con,nrows = chunk_size, header = FALSE, quote="")
names(data_frame)<-c(header)
print(header)
print(data_frame)
if(nrow(data_frame) < chunk_size) {
break
}
}
}
close(con)
But, here what the issue i'm facing is that, the first chunk will only read the No. Column as a character, the rest of the chunks will not.
How can i resolve this?
PS: the original input file has about 150+ columns and about 20 Million rows.

You can read the data as string with readLines and split it:
fileName <- "input.csv"
df <- do.call(rbind.data.frame, strsplit(readLines(fileName), ",")[-1]) # skipping headlines
colnames(df) <- c("id","No.","V","S","D") #adding headlines
or the direct approach with read.csv:
fileName <- "input.csv"
col <- c("integer","character","integer","integer","integer")
df <- read.csv(file = fileName,
sep = ",",
colClasses=col,
header = TRUE,
stringsAsFactors = FALSE)

You need to give the column type colClasses in the read.csv() inside the repeat procedure.
You no longer have the header so you need to define an unnamed vector to specify the colClasses.
Let's say the size of colClasses is 150.
myColClasses=rep("numeric",150)
myColClasses[2] <- "character"
repeat {
data_frame <- read.csv(con,nrows = chunk_size, colClasses=myColClasses, header = FALSE, quote="")
...

Related

Read multiple .txt files and add new column identifying file name in R

I have 1500+ .txt files called data_{date from 2015070918 to today} all with 7 columns worth of data and variable row amounts. I have managed to use the following code to extract and merge the data into one table:
files = list.files(pattern = ".txt")
myData <- lapply(files, function(x) {
tryCatch(read.table(x, header = F, sep = ','), error=function(e) NULL)
})
Note: there are no headers on the columns, currently I don't even know which variable is which!
At the moment the data only has the date in the file name and therefore it isn't possible to distinguish between each subset of daily data. I want to create an additional column to include the date which I can extract if I can include the filename in an additional column.
I searched on stackexchange and came across this possible solution: Importing multiple .csv files into R and adding a new column with file name
df <- do.call(rbind, lapply(files, function(x) cbind(read.csv(x, header = F, sep = ","), name=strsplit(x,'\\.')[[1]][1])))
However I get the following error:
Error in read.table(file = file, header = header, sep = sep, quote = quote, :
no lines available in input
I have used read.csv on individual files and they have imported without any issues. Any ideas to resolve this would be greatly appreciated!
This should work, if your read.table command is correct:
myData_list <- lapply(files, function(x) {
out <- tryCatch(read.table(x, header = F, sep = ','), error = function(e) NULL)
if (!is.null(out)) {
out$source_file <- x
}
return(out)
})
myData <- data.table::rbindlist(myData_list)
In the past I found that you can spare yourself a lot of headache using data.table::fread instead of read.table. So you could consider this:
myData_list <- lapply(files, function(x) {
out <- data.table::fread(x, header = FALSE)
out$source_file <- x
return(out)
})
myData <- data.table::rbindlist(myData_list)
You can add the tryCatch part back if necessary. Depending on how the files vector looks, basename() might be interesting to use on the column source_file.
You could try using sapply with an index corresponding to each of the files:
files <- list.files(pattern = ".txt")
myData <- lapply(seq_along(files), function(x) {
tryCatch(
{
dt <- read.table(files[x], header = F, sep = ',')
dt$index <- x # or files[x] is you want to use the file name instead
dt
},
error=function(e) { NULL }
)
})

Removing new lines in R

I am trying to bring multiple rows into one cell in my CSV file. I first began with converting my text file into a CSV file, however the final column needs to have all the contents in one cell, and it's currently being split into multiple. The CSV File currently looks like the first picture, and needs to look like the second picture. Picture1Picture2
I have the following code:
mydata = read.table ("rolled_swiftmessage_test.txt", sep="|", allowEscapes
= TRUE, fill = FALSE)
write.table(mydata, file="rolled_swiftmessage_test.csv",sep=",",col.names=
FALSE,row.names= FALSE)
Currently it produces Picture_1, and I need it to produce picture_2. How do I fix it? Thanks!
After corresponding with the OP and seeing the kind of data that she has, this is my updated answer:
mydata <- read.table ("Test_TextFile.txt", sep="|", allowEscapes = TRUE, fill = FALSE, stringsAsFactors = F)
# Remove rows full of dashes
for(row in 1:nrow(mydata)) {
if(grepl('^\\-+$', mydata$V1[row])) mydata <- mydata[-row,]
}
empty_rows <- which(grepl('^\\s*$', mydata$V1))
rows_to_squeeze <- split(empty_rows, cumsum(c(1, diff(empty_rows) != 1)))
for(i in length(rows_to_squeeze):1) {
mydata$V12[rows_to_squeeze[[i]][1] - 1] <- paste(mydata$V12[seq(rows_to_squeeze[[i]][1] - 1, rows_to_squeeze[[i]][length(rows_to_squeeze[[i]])])], collapse = ' ')
mydata <- mydata[-seq(rows_to_squeeze[[i]][1], rows_to_squeeze[[i]][length(rows_to_squeeze[[i]])]),]
}
write.table(mydata, file="rolled_swiftmessage_test.csv", sep=",", col.names = FALSE, row.names = FALSE)
Original answer
Here you have my attempt at this. It's not pretty, but I think it works. Basically, I read the file as lines of text, not a table, I operate on the lines to join those that belong on the same 'message' cell, and then I put them in a nice data frame that can be saved as a csv file. Let me know if you need any other tweaks:
install.packages('stringr') ## if not installed yet
library(stringr) ## in order to use str_detect and str_split below
mydata <- readLines("rolled_swiftmessage_test.txt")
new_mydata = vector('character')
current <- 1
while(!is.na(mydata[current])) {
if(str_detect(mydata[current], '\\{')) {
i <- 1
while(!str_detect(mydata[current + i], '\\}')) {
mydata[current] <- paste(mydata[current], mydata[current + i], collapse = ' ')
i = i + 1
}
mydata[current] <- paste(mydata[current], mydata[current + i], collapse = ' ')
mydata[current] <- gsub('\\| \\| \\| \\|', '', mydata[current])
new_mydata = c(new_mydata, mydata[current])
current = current + i + 1
} else {
new_mydata = c(new_mydata, mydata[current])
current = current + 1
}
}
new_mydata <- sapply(new_mydata, function(x) str_split(x, '\\|'))
new_mydata <- as.data.frame(t(as.data.frame(new_mydata)))
write.table(new_mydata, file="rolled_swiftmessage_test.csv", sep=",", col.names = FALSE, row.names = FALSE)
The resulting image after opening the csv file (notice that I added the same row to the original text file three times just so that I would have more lines for testing):

write results sequentially in a loop in r

I have a bunt of single files which need to apply a test. I need to find the way to write automatically results of each file into a file. Here is what I do:
library(ape)
stud_files <- list.files("path/dir/data",full.names = T)
for (f in stud_files) {
df <- read.table(f, header=TRUE, sep=";")
df_xts <- as.xts(df$cola, order.by = as.Date(df$colb,"%m/%d/%Y"))
pet <- testa(df_xts)
res <- data.frame(estimate = pet$estimate,
p.value=pet$p.value,
logi = pet$alternative)
write.dna(res,file = "res_testa.xls",format = "sequential")
}
This loop works well, except the last command which aim to write the results of each file consecutively, it saved only the last performance. And the results save as string, not a table as I define above (data.frame). Any idea in this case? Thanks in advance
Check help(write.dna).
write.dna(x, file, format = "interleaved", append = FALSE,
nbcol = 6, colsep = " ", colw = 10, indent = NULL,
blocksep = 1)
append a logical, if TRUE the data are appended to the file without
erasing the data possibly existing in the file, otherwise the file (if
it exists) is overwritten (FALSE the default).
Set append = TRUE and you should be all set.
As some of the comments point out, however, you are probably better off generating your table, and then writing it all at once to a file. Unless you have billions of files, you likely won't run out of memory.
Here is how I would approach this.
library(ape)
library(data.table)
stud_files <- list.files("path/dir/data",full.names = T)
sumfunc <- function(f) {
df <- read.table(f, header=TRUE, sep=";")
df_xts <- as.xts(df$cola, order.by = as.Date(df$colb,"%m/%d/%Y"))
pet <- testa(df_xts)
res <- data.table(estimate = pet$estimate,
p.value=pet$p.value,
logi = pet$alternative)
return(res)
}
lres <- lapply(stud_files, sumfunc)
dat <- rbindlist(lres)
write.table(dat,
file = "res_testa.csv",
sep = ",",
quote = FALSE,
row.names = FALSE)

Stream processing large csv file in R

I need to make a couple of relatively simple changes to a very large csv file (c.8.5GB). I tried initially using various reader functions: read.csv, readr::read.csv, data.table::fread. However: they all run out of memory.
I'm thinking I need to use a stream processing approach instead; read a chunk, update it, write it, repeat. I found this answer which is on the right lines; however I don't how to terminate the loop (I'm relatively new to R).
So I have 2 questions:
What's the right way to make the while loop work?
Is there a better way (for some definition of 'better')? e.g. is there some way to do this using dplyr & pipes?
Current code as follows:
src_fname <- "testdata/model_input.csv"
tgt_fname <- "testdata/model_output.csv"
#Changes needed in file: rebase identifiers, set another col to constant value
rebase_data <- function(data, offset) {
data$'Unique Member ID' <- data$'Unique Member ID' - offset
data$'Client Name' <- "TestClient2"
return(data)
}
CHUNK_SIZE <- 1000
src_conn = file(src_fname, "r")
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE)
cols <- colnames(data)
offset <- data$'Unique Member ID'[1] - 1
data <- rebase_data(data, offset)
#1st time through, write the headers
tgt_conn = file(tgt_fname, "w")
write.csv(data,tgt_conn, row.names=FALSE)
#loop over remaining data
end = FALSE
while(end == FALSE) {
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE, col.names = cols)
data <- rebase_data(data, offset)
#write.csv doesn't support col.names=FALSE; so use write.table which does
write.table(data, tgt_conn, row.names=FALSE, col.names=FALSE, sep=",")
# ??? How to test for EOF and set end = TRUE if so ???
# This doesn't work, presumably because nrow() != CHUNK_SIZE on final loop?
if (nrow(data) < CHUNK_SIZE) {
end <- TRUE
}
}
close(src_conn)
close(tgt_conn)
Thanks for any pointers.
Sorry to poke a 2-year-old thread, but now with readr::read_csv_chunked (auto-loaded along with dplyr when loading tidyverse), we could also do like:
require(tidyverse)
## For non-exploratory code, as #antoine-sac suggested, use:
# require(readr) # for function `read_csv_chunked` and `read_csv`
# require(dplyr) # for the pipe `%>%` thus less parentheses
src_fname = "testdata/model_input.csv"
tgt_fname = "testdata/model_output.csv"
CHUNK_SIZE = 1000
offset = read_csv(src_fname, n_max=1)$comm_code %>% as.numeric() - 1
rebase.chunk = function(df, pos) {
df$comm_code = df$comm_code %>% as.numeric() - offset
df$'Client Name' = "TestClient2"
is.append = ifelse(pos > 1, T, F)
df %>% write_csv(
tgt_fname,
append=is.append
)
}
read_csv_chunked(
src_fname,
callback=SideEffectChunkCallback$new(rebase.chunk),
chunk_size = chunck.size,
progress = T # optional, show progress bar
)
Here the tricky part is to set is.append based on parameter pos, which indicates the start row number of the data frame df within original file. Within readr::write_csv, when append=F the header (columns name) will be written to file, otherwise not.
Try this out:
library("chunked")
read_chunkwise(src_fname, chunk_size=CHUNK_SIZE) %>%
rebase_data(offset) %>%
write_chunkwise(tgt_fname)
You may need to fiddle a bit with the colnames to get exactly what you want.
(Disclaimer: haven't tried the code)
Note that there is no vignette with the package but the standard usage is described on github: https://github.com/edwindj/chunked/
OK I found a solution, as follows:
# src_fname <- "testdata/model_input.csv"
# tgt_fname <- "testdata/model_output.csv"
CHUNK_SIZE <- 20000
#Changes needed in file: rebase identifiers, set another col to constant value
rebase_data <- function(data, offset) {
data$'Unique Member ID' <- data$'Unique Member ID' - offset
data$'Client Name' <- "TestClient2"
return(data)
}
#--------------------------------------------------------
# Get the structure first to speed things up
#--------------------------------------------------------
structure <- read.csv(src_fname, nrows = 2, check.names = FALSE)
cols <- colnames(structure)
offset <- structure$'Unique Member ID'[1] - 1
#Open the input & output files for reading & writing
src_conn = file(src_fname, "r")
tgt_conn = file(tgt_fname, "w")
lines_read <- 0
end <- FALSE
read_header <- TRUE
write_header <- TRUE
while(end == FALSE) {
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE, col.names = cols, header = read_header)
if (nrow(data) > 0) {
lines_read <- lines_read + nrow(data)
print(paste0("lines read this chunk: ", nrow(data), ", lines read so far: ", lines_read))
data <- rebase_data(data, offset)
#write.csv doesn't support col.names=FALSE; so use write.table which does
write.table(data, tgt_conn, row.names=FALSE, col.names=write_header, sep = ",")
}
if (nrow(data) < CHUNK_SIZE) {
end <- TRUE
}
read_header <- FALSE
write_header <- FALSE
}
close(src_conn)
close(tgt_conn)

Index elements of large CSV file when reading line by line

I'm reading a large CSV file (>15 GB) line by line in R. I'm using
con <- file("datafile.csv", open = "r")
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
# code to be written
}
In the "code to be written" section, I need to be able to refer to individual elements in each row and save them to an array. The file has no headers if that's important.
Thanks!
You could use read.table with argument text to parse oneLine string as if it were a csv file:
# set your arguments: separator, decimal separator etc...
x <- read.table(text=oneLine, sep=",", dec=".", header=F)
The returned x is a data.frame with one row only that you can easily turn into an array.
You could do something like this:
CHUNK_SIZE <- 5000
con <- file('datafile.csv', 'rt')
res <- NULL
while (nrow(chunk <- read.csv(con, nrow = CHUNK_SIZE, header = FALSE, stringsAsFactors = FALSE)) > 0) {
res <- rbind(res, chunk)
if (nrow(chunk) < CHUNK_SIZE) break
}

Resources