Ascii file in R as.numeric integers are incorrect - r

I have read an ascii (.spe) file into R. This file contains one column of, mostly, integers. However R is interpreting these integers incorrectly, probably because I am not specifying the correct format or something like that. The file was generated in Ortec Maestro software. Here is the code:
library(SDMTools)
strontium<-read.table("C:/Users/Hal 2/Desktop/beta_spec/strontium 90 spectrum.spe",header=F,skip=2)
str_spc<-vector(mode="numeric")
for (i in 1:2037)
{
str_spc[i]<-as.numeric(strontium$V1[i+13])
}
Here, for example, strontium$V1[14] has the value 0, but R is interpreting it as a 10. I think I may have to convert the data to some other format, or something like that, but I'm not sure and I'm probably googling the wrong search terms.
Here are the first few lines from the file:
$SPEC_ID:
No sample description was entered.
$SPEC_REM:
DET# 1
DETDESC# MCB 129
AP# Maestro Version 6.08
$DATE_MEA:
10/14/2014 15:13:16
$MEAS_TIM:
1516 1540
$DATA:
0 2047
Here is a link to the file: https://www.dropbox.com/sh/y5x68jen487qnmt/AABBZyC6iXBY3e6XH0XZzc5ba?dl=0
Any help appreciated.

I saw someone had made a parser for SPE Spectra files in python and I can't let that stand without there being at least a minimally functioning R version, so here's one that parses some of the fields, but gets you your data:
library(stringr)
library(gdata)
library(lubridate)
read.spe <- function(file) {
tmp <- readLines(file)
tmp <- paste(tmp, collapse="\n")
records <- strsplit(tmp, "\\$")[[1]]
records <- records[records!=""]
spe <- list()
spe[["SPEC_ID"]] <- str_match(records[which(startsWith(records, "SPEC_ID"))],
"^SPEC_ID:[[:space:]]*([[:print:]]+)[[:space:]]+")[2]
spe[["SPEC_REM"]] <- strsplit(str_match(records[which(startsWith(records, "SPEC_REM"))],
"^SPEC_REM:[[:space:]]*(.*)")[2], "\n")
spe[["DATE_MEA"]] <- mdy_hms(str_match(records[which(startsWith(records, "DATE_MEA"))],
"^DATE_MEA:[[:space:]]*(.*)[[:space:]]$")[2])
spe[["MEAS_TIM"]] <- strsplit(str_match(records[which(startsWith(records, "MEAS_TIM"))],
"^MEAS_TIM:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["ROI"]] <- str_match(records[which(startsWith(records, "ROI"))],
"^ROI:[[:space:]]*(.*)[[:space:]]$")[2]
spe[["PRESETS"]] <- strsplit(str_match(records[which(startsWith(records, "PRESETS"))],
"^PRESETS:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["ENER_FIT"]] <- strsplit(str_match(records[which(startsWith(records, "ENER_FIT"))],
"^ENER_FIT:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["MCA_CAL"]] <- strsplit(str_match(records[which(startsWith(records, "MCA_CAL"))],
"^MCA_CAL:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["SHAPE_CAL"]] <- str_match(records[which(startsWith(records, "SHAPE_CAL"))],
"^SHAPE_CAL:[[:space:]]*(.*)[[:space:]]*$")[2]
spe_dat <- strsplit(str_match(records[which(startsWith(records, "DATA"))],
"^DATA:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["SPE_DAT"]] <- as.numeric(gsub("[[:space:]]", "", spe_dat)[-1])
return(spe)
}
dat <- read.spe("strontium 90 spectrum.Spe")
str(dat)
## List of 10
## $ SPEC_ID : chr "No sample description was entered."
## $ SPEC_REM :List of 1
## ..$ : chr [1:3] "DET# 1" "DETDESC# MCB 129" "AP# Maestro Version 6.08"
## $ DATE_MEA : POSIXct[1:1], format: "2014-10-14 15:13:16"
## $ MEAS_TIM : chr "1516 1540"
## $ ROI : chr "0"
## $ PRESETS : chr [1:3] "None" "0" "0"
## $ ENER_FIT : chr "0.000000 0.002529"
## $ MCA_CAL : chr [1:2] "3" "0.000000E+000 2.529013E-003 0.000000E+000 keV"
## $ SHAPE_CAL: chr "3\n3.100262E+001 0.000000E+000 0.000000E+000"
## $ SPE_DAT : num [1:2048] 0 0 0 0 0 0 0 0 0 0 ...
head(dat$SPE_DAT)
## [1] 0 0 0 0 0 0
It needs some polish and there's absolutely no error checking (i.e. for missing fields), but no time today to deal with that. I'll finish the parsing and make a minimal package wrapper for it over the next couple days.

Related

How to fix "Error: arguments imply differing number of rows"?

I'm trying to get an API response using a URL that exists in an API data frame I just got, but I'm receiving the error:
"Error: arguments imply differing number of rows"
Does someone now how to fix it?
SCRIPT SO FAR
install.packages("jsonlite", "httr")
library(jsonlite)
library(httr)
### Generating URL and first request
url_deputados <- "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome"
get_deputados <- GET(url_deputados)
### Transforming it to text
deputados_text <- content(get_deputados, "text")
deputados_text
### Converting
deputados_json <- fromJSON(deputados_text, flatten = TRUE)
deputados_json
### Transforming it to table
deputados_df <- as.data.frame(deputados_json)
deputados_df
### And removing the two last columns which I don't need
deputados_df <- deputados_df[1:9]
### Now for the secondary requisitions, I'm creating a URL with the Id that is present in the first column of the data frame I just got
url_base <- "``https://dadosabertos.camara.leg.br/api/v2/``"
url_deputados <- "deputados/"
url_id <- deputados_df$dados.id
id_list <- c(url_id)
i <- 1
url <- paste0(url_base, url_deputados, id_list[i])
url
### Up to this point everything works, but I need to make sequential requests so I can GET the info for the next line of the existing data frame
while (i <= 531) {
print("Próxima página encontrada, baixando...")
get_deputados_id <- GET(paste0(url_base, url_deputados, id_list[i]))
deputados_id_text <- content(get_deputados_id, "text")
deputados_id_json <- fromJSON(deputados_id_text, flatten = TRUE)
deputados_id_df <- as.data.frame(deputados_id_json)
i <- i + 1
}
And this is where I receive the message error
When you run into problems at one line in your code, stop and look at the previous results. For instance, for me (since you didn't specify), I'm getting an error here:
deputados_df <- as.data.frame(deputados_json)
# Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, :
# arguments imply differing number of rows: 532, 3
So ... let's look at deputados_json:
str(deputados_json)
# List of 2
# $ dados:'data.frame': 532 obs. of 9 variables:
# ..$ id : int [1:532] 220593 204379 220714 221328 204560 204528 121948 74646 160508 136811 ...
# ..$ uri : chr [1:532] "https://dadosabertos.camara.leg.br/api/v2/deputados/220593" "https://dadosabertos.camara.leg.br/api/v2/deputados/204379" "https://dadosabertos.camara.leg.br/api/v2/deputados/220714" "https://dadosabertos.camara.leg.br/api/v2/deputados/221328" ...
# ..$ nome : chr [1:532] "Abilio Brunini" "Acácio Favacho" "Adail Filho" "Adilson Barroso" ...
# ..$ siglaPartido : chr [1:532] "PL" "MDB" "REPUBLICANOS" "PL" ...
# ..$ uriPartido : chr [1:532] "https://dadosabertos.camara.leg.br/api/v2/partidos/37906" "https://dadosabertos.camara.leg.br/api/v2/partidos/36899" "https://dadosabertos.camara.leg.br/api/v2/partidos/37908" "https://dadosabertos.camara.leg.br/api/v2/partidos/37906" ...
# ..$ siglaUf : chr [1:532] "MT" "AP" "AM" "SP" ...
# ..$ idLegislatura: int [1:532] 57 57 57 57 57 57 57 57 57 57 ...
# ..$ urlFoto : chr [1:532] "https://www.camara.leg.br/internet/deputado/bandep/220593.jpg" "https://www.camara.leg.br/internet/deputado/bandep/204379.jpg" "https://www.camara.leg.br/internet/deputado/bandep/220714.jpg" "https://www.camara.leg.br/internet/deputado/bandep/221328.jpg" ...
# ..$ email : chr [1:532] "dep.abiliobrunini#camara.leg.br" "dep.acaciofavacho#camara.leg.br" "dep.adailfilho#camara.leg.br" "dep.adilsonbarroso#camara.leg.br" ...
# $ links:'data.frame': 3 obs. of 2 variables:
# ..$ rel : chr [1:3] "self" "first" "last"
# ..$ href: chr [1:3] "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome" "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome&pagina=1&itens=1000" "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome&pagina=1&itens=1000"
(Hint: that's not unambiguously converted into a frame.)
My guess is that you just need to access $dados:
head(deputados_json$dados)
# id uri nome siglaPartido uriPartido siglaUf idLegislatura urlFoto email
# 1 220593 https://dadosabertos.camara.leg.br/api/v2/deputados/220593 Abilio Brunini PL https://dadosabertos.camara.leg.br/api/v2/partidos/37906 MT 57 https://www.camara.leg.br/internet/deputado/bandep/220593.jpg dep.abiliobrunini#camara.leg.br
# 2 204379 https://dadosabertos.camara.leg.br/api/v2/deputados/204379 Acácio Favacho MDB https://dadosabertos.camara.leg.br/api/v2/partidos/36899 AP 57 https://www.camara.leg.br/internet/deputado/bandep/204379.jpg dep.acaciofavacho#camara.leg.br
# 3 220714 https://dadosabertos.camara.leg.br/api/v2/deputados/220714 Adail Filho REPUBLICANOS https://dadosabertos.camara.leg.br/api/v2/partidos/37908 AM 57 https://www.camara.leg.br/internet/deputado/bandep/220714.jpg dep.adailfilho#camara.leg.br
# 4 221328 https://dadosabertos.camara.leg.br/api/v2/deputados/221328 Adilson Barroso PL https://dadosabertos.camara.leg.br/api/v2/partidos/37906 SP 57 https://www.camara.leg.br/internet/deputado/bandep/221328.jpg dep.adilsonbarroso#camara.leg.br
# 5 204560 https://dadosabertos.camara.leg.br/api/v2/deputados/204560 Adolfo Viana PSDB https://dadosabertos.camara.leg.br/api/v2/partidos/36835 BA 57 https://www.camara.leg.br/internet/deputado/bandep/204560.jpg dep.adolfoviana#camara.leg.br
# 6 204528 https://dadosabertos.camara.leg.br/api/v2/deputados/204528 Adriana Ventura NOVO https://dadosabertos.camara.leg.br/api/v2/partidos/37901 SP 57 https://www.camara.leg.br/internet/deputado/bandep/204528.jpg dep.adrianaventura#camara.leg.br
After that, make sure you fix your url_base, It should almost certainly not contain so many backticks.
Finally, you should do the same thing in your while loop:
while (i <= 531) {
get_deputados_id <- GET(paste0(url_base, url_deputados, id_list[i]))
deputados_id_text <- content(get_deputados_id, "text")
deputados_id_json <- fromJSON(deputados_id_text, flatten = TRUE)
# deputados_id_df <- as.data.frame(deputados_id_json)
deputados_id_df <- deputados_id_json$dados
i <- i + 1
}

Split PDF files in multiples files every 2 pages in R

I have a PDF document with 300 pages. I need to split this file in 150 files containing each one 2 pages. For example, the 1st document would contain pages 1 & 2 of the original file, the 2nd document, the pages 3 & 4 and so on.
Maybe I can use the "pdftools" package, but I don't know how.
1) pdftools Assuming that the input PDF is in the current directory and the outputs are to go into the same directory, change the inputs below and then get the number of pages num, compute the st and en vectors of start and end page numbers and repeatedly call pdf_subset. Note that the pdf_length and pdf_subset functions come from the qpdf R package but are also made available by the pdftools R package by importing them and exporting them back out.
library(pdftools)
# inputs
infile <- "a.pdf" # input pdf
prefix <- "out_" # output pdf's will begin with this prefix
num <- pdf_length(infile)
st <- seq(1, num, 2)
en <- pmin(st + 1, num)
for (i in seq_along(st)) {
outfile <- sprintf("%s%0*d.pdf", prefix, nchar(num), i)
pdf_subset(infile, pages = st[i]:en[i], output = outfile)
}
2) pdfbox The Apache pdfbox utility can split into files of 2 pages each. Download the .jar command line utilities file from pdfbox and be sure you have java installed. Then run this assuming that your input file is a.pdf and is in the current directory (or run the quoted part directly from the command line without the quotes and without R). The jar file name below may need to be changed if a later version is to be used. The one named below is the latest one currently (not counting alpha version).
system("java -jar pdfbox-app-2.0.26.jar PDFSplit -split 2 a.pdf")
3) animation/pdftk Another option is to install the pdftk program, change the inputs at the top of the script below and run. This gets the number of pages in the input, num, using pdftk and then computes the start and end page numbers, st and en, and then invokes pdftk repeatedly, once for each st/en pair to extract those pages into another file.
library(animation)
# inputs
PDFTK <- "~/../bin/pdftk.exe" # path to pdftk
infile <- "a.pdf" # input pdf
prefix <- "out_" # output pdf's will begin with this prefix
ani.options(pdftk = Sys.glob(PDFTK))
tmp <- tempfile()
dump_data <- pdftk(infile, "dump_data", tmp)
g <- grep("NumberOfPages", readLines(tmp), value = TRUE)
num <- as.numeric(sub(".* ", "", g))
st <- seq(1, num, 2)
en <- pmin(st + 1, num)
for (i in seq_along(st)) {
outfile <- sprintf("%s%0*d.pdf", prefix, nchar(num), i)
pdftk(infile, sprintf("cat %d-%d", st[i], en[i]), outfile)
}
Neither pdftools nor qpdf (on which the first depends) support splitting PDF files by other than "every page". You likely will need to rely on an external program, I'm confident you can get pdftk to do that by calling it once for each 2-page output.
I have a 36-page PDF here named quux.pdf in the current working directory.
str(pdftools::pdf_info("quux.pdf"))
# List of 11
# $ version : chr "1.5"
# $ pages : int 36
# $ encrypted : logi FALSE
# $ linearized : logi FALSE
# $ keys :List of 8
# ..$ Producer : chr "pdfTeX-1.40.24"
# ..$ Author : chr ""
# ..$ Title : chr ""
# ..$ Subject : chr ""
# ..$ Creator : chr "LaTeX via pandoc"
# ..$ Keywords : chr ""
# ..$ Trapped : chr ""
# ..$ PTEX.Fullbanner: chr "This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022) kpathsea version 6.3.4"
# $ created : POSIXct[1:1], format: "2022-05-17 22:54:40"
# $ modified : POSIXct[1:1], format: "2022-05-17 22:54:40"
# $ metadata : chr ""
# $ locked : logi FALSE
# $ attachments: logi FALSE
# $ layout : chr "no_layout"
I also have pdftk installed and available in the page,
Sys.which("pdftk")
# pdftk
# "C:\\PROGRA~2\\PDFtk Server\\bin\\pdftk.exe"
With this, I can run an external script to create 2-page PDFs:
list.files(pattern = "pdf$")
# [1] "quux.pdf"
pages <- seq(pdftools::pdf_info("quux.pdf")$pages)
pages <- split(pages, (pages - 1) %/% 2)
pages[1:3]
# $`0`
# [1] 1 2
# $`1`
# [1] 3 4
# $`2`
# [1] 5 6
for (pg in pages) {
system(sprintf("pdftk quux.pdf cat %s-%s output out_%02i-%02i.pdf",
min(pg), max(pg), min(pg), max(pg)))
}
list.files(pattern = "pdf$")
# [1] "out_01-02.pdf" "out_03-04.pdf" "out_05-06.pdf" "out_07-08.pdf"
# [5] "out_09-10.pdf" "out_11-12.pdf" "out_13-14.pdf" "out_15-16.pdf"
# [9] "out_17-18.pdf" "out_19-20.pdf" "out_21-22.pdf" "out_23-24.pdf"
# [13] "out_25-26.pdf" "out_27-28.pdf" "out_29-30.pdf" "out_31-32.pdf"
# [17] "out_33-34.pdf" "out_35-36.pdf" "quux.pdf"
str(pdftools::pdf_info("out_01-02.pdf"))
# List of 11
# $ version : chr "1.5"
# $ pages : int 2
# $ encrypted : logi FALSE
# $ linearized : logi FALSE
# $ keys :List of 2
# ..$ Creator : chr "pdftk 2.02 - www.pdftk.com"
# ..$ Producer: chr "itext-paulo-155 (itextpdf.sf.net-lowagie.com)"
# $ created : POSIXct[1:1], format: "2022-05-18 09:37:56"
# $ modified : POSIXct[1:1], format: "2022-05-18 09:37:56"
# $ metadata : chr ""
# $ locked : logi FALSE
# $ attachments: logi FALSE
# $ layout : chr "no_layout"

Scraping table from myneta using R

I am trying to scrape a table from http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary to my R studio.
Here's the code
url<-'http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary'
webpage<-read_html(url)
candidate_info<- html_nodes(webpage,xpath='//*[#id="main"]/div/div[2]/div[2]/table')
candidate_info<- html_table(candidate_info)
head(candidate_info)
But getting no output, suggest what I am doing wrong?
That site has some very broken HTML. But, it's workable.
I find it better to target nodes in a slightly less fragile way. The XPath below finds it by content of the table.
html_table() croaks (or took forever and I didn't want to wait) so I ended up building the table "manually".
library(rvest)
# helper to clean column names
mcga <- function(x) { make.unique(gsub("(^_|_$)", "", gsub("_+", "_", gsub("[[:punct:][:space:]]+", "_", tolower(x)))), sep = "_") }
pg <- read_html("http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary")
# target the table
tab <- html_node(pg, xpath=".//table[contains(thead, 'Liabilities')]")
# get the rows so we can target columns
rows <- html_nodes(tab, xpath=".//tr[td[not(#colspan)]]")
# make a data frame
do.call(
cbind.data.frame,
c(lapply(1:8, function(i) {
html_text(html_nodes(rows, xpath=sprintf(".//td[%s]", i)), trim=TRUE)
}), list(stringsAsFactors=FALSE))
) -> xdf
# make nicer names
xdf <- setNames(xdf, mcga(html_text(html_nodes(tab, "th")))) # get the header to get column names
str(xdf)
## 'data.frame': 4823 obs. of 8 variables:
## $ sno : chr "1" "2" "3" "4" ...
## $ candidate : chr "A Hasiv" "A Wahid" "Aan Shikhar Shrivastava" "Aaptab Urf Aftab" ...
## $ constituency : chr "ARYA NAGAR" "GAINSARI" "GOSHAINGANJ" "MUBARAKPUR" ...
## $ party : chr "BSP" "IND" "Satya Shikhar Party" "Islam Party Hind" ...
## $ criminal_case: chr "0" "0" "0" "0" ...
## $ education : chr "12th Pass" "10th Pass" "Graduate" "Illiterate" ...
## $ total_assets : chr "Rs 3,94,24,827 ~ 3 Crore+" "Rs 75,106 ~ 75 Thou+" "Rs 41,000 ~ 41 Thou+" "Rs 20,000 ~ 20 Thou+" ...
## $ liabilities : chr "Rs 58,46,335 ~ 58 Lacs+" "Rs 0 ~" "Rs 0 ~" "Rs 0 ~" ...

R read.csv "More columns than column names" error

I have a problem when importing .csv file into R. With my code:
t <- read.csv("C:\\N0_07312014.CSV", na.string=c("","null","NaN","X"),
header=T, stringsAsFactors=FALSE,check.names=F)
R reports an error and does not do what I want:
Error in read.table(file = file, header = header, sep = sep, quote = quote, :
more columns than column names
I guess the problem is because my data is not well formatted. I only need data from [,1:32]. All others should be deleted.
Data can be downloaded from:
https://drive.google.com/file/d/0B86_a8ltyoL3VXJYM3NVdmNPMUU/edit?usp=sharing
Thanks so much!
Open the .csv as a text file (for example, use TextEdit on a Mac) and check to see if columns are being separated with commas.
csv is "comma separated vectors". For some reason when Excel saves my csv's it uses semicolons instead.
When opening your csv use:
read.csv("file_name.csv",sep=";")
Semi colon is just an example but as someone else previously suggested don't assume that because your csv looks good in Excel that it's so.
That's one wonky CSV file. Multiple headers tossed about (try pasting it to CSV Fingerprint) to see what I mean.
Since I don't know the data, it's impossible to be sure the following produces accurate results for you, but it involves using readLines and other R functions to pre-process the text:
# use readLines to get the data
dat <- readLines("N0_07312014.CSV")
# i had to do this to fix grep errors
Sys.setlocale('LC_ALL','C')
# filter out the repeating, and wonky headers
dat_2 <- grep("Node Name,RTC_date", dat, invert=TRUE, value=TRUE)
# turn that vector into a text connection for read.csv
dat_3 <- read.csv(textConnection(paste0(dat_2, collapse="\n")),
header=FALSE, stringsAsFactors=FALSE)
str(dat_3)
## 'data.frame': 308 obs. of 37 variables:
## $ V1 : chr "Node 0" "Node 0" "Node 0" "Node 0" ...
## $ V2 : chr "07/31/2014" "07/31/2014" "07/31/2014" "07/31/2014" ...
## $ V3 : chr "08:58:18" "08:59:22" "08:59:37" "09:00:06" ...
## $ V4 : chr "" "" "" "" ...
## .. more
## $ V36: chr "" "" "" "" ...
## $ V37: chr "0" "0" "0" "0" ...
# grab the headers
headers <- strsplit(dat[1], ",")[[1]]
# how many of them are there?
length(headers)
## [1] 32
# limit it to the 32 columns you want (Which matches)
dat_4 <- dat_3[,1:32]
# and add the headers
colnames(dat_4) <- headers
str(dat_4)
## 'data.frame': 308 obs. of 32 variables:
## $ Node Name : chr "Node 0" "Node 0" "Node 0" "Node 0" ...
## $ RTC_date : chr "07/31/2014" "07/31/2014" "07/31/2014" "07/31/2014" ...
## $ RTC_time : chr "08:58:18" "08:59:22" "08:59:37" "09:00:06" ...
## $ N1 Bat (VDC) : chr "" "" "" "" ...
## $ N1 Shinyei (ug/m3): chr "" "" "0.23" "null" ...
## $ N1 CC (ppb) : chr "" "" "null" "null" ...
## $ N1 Aeroq (ppm) : chr "" "" "null" "null" ...
## ... continues
If you only need the first 32 columns, and you know how many columns there are, you can set the other columns classes to NULL.
read.csv("C:\\N0_07312014.CSV", na.string=c("","null","NaN","X"),
header=T, stringsAsFactors=FALSE,
colClasses=c(rep("character",32),rep("NULL",10)))
If you do not want to code up each colClass and you like the guesses read.csv then just save that csv and open it again.
Alternatively, you can skip the header and name the columns yourself and remove the misbehaved rows.
A<-data.frame(read.csv("N0_07312014.CSV",
header=F,stringsAsFactors=FALSE,
colClasses=c(rep("character",32),rep("NULL",5)),
na.string=c("","null","NaN","X")))
Yournames<-as.character(A[1,])
names(A)<-Yournames
yourdata<-unique(A)[-1,]
The code above assumes you do not want any duplicate rows. You can alternatively remove rows that have the first entry equal to the first column name, but I'll leave that to you.
try read.table() instead of read.csv()
I was also facing the same issue. Now solved.
Just use header = FALSE
read.csv("data.csv", header = FALSE) -> mydata
I had the same problem. I opened my data in textfile and double expressions are separated by semicolons, you should replace them with a period
I was having this error that was caused by multiple rows of meta data at the top of the file. I was able to use read.csv by doing skip= and skipping those rows.
data <- read.csv('/blah.csv',skip=3)
For me, the solution was using csv2 instead of csv.
read.csv("file_name.csv", header=F)
Setting the HEADER to be FALSE will do the job perfectly for you...

How can I read a CSV more quickly in R?

I have to read a CSV (each more than 120MB). I use a for loop, but it was very very very slow. How can I read a CSV more quickly?
My code:
H=data.frame()
for (i in 201:225){
for (j in 1996:2007){
filename=paste("D:/Hannah/CD/CD.R",i,"_cd",j,".csv",sep="")
x=read.csv(filename,stringsAsFactor=F)
I=c("051","041","044","54","V0262")
temp=x[(x$A_1 %in% I)|(x$A_2 %in% I)|(x$A_3 %in% I), ]
H=rbind(H,temp)
}
}
each files structuration are same like this
> str(x)
'data.frame': 417691 obs. of 37 variables:
$ YM: int 199604 199612 199612 199612 199606 199606 199609 199601 ...
$ A_TYPE: int 1 1 1 1 1 1 1 1 1 1 ...
$ HOSP: chr "dd0516ed3e" "c53d67027e" ...
$ A_DATE: int 19960505 19970116 19970108 ...
$ C_TYPE: int 19 9 1 1 2 9 9 1 1 1 ...
$ S_NO : int 142 37974 4580 4579 833 6846 2272 667 447 211 ...
$ C_ITEM_1 : chr "P2" "P3" "A2"...
$ C_ITEM_2 : chr "R6" "I3" ""...
$ C_ITEM_3 : chr "W2" "" "A2"...
$ C_ITEM_4 : chr "Y1" "O3" ""...
$ F_TYPE: chr "40" "02" "02" "02" ...
$ F_DATE : int 19960415 19961223 19961227 ...
$ T_END_DATE: int NA NA NA ...
$ ID_B : int 19630526 19630526 19630526 ...
$ ID : chr "fff" "fac" "eab"...
$ CAR_NO : chr "B4" "B5" "C1" "B6" ...
$ GE_KI: int 4 4 4 4 4 4 4 4 4 4 ...
$ PT_N : chr "H10" "A10" "D10" "D10" ...
$ A_1 : chr "0521" "7948" "A310" "A312" ...
$ A_2 : chr "05235" "5354" "" "" ...
$ A_3 : chr "" "" "" "" ...
$ I_O_CE: chr "5210" "" "" "" ...
$ DR_DAY : int 0 7 3 3 0 0 3 3 3 3 ...
$ M_TYPE: int 2 0 0 0 2 2 0 0 0 0 ...
........
I think the big performance problem here is that you iteratively grow the H object. Each time the object grows, the OS needs to allocate more for it. This process takes quite long. A simple fix would be to preallocate H to the correct number of rows. If the number of rows is not known beforehand, you can preallocate a good amount, and resize as needed.
Alternatively, the following approach does not suffer form the problem I describe above:
list_of_files = list.files('dir_where_files_are', pattern = '*csv', full.names = TRUE)
big_data_frame = do.call('rbind', lapply(list_of_files, read.csv, sep = ""))
You could also use function fread() from the data.table package. It's pretty fast compared to read.csv. Also, try to just loop over list.files().
This may not be the most efficient or most elegant approach, but here is what I would do, based upon some assumptions where more info is missing; particularly, can't do any testing:
Make sure that RSQLite is installed (sqldf could be an option if you have enough memory, but personally I prefer having a "real" database that I also can access with other tools).
# make sqlite available
library( RSQLite )
db <- dbConnect( dbDriver("SQLite"), dbname = "hannah.sqlite" )
# create a vector with your filenames
filenames <- NULL
for (i in 201:225)
{
for ( j in 1996:2007 )
{
fname <- paste( "D:/Hannah/CD/CD.R", i, "_cd", j, ".csv", sep="" )
filenames <- c( filenames, fname )
}
}
# extract the DB structure, create empty table
x <- read.csv( filenames[1], stringsAsFactor = FALSE, nrows = 1 )
dbWriteTable( db, "all", x, row.names = FALSE )
dbGetQuery( db, "DELETE FROM all" )
# a small table for your selection criteria (build in flexibility for the future)
I <- as.data.frame( c( "051", "041", "044", "54", "V0262" ) )
dbWriteTable( db, "crit", I, row.names = FALSE )
# move your 300 .csv files into that table
# (you probably do that better using the sqlite CLI but more info would be needed)
for( f in filenames )
{
x <- read.csv( f, stringsAsFactor = FALSE )
dbWriteTable( db, "all", x, append = TRUE, row.names = FALSE )
}
# now you can extract the subset in one go
extract <- dbGetQuery( db, "SELECT * FROM all
WHERE A_1 IN (SELECT I FROM crit ) OR
A_2 IN (SELECT I FROM crit ) OR
A_3 IN (SELECT I FROM crit )" )
This is not tested but should work (if not, tell me where it stops) and it should be faster and not run into memory problems. But again, without real data no real solution!

Resources