I am trying to scrape a table from http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary to my R studio.
Here's the code
url<-'http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary'
webpage<-read_html(url)
candidate_info<- html_nodes(webpage,xpath='//*[#id="main"]/div/div[2]/div[2]/table')
candidate_info<- html_table(candidate_info)
head(candidate_info)
But getting no output, suggest what I am doing wrong?
That site has some very broken HTML. But, it's workable.
I find it better to target nodes in a slightly less fragile way. The XPath below finds it by content of the table.
html_table() croaks (or took forever and I didn't want to wait) so I ended up building the table "manually".
library(rvest)
# helper to clean column names
mcga <- function(x) { make.unique(gsub("(^_|_$)", "", gsub("_+", "_", gsub("[[:punct:][:space:]]+", "_", tolower(x)))), sep = "_") }
pg <- read_html("http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary")
# target the table
tab <- html_node(pg, xpath=".//table[contains(thead, 'Liabilities')]")
# get the rows so we can target columns
rows <- html_nodes(tab, xpath=".//tr[td[not(#colspan)]]")
# make a data frame
do.call(
cbind.data.frame,
c(lapply(1:8, function(i) {
html_text(html_nodes(rows, xpath=sprintf(".//td[%s]", i)), trim=TRUE)
}), list(stringsAsFactors=FALSE))
) -> xdf
# make nicer names
xdf <- setNames(xdf, mcga(html_text(html_nodes(tab, "th")))) # get the header to get column names
str(xdf)
## 'data.frame': 4823 obs. of 8 variables:
## $ sno : chr "1" "2" "3" "4" ...
## $ candidate : chr "A Hasiv" "A Wahid" "Aan Shikhar Shrivastava" "Aaptab Urf Aftab" ...
## $ constituency : chr "ARYA NAGAR" "GAINSARI" "GOSHAINGANJ" "MUBARAKPUR" ...
## $ party : chr "BSP" "IND" "Satya Shikhar Party" "Islam Party Hind" ...
## $ criminal_case: chr "0" "0" "0" "0" ...
## $ education : chr "12th Pass" "10th Pass" "Graduate" "Illiterate" ...
## $ total_assets : chr "Rs 3,94,24,827 ~ 3 Crore+" "Rs 75,106 ~ 75 Thou+" "Rs 41,000 ~ 41 Thou+" "Rs 20,000 ~ 20 Thou+" ...
## $ liabilities : chr "Rs 58,46,335 ~ 58 Lacs+" "Rs 0 ~" "Rs 0 ~" "Rs 0 ~" ...
Related
I'm trying to get an API response using a URL that exists in an API data frame I just got, but I'm receiving the error:
"Error: arguments imply differing number of rows"
Does someone now how to fix it?
SCRIPT SO FAR
install.packages("jsonlite", "httr")
library(jsonlite)
library(httr)
### Generating URL and first request
url_deputados <- "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome"
get_deputados <- GET(url_deputados)
### Transforming it to text
deputados_text <- content(get_deputados, "text")
deputados_text
### Converting
deputados_json <- fromJSON(deputados_text, flatten = TRUE)
deputados_json
### Transforming it to table
deputados_df <- as.data.frame(deputados_json)
deputados_df
### And removing the two last columns which I don't need
deputados_df <- deputados_df[1:9]
### Now for the secondary requisitions, I'm creating a URL with the Id that is present in the first column of the data frame I just got
url_base <- "``https://dadosabertos.camara.leg.br/api/v2/``"
url_deputados <- "deputados/"
url_id <- deputados_df$dados.id
id_list <- c(url_id)
i <- 1
url <- paste0(url_base, url_deputados, id_list[i])
url
### Up to this point everything works, but I need to make sequential requests so I can GET the info for the next line of the existing data frame
while (i <= 531) {
print("Próxima página encontrada, baixando...")
get_deputados_id <- GET(paste0(url_base, url_deputados, id_list[i]))
deputados_id_text <- content(get_deputados_id, "text")
deputados_id_json <- fromJSON(deputados_id_text, flatten = TRUE)
deputados_id_df <- as.data.frame(deputados_id_json)
i <- i + 1
}
And this is where I receive the message error
When you run into problems at one line in your code, stop and look at the previous results. For instance, for me (since you didn't specify), I'm getting an error here:
deputados_df <- as.data.frame(deputados_json)
# Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, :
# arguments imply differing number of rows: 532, 3
So ... let's look at deputados_json:
str(deputados_json)
# List of 2
# $ dados:'data.frame': 532 obs. of 9 variables:
# ..$ id : int [1:532] 220593 204379 220714 221328 204560 204528 121948 74646 160508 136811 ...
# ..$ uri : chr [1:532] "https://dadosabertos.camara.leg.br/api/v2/deputados/220593" "https://dadosabertos.camara.leg.br/api/v2/deputados/204379" "https://dadosabertos.camara.leg.br/api/v2/deputados/220714" "https://dadosabertos.camara.leg.br/api/v2/deputados/221328" ...
# ..$ nome : chr [1:532] "Abilio Brunini" "Acácio Favacho" "Adail Filho" "Adilson Barroso" ...
# ..$ siglaPartido : chr [1:532] "PL" "MDB" "REPUBLICANOS" "PL" ...
# ..$ uriPartido : chr [1:532] "https://dadosabertos.camara.leg.br/api/v2/partidos/37906" "https://dadosabertos.camara.leg.br/api/v2/partidos/36899" "https://dadosabertos.camara.leg.br/api/v2/partidos/37908" "https://dadosabertos.camara.leg.br/api/v2/partidos/37906" ...
# ..$ siglaUf : chr [1:532] "MT" "AP" "AM" "SP" ...
# ..$ idLegislatura: int [1:532] 57 57 57 57 57 57 57 57 57 57 ...
# ..$ urlFoto : chr [1:532] "https://www.camara.leg.br/internet/deputado/bandep/220593.jpg" "https://www.camara.leg.br/internet/deputado/bandep/204379.jpg" "https://www.camara.leg.br/internet/deputado/bandep/220714.jpg" "https://www.camara.leg.br/internet/deputado/bandep/221328.jpg" ...
# ..$ email : chr [1:532] "dep.abiliobrunini#camara.leg.br" "dep.acaciofavacho#camara.leg.br" "dep.adailfilho#camara.leg.br" "dep.adilsonbarroso#camara.leg.br" ...
# $ links:'data.frame': 3 obs. of 2 variables:
# ..$ rel : chr [1:3] "self" "first" "last"
# ..$ href: chr [1:3] "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome" "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome&pagina=1&itens=1000" "https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura=57&ordem=ASC&ordenarPor=nome&pagina=1&itens=1000"
(Hint: that's not unambiguously converted into a frame.)
My guess is that you just need to access $dados:
head(deputados_json$dados)
# id uri nome siglaPartido uriPartido siglaUf idLegislatura urlFoto email
# 1 220593 https://dadosabertos.camara.leg.br/api/v2/deputados/220593 Abilio Brunini PL https://dadosabertos.camara.leg.br/api/v2/partidos/37906 MT 57 https://www.camara.leg.br/internet/deputado/bandep/220593.jpg dep.abiliobrunini#camara.leg.br
# 2 204379 https://dadosabertos.camara.leg.br/api/v2/deputados/204379 Acácio Favacho MDB https://dadosabertos.camara.leg.br/api/v2/partidos/36899 AP 57 https://www.camara.leg.br/internet/deputado/bandep/204379.jpg dep.acaciofavacho#camara.leg.br
# 3 220714 https://dadosabertos.camara.leg.br/api/v2/deputados/220714 Adail Filho REPUBLICANOS https://dadosabertos.camara.leg.br/api/v2/partidos/37908 AM 57 https://www.camara.leg.br/internet/deputado/bandep/220714.jpg dep.adailfilho#camara.leg.br
# 4 221328 https://dadosabertos.camara.leg.br/api/v2/deputados/221328 Adilson Barroso PL https://dadosabertos.camara.leg.br/api/v2/partidos/37906 SP 57 https://www.camara.leg.br/internet/deputado/bandep/221328.jpg dep.adilsonbarroso#camara.leg.br
# 5 204560 https://dadosabertos.camara.leg.br/api/v2/deputados/204560 Adolfo Viana PSDB https://dadosabertos.camara.leg.br/api/v2/partidos/36835 BA 57 https://www.camara.leg.br/internet/deputado/bandep/204560.jpg dep.adolfoviana#camara.leg.br
# 6 204528 https://dadosabertos.camara.leg.br/api/v2/deputados/204528 Adriana Ventura NOVO https://dadosabertos.camara.leg.br/api/v2/partidos/37901 SP 57 https://www.camara.leg.br/internet/deputado/bandep/204528.jpg dep.adrianaventura#camara.leg.br
After that, make sure you fix your url_base, It should almost certainly not contain so many backticks.
Finally, you should do the same thing in your while loop:
while (i <= 531) {
get_deputados_id <- GET(paste0(url_base, url_deputados, id_list[i]))
deputados_id_text <- content(get_deputados_id, "text")
deputados_id_json <- fromJSON(deputados_id_text, flatten = TRUE)
# deputados_id_df <- as.data.frame(deputados_id_json)
deputados_id_df <- deputados_id_json$dados
i <- i + 1
}
I'm trying to pull data directly from an API into R using the httr package. The API doesn't require any authentication, and accepts JSON strings of lat, long, elevation, variable sets, and time period to estimate climate variables for any location. This is my first time using an API, but the code below is what I've cobbled together from various Stack Overflow posts.
library(jsonlite)
library(httr)
url = "http://apibc.climatewna.com/api/clmApi"
body <- data.frame(lat = c(48.98,50.2), ##two example locations
lon = c(-115.02, -120),
el = c(1000,100),
prd = c("Normal_1961_1990.nrm","Normal_1961_1990.nrm"),
varYSM = c("Y","SST"))
requestBody <- toJSON(list("output" = body),auto_unbox = TRUE) ##convert to JSON string
result <- POST("http://apibc.climatewna.com/api/clmApi", ##post to API
body = requestBody,
add_headers(`Content-Type`="application/json"))
content(result)
I've tried various different versions of this (e.g. writing the JSON string manually, putting the body as a list in POST with encode = "json"), and it always runs, but the content always contains the below error message:
$Message
[1] "An error has occurred."
$ExceptionMessage
[1] "Object reference not set to an instance of an object."
$ExceptionType
[1] "System.NullReferenceException"
If I use GET and specify the variables directly in the URL
url = "http://apibc.climatewna.com/api/clmApi/LatLonEl?lat=48.98&lon=-115.02&el=1000&prd=Normal_1961_1990&varYSM=Y"
result <- GET(url)
content(result)
it produces the correct output, but then I can only obtain information for one location at a time. There isn't currently any public documentation about this API as it's very new, but I've attached a draft of the section explaining it using JS below. I would very much appreciate any help/suggestions on what I'm doing wrong!
Thank you!
The main problem is that jQuery.ajax encodes the data using jQuery.param before sending it to the API, so what it's sending looks something like [0][lat]=48.98&[0][lon]=-115.02.... I don't know of a package in R that does a similar encoding as jQuery.param, so we'll have to hack something together.
Modifying your example slightly:
library(httr)
body <- data.frame(lat = c(48.98,50.2), ##two example locations
lon = c(-115.02, -120),
el = c(1000,100),
prd = c("Normal_1961_1990","Normal_1961_1990"),
varYSM = c("Y","Y"))
Now, we do the encoding, like so:
out <- sapply(1:nrow(body), function(i) {
paste(c(
paste0(sprintf("[%d][lat]", i - 1), "=", body$lat[i]),
paste0(sprintf("[%d][lon]", i - 1), "=", body$lon[i]),
paste0(sprintf("[%d][el]", i - 1), "=", body$el[i]),
paste0(sprintf("[%d][prd]", i - 1), "=", body$prd[i]),
paste0(sprintf("[%d][varYSM]", i - 1), "=", body$varYSM[i])
), collapse = "&")
})
out <- paste(out, collapse = "&")
so now out is in a form that the API likes. Finally
result <- POST(url = "http://apibc.climatewna.com/api/clmApi", ##post to API
body = out,
add_headers(`Content-Type`="application/x-www-form-urlencoded"))
noting the Content-Type. We get
df <- do.call(rbind, lapply(content(result), as.data.frame, stringsAsFactors = FALSE))
str(df)
# 'data.frame': 2 obs. of 29 variables:
# $ lat : chr "48.98" "50.2"
# $ lon : chr "-115.02" "-120"
# $ elev : chr "1000" "100"
# $ prd : chr "Normal_1961_1990" "Normal_1961_1990"
# $ varYSM : chr "Y" "Y"
# $ MAT : chr "5.2" "8"
# $ MWMT : chr "16.9" "20.2"
# $ MCMT : chr "-6.7" "-5.6"
# $ TD : chr "23.6" "25.7"
# $ MAP : chr "617" "228"
# $ MSP : chr "269" "155"
# $ AHM : chr "24.7" "79.1"
# $ SHM : chr "62.9" "130.3"
# $ DD_0 : chr "690" "519"
# $ DD5 : chr "1505" "2131"
# $ DD_18 : chr "4684" "3818"
# $ DD18 : chr "60" "209"
# $ NFFD : chr "165" "204"
# $ bFFP : chr "150" "134"
# $ eFFP : chr "252" "254"
# $ FFP : chr "101" "120"
# $ PAS : chr "194" "34"
# $ EMT : chr "-36.3" "-32.7"
# $ EXT : chr "37.1" "41.2"
# $ Eref : chr "14.7" "13.6"
# $ CMD : chr "721" "862"
# $ MAR : chr "347" "679"
# $ RH : chr "57" "57"
# $ Version: chr "ClimateBC_API_v5.51" "ClimateBC_API_v5.51"
I am trying to find a way to retrieve data from Harvard Dataverse website through R. I am using "dataverse" and "dvn" packages, among others. Many of the data files end with ".tab", although they are not formatted as normal tab-delimited text.
I have done this:
library(dataverse)
## 01. Using the dataverse server and making a search
Sys.setenv("DATAVERSE_SERVER" ="dataverse.harvard.edu")
## 02. Loading the dataset that I chose, by url
doi_url <- "https://doi.org/10.7910/DVN/ZTCWYQ"
my_dataset <- get_dataset(doi_url)
## 03. Grabbing the first file of the dataset
## which is named "001_AppendixC.tab"
my_files <- my_dataset$files$label
my_file <- get_file(my_files[1], doi_url)
AppendixC <- tempfile()
writeBin(my_file, AppendixC)
read.table(AppendixC)
> Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
> line 1 did not have 2 elements
> In addition: Warning message:
> In read.table(AppendixC) :
> line 1 appears to contain embedded nulls
Any hint?
The problem is that dataverse::get_file() returns the file in a raw binary format. The easiest way to load it into memory is to write it to a tempfile with writeBin() and then read that file with the appropriate import/read function.
Here is a function that should automagically read it into memory:
# Uses rio, which automatically chooses the appropriate import/read
# function based on file type.
library(rio)
install_formats() # only needs to run once after
# pkg installation
load_raw_file <- function(raw, type) {
match.arg(
arg = type,
choices = c(
"csv", "tab", "psc", "tsv", "sas7bdat",
"sav", "dta", "xpt", "por", "xls", "xlsx",
"R", "RData", "rda", "rds", "rec", "mtb",
"feather", "csv.gz", "fwf"
)
)
tmp <- tempfile(fileext = paste0(".", type))
writeBin(as.vector(raw), tmp)
out <- import(tmp)
unlink(tmp)
out
}
Let's try it out with your file, which is a an excel file.
library(dataverse)
raw <- get_file(
"001_AppendixC.tab",
"https://doi.org/10.7910/DVN/ZTCWYQ"
)
data <- load_raw_file(raw, "xlsx")
And look at the data:
str(data)
> 'data.frame': 132 obs. of 17 variables:
> $ Country : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
> $ UN_9193 : chr "37.4" "7.7" "9.1" "65.400000000000006" ...
> $ UN_9901 : chr "46.1" "7.2" "10.7" "50" ...
> $ UN_0709 : chr "24.6" "9.6999999999999993" "7.5" "23.7" ...
> $ UN_1416 : chr "23" "4.9000000000000004" "4.5999999999999996" "14" ...
> $ stu90_94 : chr "51.3" "37.200000000000003" "22.9" "52.9" ...
> $ stu98_02 : chr "54.7" "39.200000000000003" "23.6" "47.1" ...
> $ stu06_10 : chr "51.3" "23.1" "13.2" "29.2" ...
> $ stu12_16 : chr "40.9" "17.899999999999999" "11.7" "37.6" ...
> $ wast90_94: chr "11.5" "9.4" "7.1" "7.9" ...
> $ wast98_02: chr "13.4" "12.2" "3.1" "8.6999999999999993" ...
> $ wast06_10: chr "8.9" "9.4" "4.0999999999999996" "8.1999999999999993" ...
> $ wast12_16: chr "9.5" "6.2" "4.0999999999999996" "4.9000000000000004" ...
> $ UM1992 : chr "16.8" "3.7" "4.5" "22.6" ...
> $ UM2000 : chr "13.7" "2.6" "4" "21.7" ...
> $ UM2008 : chr "11" "1.8" "2.9" "19.2" ...
> $ UM2015 : chr "9.1" "1.4" "2.6" "15.7" ...
I have read an ascii (.spe) file into R. This file contains one column of, mostly, integers. However R is interpreting these integers incorrectly, probably because I am not specifying the correct format or something like that. The file was generated in Ortec Maestro software. Here is the code:
library(SDMTools)
strontium<-read.table("C:/Users/Hal 2/Desktop/beta_spec/strontium 90 spectrum.spe",header=F,skip=2)
str_spc<-vector(mode="numeric")
for (i in 1:2037)
{
str_spc[i]<-as.numeric(strontium$V1[i+13])
}
Here, for example, strontium$V1[14] has the value 0, but R is interpreting it as a 10. I think I may have to convert the data to some other format, or something like that, but I'm not sure and I'm probably googling the wrong search terms.
Here are the first few lines from the file:
$SPEC_ID:
No sample description was entered.
$SPEC_REM:
DET# 1
DETDESC# MCB 129
AP# Maestro Version 6.08
$DATE_MEA:
10/14/2014 15:13:16
$MEAS_TIM:
1516 1540
$DATA:
0 2047
Here is a link to the file: https://www.dropbox.com/sh/y5x68jen487qnmt/AABBZyC6iXBY3e6XH0XZzc5ba?dl=0
Any help appreciated.
I saw someone had made a parser for SPE Spectra files in python and I can't let that stand without there being at least a minimally functioning R version, so here's one that parses some of the fields, but gets you your data:
library(stringr)
library(gdata)
library(lubridate)
read.spe <- function(file) {
tmp <- readLines(file)
tmp <- paste(tmp, collapse="\n")
records <- strsplit(tmp, "\\$")[[1]]
records <- records[records!=""]
spe <- list()
spe[["SPEC_ID"]] <- str_match(records[which(startsWith(records, "SPEC_ID"))],
"^SPEC_ID:[[:space:]]*([[:print:]]+)[[:space:]]+")[2]
spe[["SPEC_REM"]] <- strsplit(str_match(records[which(startsWith(records, "SPEC_REM"))],
"^SPEC_REM:[[:space:]]*(.*)")[2], "\n")
spe[["DATE_MEA"]] <- mdy_hms(str_match(records[which(startsWith(records, "DATE_MEA"))],
"^DATE_MEA:[[:space:]]*(.*)[[:space:]]$")[2])
spe[["MEAS_TIM"]] <- strsplit(str_match(records[which(startsWith(records, "MEAS_TIM"))],
"^MEAS_TIM:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["ROI"]] <- str_match(records[which(startsWith(records, "ROI"))],
"^ROI:[[:space:]]*(.*)[[:space:]]$")[2]
spe[["PRESETS"]] <- strsplit(str_match(records[which(startsWith(records, "PRESETS"))],
"^PRESETS:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["ENER_FIT"]] <- strsplit(str_match(records[which(startsWith(records, "ENER_FIT"))],
"^ENER_FIT:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["MCA_CAL"]] <- strsplit(str_match(records[which(startsWith(records, "MCA_CAL"))],
"^MCA_CAL:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["SHAPE_CAL"]] <- str_match(records[which(startsWith(records, "SHAPE_CAL"))],
"^SHAPE_CAL:[[:space:]]*(.*)[[:space:]]*$")[2]
spe_dat <- strsplit(str_match(records[which(startsWith(records, "DATA"))],
"^DATA:[[:space:]]*(.*)[[:space:]]$")[2], "\n")[[1]]
spe[["SPE_DAT"]] <- as.numeric(gsub("[[:space:]]", "", spe_dat)[-1])
return(spe)
}
dat <- read.spe("strontium 90 spectrum.Spe")
str(dat)
## List of 10
## $ SPEC_ID : chr "No sample description was entered."
## $ SPEC_REM :List of 1
## ..$ : chr [1:3] "DET# 1" "DETDESC# MCB 129" "AP# Maestro Version 6.08"
## $ DATE_MEA : POSIXct[1:1], format: "2014-10-14 15:13:16"
## $ MEAS_TIM : chr "1516 1540"
## $ ROI : chr "0"
## $ PRESETS : chr [1:3] "None" "0" "0"
## $ ENER_FIT : chr "0.000000 0.002529"
## $ MCA_CAL : chr [1:2] "3" "0.000000E+000 2.529013E-003 0.000000E+000 keV"
## $ SHAPE_CAL: chr "3\n3.100262E+001 0.000000E+000 0.000000E+000"
## $ SPE_DAT : num [1:2048] 0 0 0 0 0 0 0 0 0 0 ...
head(dat$SPE_DAT)
## [1] 0 0 0 0 0 0
It needs some polish and there's absolutely no error checking (i.e. for missing fields), but no time today to deal with that. I'll finish the parsing and make a minimal package wrapper for it over the next couple days.
I have to read a CSV (each more than 120MB). I use a for loop, but it was very very very slow. How can I read a CSV more quickly?
My code:
H=data.frame()
for (i in 201:225){
for (j in 1996:2007){
filename=paste("D:/Hannah/CD/CD.R",i,"_cd",j,".csv",sep="")
x=read.csv(filename,stringsAsFactor=F)
I=c("051","041","044","54","V0262")
temp=x[(x$A_1 %in% I)|(x$A_2 %in% I)|(x$A_3 %in% I), ]
H=rbind(H,temp)
}
}
each files structuration are same like this
> str(x)
'data.frame': 417691 obs. of 37 variables:
$ YM: int 199604 199612 199612 199612 199606 199606 199609 199601 ...
$ A_TYPE: int 1 1 1 1 1 1 1 1 1 1 ...
$ HOSP: chr "dd0516ed3e" "c53d67027e" ...
$ A_DATE: int 19960505 19970116 19970108 ...
$ C_TYPE: int 19 9 1 1 2 9 9 1 1 1 ...
$ S_NO : int 142 37974 4580 4579 833 6846 2272 667 447 211 ...
$ C_ITEM_1 : chr "P2" "P3" "A2"...
$ C_ITEM_2 : chr "R6" "I3" ""...
$ C_ITEM_3 : chr "W2" "" "A2"...
$ C_ITEM_4 : chr "Y1" "O3" ""...
$ F_TYPE: chr "40" "02" "02" "02" ...
$ F_DATE : int 19960415 19961223 19961227 ...
$ T_END_DATE: int NA NA NA ...
$ ID_B : int 19630526 19630526 19630526 ...
$ ID : chr "fff" "fac" "eab"...
$ CAR_NO : chr "B4" "B5" "C1" "B6" ...
$ GE_KI: int 4 4 4 4 4 4 4 4 4 4 ...
$ PT_N : chr "H10" "A10" "D10" "D10" ...
$ A_1 : chr "0521" "7948" "A310" "A312" ...
$ A_2 : chr "05235" "5354" "" "" ...
$ A_3 : chr "" "" "" "" ...
$ I_O_CE: chr "5210" "" "" "" ...
$ DR_DAY : int 0 7 3 3 0 0 3 3 3 3 ...
$ M_TYPE: int 2 0 0 0 2 2 0 0 0 0 ...
........
I think the big performance problem here is that you iteratively grow the H object. Each time the object grows, the OS needs to allocate more for it. This process takes quite long. A simple fix would be to preallocate H to the correct number of rows. If the number of rows is not known beforehand, you can preallocate a good amount, and resize as needed.
Alternatively, the following approach does not suffer form the problem I describe above:
list_of_files = list.files('dir_where_files_are', pattern = '*csv', full.names = TRUE)
big_data_frame = do.call('rbind', lapply(list_of_files, read.csv, sep = ""))
You could also use function fread() from the data.table package. It's pretty fast compared to read.csv. Also, try to just loop over list.files().
This may not be the most efficient or most elegant approach, but here is what I would do, based upon some assumptions where more info is missing; particularly, can't do any testing:
Make sure that RSQLite is installed (sqldf could be an option if you have enough memory, but personally I prefer having a "real" database that I also can access with other tools).
# make sqlite available
library( RSQLite )
db <- dbConnect( dbDriver("SQLite"), dbname = "hannah.sqlite" )
# create a vector with your filenames
filenames <- NULL
for (i in 201:225)
{
for ( j in 1996:2007 )
{
fname <- paste( "D:/Hannah/CD/CD.R", i, "_cd", j, ".csv", sep="" )
filenames <- c( filenames, fname )
}
}
# extract the DB structure, create empty table
x <- read.csv( filenames[1], stringsAsFactor = FALSE, nrows = 1 )
dbWriteTable( db, "all", x, row.names = FALSE )
dbGetQuery( db, "DELETE FROM all" )
# a small table for your selection criteria (build in flexibility for the future)
I <- as.data.frame( c( "051", "041", "044", "54", "V0262" ) )
dbWriteTable( db, "crit", I, row.names = FALSE )
# move your 300 .csv files into that table
# (you probably do that better using the sqlite CLI but more info would be needed)
for( f in filenames )
{
x <- read.csv( f, stringsAsFactor = FALSE )
dbWriteTable( db, "all", x, append = TRUE, row.names = FALSE )
}
# now you can extract the subset in one go
extract <- dbGetQuery( db, "SELECT * FROM all
WHERE A_1 IN (SELECT I FROM crit ) OR
A_2 IN (SELECT I FROM crit ) OR
A_3 IN (SELECT I FROM crit )" )
This is not tested but should work (if not, tell me where it stops) and it should be faster and not run into memory problems. But again, without real data no real solution!