chunking txt files in R - r

all,
I'm working form Matthew Jockers's code in his "Text Analysis with R for Students of Literature" book.
In it he provides code to pull all <p> tags from XML documents, chop that content in 1000 words chunks and apply a bunch data massaging tricks. Once that's done, he inserts that chunking function in a loop that produces a data matrix that is ready to be used in mallet. Please see the code below.
My question is, how do I do the same thing with .txt files? Obviously, text files do not have attributes like <p> to work from. I'm not an experienced programmer so go easy on me please!!!
chunk.size <- 1000 #number of words per chunk
makeFlexTextChunks <- function(doc.object, chunk.size=1000, percentage=TRUE){
paras <- getNodeSet(doc.object,
"/d:TEI/d:text/d:body//d:p",
c(d = "http://www.tei-c.org/ns/1.0"))
words <- paste(sapply(paras,xmlValue), collapse=" ")
words.lower <- tolower(words)
words.lower <- gsub("[^[:alnum:][:space:]']", " ", words.lower)
words.l <- strsplit(words.lower, "\\s+")
word.v <- unlist(words.l)
x <- seq_along(word.v)
if(percentage){
max.length <- length(word.v)/chunk.size
chunks.l <- split(word.v, ceiling(x/max.length))
} else {
chunks.l <- split(word.v, ceiling(x/chunk.size))
#deal with small chunks at the end
if(length(chunks.l[[length(chunks.l)]]) <=
length(chunks.l[[length(chunks.l)]])/2){
chunks.l[[length(chunks.l)-1]] <-
c(chunks.l[[length(chunks.l)-1]],
chunks.l[[length(chunks.l)]])
chunks.l[[length(chunks.l)]] <- NULL
}
}
chunks.l <- lapply(chunks.l, paste, collapse=" ")
chunks.df <- do.call(rbind, chunks.l)
return(chunks.df)
}
topic.m <- NULL
for(i in 1:length(files.v)){
doc.object <- xmlTreeParse(file.path(input.dir, files.v[i]),
useInternalNodes=TRUE)
chunk.m <- makeFlexTextChunks(doc.object, chunk.size,
percentage=FALSE)
textname <- gsub("\\..*","", files.v[i])
segments.m <- cbind(paste(textname,
segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.m <- rbind(topic.m, segments.m)
}

Thank you everybody for your help. I think I found my answer after much trial and error! The key was to pull the txt files with scan(paste(input.dir, files.v[i], sep="/") in the loop rather than the function. Please see my code here:
input.dir <- "data/plainText"
files.v <- dir(input.dir, ".*txt")
chunk.size <- 100 #number of words per chunk
makeFlexTextChunks <- function(doc.object, chunk.size=100, percentage=TRUE){
words.lower <- tolower(paste(doc.object, collapse=" "))
words.lower <- gsub("[^[:alnum:][:space:]']", " ", words.lower)
words.l <- strsplit(words.lower, "\\s+")
word.v <- unlist(words.l)
x <- seq_along(word.v)
if(percentage){
max.length <- length(word.v)/chunk.size
chunks.l <- split(word.v, ceiling(x/max.length))
}
else {
chunks.l <- split(word.v, ceiling(x/chunk.size))
#deal with small chunks at the end
if(length(chunks.l[[length(chunks.l)]]) <=
length(chunks.l[[length(chunks.l)]])/2){
chunks.l[[length(chunks.l)-1]] <-
c(chunks.l[[length(chunks.l)-1]],
chunks.l[[length(chunks.l)]])
chunks.l[[length(chunks.l)]] <- NULL
}
}
chunks.l <- lapply(chunks.l, paste, collapse=" ")
chunks.df <- do.call(rbind, chunks.l)
return(chunks.df)
}
topic.m <- NULL
for(i in 1:length(files.v)){
doc.object <- scan(paste(input.dir, files.v[i], sep="/"), what="character", sep="\n")
chunk.m <- makeFlexTextChunks(doc.object, chunk.size, percentage=FALSE)
textname <- gsub("\\..*","", files.v[i])
segments.m <- cbind(paste(textname, segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.m <- rbind(topic.m, segments.m)
}

Maybe this can point you in the right direction. The following code reads in a txt file a splits the words up into elements of a vector.
library(readr)
library(stringr)
url <- "http://www.gutenberg.org/files/98/98-0.txt"
mystring <- read_file(url)
res <- str_split(mystring, "\\s+")
Then you can split it into chunks of 1000 words and do your magic?

Related

R parantheses Problems and alternative way of simplyfing CSV concatenations

Im new to R and not used to the Syntax very well i got the following Error:
“Error: unexpected '}' in ”}"
so i know now that there is any Problem with my parantheses.
Problem is, I am looking for 1 h now and I couldnt find any unmached Brackets.
while i was parsing the Code itselve seemed quiet expensive for a solution which should be simple.
so my Intention ist to search a directroy full of CSV and i want to concatenate those (rowwise) which have the same Filename. Is there any function in R yet? Or is the following approach acceptable?
concated_CSV <- data.frame()
Data1 <- data.frame(n)
Data2 <- data.frame()
for (File in Filenames) {
if (Data1$n == 1) {
Data1 <- read.csv(File, header=T, sep=";", dec=",")
Filename_Data1 <- unlist(strsplit(File, ".csv"))
Tendril_Nr_Data1 <- unlist(strsplit(File, "_"))[1]
}
else if (is.na(Data1$n)) {
Data2 <- read.csv(File, header=T, sep=";", dec=",")
Filename_Data2 <- unlist(strsplit(File, ".csv"))
Tendril_Nr_Data2 <- unlist(strsplit(File, "_"))[1]
}
else if (Tendril_Nr_Data1 == Tendril_Nr_Data2) {
concated_CSV <- rbind(Data1, Data2)
new_Filename <- paste0(trg_dir, "/", Tendril_Nr_Data1, ".csv")
write.csv(concated_CSV, new_Filename, row.names=FALSE)
}
}
thank you very much and
best wishes
thanks for your Answers. As you see Im aswell new to Stackoverflow and was just on the reading side so far.
here ist the code i tryied to simplify so you cant use it.
the "Filenames" represents the Filenames im dealing with.
#Stackoverflow example
Filenames <- c("6.1.3.1_1.CSV","6.1.3.1_2.CSV","6.4.3.1.CSV","6.1.2.1_1.CSV","6.1.2.1_2.CSV","6.1.5.CSV")
Filename_Data1 <- "6.1.3.1_1.CSV"
Filename_Data2 <- "6.1.3.1_2.CSV"
#record File for an Output
concated_CSV<- data.frame()
n <- 1
Data1 <- data.frame(n)
Data2<- data.frame()
for(File in Filenames){
if (Data1$n==1 ){
Data1 <- read.csv(File, header=T, sep=";", dec=",")
Filename_Data1 <- unlist(strsplit(File, ".csv"))
Tendril_Nr_Data1 <- unlist(strsplit(Filename_Data1, "_"))[1]
} else if (Data1$n=!1){
Data2 <- read.csv(File, header=T, sep=";", dec=",")
Filename_Data2 <- unlist(strsplit(File, ".csv"))
Tendril_Nr_Data2 <- unlist(strsplit(Filename_Data1, "_"))[1]
} else if (identical(Tendril_Nr_Data1, Tendril_Nr_Data2)){
concated_CSV <- rbind(Data1, Data2)
#tis is the name and directory to which the file should be saved in
#new_Filename <- paste0(trg_dir, "/",Tendril_Nr_Data1,".csv")
n_Filename <- "hello"
write.csv(concated_CSV,n_Filename, row.names = FALSE)
}
}
the missing parantheses hasnt disappered.
My intention ist to write a program which compares CSV-Data-Filenames in a given Directory and if there is a Filename twice for example "abc_1.csv" and abc_2.csv" the Program shall concatenate the CSV-Data rowwise and save a file named "abc.csv" (hope this is clearer).

Web scraping data in R when there are a lot of links

I am trying to scrape baseball data from baseball-reference (e.g., https://www.baseball-reference.com/teams/NYY/2017.shtml). I have a huge vector of URLS that I created using a for loop, since the links follow a specific pattern. However, I am having trouble running my code, probably because I have to make too many connections within R. There are over 17000 elements in my vector, and my code stops working once it gets to around 16000. Is there an easier and perhaps a more efficient way to replicate my code?
require(Lahman)
teams <- unique(Teams$franchID)
years <- 1871:2017
urls <- matrix(0, length(teams), length(years))
for(i in 1:length(teams)) {
for(j in 1:length(years)) {
urls[i, j] <- paste0("https://www.baseball-reference.com/teams/",
teams[i], "/", years[j], ".shtml")
}
}
url_vector <- as.vector(urls)
list_of_batting <- list()
list_of_pitching <- list()
for(i in 1:length(url_vector)) {
url <- url_vector[i]
res <- try(readLines(url), silent = TRUE)
## check if website exists
if(inherits(res, "try-error")) {
list_of_batting[[i]] <- NA
list_of_pitching[[i]] <- NA
}
else {
urltxt <- readLines(url)
urltxt <- gsub("-->", "", gsub("<!--", "", urltxt))
doc <- htmlParse(urltxt)
tables_full <- readHTMLTable(doc)
tmp1 <- tables_full$players_value_batting
tmp2 <- tables_full$players_value_pitching
list_of_batting[[i]] <- tmp1
list_of_pitching[[i]] <- tmp2
}
print(i)
closeAllConnections()
}

Working with large csv file in R

any help will be appreciated.
I used the following code to break down my large csv file (4gb) and now I am trying to save the 2nd, 3rd... part into a csv. However, I can only access the first chunk of my data.
Is there anything wrong with my code?
How do I save the second chunk of my data into csv?
rgfile <- 'filename.csv'
index <- 0
chunkSize <- 100000
con <- file(description = rgfile, open="r")
dataChunk <- read.table(con, nrows= chunkSize, header=T, fill= TRUE, sep= ",")
actualColumnNames <- names(dataChunk)
repeat {
index <- index + 1
print(paste('Processing rows:', index * chunkSize))
if (nrow(dataChunk) != chunkSize){
print('Processed all files!')
break
}
dataChunk <- read.table(
con, nrows = chunkSize, skip=0, header = FALSE,
fill=TRUE, sep = ",", col.names=actualColumnNames
)
break
}
library(tidyverse)
library(nycflights13)
# make the problelm reproducible
rgfile <- 'flights.csv'
write_csv(flights, rgfile)
# now, get to work
lines <- as.numeric(R.utils::countLines(rgfile))
chunk_size <- 100000
hdr <- read_csv(rgfile, n_max=2)
fnum <- 1
for (i in seq(1, lines, chunk_size)) {
suppressMessages(
read_csv(
rgfile, col_names=colnames(hdr), skip=(i-1), n_max=chunk_size
)
) -> x
if (i>1) colnames(x) <- colnames(hdr)
write_csv(x, sprintf("file%03d.csv", fnum))
fnum <- fnum + 1
}

Speed up text mining (and for loop) in R

I'm text-mining thousands of documents (basically doing frequency count) and wondering is there any other ways to speed up the following process? Currently it takes more than 10 hours to run the whole analysis. Thank you (from an R beginner).
sessionInfo()
#R version 3.2.3 (2015-12-10)
library(bitops)
library(RCurl)
library(XML)
library(stringr)
library(tm)
setwd("F:/testing_folder")
path = "F:/testing_folder"
file.names <- dir(path, pattern =".txt")
filename <- vector()
totalword <- vector()
system.time(
for(i in 1:length(file.names)){
text.v <- scan(file.names[i], what="character", sep="\n",encoding = "UTF-8")
report.v <- paste(text.v, collapse=" " )
#Count total number of words
words.l <- strsplit(report.v, "\\W")
word.v <- unlist(words.l)
not.blanks.v <- which(word.v!="")
word.v <- word.v[not.blanks.v]
totalword <- append(totalword,length(word.v))
filename <- append(filename,print(file.names[i]))
x <- data.frame(filename,totalword)
write.csv(x, file= "results.csv") #export results
}
)
What do you get from the following?
Rprof("profile1.out", line.profiling=TRUE)
source("http://pastebin.com/raw/kFGCse5s")
Rprof(NULL)
proftable("profile1.out", lines=10)

How to parse INI like configuration files with R?

Is there an R function for parsing INI like configuration files?
While searching I only found this discussion.
Here is an answer that was given to exact the same question on r-help in 2007 (thanks to #Spacedman for pointing this out):
Parse.INI <- function(INI.filename)
{
connection <- file(INI.filename)
Lines <- readLines(connection)
close(connection)
Lines <- chartr("[]", "==", Lines) # change section headers
connection <- textConnection(Lines)
d <- read.table(connection, as.is = TRUE, sep = "=", fill = TRUE)
close(connection)
L <- d$V1 == "" # location of section breaks
d <- subset(transform(d, V3 = V2[which(L)[cumsum(L)]])[1:3],
V1 != "")
ToParse <- paste("INI.list$", d$V3, "$", d$V1, " <- '",
d$V2, "'", sep="")
INI.list <- list()
eval(parse(text=ToParse))
return(INI.list)
}
Actually, I wrote a short and presumably buggy function (i.e. not covering all corner cases) which works for me now:
read.ini <- function(x) {
if(length(x)==1 && !any(grepl("\\n", x))) lines <- readLines(x) else lines <- x
lines <- strsplit(lines, "\n", fixed=TRUE)[[1]]
lines <- lines[!grepl("^;", lines) & nchar(lines) >= 2] # strip comments & blank lines
lines <- gsub("\\r$", "", lines)
idx <- which(grepl("^\\[.+\\]$", lines))
if(idx[[1]] != 1) stop("invalid INI file. Must start with a section.")
res <- list()
fun <- function(from, to) {
tups <- strsplit(lines[(from+1):(to-1)], "[ ]*=[ ]*")
for (i in 1:length(tups))
if(length(tups[[i]])>2) tups[[i]] <- c(tups[[i]][[1]], gsub("\\=", "=", paste(tail(tups[[i]],-1), collapse="=")))
tups <- unlist(tups)
keys <- strcap(tups[seq(from=1, by=2, length.out=length(tups)/2)])
vals <- tups[seq(from=2, by=2, length.out=length(tups)/2)]
sec <- strcap(substring(lines[[from]], 2, nchar(lines[[from]])-1))
res[[sec]] <<- setNames(vals, keys)
}
mapply(fun, idx, c(tail(idx, -1), length(lines)+1))
return(res)
}
where strcap is a helper function that capitalizes a string:
strcap <- function(s) paste(toupper(substr(s,1,1)), tolower(substring(s,2)), sep="")
There are also some C solutions for this, like inih or libini that might be useful. I did not try them out, though.

Resources