rvest - Scraping a Table - r

I am scraping a WebPage using rvest library, my, interest is to extract all the data from the table present in the webpage.
library(rvest)
library(tidyr)
url <- ''
# Parsing the HTML Code from Website
hdb_webpage <- read_html(url)
## Grabbing Page Info - Table Input 1
dat_1 <- hdb_webpage %>%
html_table(header=FALSE) %>%
.[[2]] %>%
as.data.frame()
# Transposing
dat_1 <- as.data.frame(t(dat_1$X3))
# Changing colnames
colnames(dat_1) <- c("Name", "Address", "Category", "TradeType", "Contact")
I continue to do the same manually for the rest of the dataframes present in the list. There are actually 18 dataframe present in the list which consists of varying Variables and Observations, which leads to spending much time in cleaning the data.
Alternatively, for scraping the whole table, I use the following code;
tbls_ls <- hdb_webpage %>%
html_nodes("table") %>%
html_table(header = FALSE) %>%
.[2:18]
df <- data.frame(matrix(unlist(tbls_ls), nrow=279, byrow=T),stringsAsFactors=FALSE)
df <- unique(df)
This code extracts all the information from the table into list and then I use unlist to convert into a dataframe and then apply unique to get relevant data.
Is there a way through which I can extract all the data from the table without going thorough one by one.

When you look at the raw list rw.list read in from html_table() there are three if-cases to be handled differently.
library(rvest)
path <- 'https://services2.hdb.gov.sg/webapp/AA16RMSBusinessDirectory/AA16SLevelmap?SearchOption=1&BLK=166&STREET=WOODLANDS+STREET+13++++++++++++++++++++++++++++++++++++++++++++++++++%EF%BF%BD&pcode=730166&STREETLIST=--&MAIN_TRADE_CODE=0000Please+Select+Category%24&Forward=&FROMHOME=true&Slvl=1&SEARCHPANEL=1&MAIN_TRADE_DESC'
# Parsing the HTML Code from Website
rw <- read_html(path)
rw.list <- html_table(rw)[-1]
names(rw.list) <- lapply(rw.list, function(x) # attribute clean names
unique(gsub("\\n|\\r|\\t|\\s+(More Information)?", "", x[1, ])))
l1 <- lapply(rw.list, function(x) t(x[-(1:2), ]))
l1 <- lapply(1:length(l1), function(x) {
d <- as.data.frame(l[[x]], stringsAsFactors=FALSE)
names(d) <- d[1, ]
if (length(d) == 10 | length(d) == 6)
out <- matrix(unlist(d[3, grep("Category|Trade|(Tel No)", names(d), )]),
ncol=2,
dimnames=list(NULL, d[1, 1:2]))
else if (length(d) == 8)
out <- matrix(unlist(t(d[3, grep("Category|Trade|(Tel No)", names(d), )])),
ncol=3, byrow=TRUE, dimnames=list(NULL, d[1, 1:3]))
else
out <- d[3, ]
return(cbind(id=names(l)[x], out))
})
The clean list we can merge with Reduce().
result <- Reduce(function(...) merge(..., all=TRUE), l1)
Result
head(result, 3)
# id Category Trade Tel No
# 1 1.GREENEMERALDAQUARIA Pets Aquarium Fish (freshwater/marine) And Accessories 68160208
# 2 2.SEEMRALICIOUS Beauty Beauty Salon 66357994
# 3 3.MORRISONOPTICALPTELTD Shopping Optical Goods & Eyewear 63666300

Related

R - Cumulative storage of looped cbind() results and possible lapply solution to double for-loop

I've found a work around solution to a question I posted based on #Ryan's recommendation, given by this code:
for (i in seq_along(url)){
webpage <- read_html(url[i]) #loop through URL list to access html data
fac_data <- html_nodes(webpage,'.tableunder') %>% html_text()
fac_data1 <- html_nodes(webpage,'.tableunder1') %>% html_text()
fac_data <- c(fac_data, fac_data1) #Store table data on each URL in a variable
x <- fac_data %>% matrix(ncol = length(headers[[i]]), byrow=TRUE) #make matrix to extract column data
for (j in seq_along(headers[[i]])){
y <- cbind(x[,j]) #extract column data and store in temporary variable
colnames(y) <- as.character(headers[[i]][j]) #add column name
print(cbind(y)) #loop through headers list to print column data in sequence. ** cbind(y) will be overwritten when I try to store the result on a list with 'z <- cbind(y)'.
}
}
I am now able to print out all values, complete with headers of the data in question.
Some follow-up questions will be:
How do I save the output of cbind(y) cumulatively in a data.frame or a list? Looping through cbind(y) will overwrite values, which leaves me with only the last column from the last table. Like this:
退休年月
[1,] "82年8月"
Neither do these variations work:
z[[x]][j] <- cbind(y)
> source('~/Google 云端硬盘/R/scrapeFaculty.R')
Error in `*tmp*`[[x]] : 最多只能選擇一個元素
z[j] <- cbind(y)
> source('~/Google 云端硬盘/R/scrapeFaculty.R')
There were 13 warnings (use warnings() to see them)
z[[j]] <- cbind(y)
> source('~/Google 云端硬盘/R/scrapeFaculty.R')
Error in z[[j]] <- cbind(y) : 用來替換的元素比所要替換的值多
Can the double for-loop be replaced by a simple lapply() function to
resolve the above issue?
EDIT:
Here's the final code I used to solve this:
for (i in seq_along(url)){
webpage <- read_html(url[i])
fac_data <- html_nodes(webpage,'.tableunder') %>% html_text()
fac_data1 <- html_nodes(webpage,'.tableunder1') %>% html_text()
fac_data <- c(fac_data, fac_data1)
x <- fac_data %>% matrix(ncol = length(headers[[i]]), byrow=TRUE) #make matrix to extract column data
y <- cbind(x[,1:length(headers[[i]])]) #extract column data
colnames(y)<- as.character(headers[[i]]) #add colunm name
ntu.hist[[i]] <- y #Cumulate results on a list.
}
I was wondering if it would be an option to cbind multiple at one time instead of looping. Would any of these syntax options help?
y <– data.frame(col1=c(1:3),col2=c(4:6),col3=c(7:9))
cbind(y[,c(1:3)])
col1 col2 col3
1 1 4 7
2 2 5 8
3 3 6 9
#In R, you can use ":" to specify a range. So 1,2,3,4 is equal to 1:4.
#If you don't want number 3 in that range, you can use c(1,2,4).
#For example:
cbind(y[,c(1,3)])
col1 col3
1 1 7
2 2 8
3 3 9
Final code:
Here's the final code:
for (i in seq_along(url)){
webpage <- read_html(url[i])
fac_data <- html_nodes(webpage,'.tableunder') %>% html_text()
fac_data1 <- html_nodes(webpage,'.tableunder1') %>% html_text()
fac_data <- c(fac_data, fac_data1)
x <- fac_data %>% matrix(ncol = length(headers[[i]]), byrow=TRUE) #make matrix to extract column data
y <- cbind(x[,1:length(headers[[i]])]) #extract column data
colnames(y)<- as.character(headers[[i]]) #add colunm name
ntu.hist[[i]] <- y #Cumulate results on a list.
}

R extract data from pdf

Here is the data
http://drdpat.bih.nic.in/Downloads/Rice-Varieties-1996-2012.pdf
It is a pdf. If you open the pdf, you will on page 2, there is a table that I need to extract and store it in a dataframe. I followed this link to do this
https://ropensci.org/blog/2016/03/01/pdftools-and-jeroen
library(pdftools)
text <- pdf_text("data.pdf")
dat<-text[2] # this reads the second page
After this, no matter what I try it does not convert it into a tabular format. I
tried this:
dat1 <- matrix(dat, byrow = TRUE,nrow = 12, ncol = 8) # it didn't work
Tried to use the scan function
dat.s <- scan(dat, what = "character", sep = " ", skip = 2) # no use
Can anyone help me with this? Also I am only looking to achieve this in R
Thanks
The structure of the tables in the PDF is a bit messed up: some columns overlap with each other and the tabulizer algorithm does not extract them correctly.
I was only able to extract the first 6 columns from page 2; the last 2 columns (Salient Features, "Recommended for cultivation") remain problematic...
library(tabulizer)
library(dplyr)
out1 <- extract_tables("Rice-Varieties-1996-2012.pdf", pages=2)[[1]]
## With a moderate amount of hacking,
## the following columns are correctly extracted:
## 1. Sl. No.
## 4. Year of Notification
## 5. Duration (in days)
## 6. Eco-System
sel <- gsub(" ","",out1[ ,c(1,4,5,6)])
## To extract Parentage column, you can use the `area` parameter:
## I figured out the values by trial and error
out2 <- extract_tables("Rice-Varieties-1996-2012.pdf", guess=FALSE,
pages=2,
area=list(c(80,120,2000,420) ) )[[1]]
sel <- cbind(sel,out2[1:nrow(sel),1])
## The header is contained in the first 3 rows of `sel`
## which can be aggregated by `paste0`
print(sel)
head <- aggregate(sel[1:3, ], by=list(rep(1,3)), paste0, collapse="") %>%
select(-Group.1)
## The body is a bit harder, because each record might be split across
## a variable number of rows, depending on the entries.
## I have used non-empty records for column 1 (Sl.No.)
## to identify the breakpoints where to split sel into row blocks
## pertaining to the same record.
body <- sel[-(1:3), ]
brks <- body[ ,1]!=""
ibrk <- c((1:nrow(body))[brks], nrow(body)+1)
ll <- unlist(sapply(1:(length(ibrk)-1), function(k) rep(ibrk[k],ibrk[k+1]-ibrk[k])))
stopifnot(length(ll)==nrow(body))
body <- data.frame(body, stringsAsFactors=FALSE)
colnames(body) <- head
tab <- aggregate(body, by=list(ll), paste0, collapse="") %>%
select(-Group.1)
print(tab)
## Using the same trick as above with brks and ibrk,
## one is able to extract column "Name of variety"
## (again, I found the values of area by trial and error).
out3 <- extract_tables("Rice-Varieties-1996-2012.pdf", guess=FALSE,
pages=2,
area=list(c(80,20,2000,130) ) )[[1]]
sel3 <- gsub(" ","",out3)
head3 <- aggregate(sel3[1:2, ], by=list(rep(1,2)), paste0, collapse="") %>%
select(-Group.1)
body3 <- sel3[-(1:2), ]
brks3 <- body3[ ,1]!=""
ibrk3 <- c((1:nrow(body3))[brks3], nrow(body3)+1)
ll3 <- unlist(sapply(1:(length(ibrk3)-1), function(k) rep(ibrk3[k],ibrk3[k+1]-ibrk3[k])))
stopifnot(length(ll3)==nrow(body3))
body3 <- data.frame(body3, stringsAsFactors=FALSE)
colnames(body3) <- head3
tab3 <- aggregate(body3, by=list(ll3), paste0, collapse="") %>%
select(-Group.1)
print(tab3)
## I have not managed to find a value of `area` which correctly splits
## the last two columns *and* allows to identify the rows in each record...
tab <- tab %>% left_join(tab3)

Loop over a subset, source a file and save results in a dataframe

Similar questions have been asked already but none was able to solve my specific problem. I have a .R file ("Mycalculus.R") containing many basic calculus that I need to apply to subsets of a dataframe: one subset for each year where the modalities of "year" are factors (yearA, yearB, yearC) not numeric values. The file generates a new dataframe that I need to save in a Rda file. Here is what I expect the code to look like with a for loop (this one obviously do not work):
id <- identif(unlist(df$year))
for (i in 1:length(id)){
data <- subset(df, year == id[i])
source ("Mycalculus.R", echo=TRUE)
save(content_df1,file="myresults.Rda")
}
Here is an exact of the main data.frame df:
obs year income gender ageclass weight
1 yearA 1000 F 1 10
2 yearA 1200 M 2 25
3 yearB 1400 M 2 5
4 yearB 1350 M 1 11
Here is what the sourced file "Mycalculus.R" do: it applies numerous basic calculus to columns of the dataframe called "data", and creates two new dataframes df1 and then df2 based on df1. Here is an extract:
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
library(GiniWegNeg)
gini=c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1=data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c("content_df1")
df2=(1/5)*df1$Income_gender+df2$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c("content_df2")
So that in the end, I get two dataframes like this:
Income_Gender Income_Ageclass
content_df1 .... ....
And for df2:
myresult
content_df2 ....
But I need to save df1 and Rf2 as a Rda file where the row names of content_df1 and content_df2 are given per subset, something like this:
Income_Gender Income_Ageclass
content_df1_yearA .... ....
content_df1_yearB .... ....
content_df1_yearC .... ....
and
myresult
content_df2_yearA ....
content_df2_yearB ....
content_df2_yearC ....
Currently, my program does not use any loop and is doing the job but messily. Basically the code is more than 2500 lines of code. (please don't throw tomatoes at me).
Anyone could help me with this specific request?
Thank you in advance.
Consider incorporating all in one script with a defined function of needed arguments, called by lapply(). Lapply then returns a list of dataframes that you can rowbind into one final df.
library(dplyr)
library(GiniWegNeg)
runIncomeCalc <- function(data, y){
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1 <- data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c(paste0("content_df1_", y))
return(df1)
}
runResultsCalc <- function(df, y){
df2 <- (1/5) * df$Income_gender + df$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c(paste0("content_df2_", y)
return(df2)
}
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
Now if you need to source across scripts. Create same two functions, runIncomeCalc and runResultsCalc in Mycalculus.R and then call each in other script:
library(dplyr)
library(GiniWegNeg)
if(!exists("runIncomeCalc", mode="function")) source("Mycalculus.R")
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
If you functional-ize your steps you can create a workflow like the following:
calcFunc <- function(df) {
## Do something to the df, then return it
df
}
processFunc <- function(fname) {
## Read in your table
x <- read.table(fname)
## Do the calculation
x <- calcFunc(x)
## Make a new file name (remember to change the file extension)
new_fname <- sub("something", "else", fname)
## Write the .RData file
save(x, file = new_fname)
}
### Your workflow
## Generate a vector of files
my_files <- list.files()
## Do the work
res <- lapply(my_files, processFunc)
Alternatively, don't save the files. Omit the save call in the processFunc, and return a list of data.frame objects. Then use either data.table::rbindlist(res) or do.call(rbind, list) to make one large data.frame object.

Passing vector with multiple values into R function to generate data frame

I have a table, called table_wo_nas, with multiple columns, one of which is titled ID. For each value of ID there are many rows. I want to write a function that for input x will output a data frame containing the number of rows for each ID, with column headers ID and nobs respectively as below for x <- c(2,4,8).
## id nobs
## 1 2 1041
## 2 4 474
## 3 8 192
This is what I have. It works when x is a single value (ex. 3), but not when it contains multiple values, for example 1:10 or c(2,5,7). I receive the warning "In ID[counter] <- x : number of items to replace is not a multiple of replacement length". I've just started learning R and have been struggling with this for a week and have searched manuals, this site, Google, everything. Can someone help please?
counter <- 1
ID <- vector("numeric") ## contain x
nobs <- vector("numeric") ## contain nrow
for (i in x) {
r <- subset(table_wo_nas, ID %in% x) ## create subset for rows of ID=x
ID[counter] <- x ## add x to ID
nobs[counter] <- nrow(r) ## add nrow to nobs
counter <- counter + 1 } ## loop
result <- data.frame(ID, nobs) ## create data frame
In base R,
# To make a named vector, either:
tmp <- sapply(split(table_wo_nas, table_wo_nas$ID), nrow)
# OR just:
tmp <- table(table_wo_nas$ID)
# AND
# arrange into data.frame
nobs_df <- data.frame(ID = names(tmp), nobs = tmp)
Alternately, coerce the table into a data.frame directly, and rename:
nobs_df <- data.frame(table(table_wo_nas$ID))
names(nobs_df) <- c('ID', 'nobs')
If you only want certain rows, subset:
nobs_df[c(2, 4, 8), ]
There are many, many more options; these are just a few.
With dplyr,
library(dplyr)
table_wo_nas %>% group_by(ID) %>% summarise(nobs = n())
If you only want certain IDs, add on a filter:
table_wo_nas %>% group_by(ID) %>% summarise(nobs = n()) %>% filter(ID %in% c(2, 4, 8))
Seems pretty straightforward if you just use table again:
tbl <- table( table_wo_nas[ , 'ID'] )
data.frame( IDs = names(tbl), nobs= tbl)
Could also get a quick answer although with different column names using:
as.data.frame(table( table_wo_nas[ , 'ID'] ))
Try this.
x=c(2,4,8)
count_of_id=0
#df is your data frame table_wo_nas
count_of<-function(x)
{for(i in 1 : length(x))
{count_of_id[i]<-length(which(df$id==x[i])) #find out the n of rows for each unique value of x
}
df_1<-cbind(id,count_of_id)
return(df_1)
}

Combine different tables in a list in R

update: Code below seems to work
I'm not entire sure to how this question, so I apologise if this is worded badly. I tried looking for "combine different elements of a list using apply" but that doesn't seem to work.
Anyways, as the result of scraping a website, I have two vectors giving identifying information and a list that contains a number of different tables. A simplified version looks something like this:
respondents <- c("A", "B")
questions <- c("question1", "question2")
df1 <- data.frame(
option = c("yes", "no"),
percentage = c(70, 30), stringsAsFactors = FALSE)
df2 <- data.frame(
option= c("today", "yesterday"),
percentage =c(30, 70), stringsAsFactors = FALSE)
df3 <- data.frame(
option = c("yes", "no"),
percentage = c(60, 40), stringsAsFactors = FALSE)
df4 <- data.frame(
option= c("today", "yesterday"),
percentage =c(20, 80), stringsAsFactors = FALSE)
lst <- list(df1, df2, df3, df4)
The first two tables are questions and responses from the first participant, and the second two tables are questions are from the second participant. What i would like to do is to create two tables that contain the answers to the questions for the two participants. So I would like something that looks like this:
question1 <- data.frame(
option = c("yes", "no"),
A = c(70, 30),
B = c(60, 40), stringsAsFactors = FALSE)
question2 <- data.frame(
option = c("today", "yesterday"),
A = c(30, 70),
B = c(20, 80), stringsAsFactors = FALSE)
In my case, I have 122 responses from 51 participants, and it ordered so that tables 1-122 are from the first participant, the next 122 tables are from the second participant, etc. Ultimately, then, I would like to have 122 tables (one table per question), with each table containing 51 columns that correspond to each participant. I am more or less at a loss as to how to do this, so I would appreciate any suggestions.
This should now work:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[#href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\\s+src=./images/" with _
tmp <- gsub("<img\\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\\(','',x[,2] )
x [,2 ] = gsub('%\\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
#get rid of extra tables
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("option", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[122]]
Thanks to akrun (see comments), I got this to work. The full code is here:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
url
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[#href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Remove first column
Questions <- tb[[1]][,1]
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
survey <- vector(length(subURL), mode = "list")
i <- 1
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\\s+src=./images/" with _
tmp <- gsub("<img\\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
#tb <- tb[-1]
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\\(','',x[,2] )
x [,2 ] = gsub('%\\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
## Remove unneeded dataframes in list
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("Options", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[1]]

Resources