subscript out of bounds for list! R - r

I am learning web scraping and have been facing one hurdle after another. I want to create a data frame full of the first table on this page for all portfolio managers, for the month of august, the year 2022.
So far, I have found a way to scrape a single table properly (I think! Please let me know if I can improve on this).
I haven't been able to bind all the tables into a data frame properly, also I wanted to find out if there is a way to transform this form type data into a proper data frame with the 1st column of every table as the variable and the second column as the row (I know I can use the usual data wrangling thing but I wanted to know if some function helped transform this form type data into a data frame).
> library(tidyverse)
> library(rvest)
> library(httr)
> url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
> pm_id <- read_html(url) %>%
+ html_elements('select[name="pmrId"].f_control option') %>%
+ html_attr("value")
> pm_id <- pm_id[2:416]
> sebi_pm <- function(x) {
+ resp = POST(url,
+ body = list(
+ pmrId= x,
+ year="2022",
+ m .... [TRUNCATED]
> #s <- lapply(pm_id[i], sebi_pm)
> #v <- sebi_pm(pm_id[1])
> #v
> #do.call() lapply(pm_id[1:5], sebi_pm)
> ha <- do.call("rbind", lapply(pm_id, sebi_ .... [TRUNCATED]
#> Error in .[[1]] : subscript out of bounds

Normally I would be a stickler for a reproducible example, but I think I know what you're getting at here... try this...
# DEPENDENCIES -----------------------------------------------------------------
library(rvest)
library(httr)
library(stringr)
library(data.table)
# UTILITY FUNCTIONS ------------------------------------------------------------
get_pm_ids <- function() {
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
# get list of portfolio manager ids
pm_ids <- read_html(url) |>
html_elements('select[name="pmrId"].f_control option') |>
html_attr('value')
pm_ids
}
get_monthly_report <- function(pmr_id, report_year, report_month) {
msg <- sprintf('fetching report for portfolio manager: %s; year = %s; month = %s',
str_split(pmr_id, '##', simplify = TRUE)[ , 3] |> str_squish(),
report_year,
report_month)
message(msg)
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
params <- list(
currdate = '',
loginflag = 0,
searchValue = '',
pmrId = pmr_id,
year = report_year,
month = report_month,
loginEmail = '',
loginPassword = '',
cap_login = '',
moduleNo = -1,
moduleId = '',
link = '',
yourName = '',
friendName = '',
friendEmail = '',
mailmessage = '',
cap_email = ''
)
resp <- POST(url, body = params)
pg <- httr::content(resp)
tbl <- html_nodes(pg, 'div.portlet:nth-child(3) > div:nth-child(1) > table:nth-child(1)')
result_df <- data.frame()
if (length(tbl) == 0) {
# no records found
result_df <- data.frame(id = pmr_id,
report_year = report_year,
report_month = report_month)
} else {
tr <- html_nodes(tbl, 'tr')
cell_captions <- lapply(tr, html_children) |> lapply('[', 1) |> lapply(html_text) |> unlist()
cell_contents <- lapply(tr, html_children) |> lapply('[', 2) |> lapply(html_text) |> unlist()
result_df <- data.frame(t(cell_contents))
colnames(result_df) <- cell_captions
result_df$id <- pmr_id
result_df$report_year <- report_year
result_df$report_month <- report_month
}
return(result_df)
}
# MAIN -------------------------------------------------------------------------
## 1. fetch list of portfolio manager ids --------------------------------------
pm_ids <- get_pm_ids()
## 2. filter list of portfolio manager ids -------------------------------------
pm_ids <- pm_ids[ 2:416 ]
## 3. testing: fetch reports for a sample of managers in January 2022 ----------
set.seed(1234)
tmp <- sample(pm_ids, 5)
reports_list <- lapply(tmp, get_monthly_report, 2022, 1)
## 4. combine the results ------------------------------------------------------
reports_df <- rbindlist(reports_list, use.names = TRUE, fill = TRUE) |>
as.data.frame()
## 5. inspect results ----------------------------------------------------------
View(reports_df, 'downloaded reports')
This code could be improved by providing some kind of input validation and more robust error handling. Hope this helps!

Related

PowerBI R Script Runtime Error,Prefixing UQ() with the rlang namespace is deprecated as of rlang 0.3.0

Below is the reproducible code which needs to be pasted in PowerBI R script visualization.
I'm making some customizations to the default process_map object.
The visual works on my current desktop but gives out error when published to PowerBI Web.
# The following code to create a dataframe and remove duplicated rows is always executed and acts as a preamble for your script:
# dataset <- data.frame(Column1)
# dataset <- unique(dataset)
# Paste or type your script code here:
library(bupaR)
library(DiagrammeR)
library(tidyverse)
library(lubridate)
# convert to proper date
processMap <- patients %>%
process_map(sec = performance(median, "hours")
,type = frequency("relative_case")
,type_edges = frequency("absolute_case")
,rankdir = "LR"
,layout = layout_pm(edge_weight = TRUE)
,fixed_edge_width = F
,render = F
)
# customisation(label, color, font)
processMap$nodes_df$label <- stringr::str_replace_all(processMap$nodes_df$label, c('ARTIFICIAL_START' = 'Start', 'ARTIFICIAL_END' = 'End'))
processMap$nodes_df$color <- stringr::str_replace_all(processMap$nodes_df$color, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontcolor <- stringr::str_replace_all(processMap$nodes_df$fontcolor, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontname <- stringr::str_replace_all(processMap$nodes_df$fontname, c('Arial' = 'Calibri'))
processMap$edges_df$fontname <- stringr::str_replace_all(processMap$edges_df$fontname, c('Arial' = 'Calibri'))
# change edge with
processMap$edges_df$penwidth <- scales::rescale(processMap$edges_df$penwidth, to = c(0.5, 4))
# custom duration edges
edges_label <- processMap$edges_df$label
tmp <- regmatches(edges_label, gregexpr("\\(.*?\\)", edges_label))
tmp <- gsub("[\\(\\)]", "", tmp)
tmp <- stringr::str_replace_all(tmp, c('character0' = '0', ' hours'=''))
tmp <- as.numeric(tmp)*60*60
tmp <- lubridate::as.duration(tmp)
tmp <- regmatches(tmp, gregexpr("\\(.*?\\)", tmp))
tmp <- gsub("[\\(\\)]", "", tmp)
edges_label_duration <- stringr::str_replace_all(tmp, c('character0' = '<1 hour'))
edges_label_count_absolute_case <- gsub("\n.*","",edges_label)
edges_label_clean <- paste0(edges_label_count_absolute_case, '\n',edges_label_duration)
# remove those which should have no hour()
correct_label <- gsub("\n.*","",edges_label_clean[!grepl("hour", edges_label)])
edges_label_clean[!grepl("hour", edges_label)] <- correct_label
# assign to process object
processMap$edges_df$label <- edges_label_clean
DiagrammeR::export_graph(graph = processMap, file_type = 'png', file_name = 'processMap.png')
Below is the visual output in PowerBI desktop
When published to the PowerBI portal, it gives the following error
Error in if (any(ind)) Encoding(x[ind]) <- "bytes" :
missing value where TRUE/FALSE needed
In addition: Warning message:
Prefixing `UQ()` with the rlang namespace is deprecated as of rlang 0.3.0.
Please use the non-prefixed form or `!!` instead.
# Bad:
rlang::expr(mean(rlang::UQ(var) * 100))
# Ok:
rlang::expr(mean(UQ(var) * 100))
# Good:
rlang::expr(mean(!!var * 100))
rlang's on desktop is rlang_0.4.10 while the PowerBI Web has a rlang_0.3.0
I ended up re-writing the customisation code so that there's no function within a function which is referenced by the error code below
# Bad:
rlang::expr(mean(rlang::UQ(var) * 100))
I suspect the related function is this one
regmatches(edges_label, gregexpr("\\(.*?\\)", edges_label))
Couldn't really definitively say it's the case but anyways the updated code below generates no more error when published to PowerBI service/web.
library(bupaR)
library(DiagrammeR)
library(tidyverse)
library(lubridate)
# convert to proper date
processMap <- patients %>%
process_map(sec = performance(median, "hours")
,type = frequency("relative_case")
,type_edges = frequency("absolute_case")
,rankdir = "LR"
,layout = layout_pm(edge_weight = TRUE)
,fixed_edge_width = F
,render = F
)
# customisation(label, color, font)
processMap$nodes_df$label <- stringr::str_replace_all(processMap$nodes_df$label, c('ARTIFICIAL_START' = 'Start', 'ARTIFICIAL_END' = 'End'))
processMap$nodes_df$color <- stringr::str_replace_all(processMap$nodes_df$color, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontcolor <- stringr::str_replace_all(processMap$nodes_df$fontcolor, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontname <- stringr::str_replace_all(processMap$nodes_df$fontname, c('Arial' = 'Calibri'))
processMap$edges_df$fontname <- stringr::str_replace_all(processMap$edges_df$fontname, c('Arial' = 'Calibri'))
# change edge with
processMap$edges_df$penwidth <- scales::rescale(processMap$edges_df$penwidth, to = c(0.5, 4))
# custom duration edges
edges_label <- processMap$edges_df$label
# extract within bracket
tmp <- stringr::str_extract(string = edges_label,
pattern = "(?<=\\().*(?=\\))")
tmp <- stringr::str_replace_all(tmp, c(' hours'=''))
tmp[is.na(tmp)] <- '0'
tmp <- as.numeric(tmp)*60*60
tmp <- lubridate::as.duration(tmp)
tmp <- stringr::str_extract(string = tmp,
pattern = "\\(.*?\\)")
tmp[is.na(tmp)] <- '(<1 hour)'
edges_label_duration <- tmp
edges_label_count_absolute_case <- gsub("\n.*","",edges_label)
edges_label_clean <- paste0(edges_label_count_absolute_case, '\n',edges_label_duration)
# remove those which should have no hour()
correct_label <- gsub("\n.*","",edges_label_clean[!grepl("hour", edges_label)])
edges_label_clean[!grepl("hour", edges_label)] <- correct_label
# assign to process object
processMap$edges_df$label <- edges_label_clean
DiagrammeR::export_graph(graph = processMap, file_type = 'png', file_name = 'processMap.png')

R function is looping over the same data in webscraper

This is my program that I've written
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument, 'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pageNumbers al-pageNumbers")]
P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument,'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument){
Sort1 <- html_nodes(parsedDocument, 'h1')
Title <- gsub("<h1>\\n|\\n</h1>","",Sort1)
return(Title)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if(year_chosen >= 1994){
noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl = "&fl_SiteID=5275&startpage="
URL = paste(noYearURL, pagesURl, sep = "")
#URL is working with parameter year_chosen
Page <- getPageNumber(URL)
Page2 <- 0
while(Page < Page2 | Page != Page2){
Page <- Page2
URL3 <- paste(URL, Page-1, sep = "")
Page2 <- getPageNumber(URL3)
}
R_Data <- data.frame()
for(i in 1:Page){ #0:Page-1
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for(j in 1:(length(URL2))){
parsedDocument <- read_html(URL2[j])
print(URL2[j])
R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
#R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
R_Data <- rbind(R_Data, R)
}
}
paste(URL2)
suppressWarnings(write.csv(R_Data, "DNAresearch.csv", row.names = FALSE, sep = "\t"))
#return(R_Data)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
The output for my code goes as follows:
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
[1] "https://doi.org/10.1093/dnares/10.6.277"
[1] "https://doi.org/10.1093/dnares/10.6.229"
[1] "https://doi.org/10.1093/dnares/10.6.239"
[1] "https://doi.org/10.1093/dnares/10.6.287"
[1] "https://doi.org/10.1093/dnares/10.5.221"
[1] "https://doi.org/10.1093/dnares/10.5.203"
[1] "https://doi.org/10.1093/dnares/10.5.213"
[1] "https://doi.org/10.1093/dnares/10.4.137"
[1] "https://doi.org/10.1093/dnares/10.4.147"
[1] "https://doi.org/10.1093/dnares/10.4.167"
[1] "https://doi.org/10.1093/dnares/10.4.181"
[1] "https://doi.org/10.1093/dnares/10.4.155"
[1] "https://doi.org/10.1093/dnares/10.3.115"
[1] "https://doi.org/10.1093/dnares/10.3.85"
[1] "https://doi.org/10.1093/dnares/10.3.123"
[1] "https://doi.org/10.1093/dnares/10.3.129"
[1] "https://doi.org/10.1093/dnares/10.3.97"
[1] "https://doi.org/10.1093/dnares/10.2.59"
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
I'm trying to scrape a journal with years as a parameter. I've scraped one page, but when I'm supposed to change pages my loop just goes back to the top of the page and loops over the same data. My code should be right and I don't understand why this is happening. Thank you in advance
It is not that it is reading the same url. It is that you are selecting for the wrong node which happens to yield repeating info. As I mentioned in your last question, you need to re-work your Title function. The Title re-write below will extract the actual article title based on class name and single node match.
Please note the removal of your sep arg. There are also some other areas of the code that look like they probably could be simplified in terms of logic.
Title function:
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\\r\\n\\s+", "", .) %>%
trimws(.)
return(Title)
}
R:
library(rvest)
library(XML)
library(stringr)
# Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")]
P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
# Getting all articles based off of their DOI
getAllArticles <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber <- trimws(gsub(".*10.1093/dnares/", "", html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\\r\\n\\s+", "", .) %>%
trimws(.)
return(Title)
}
# main function with input as parameter year
findURL <- function(year_chosen) {
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
Page <- getPageNumber(URL)
if (Page == 5) {
Page2 <- 0
while (Page < Page2 | Page != Page2) {
Page <- Page2
URL3 <- paste(URL, Page - 1, sep = "")
Page2 <- getPageNumber(URL3)
}
}
R_Data <- data.frame()
for (i in 1:Page) {
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URL2))) {
parsedDocument <- read_html(URL2[j])
#print(URL2[j])
#print(Title(parsedDocument))
R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
#print(R)
R_Data <- rbind(R_Data, R)
}
}
write.csv(R_Data, "Group4.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)

pulling multiple entries in xml with different data using R

I have a set of XML files that I am reading in, and wanted to know the best way to deal with the following:
<MyDecision>
<Decision>
<DecisionID>X1234</DecisionID>
<DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
<a:Reason>
<a:Description>DOBMismatch</a:Description>
</a:Reason>
<a:Reason>
<a:Description>PrimaryChecksFail</a:Description>
</a:Reason>
<a:Reason>
<a:Description>IncomeReferral</a:Description>
</a:Reason>
</DecisionReasons>
</Decision>
</MyDecision>
At the moment, I am running some R code but get the response:
Error: Duplicate identifiers for rows (2, 3, 4)
The intended output is a dataframe that looks something like:
fieldname |contents
MyDecision_Decision_DecisionID |X1234
MyDecision_Decision_DecisionReasons_Reason_Description_DOBMismatch |DOBMismatch
MyDecision_Decision_DecisionReasons_Reason_Description_PrimaryChecksFail |PrimaryChecksFail
MyDecision_Decision_DecisionReasons_Reason_Description_IncomeReferral |IncomeReferral
My current code is as below:
library(profvis)
library(XML)
library(xml2)
library(plyr)
library(tidyverse)
library(reshape2)
library(foreign)
library(rio)
setwd('c:/temp/xml/t')
df <- data.frame()
transposed.df1 <- data.frame()
allxmldata <- data.frame()
inputfiles <- as.character('test.xml')
findchildren<-function(nodes, df) {
numchild <- sapply(nodes, function(x){length(xml_children(x))})
xmlvalue <- xml_text(nodes[numchild==0])
xmlname <- xml_name(nodes[numchild==0])
xmlpath <- sapply(nodes[numchild==0], function(x) {gsub(', ','_', toString(rev(xml_name(xml_parents(x)))))})
if (isTRUE(xmlpath == 'MyDecision_Decision_DecisionReasons_Reason')) {
fieldname <- paste(xmlpath,xmlname,xmlvalue,sep = '_')
} else {
fieldname <- paste(xmlpath,xmlname,sep = '_')
}
contents <- sapply(xmlvalue, function(f){is.na(f)<-which(f == '');f})
dftemp <- data.frame(fieldname, contents)
df <- rbind(df, dftemp)
print(dim(df))
if (sum(numchild)>0){
findchildren(xml_children(nodes[numchild>0]), df) }
else{ return(df)}
}
for (x in inputfiles) {
df1 <- findchildren(xml_children(read_xml(x)),df)
xml.df1 <- data.frame(spread(df1, key = fieldname, value = contents), fix.empty.names = TRUE)
allxmldata <- rbind.fill(allxmldata,xml.df1)
}
I hope that there is someone that can point out what I have done wrong...
I came up with the following solution that works for your exemplary dataset. Hopefully, this approach can also be used for your larger dataset.
# we need these two packages
library(xml2)
library(tidyverse)
# read in the xml-file
xml <- read_xml(
'<MyDecision>
<Decision>
<DecisionID>X1234</DecisionID>
<DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
<a:Reason>
<a:Description>DOBMismatch</a:Description>
</a:Reason>
<a:Reason>
<a:Description>PrimaryChecksFail</a:Description>
</a:Reason>
<a:Reason>
<a:Description>IncomeReferral</a:Description>
</a:Reason>
</DecisionReasons>
</Decision>
</MyDecision>'
)
xml %>%
# first find all nodes that doesn't contain any child nodes
xml_find_all(".//node()[not(node())]") %>%
# find the parent of each node
map(xml_parent) %>%
# extract name and text of each of the childless nodes
map(~list(name = xml_name(.x), text = xml_text(.x))) %>%
# bind rows.
bind_rows()
This produces the following output:
# A tibble: 4 x 2
name text
<chr> <chr>
1 DecisionID X1234
2 Description DOBMismatch
3 Description PrimaryChecksFail
4 Description IncomeReferral

How to pass multiple values in a rvest submission form

This is a follow up to a prior thread. The code works fantastic for a single value but I get the following error when trying to pass more than 1 value I get an error based on the length of the function.
Error in vapply(elements, encode, character(1)) :
values must be length 1,
but FUN(X[1]) result is length 3
Here is a sample of the code. In most instances I have been able just to name an object and scrape that way.
library(httr)
library(rvest)
library(dplyr)
b<-c('48127','48180','49504')
POST(
url = "http://www.nearestoutlet.com/cgi-bin/smi/findsmi.pl",
body = list(zipcode = b),
encode = "form"
) -> res
I was wondering if a loop to insert the values into the form would be the right way to go? However my loop writing skills are still in development and I am unsure of where to place it; in addition when i call the loop it doesn't print line by line it just returns null results.
#d isn't listed in the above code as it returns null
d<-for(i in 1:3){nrow(b)}
Here is an approach to send multiple POST requests
library(httr)
library(rvest)
b <- c('48127','48180','49504')
For each element in b perform a function that will send the appropriate POST request
res <- lapply(b, function(x){
res <- POST(
url = "http://www.nearestoutlet.com/cgi-bin/smi/findsmi.pl",
body = list(zipcode = x),
encode = "form"
)
res <- read_html(content(res, as="raw"))
})
Now for each element of the list res you should do the parsing steps explained by hrbrmstr: How can I Scrape a CGI-Bin with rvest and R?
library(tidyverse)
I will use hrbrmstr's code since he is king and it is already clear to you. Only thing we are doing here is performing it on each element of res list.
res_list = lapply(res, function(x){
rows <- html_nodes(x, "table[width='300'] > tr > td")
ret <- data_frame(
record = !is.na(html_attr(rows, "bgcolor")),
text = html_text(rows, trim=TRUE)
) %>%
mutate(record = cumsum(record)) %>%
filter(text != "") %>%
group_by(record) %>%
summarise(x = paste0(text, collapse="|")) %>%
separate(x, c("store", "address1", "city_state_zip", "phone_and_or_distance"), sep="\\|", extra="merge")
return(ret)
}
)
or using map from purrr
res %>%
map(function(x){
rows <- html_nodes(x, "table[width='300'] > tr > td")
data_frame(
record = !is.na(html_attr(rows, "bgcolor")),
text = html_text(rows, trim=TRUE)
) %>%
mutate(record = cumsum(record)) %>%
filter(text != "") %>%
group_by(record) %>%
summarise(x = paste0(text, collapse="|")) %>%
separate(x, c("store", "address1", "city_state_zip", "phone_and_or_distance"),
sep="\\|", extra="merge") -> ret
return(ret)
}
)
If you would like this in a data frame:
res_df <- data.frame(do.call(rbind, res_list), #rbinds list elements
b = rep(b, times = unlist(lapply(res_list, length)))) #names the rows according to elements in b
You can put the values inside the post as below,
b<-c('48127','48180','49504')
for(i in 1:length(b)) {
POST(
url = "http://www.nearestoutlet.com/cgi-bin/smi/findsmi.pl",
body = list(zipcode =b[i]),
encode = "form"
) -> res
# YOUR CODES HERE (for getting content of the page etc.)
}
But since for every different zipcode value the "res" value will be different, you need the put the rest of the codes inside the area I commented. Otherwise you get the last value only.

R: Trouble in appending rows in a dataframe from webscraping in r [duplicate]

This question already has answers here:
Error in if/while (condition) {: missing Value where TRUE/FALSE needed
(4 answers)
Closed 5 years ago.
I have dataframe with 7 rows and 1 column,which contains links of a website, I'm trying to extract data from those various link and store them in a data frame but not able to append that.Also I'm checking that if for a link if there is no records(this I'm checking through html attribute of that link) skip that link and proceed to next link.I'm also trying to fetch data for multiple pages of a link.
This is reproducible data
text1="http://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom="
text3="&proptype="
text4="Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment"
text5="&cityName=Thane&BudgetMin="
text6="&BudgetMax="
bhk=c("1","2","3","4","5",">5")
budg_min=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
budg_max=c("5-Lacs","10-Lacs","20-Lacs","30-Lacs","40-Lacs","50-Lacs","60-Lacs","70-Lacs","80-Lacs","90-Lacs","1-Crores","1.2-Crores","1.4-Crores","1.6-Crores","1.8-Crores","2-Crores","2.3-Crores","2.6-Crores","3-Crores","3.5-Crores","4-Crores","4.5-Crores","5-Crores","10-Crores","20-Crores")
eg <- expand.grid(bhk = bhk, budg_min = budg_min, budg_max = budg_max)
eg <- eg[as.integer(eg$budg_min) <= as.integer(eg$budg_max),]
uuu <- sprintf("%s%s%s%s%s%s%s%s", text1,eg[,1],text3,text4,text5,eg[,2],text6,eg[,3])
uuu_df1=data.frame(x=uuu[1:7,])
dput(uuu_df1)
I have 3 solution for this but none seems to be working fine.
SOlution#1
urlList <- llply(uuu_df1[,1], function(url){
this_pg <- read_html(url)
results_count <- this_pg %>%
xml_find_first(".//span[#id='resultCount']") %>%
xml_text() %>%
as.integer()
if(results_count > 0){
cards <- this_pg %>%
xml_find_all('//div[#class="SRCard"]')
df <- ldply(cards, .fun=function(x){
y <- data.frame(wine = x %>% xml_find_first('.//span[#class="agentNameh"]') %>% xml_text(),
excerpt = x %>% xml_find_first('.//div[#class="postedOn"]') %>% xml_text(),
locality = x %>% xml_find_first('.//span[#class="localityFirst"]') %>% xml_text(),
society = x %>% xml_find_first('.//div[#class="labValu"]') %>% xml_text() %>% gsub('\\n', '', .))
return(y)
})
} else {
df <- NULL
}
return(df)
}, .progress = 'text')
names(urlList) <- uuu_df1[,1]
a=bind_rows(urlList)
Above code gives me error Error in if (results_count > 0) { : missing value where TRUE/FALSE needed
Solution#2
urlList <- lapply(uuu_df1[,1], function(url){
UrlPage <- html(as.character(url))
ImgNode <- UrlPage %>% html_node("div.noResultHead")
u <- paste("No", word(string = as(ImgNode, "character"), start=4, end=5), sep=" ")
cat(".")
pg <- read_html(url)
if(u!="No Results Found!") {
df <- data.frame(wine=html_text(html_nodes(pg, ".agentNameh")),
excerpt=html_text(html_nodes(pg, ".postedOn")),
locality=html_text(html_nodes(pg,".localityFirst")),
society=html_text(html_nodes(pg,'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
} else {
# ASSIGN EMPTY DATAFRAME (FOR CONSISTENT STRUCTURE)
df <- data.frame(wine=character(), excerpt=character(), locality=character(), society=character())
}
# RETURN NAMED LIST
return(list(UrlPage=UrlPage, ImgNode=ImgNode, u=u, df=df))
})
# ROW BIND ONLY DATAFRAME ELEMENT FROM LIST
wines <- map_df(urlList, function(u) u$df)
Above code gives empty dataframe
Solution#3
uuu_df1=data.frame(x=uuu_df[1:7,])
wines=data.frame()
url_test=c()
UrlPage_test=c()
u=c()
ImgNode=c()
pg=c()
for(i in 1:dim(uuu_df1)[1]) {
url_test[i]=as.character(uuu_df1[i,])
UrlPage_test[i] <- html(url_test[i])
ImgNode[i] <- UrlPage_test[i] %>% html_node("div.noResultHead")
u[i]=ImgNode[i]
u[i]=as(u[i],"character")
u[i]=paste("No",word(string = u, start = 4, end = 5),sep = " ")
if(u[i]=="No Results Found!") next
{
map_df(1:5, function(i) # here 1:5 is number of webpages of a website
{
# simple but effective progress indicator
cat(".")
pg[i] <- read_html(sprintf(url_test[i], i))
data.frame(wine=html_text(html_nodes(pg[i], ".agentNameh")),
excerpt=html_text(html_nodes(pg[i], ".postedOn")),
locality=html_text(html_nodes(pg[i],".localityFirst")),
society=html_text(html_nodes(pg[i],'.labValu .stop-propagation:nth-child(1)')),
stringsAsFactors=FALSE)
}) -> wines
}}
Above code also gives an error
Error in UseMethod("xml_find_first") :
no applicable method for 'xml_find_first' applied to an object of class "list"
In addition: Warning messages:
1: 'html' is deprecated.
Use 'read_html' instead.
See help("Deprecated")
2: In UrlPage_test[i] <- html(url_test[i]) :
number of items to replace is not a multiple of replacement length
Any suggestions on which code can be corrected so that my requirement is met. Thanks in advance
Solution #1
That missing value where TRUE/FALSE needed is printed when you do something like this:
if (NA > 0) {
do something
}
So replace your if condition
if(results_count > 0)
with
(!is.na(results_count) & (results_count > 0))

Resources