(...) is not a length 1 atomic vector error - r

I am trying to scrape IMDB data, and for one variable I keep getting an error.
Error: Result 28 is not a length 1 atomic vector
library(rvest)
library(purrr)
library(tidyr)
topmovies <- read_html("http://www.imdb.com/chart/top")
links <- topmovies %>%
html_nodes(".titleColumn") %>%
html_nodes("a") %>%
html_attr("href") %>%
xml2::url_absolute("http://imdb.com")
pages <- links %>% map(read_html)
budget <- try(pages %>%
map_chr(. %>%
html_nodes("#titleDetails .txt-block:nth-child(11)") %>%
html_text() %>%
#gsub("\\D", "", .) %>%
extract_numeric()),silent=TRUE)
When I do it with try (as in the code), I get
budget
[1] "Error : cannot convert object to a data frame\n" attr(,"class")
[1] "try-error" attr(,"condition") <Rcpp::exception: cannot convert
object to a data frame>
It would be great if anyone could tell me what is going wrong / why try isn't just skipping that result?

This can happen when you encounter NULL. Try:
map_if(is_empty, ~ NA_character_)
in the pipe chain and see if it works.

Related

Error when using a function: Error in df1[[1]] : subscript out of bounds

I'm trying to scrape the gun laws from https://www.statefirearmlaws.org/. However, I keep getting the following error:
Error in df1[[1]] : subscript out of bounds
I used selector gadget to copy the nodes for the table.
What can I do to fix it?
library(rvest)
library(tidyverse)
years <- lapply(c(2006:2018), function(x) {
link <- paste0('https://www.statefirearmlaws.org/national-data/', x)
df1 <- link %>% read_html() %>%
html_nodes('.js-view-dom-id-cc833ef0290cd127457401b760770f1411daa41fc70df5f12d07744fab0a173c > div > div') %>%
html_text(trim = TRUE)
df <- df1[[1]]
return(df)
}
)
df1 <- link %>% read_html() %>%
html_nodes('.js-view-dom-id-cc833ef0290cd127457401b760770f1411daa41fc70df5f12d07744fab0a173c > div > div')
this part results in {xml_nodeset (0)} which later produces empty list().
Are you selecting the correct thing you want to scrape in html_nodes? Maybe SelectorGadget can be helpful to choose what you need
So html_text expects a node as it's input and html_table outputs a list of tibbles so html_text fails to parse this.

How do you subset data from a list in R?

I would LIKE TO EXTRACT THE job descriptions, aka the "p" TAG HTML ELEMENTS, from all 16 pages generated from the last line of code.
"ret" IS A LIST of 16 HTML pages generated by the last line of code. I'm not used to working with lists of lists, so I'm confused how to extract the data from these lists.
Normally I would use
res %>%
html_elements("body p")
But I'm getting the error message, "Error in UseMethod("xml_find_all") :
no applicable method for 'xml_find_all' applied to an object of class "list"
library(tidyverse)
library(rvest)
library(xml2)
url<-"https://www.indeed.com/jobs?q=data%20analyst&l=San%20Francisco%2C%20CA&vjk=0c2a6008b4969776"
page<-xml2::read_html(url)#function will read in the code from the webpage and break it down into different elements (<div>, <span>, <p>, etc.
#get job title
title<-page %>%
html_nodes(".jobTitle") %>%
html_text()
#get company Location
loc<-page %>%
html_nodes(".companyLocation") %>%
html_text()
#job snippet
page %>%
html_nodes(".job-snippet") %>%
html_text()
#Get link
desc<- page %>%
html_nodes("a[data-jk]") %>%
html_attr("href")
# Create combine link
combined_link <- paste("https://www.indeed.com", desc, sep="")
#Turn combined link into a session follow link
page1 <- html_session(combined_link[[1]])
page1 %>%
html_nodes(".iCIMS_JobContent, #jobDescriptionText") %>%
html_text()
#one<- page %>% html_elements("a[id*='job']")
#create function return a list of page-returns
ret <- lapply(paste0("https://www.indeed.com", desc), read_html)
We could either use lapply from base R
out <- lapply(ret, function(x) x %>%
html_nodes(".iCIMS_JobContent, #jobDescriptionText") %>%
html_text())
or loop with map from purrr
library(purrr)
out <- map(ret, ~ .x %>%
html_nodes(".iCIMS_JobContent, #jobDescriptionText") %>%
html_text())
NOTE: Both are looping over the elements of the list, the .x or x are the individual elements (from anonymous function - i.e. function created on the fly (function(x) or ~ - in tidyverse)

Trouble mapping a function to a list of scraped links using rvest

I am trying to apply a function that extracts a table from a list of scraped links. I am at the final stage where I am applying the get_injury_data function to the links - I have been having issues with successfully executing this. I get the following error:
Error in matrix(unlist(values), ncol = width, byrow = TRUE) :
'data' must be of a vector type, was 'NULL'
I wonder if anyone can help me spot where I am going wrong. The code is as follows:
library(tidyverse)
library(rvest)
# create a function to grab the team links
get_team_links <- function(url){
url %>%
read_html %>%
html_nodes('td.hauptlink a') %>%
html_attr('href') %>%
.[. != '#'] %>% # remove rows with # string
paste0('https://www.transfermarkt.com', .) %>% # pat the website link to the url strings
unique() %>% # keep only unique links
as_tibble() %>% # turn strings into a tibble datatset
rename("links" = "value") %>% # rename the value column
filter(!grepl('profil', links)) %>% # remove link of players included
filter(!grepl('spielplan', links)) %>% # remove link of additional team pages included
mutate(links = gsub("startseite", "kader", links)) # change link to go to the detailed page
}
# create a function to grab the player links
get_player_links <- function(url){
url %>%
read_html %>%
html_nodes('td.hauptlink a') %>%
html_attr('href') %>%
.[. != '#'] %>% # remove rows with # string
paste0('https://www.transfermarkt.com', .) %>% # pat the website link to the url strings
unique() %>% # keep only unique links
as_tibble() %>% # turn strings into a tibble datatset
rename("links" = "value") %>% # rename the value column
filter(grepl('profil', links)) %>% # remove link of players included
mutate(links = gsub("profil", "verletzungen", links)) # change link to go to the injury page
}
# create a function to get the injury dataset
get_injury_data <- function(url){
url %>%
read_html() %>%
html_nodes('#yw1') %>%
html_table()
}
# get team links and save it as team_links
team_links <- get_team_links('https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1')
# get player links and by mapping the function on to the player_injury_links dataset
# and then unnest the list of lists as a long list
player_injury_links <- team_links %>%
mutate(links = map(team_links$links, get_player_links)) %>%
unnest(links)
# using the player_injury_links list create a dataset by web scrapping the play injury pages
player_injury_data <- map(player_injury_links$links, get_injury_data)
Solution
So the issue that I was having was that some of the links that I was scraping did not have any data.
To overcome this issue used, I used the possibly function from purrr package. This helped me create a new, error-free function.
The line code that was giving me trouble is as follows:
player_injury_data <- player_injury_links %>%
purrr::map(., purrr::possibly(get_injury_data, otherwise = NULL, quiet = TRUE))

Error while using unnest_tokens() while passing a function to the token

Error in unnest_tokens.data.frame(., entity, text, token = tokenize_scispacy_entities, :
Expected output of tokenizing function to be a list of length 100
The unnest_tokens() works well for a sample of few observations but fails on the entire dataset.
https://github.com/dgrtwo/cord19
Reproducible example:
library(dplyr)
library(cord19)
library(tidyverse)
library(tidytext)
library(spacyr)
Install the model from here - https://github.com/allenai/scispacy
spacy_initialize("en_core_sci_sm")
tokenize_scispacy_entities <- function(text) {
spacy_extract_entity(text) %>%
group_by(doc_id) %>%
nest() %>%
pull(data) %>%
map("text") %>%
map(str_to_lower)
}
paragraph_entities <- cord19_paragraphs %>%
select(paper_id, text) %>%
sample_n(10) %>%
unnest_tokens(entity, text, token = tokenize_scispacy_entities)
I face the same problem. I don't know the reason why, after I filter out empty abstract and shorter abstract string, everything seems work just fine.
abstract_entities <- article_data %>%
filter(nchar(abstract) > 30) %>%
select(paper_id, title, abstract) %>%
sample_n(1000) %>%
unnest_tokens(entity, abstract, token = tokenize_scispacy_entities)

R Webscraping: How to feed URLS into a function

My end goal is to be able to take all 310 articles from this page and its following pages and run it through this function:
library(tidyverse)
library(rvest)
library(stringr)
library(purrr)
library(lubridate)
library(dplyr)
scrape_docs <- function(URL){
doc <- read_html(URL)
speaker <- html_nodes(doc, ".diet-title a") %>%
html_text()
date <- html_nodes(doc, ".date-display-single") %>%
html_text() %>%
mdy()
title <- html_nodes(doc, "h1") %>%
html_text()
text <- html_nodes(doc, "div.field-docs-content") %>%
html_text()
all_info <- list(speaker = speaker, date = date, title = title, text = text)
return(all_info)
}
I assume the way to go forward would be to somehow create a list of the URLs I want, then iterate that list through the scrape_docs function. As it stands, however, I'm having a hard time understanding how to go about that. I thought something like this would work, but I seem to be missing something key given the following error:
xml_attr cannot be applied to object of class "character'.
source_col <- "https://www.presidency.ucsb.edu/advanced-search?field-keywords=%22space%20exploration%22&field-keywords2=&field-keywords3=&from%5Bdate%5D=&to%5Bdate%5D=&person2=&items_per_page=100&page=0"
pages <- 4
all_links <- tibble()
for(i in seq_len(pages)){
page <- paste0(source_col,i) %>%
read_html() %>%
html_attr("href") %>%
html_attr()
tmp <- page[[1]]
all_links <- bind_rows(all_links, tmp)
}
all_links
You can get all the url's by doing
library(rvest)
source_col <- "https://www.presidency.ucsb.edu/advanced-search?field-keywords=%22space%20exploration%22&field-keywords2=&field-keywords3=&from%5Bdate%5D=&to%5Bdate%5D=&person2=&items_per_page=100&page=0"
all_urls <- source_col %>%
read_html() %>%
html_nodes("td a") %>%
html_attr("href") %>%
.[c(FALSE, TRUE)] %>%
paste0("https://www.presidency.ucsb.edu", .)
Now do the same by changing the page number in source_col to get remaining data.
You can then use a for loop or map to extract all the data.
purrr::map(all_urls, scrape_docs)
Testing the function scrape_docs on 1 URL
scrape_docs(all_urls[1])
#$speaker
#[1] "Dwight D. Eisenhower"
#$date
#[1] "1958-04-02"
#$title
#[1] "Special Message to the Congress Relative to Space Science and Exploration."
#$text
#[1] "\n To the Congress of the United States:\nRecent developments in long-range
# rockets for military purposes have for the first time provided man with new mac......

Resources