In the below image you can see that there are 12 teams objects and 6 time objects. When I add this variable into the Dataframe it obviously does not fit and errors out. Is there a way to duplicate the time so it shows up twice, 1 for the top name and 1 for the bottom in the same matchup?
library(rvest)
library(dplyr)
library(tm)
library(stringi)
library(readr)
today <- Sys.Date()#+1
today <- gsub("-", "", today, fixed=TRUE)
url <- read_html(paste0('https://classic.sportsbookreview.com/betting-odds/nhl-hockey/?date=', as.character(today)))
gametime <- url %>%
html_nodes('.eventLine-time .eventLine-book-value') %>%
html_text()
teams <- url %>%
html_nodes('.team-name a') %>%
html_text()
roster <- data.frame(gametime = time, TEAM = teams)
ADDING IN EDIT: gametime = rep(gametime, each=2)
library(rvest)
library(dplyr)
library(tm)
library(stringi)
library(readr)
today <- Sys.Date()#+1
today <- gsub("-", "", today, fixed=TRUE)
url <- read_html(paste0('https://classic.sportsbookreview.com/betting-odds/nhl-hockey/?date=', as.character(today)))
gametime <- url %>%
html_nodes('.eventLine-time .eventLine-book-value') %>%
html_text()
gametime = rep(gametime, each=2)
teams <- url %>%
html_nodes('.team-name a') %>%
html_text()
roster <- data.frame(gametime = time, TEAM = teams)
rep(gametime, each=2) will duplicate each element of gametime.
Related
My code looks like this:
te_earnings <- read_html("https://tradingeconomics.com/earnings")
te_earnings %>% html_table()
te_earnings_data <- te_earnings %>% html_table()
rm(te_earnings)
te_earnings_data <- te_earnings_data[[2]]
te_earnings_data <- te_earnings_data %>% as_tibble()
te_earnings_data
te_earnings_data <- te_earnings_data[,-c(12,13)]
new_names<- as.character(str_extract_all(te_earnings_data[1,], boundary("word")))
names(te_earnings_data)
new_names[1:2] <- c("Date","Company")
new_names <- new_names[-c(12:13)]
new_names
names(te_earnings_data) <- new_names
names(te_earnings_data)[8] <- "Previous" ; rm(new_names)
te_earnings_data <- te_earnings_data[-1,]
is_tibble(te_earnings_data)
te_earnings_data[te_earnings_data == ''] <- NA
trim <- function( x ) {
gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)
}
te_earnings_data <- apply(te_earnings_data,2,trim)
te_earnings_data <- te_earnings_data %>% as_tibble("both")
te_earnings_data
# extracting the ticker and create new column
te_earnings_data$ticker <- NA
pattern_country_strings <- paste0(c(":US",":CN:",":JP",":BS",":MM",":IN",":AU", ":SM",":LN",":FP"), collapse="|")
te_earnings_data$ticker <- sub(".*\r\n", "", te_earnings_data$Company)
te_earnings_data$ticker <- te_earnings_data$ticker %>% str_replace(pattern_country_strings, " ")
head(te_earnings_data$ticker)
te_earnings_data$ticker <- te_earnings_data$ticker %>% str_trim()
head(te_earnings_data$ticker)
paste0(c(":US",":CN:"), collapse="|")
# Remove tickers from company
te_earnings_data$Company <- te_earnings_data$Company %>% str_replace_all(" .*" , "")
# Remove \r\n from company
te_earnings_data$Company <- te_earnings_data$Company %>% str_replace_all(paste0(c("\n","\r"), collapse="|"), "")
I am trying to create a data.frame with the earnings fom the page https://tradingeconomics.com/earnings an I like to change the date to the last month.
For Example, there you can change the date in the URL.
https://de.finance.yahoo.com/eventkalender/earnings?from=2023-01-08&to=2023-01-14&day=2023-01-09
But I dont find a date in the URL, even if I change the custom date at the page, nothin changes.
I tried to find the date in the source code of the page but I could not find it. Not a lot exp with that.
Can anybody tell me if this is basically possible or does it depend on the page?
Ty
I tried to download the page for a special date, but the date dont changes. And I dont know where to change it for Web Scraping.
EDIT:
I found a solution for yahoo. Just changing the date in the URL with a for loop and paste0 for example.
url <- "https://finance.yahoo.com/calendar/earnings?from=2022-12-04&to=2022-12-10&day=2022-12-06"
download_table <- function(url) {
url_file <- GET(url)
web_page_parsed <- htmlParse(url_file)
tables <- readHTMLTable(web_page_parsed)
}
url_file <- GET(url)
web_page_parsed <- htmlParse(url_file)
tables <- readHTMLTable(web_page_parsed)
print(head(tables))
after running this code in some rows (34:39) were introduced NAs and I do not know why?
Could you help? I tried another pc, however the same problem occured.
# CZECH REPO SAZBA
library(rvest)
library(dplyr)
link <- "https://www.cnb.cz/cs/casto-kladene-dotazy/Jak-se-vyvijela-dvoutydenni-repo-sazba-CNB/"
page <- read_html(link)
date <- page %>% html_nodes('td:nth-child(1)') %>% html_text()
repo <- page %>% html_nodes('td+ td') %>% html_text()
Sazba <- data.frame(cbind(date, repo))
Sazba$repo <- as.numeric(gsub(",", ".", Sazba$repo))
Sazba$date <- gsub(" ", "", Sazba$date)
str(Sazba)
Sazba$date <- as.Date(gsub("[.]", "/", Sazba$date), "%d/%m/%Y")
Picture of the issue in R-Studio
I have a solution, the problem was with different encoding for these date values. I use stringi package to format them so these strange spaces were translated to "\u00a0".
#CZECH REPO SAZBA
library(rvest)
library(dplyr)
link <- "https://www.cnb.cz/cs/casto-kladene-dotazy/Jak-se-vyvijela-dvoutydenni-repo-sazba-CNB/"
page <- read_html(link)
date <- page %>% html_nodes('td:nth-child(1)') %>% html_text()
repo <- page %>% html_nodes('td+ td') %>% html_text()
date <- stringi::stri_escape_unicode(date)
Sazba <- data.frame(cbind(date, repo))
Sazba$repo <- as.numeric(gsub(",", ".", Sazba$repo))
Sazba$date <- gsub("\\\\u00a0", "", Sazba$date)
str(Sazba)
Sazba$date <- as.Date(Sazba$date, "%d.%m.%Y")
Sazba
Try using parsedate package:
library(parsedate)
Sazba$date <- parse_date(Sazba$date)
I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)
I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)
For a schoolproject i have to scrape a website which isn't a problem. But for it to be called BigData i wanted to scrape the whole archive(which is the past 5 years). The only thing that changes in the url is the date at the end of the url but i don't know how to write a script that changes only the date at the end.
The website I'm using is this: https://www.ongelukvandaag.nl/archief/ .
And the dates i need are from 01-01-2015 until 24-09-2020. The first part of the code i already figured out and I'm able to scrape 1 page. I'm a beginner at using R and would like to know if anyone could help me. The code is shown below. Thanks in advance!
This is what i got so far and the errors are underneath the code.
install.packages("XML")
install.packages("reshape")
install.packages("robotstxt")
install.packages("Rcrawler")
install.packages("RSelenium")
install.packages("devtools")
install.packages("exifr")
install.packages("Publish")
devtools::install_github("r-lib/xml2")
library(rvest)
library(dplyr)
library(xml)
library(stringr)
library(jsonlite)
library(xml12)
library(purrr)
library(tidyr)
library(reshape)
library(XML)
library(robotstxt)
library(Rcrawler)
library(RSelenium)
library(ps)
library(devtools)
library(exifr)
library(Publish)
#Create an url object
url<-"https://www.ongelukvandaag.nl/archief/%d "
#Verify the web can be scraped
paths_allowed(paths = c(url))
#Obtain the links for every day from 2015 to 2020
map_df(2015:2020, function(i){
page<-read_html(sprintf(url,i))
data.frame(Links = html_attr(html_nodes(page, ".archief a"),"href"))
}) -> Links %>%
Links$Links<-paste("https://www.ongelukvandaag.nl/",Links$Links,sep = "")
#Scrape what you want from each link:
d<- map(Links$Links, function(x) {
Z <- read_html(x)
Date <- Z %>% html_nodes(".text-muted") %>% html_text(trim = TRUE) # Last update
All_title <- Z %>% html_nodes("h2") %>% html_text(trim = TRUE) # Title
return(tibble(All_title,Date))
})
The errors i get:
Error in open.connection(x, "rb") : HTTP error 400.
in paste("https://www.ongelukvandaag.nl/", Links$Links, sep = "") : object 'Links' not found >
in map(Links$Links, function(x) { : object 'Links' not found
and the packages "xml12" & "xml" don't work in this version of RStudio
Take a look at my code and my comments:
library(purrr)
library(rvest) # don't load a lot of libraries if you don't need them
url <- "https://www.ongelukvandaag.nl/archief/"
bigdata <-
map_dfr(
2015:2020,
function(year){
year_pg <- read_html(paste0(url, year))
list_dates <- year_pg %>% html_nodes(xpath = "//div[#class='archief']/a") %>% html_text() # in case some dates are missing
map_dfr(
list_dates,
function(date) {
pg <- read_html(paste0(url, date))
items <- pg %>% html_nodes("div.full > div.row")
items <- items[sapply(items, function(x) length(x %>% html_node(xpath = "./descendant::h2"))) > 0] # drop NA items
data.frame(
date = date,
title = items %>% html_node(xpath = "./descendant::h2") %>% html_text(),
update = items %>% html_node(xpath = "./descendant::h4") %>% html_text(),
image = items %>% html_node(xpath = "./descendant::img") %>% html_attr("src")
)
}
)
}
)
My end goal is to be able to take all 310 articles from this page and its following pages and run it through this function:
library(tidyverse)
library(rvest)
library(stringr)
library(purrr)
library(lubridate)
library(dplyr)
scrape_docs <- function(URL){
doc <- read_html(URL)
speaker <- html_nodes(doc, ".diet-title a") %>%
html_text()
date <- html_nodes(doc, ".date-display-single") %>%
html_text() %>%
mdy()
title <- html_nodes(doc, "h1") %>%
html_text()
text <- html_nodes(doc, "div.field-docs-content") %>%
html_text()
all_info <- list(speaker = speaker, date = date, title = title, text = text)
return(all_info)
}
I assume the way to go forward would be to somehow create a list of the URLs I want, then iterate that list through the scrape_docs function. As it stands, however, I'm having a hard time understanding how to go about that. I thought something like this would work, but I seem to be missing something key given the following error:
xml_attr cannot be applied to object of class "character'.
source_col <- "https://www.presidency.ucsb.edu/advanced-search?field-keywords=%22space%20exploration%22&field-keywords2=&field-keywords3=&from%5Bdate%5D=&to%5Bdate%5D=&person2=&items_per_page=100&page=0"
pages <- 4
all_links <- tibble()
for(i in seq_len(pages)){
page <- paste0(source_col,i) %>%
read_html() %>%
html_attr("href") %>%
html_attr()
tmp <- page[[1]]
all_links <- bind_rows(all_links, tmp)
}
all_links
You can get all the url's by doing
library(rvest)
source_col <- "https://www.presidency.ucsb.edu/advanced-search?field-keywords=%22space%20exploration%22&field-keywords2=&field-keywords3=&from%5Bdate%5D=&to%5Bdate%5D=&person2=&items_per_page=100&page=0"
all_urls <- source_col %>%
read_html() %>%
html_nodes("td a") %>%
html_attr("href") %>%
.[c(FALSE, TRUE)] %>%
paste0("https://www.presidency.ucsb.edu", .)
Now do the same by changing the page number in source_col to get remaining data.
You can then use a for loop or map to extract all the data.
purrr::map(all_urls, scrape_docs)
Testing the function scrape_docs on 1 URL
scrape_docs(all_urls[1])
#$speaker
#[1] "Dwight D. Eisenhower"
#$date
#[1] "1958-04-02"
#$title
#[1] "Special Message to the Congress Relative to Space Science and Exploration."
#$text
#[1] "\n To the Congress of the United States:\nRecent developments in long-range
# rockets for military purposes have for the first time provided man with new mac......