I'm trying to get a csv-file into R. I have made the following code which creates the link to the csv-file. The code works perfectly when I use the generated link in my browser. But doesn't work properly when I run the code. Hope somebody can help.
Jakob
library(httr)
library(tidyverse)
library(stringr)
library(jsonlite)
metadata.dst <- function(tableid){
link.metadata <- "http://api.statbank.dk/v1/tableinfo/"
GET(str_c(link.metadata,tableid)) %>%
content("text") %>%
fromJSON()
}
download.link.dst <- function(tableid){
table <- tolower(tableid)
base.link_start <- "http://api.statbank.dk/v1/data/"
base.link_end <- "/CSV?delimiter=Semicolon"
link_start <- str_c(base.link_start, table,base.link_end)
variables <- metadata.dst(tableid)$variables$id
link_end <- str_c("&",variables) %>%
str_c("=*") %>%
str_c(collapse = "")
download.link <- str_c(link_start, link_end) %>%
str_replace("Å","%C3%85") %>%
str_replace("å", "%C3%A5") %>%
str_replace("Ø", "%C3%98") %>%
str_replace("ø", "%C3%B8") %>%
str_replace("Æ", "%C3%86") %>%
str_replace("æ", "%C3%A6")
download.link
}
read_csv2(download.link.dst("FOLK1B"))
Related
I created a function to scrape the fbref.com website. Sometimes in this website or in others that I'm trying to scrape I receive a timeout error. I read about it and it is suggested to include a sys.sleep between the requisitions or a purrr:slowly. I tried to include inside the map but I could not. How can I include a 10 seconds gap between each requisition inside the map (It would be 7 requisitions and 6 intervals of 10 seconds).
Thanks in advance and if I do not include something please inform me!
#packages
library(rvest)
library(stringr)
library(dplyr)
library(tidyr)
library(purrr)
library(lubridate)
library(tm)
#function
funcao_extract <- function(link1,link2){
fbref <- "https://fbref.com/pt/equipes/"
dados <- fbref %>%
map2_chr(rep(link1,7),paste0) %>%
map2_chr(seq(from=2014,to=2020),paste0) %>%
map2_chr(rep(link2,7),paste0) %>%
map(. %>%
read_html() %>%
html_table() %>%
.[[1]] %>% #select first table
dplyr::bind_rows()%>%
janitor::clean_names() %>%
slice(-1) %>%
select(-21) %>%
rename(nome=1,nacionalidade=2,posicao=3,idade=4,jogos=5,inicios=6,minutos=7,minutos_90=8,gol=9,assistencia=10,
gol_normal=11,gol_penalti=12,penalti_batido=13,amarelo=14,vermelho=15,gol_90=16,assistencia_90=17,
gol_assistencia_90=18,gol_normal_90=19,gol_assistencia_penalti_90=20) %>%
as.data.frame() %>%
format(scientific=FALSE) %>%
mutate_at(.,c(5:15),as.numeric) %>%
mutate(nacionalidade = str_extract(nacionalidade, "[A-Z]+")) #only capital letters
) %>% #renomear as colunas
setNames(paste0(rep("Gremio_",7),seq(from=2014,to=2020))) #name lists
}
#test with Grêmio
gremio <- funcao_extract("d5ae3703/","/Gremio-Estatisticas")
Just a small example
library(tidyverse)
example <- list(data.frame(a=1:10), data.frame(a=11:20))
example %>%
map_df(~ {Sys.sleep(10) ;
message(Sys.time());.x} %>%
summarise(a = sum(a)))
I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)
I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)
I would like to scrape all the seasons from 2003-2004 to 2019-2020 of the Dutch football league including the 34 playing rounds (I am using this website https://www.voetbal.com/wedstrijdgegevens/ned-eredivisie-2003-2004-spieltag/). As you can see in my code it's only showing me de results of de last season. I think it's overwriting the other seasons. What am I doing wrong? What do I have to add to my code? Can anybody help me?
Here is the code I use:
library(tidyverse)
library(dplyr)
library(ggplot2)
library(caret)
library(rvest)
library(devtools)
library(httr)
library(tidyr)
library(tibble)
library(xml2)
library(tidyr)
library(stringr)
url <- sprintf("https://www.voetbal.com/wedstrijdgegevens/ned-eredivisie-%d-%d-spieltag/", 2003:2019, 2004:2020)
basis<-function(url){
website <- read_html(url)
Sys.sleep(2)
datum <- website %>%
html_nodes(".data .standard_tabelle td[nowrap]:nth-of-type(1)") %>%
html_text()
tijdstip <- website %>%
html_nodes(".data .standard_tabelle td[nowrap]:nth-of-type(2)") %>%
html_text()
thuisclub <- website %>%
html_nodes(".data .standard_tabelle [align='right'] a") %>%
html_text()
uitclub <- website %>%
html_nodes(".standard_tabelle td:nth-of-type(5) a") %>%
html_text()
uitslag <- website %>%
html_nodes(".data .standard_tabelle td[nowrap]:nth-of-type(6)") %>%
html_text()
return(tibble(datum=datum, tijdstip=tijdstip, thuisclub=thuisclub, uitclub=uitclub, uitslag=uitslag))
}
overige_seizoenen<-function(url){
for (i in 1:17){
list_of_pages<-str_c(url[[i]], 1:34)
table <-list_of_pages%>%
map(basis)%>%
bind_rows()
}
return(table)
}
jochem <- overige_seizoenen(url)
```
I suspect there is an error in the for loop. In R, if you want a loop to iterate from element 1 to 10, you can't just say for (i in 10), you must clarify it as for (i in 1:10). So try this loop now:
for (i in 1:seizoenen){
list_of_pages<-str_c(url[[i]], 1:34)
table <-list_of_pages%>%
map(basis)%>%
bind_rows()
}
return(table)
}
I am trying to get the 1980 matching stocks from this Yahoo Finance Screener:
[https://finance.yahoo.com/screener/unsaved/38a77251-0996-439b-8be4-9d10ff18ff79?count=25&offset=0]
using R and rvest.
I normally use XPath but I can't get it with SelectorGadget at this website.
Could somebody help me about an alternative way to get al pages with those data.
I wanted to have code similar yo this one and that worked with Investing. Please note that the Symbol, Name, and MarketCap codes are just examples:
library(rvest)
library(dplyr)
i=0
for(z in 1:80){
url_base<-paste("https://finance.yahoo.com/screener/unsaved/38a77251-0996-439b-8be4-9d10ff18ff79?count=25&offset=0")
zpg <- read_html(url_base)
Symbol<-zpg %>% html_nodes("table") %>% html_nodes("span") %>% html_attr("data-id" )
Name<-zpg %>% html_nodes("table") %>% html_nodes("span") %>% html_attr("data-name" )
MarketCap<-zpg %>% html_nodes("table") %>% html_nodes("span") %>% html_attr("data-name" )
data<-data.frame(WebID,FullName,MarketCap)
if(i==0){
USA<-data}else{
USA<-rbind(USA,data)
}
i=i+1
}
You could try using quantmod or tidyquant.
library(tidyverse)
library(tidyquant)
# getting symbols for NASDAQ
nasdaq <- read_delim("https://nasdaqtrader.com/dynamic/SymDir/nasdaqlisted.txt", delim = "|")
# scraping the data
df <- nasdaq %>%
head() %>% # to fetch only a few rows
rowwise() %>%
mutate(data = list(tq_get(Symbol, from = '2020-08-01', to = "2020-08-07", warnings = FALSE)))
# getting the data ready
df2 <- df$data %>%
bind_rows()
I'm trying to extract a table from a PDF with the R tabulizer package. The functions work fine, but it can't get all the data from the entire table.
Below are my codes
library(tabulizer)
library(tidyverse)
library(abjutils)
D_path = "https://github.com/financebr/files/raw/master/Compacto09-08-2019.pdf"
out <- extract_tables(D_path,encoding = 'UTF-8')
arrumar_nomes <- function(x) {
x %>%
tolower() %>%
str_trim() %>%
str_replace_all('[[:space:]]+', '_') %>%
str_replace_all('%', 'p') %>%
str_replace_all('r\\$', '') %>%
abjutils::rm_accent()
}
tab_tidy <- out %>%
map(as_tibble) %>%
bind_rows() %>%
set_names(arrumar_nomes(.[1,])) %>%
slice(-1) %>%
mutate_all(funs(str_replace_all(., '[[:space:]]+', ' '))) %>%
mutate_all(str_trim)
Comparing the PDF table (D_path) with the tab_tidy database you can see that some information was missing. All first columns, which are merged, are not found during extract_tables(). Also, all lines that contain “Boi Gordo” and “Boi Magro” information are not found by the function either.
The rest is in perfect condition. Would you know why and how to solve it? The questions here in the forum dealing with this do not have much answer.