Download file from url R - r

I am having problems downloading data from the link below directly with the code into R:
kaggle.com/c/house-prices-advanced-regression-techniques/data
I tried with this code:
data<-read.csv("https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=test.csv", skip = 1")
I tried most of the options listed here:
Access a URL and read Data with R
However, I only get html table and not tables with the relevant house-price data from the web-site. Not sure what I am doing wrong.
tnx

Here's a simple example post on kaggle how to achieve your goal, the code is taken from the example.
Create a verified account
Log in
Go to you account (click the top right -> account)
Click "Create new API token"
Place the file somewhere sensible that you can access from R
library(httr)
library(jsonlite)
kgl_credentials <- function(kgl_json_path="~/.kaggle/kaggle.json"){
# returns user credentials from kaggle json
user <- fromJSON("~/.kaggle/kaggle.json", flatten = TRUE)
return(user)
}
kgl_dataset <- function(ref, file_name, type="dataset", kgl_json_path="~/.kaggle/kaggle.json"){
# ref: depends on 'type':
# - dataset: "sudalairajkumar/novel-corona-virus-2019-dataset"
# - competition: competition ID, e.g. 8587 for "competitive-data-science-predict-future-sales"
# file_name: specific dataset wanted, e.g. "covid_19_data.csv"
.kaggle_base_url <- "https://www.kaggle.com/api/v1"
user <- kgl_credentials(kgl_json_path)
if(type=="dataset"){
# dataset
url <- paste0(.kaggle_base_url, "/datasets/download/", ref, "/", file_name)
}else if(type=="competition"){
# competition
url <- paste0(.kaggle_base_url, "/competitions/data/download/", ref, "/", file_name)
}
# call
rcall <- httr::GET(url, httr::authenticate(user$username, user$key, type="basic"))
# content type
content_type <- rcall[[3]]$`content-type`
if( grepl("zip", content_type)){
# download and unzup
temp <- tempfile()
download.file(rcall$url,temp)
data <- read.csv(unz(temp, file_name))
unlink(temp)
}else{
# else read as text -- note: code this better
data <- content(rcall, type="text/csv", encoding = "ISO-8859-1")
}
return(data)
}
Then you can use the credentials to download the dataset as described in the post
kgl_dataset(file_name = 'test.csv',
type = 'competition',
ref = 'house-prices-advanced-regression-techniques',
kgl_json_path = 'kaggle.json')
Alternatively you can use the unofficial R api
library(devtools)
install_github('mkearney/kaggler')
library(kaggler)
kgl_auth(creds_file = 'kaggle.json')
kgl_competitions_data_download('house-prices-advanced-regression-techniques', 'test.csv')
However this fails, due to a mistake in the implementation of kgl_api_get
function (path, ..., auth = kgl_auth())
{
r <- httr::GET(kgl_api_call(path, ...), auth)
httr::warn_for_status(r)
if (r$status_code != 200) { # <== should be "=="
...
}

I downloaded the data (which you should just do too, it's quite easy), but just in case you don't want to, I uploaded the data to Pastebin and you can run the code below. This is for their "train" dataset, downloaded from the link you provided above
data <- read.delim("https://pastebin.com/raw/aGvwwdV0", header=T)

Related

Downloading images from multiple URLs in R

Thanks to everyone who helped me with my previous query!
I have another question about how to proceed to download those images utilising the loop function!
I would like to download images from my data frame which consists of URL links that point directly to a .jpg image all at once.
I've attached the current code below:
This is the current code to load the URLs
# load libraries and packages
library("rvest")
library("ralger")
library("tidyverse")
library("jpeg")
library("here")
# set the number of pages
num_pages <- 5
# set working directory for photos to be stored
setwd("~/Desktop/lab/male_generic")
# create a list to hold the output
male <- vector("list", num_pages)
# looping the scraping, images from istockphoto
for(page_result in 1:num_pages){
link = paste0("https://www.istockphoto.com/search/2/image?alloweduse=availableforalluses&mediatype=photography&phrase=man&page=", page_result)
male[[page_result]] <- images_preview(link)
}
male <- unlist(male)
I only figured out how to download one image at a time, but I would like to learn how to do it all at once:
test = "https://media.istockphoto.com/id/1028900652/photo/man-meditating-yoga-at-sunset-mountains-travel-lifestyle-relaxation-emotional-concept.jpg?s=612x612&w=0&k=20&c=96TlYdSI8POnOrcqH10GlPgOeWFjEIoY-7G_yMV4Eco="
download.file(test,'test.jpg', mode = 'wb')
num_pages = 10 # write the number of pages you want to download
link = paste0("https://www.istockphoto.com/search/2/image?alloweduse=availableforalluses&mediatype=photography&phrase=man&page=", 1:num_pages)
sapply(link, function(x) {
download.file(x,
destfile = paste0("C:/Users/USUARIO/Desktop", # change it to your directory
str_extract(x, pattern = "[0-9]{1,2}"), ".jpg"),
mode = "wb")
}
)

How do I name file downloads in R using data from another column in dataframe?

I have a large dataset of unique file IDs and links to download the files. It looks like this:
file_id <- c("id:fghjs12:ws8c7/syx", "id:f7gnsfu:7a6#*s", "id:dug:shxgcvu:6sh")
link <- c("https://www.dynare.org/wp-repo/dynarewp028.pdf", "https://www.dynare.org/wp-repo/dynarewp029.pdf", "https://www.dynare.org/wp-repo/dynarewp020.pdf")
df <- data.frame(file_id, link, stringsAsFactors = FALSE)
I want to download each file using the name of the handle. Some of the links are broken. So I have the following loop to do the task but it's not working..
download_documents <- function(url, file_id) {
tryCatch(
{download.file(url, paste0('~/Desktop/Dataset/files/', file_id))},
error = function(e) {NA},
warning = function(w) {NA})
}
Map(download_documents, df$link, df$file_id)
Does anyone know what I'm doing wrong or have a better solution? Thanks in advance for your help!
You can turn the file_id to valid names using make.names.
Map(download_documents, df$link, make.names(df$file_id))

R trying to sort url in a list depending on their existence or not

I'm working on a project of collecting some datas from https://www.hockey-reference.com/boxscores/. Actually I'me trying to get every table of a season. I've generated a list of urls composed by combining https://www.hockey-reference.com/boxscores/ with each date of the calendar and each team name like "https://www.hockey-reference.com/boxscores/20171005WSH.html
I've stocked every url into a list but some are leading to a 404 error. I'm trying to use the "Curl package" with the function "url.exists" to know if there will be a 404 error and delete the url of the list. The problem is that each url from the list (including really existing url) return FALSE with url.exists in a for loop... I've tried to use this function in the console with url.exists(my list[i]) but it returns FALSE.
here's my code:
library(rvest)
library(RCurl)
##### Variables ####
team_names = c("ANA","ARI","BOS","BUF","CAR","CGY","CHI","CBJ","COL","DAL","DET","EDM","FLA","LAK","MIN","MTL","NSH","NJD","NYI","NYR","OTT","PHI","PHX","PIT","SJS","STL","TBL","TOR","VAN","VGK","WPG","WSH")
S2017 = read.table(file = "2018_season", header = TRUE, sep = ",")
dates = as.character(S2017[,1])
#### formating des dates ####
for (i in 1:length(dates)) {
dates[i] = gsub("-", "", dates[i])
}
dates = unique(dates)
##### generation des url ####
url_list = c()
for (j in 1:2) { #dates
for (k in 1:length(team_names)) {
print(k)
url_site = paste("https://www.hockey-reference.com/boxscores/",dates[j],team_names[k],".html",sep="")
url_list = rbind(url_site,url_list)
}
}
url_list_raffined = c()
for (l in 1:40) {
print(l)
if (url.exists(url_list[l], .header = TRUE) == TRUE) {
url_list_raffined = c(url_list_raffined,url_list[l])
}
}
Any idea for my problems ?
thanks
Instead of RCurl, you could use the httr package:
library(httr)
library(rvest)
library(xml2)
resp <- httr::GET(url_address, httr::timeout(60))
if(resp$status_code==200) {
html <- xml2::read_html(resp)
txt <- rvest::html_text(rvest::html_nodes(html)) # or similar
# save the results somewhere or do your operations..
}
here url_address is the address you are trying to download. Maybe you need to put this in a function or loop to iterate over all your addresses.

Loop through array of PDF files online and copy text from each

I see it is super-easy to grab a PDF file, save it, and fetch all the text from the file.
library(pdftools)
download.file("http://www2.sas.com/proceedings/sugi30/085-30.pdf", "sample.pdf", mode = "wb")
txt <- pdf_text("sample.pdf")
I am wondering how to loop through an array of PDF files, based on links, download each, and scrape the test from each. I want to go to the following link.
http://www2.sas.com/proceedings/sugi30/toc.html#dp
Then I want to download each file from 'Paper 085-30:' to 'Paper 095-30:'. Finally, I want to scrape the text out of each file. How can I do that?
I would think it would be something like this, but I suspect the paste function is not setup correctly.
library(pdftools)
for(i in values){'085-30',' 086-30','087-30','088-30','089-30'
paste(download.file("http://www2.sas.com/proceedings/sugi30/"i".pdf", i".pdf", mode = "wb")sep = "", collapse = NULL)
}
You can get a list of pdfs using rvest.
library(rvest)
x <- read_html("http://www2.sas.com/proceedings/sugi30/toc.html#dp")
href <- x %>% html_nodes("a") %>% html_attr("href")
# char vector of links, use regular expression to fetch only papers
links <- href[grepl("^http://www2.sas.com/proceedings/sugi30/\\d{3}.*\\.pdf$", href)]
I've added some error handling and don't forget to put R session to sleep so you don't flood the server. In case a download is unsuccessful, the link is stored into a variable which you can investigate after the loop has finished and perhaps adapt your code or just download them manually.
# write failed links to this variable
unsuccessful <- c()
for (link in links) {
out <- tryCatch(download.file(url = link, destfile = basename(link), mode = "wb"),
error = function(e) e, warning = function(w) w)
if (class(out) %in% c("simpleError", "simpleWarning")) {
message(sprintf("Unable to download %s ?", link))
unsuccessful <- c(unsuccessful, link)
}
sleep <- abs(rnorm(1, mean = 10, sd = 10))
message(sprintf("Sleeping for %f seconds", sleep))
Sys.sleep(sleep) # don't flood the server, sleep for a while
}

How to download multiple files using loop in R?

I have to download multiple xlsx files about a country's census data from internet using R. Files are located in this
Link .The problems are:
I am unable to write a loop which will let me go back and forth to download
File being download has some weird name not districts name. So how can I change it to districts name dynamically.
I have used the below mentioned codes:
url<-"http://www.censusindia.gov.in/2011census/HLO/HL_PCA/HH_PCA1/HLPCA-28532-2011_H14_census.xlsx"
download.file(url, "HLPCA-28532-2011_H14_census.xlsx", mode="wb")
But this downloads one file at a time and doesnt change the file name.
Thanks in advance.
Assuming you want all the data without knowing all of the urls, your questing involves webparsing. Package httr provides useful function for retrieving HTML-code of a given website, which you can parse for links.
Maybe this bit of code is what you're looking for:
library(httr)
base_url = "http://www.censusindia.gov.in/2011census/HLO/" # main website
r <- GET(paste0(base_url, "HL_PCA/Houselisting-housing-HLPCA.html"))
rc = content(r, "text")
rcl = unlist(strsplit(rc, "<a href =\\\"")) # find links
rcl = rcl[grepl("Houselisting-housing-.+?\\.html", rcl)] # find links to houslistings
names = gsub("^.+?>(.+?)</.+$", "\\1",rcl) # get names
names = gsub("^\\s+|\\s+$", "", names) # trim names
links = gsub("^(Houselisting-housing-.+?\\.html).+$", "\\1",rcl) # get links
# iterate over regions
for(i in 1:length(links)) {
url_hh = paste0(base_url, "HL_PCA/", links[i])
if(!url_success(url_hh)) next
r <- GET(url_hh)
rc = content(r, "text")
rcl = unlist(strsplit(rc, "<a href =\\\"")) # find links
rcl = rcl[grepl(".xlsx", rcl)] # find links to houslistings
hh_names = gsub("^.+?>(.+?)</.+$", "\\1",rcl) # get names
hh_names = gsub("^\\s+|\\s+$", "", hh_names) # trim names
hh_links = gsub("^(.+?\\.xlsx).+$", "\\1",rcl) # get links
# iterate over subregions
for(j in 1:length(hh_links)) {
url_xlsx = paste0(base_url, "HL_PCA/",hh_links[j])
if(!url_success(url_xlsx)) next
filename = paste0(names[i], "_", hh_names[j], ".xlsx")
download.file(url_xlsx, filename, mode="wb")
}
}

Resources