How to rename filenames considering their IDs - r

I'm a begginer with R programming. I have downloaded many pictures which have their ID as name. For example, pictures "senador588", "senador3", "senador16" and so on. Each picture shows one senator of Brazil. I need the name instead of the ID.
I also have a dataframe which displays only the ID (id_senador) and the name (name_lower).
This first part of the code downloads all the pictures:
library(data.table)
library(rvest)
library(lubridate)
library(stringr)
library(dplyr)
library(RCurl)
library(XML)
library(httr)
library(purrr)
# all the senators of Brazil
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
# get all url on the webpage
url2 <- getURL(url)
parsed <- htmlParse(url2)
links <- xpathSApply(parsed,path = "//a",xmlGetAttr,"href")
links <- do.call(rbind.data.frame, links)
colnames(links)[1] <- "links"
# filtering to get the urls of the senators
links_senador <- links %>%
filter(links %like% "/senadores/senador/")
links_senador <- data.frame(links_senador)
# creating a new directory for the pics
setwd("~/Downloads/")
dir.create("senadores-new")
setwd("~/Downloads/senadores-new")
# running a loop to download all pictures
i <- 1
while(1 <= 81){
tryCatch({
# defining the row of each senator
foto_webpage <- data.frame(links_senador$links[i])
# renaming the column's name
colnames(foto_webpage) <- "links"
# getting all images of html page
# filtering the photo which we want
html <- as.character(foto_webpage$links) %>%
httr::GET() %>%
xml2::read_html() %>%
rvest::html_nodes("img") %>%
map(xml_attrs) %>%
map_df(~as.list(.)) %>%
filter(src %like% "senadores/img/fotos-oficiais/") %>%
as.data.frame(html)
# downloading the photo
foto_senador <- html$src
download.file(foto_senador, basename(foto_senador), mode = "wb", header = TRUE)
Sys.sleep(3)
}, error = function(e) return(NULL)
)
i <- i + 1
}
This second part creates a dataframe with the ID and name of each senator:
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
file <- read_html(url)
tables <- html_nodes(file, "table")
table1 <- html_table(tables[1], fill = TRUE, header = T)
table1_df <- as.data.frame(table1)[1]
table1_df_sem_acentuacao <- as.data.frame(iconv(table1_df$Nome, from = "UTF-8", to = "ASCII//TRANSLIT"))
colnames(table1_df_sem_acentuacao) <- "senador_lower"
table1_df_lower <- as.data.frame(tolower(table1_df_sem_acentuacao$senador_lower))
colnames(table1_df_lower) <- "senador_lower"
table_name_final <- as.data.frame(gsub(" ", "-", table1_df_lower$senador_lower))
id_split <- as.data.frame(gsub("https://www25.senado.leg.br/web/senadores/senador/-/perfil/", "senador", links_senador$links))
table_dfs_final <- cbind(table_name_final, id_split)
colnames(table_dfs_final)[1] <- "name_lower"
colnames(table_dfs_final)[2] <- "id_senador"
For the loop to replace the ID for the name, I tried this:
for (p in photos) {
id <- basename(p)
id <- gsub(".jpg$", "", id)
name <- table_dfs_final$name_lower[match(id, basename(table_dfs_final$id_senador))]
fname <- paste0(table_dfs_final$id_senador, ".jpg")
file.rename(p, fname)
#optional
cat("renaming", basename(p), "to", name, "\n")
}

To make it more "R way" you can use one of the functions from apply family. create your function that changes names and than just apply it on ids and names columns you created.
changeName<- function(old_name, new_name){
file.rename(paste0(old_name,'.jpg'), paste0(new_name,'.jpg'))
}
mapply(changeName, table_dfs_final$id_senador,table_dfs_final$name_lower)

Related

Web Scraping URL Earnings Calendar Stock Market with respect to the date (URL Static without Date)

My code looks like this:
te_earnings <- read_html("https://tradingeconomics.com/earnings")
te_earnings %>% html_table()
te_earnings_data <- te_earnings %>% html_table()
rm(te_earnings)
te_earnings_data <- te_earnings_data[[2]]
te_earnings_data <- te_earnings_data %>% as_tibble()
te_earnings_data
te_earnings_data <- te_earnings_data[,-c(12,13)]
new_names<- as.character(str_extract_all(te_earnings_data[1,], boundary("word")))
names(te_earnings_data)
new_names[1:2] <- c("Date","Company")
new_names <- new_names[-c(12:13)]
new_names
names(te_earnings_data) <- new_names
names(te_earnings_data)[8] <- "Previous" ; rm(new_names)
te_earnings_data <- te_earnings_data[-1,]
is_tibble(te_earnings_data)
te_earnings_data[te_earnings_data == ''] <- NA
trim <- function( x ) {
gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)
}
te_earnings_data <- apply(te_earnings_data,2,trim)
te_earnings_data <- te_earnings_data %>% as_tibble("both")
te_earnings_data
# extracting the ticker and create new column
te_earnings_data$ticker <- NA
pattern_country_strings <- paste0(c(":US",":CN:",":JP",":BS",":MM",":IN",":AU", ":SM",":LN",":FP"), collapse="|")
te_earnings_data$ticker <- sub(".*\r\n", "", te_earnings_data$Company)
te_earnings_data$ticker <- te_earnings_data$ticker %>% str_replace(pattern_country_strings, " ")
head(te_earnings_data$ticker)
te_earnings_data$ticker <- te_earnings_data$ticker %>% str_trim()
head(te_earnings_data$ticker)
paste0(c(":US",":CN:"), collapse="|")
# Remove tickers from company
te_earnings_data$Company <- te_earnings_data$Company %>% str_replace_all(" .*" , "")
# Remove \r\n from company
te_earnings_data$Company <- te_earnings_data$Company %>% str_replace_all(paste0(c("\n","\r"), collapse="|"), "")
I am trying to create a data.frame with the earnings fom the page https://tradingeconomics.com/earnings an I like to change the date to the last month.
For Example, there you can change the date in the URL.
https://de.finance.yahoo.com/eventkalender/earnings?from=2023-01-08&to=2023-01-14&day=2023-01-09
But I dont find a date in the URL, even if I change the custom date at the page, nothin changes.
I tried to find the date in the source code of the page but I could not find it. Not a lot exp with that.
Can anybody tell me if this is basically possible or does it depend on the page?
Ty
I tried to download the page for a special date, but the date dont changes. And I dont know where to change it for Web Scraping.
EDIT:
I found a solution for yahoo. Just changing the date in the URL with a for loop and paste0 for example.
url <- "https://finance.yahoo.com/calendar/earnings?from=2022-12-04&to=2022-12-10&day=2022-12-06"
download_table <- function(url) {
url_file <- GET(url)
web_page_parsed <- htmlParse(url_file)
tables <- readHTMLTable(web_page_parsed)
}
url_file <- GET(url)
web_page_parsed <- htmlParse(url_file)
tables <- readHTMLTable(web_page_parsed)
print(head(tables))

Google Search in R [duplicate]

I used the following code:
library(XML)
library(RCurl)
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE)
{
search.term <- gsub(' ', '%20', search.term)
if(quotes) search.term <- paste('%22', search.term, '%22', sep='')
getGoogleURL <- paste('http://www.google', domain, '/search?q=',
search.term, sep='')
}
getGoogleLinks <- function(google.url)
{
doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){})
nodes <- getNodeSet(html, "//a[#href][#class='l']")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]]))
}
search.term <- "cran"
quotes <- "FALSE"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)
links <- getGoogleLinks(search.url)
I would like to find all the links that resulted from my search and I get the following result:
> links
list()
How can I get the links?
In addition I would like to get the headlines and summary of google results how can I get it?
And finally is there a way to get the links that resides in ChillingEffects.org results?
If you look at the htmlvariable, you can see that the search result links all are nested in <h3 class="r"> tags.
Try to change your getGoogleLinks function to:
getGoogleLinks <- function(google.url) {
doc <- getURL(google.url, httpheader = c("User-Agent" = "R
(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[#class='r']//a")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}
I created this function to read in a list of company names and then get the top website result for each. It will get you started then you can adjust it as needed.
#libraries.
library(URLencode)
library(rvest)
#load data
d <-read.csv("P:\\needWebsites.csv")
c <- as.character(d$Company.Name)
# Function for getting website.
getWebsite <- function(name)
{
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- read_html(url)
results <- page %>%
html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
html_text()
result <- results[1]
return(as.character(result)) # Return results if you want to see them all.
}
# Apply the function to a list of company names.
websites <- data.frame(Website = sapply(c,getWebsite))]
other solutions here don't work for me, here's my take on #Bryce-Chamberlain's issue which works for me in August 2019, it answers also another closed question : company name to URL in R
# install.packages("rvest")
get_first_google_link <- function(name, root = TRUE) {
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- xml2::read_html(url)
# extract all links
nodes <- rvest::html_nodes(page, "a")
links <- rvest::html_attr(nodes,"href")
# extract first link of the search results
link <- links[startsWith(links, "/url?q=")][1]
# clean it
link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
# get root if relevant
if(root) link <- sub("^(https?://.*?/).*$", "\\1", link)
link
}
companies <- data.frame(company = c("apple acres llc","abbvie inc","apple inc"))
companies <- transform(companies, url = sapply(company,get_first_google_link))
companies
#> company url
#> 1 apple acres llc https://www.appleacresllc.com/
#> 2 abbvie inc https://www.abbvie.com/
#> 3 apple inc https://www.apple.com/
Created on 2019-08-10 by the reprex package (v0.2.1)
The free solutions don't work anymore. Plus it doesn't allow you to search for regions outside your location. Here's a solution using Google Custom Search API. The API allows 100 free API calls per day. The function below returns only 10 results or page 1. 1 API call returns only 10 results.
Google.Search.API <- function(keyword, google.key, google.cx, country = "us")
{
# keyword = keywords[10]; country = "us"
url <- paste0("https://www.googleapis.com/customsearch/v1?"
, "key=", google.key
, "&q=", gsub(" ", "+", keyword)
, "&gl=", country # Country
, "&hl=en" # Language from Browser, english
, "&cx=", google.cx
, "&fields=items(link)"
)
d2 <- url %>%
httr::GET(ssl.verifypeer=TRUE) %>%
httr::content(.) %>% .[["items"]] %>%
data.table::rbindlist(.) %>%
mutate(keyword, SERP = row_number(), search.engine = "Google API") %>%
rename(source = link) %>%
select(search.engine, keyword, SERP, source)
pause <- round(runif(1, min = 1.1, max = 5), 1)
if(nrow(d2) == 0)
{cat("\nPausing", pause, "seconds. Failed for:", keyword)} else
{cat("\nPausing", pause, "seconds. Successful for:", keyword)}
Sys.sleep(pause)
rm(keyword, country, pause, url, google.key, google.cx)
return(d2)
}

How to download multiple files with the same name from html page?

I want to download all the files named "listings.csv.gz" which refer to US cities from http://insideairbnb.com/get-the-data.html, I can do it by writing each link but is it possible to do in a loop?
In the end I'll keep only a few columns from each file and merge them into one file.
Since the problem was solved thanks to #CodeNoob I'd like to share how it all worked out:
page <- read_html("http://insideairbnb.com/get-the-data.html")
# Get all hrefs (i.e. all links present on the website)
links <- page %>%
html_nodes("a") %>%
html_attr("href")
# Filter for listings.csv.gz, USA cities, data for March 2019
wanted <- grep('listings.csv.gz', links)
USA <- grep('united-states', links)
wanted.USA = wanted[wanted %in% USA]
wanted.links <- links[wanted.USA]
wanted.links = grep('2019-03', wanted.links, value = TRUE)
wanted.cols = c("host_is_superhost", "summary", "host_identity_verified", "street",
"city", "property_type", "room_type", "bathrooms",
"bedrooms", "beds", "price", "security_deposit", "cleaning_fee",
"guests_included", "number_of_reviews", "instant_bookable",
"host_response_rate", "host_neighbourhood",
"review_scores_rating", "review_scores_accuracy","review_scores_cleanliness",
"review_scores_checkin" ,"review_scores_communication",
"review_scores_location", "review_scores_value", "space",
"description", "host_id", "state", "latitude", "longitude")
read.gz.url <- function(link) {
con <- gzcon(url(link))
df <- read.csv(textConnection(readLines(con)))
close(con)
df <- df %>% select(wanted.cols) %>%
mutate(source.url = link)
df
}
all.df = list()
for (i in seq_along(wanted.links)) {
all.df[[i]] = read.gz.url(wanted.links[i])
}
all.df = map(all.df, as_tibble)
You can actually extract all links, filter for the ones containing listings.csv.gz and then download these in a loop:
library(rvest)
library(dplyr)
# Get all download links
page <- read_html("http://insideairbnb.com/get-the-data.html")
# Get all hrefs (i.e. all links present on the website)
links <- page %>%
html_nodes("a") %>%
html_attr("href")
# Filter for listings.csv.gz
wanted <- grep('listings.csv.gz', links)
wanted.links <- links[wanted]
for (link in wanted.links) {
con <- gzcon(url(link))
txt <- readLines(con)
df <- read.csv(textConnection(txt))
# Do what you want
}
Example: Download and combine the files
To get the result you want I would suggest to write a download function that filters for the columns you want and then combines these in a single dataframe, for example something like this:
read.gz.url <- function(url) {
con <- gzcon(url(link))
df <- read.csv(textConnection(readLines(con)))
close(con)
df <- df %>% select(c('calculated_host_listings_count_shared_rooms', 'cancellation_policy' )) %>% # random columns I chose
mutate(source.url = url) # You may need to remember the origin of each row
df
}
all.df <- do.call('rbind', lapply(head(wanted.links,2), read.gz.url))
Note I only tested this on the first two files since they are pretty large

Trying to webscrape an unchanging URL with data spread over pages

I am new to Webscraping. The url I am working with is this (https://tsmc.tripura.gov.in/doc_list). At present, I am able to extract data from the first page. Since, the url is unchanging, I don't have an identifier for the other pages to create a loop for data table extraction.
Here is my code:
install.packages("XML")
install.packages("RCurl")
install.packages("rlist")
install.packages("bitops")
library(bitops)
library(XML)
library(RCurl)
url1<- getURL("https://tsmc.tripura.gov.in/doc_list",.opts =
list(ssl.verifypeer = FALSE))
table1<- readHTMLTable(url1)
table1<- list.clean(table1, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(table1, function(t) dim(t)[1]))
table1[[which.max(n.rows)]]
View(table1)
table11= table1[["NULL"]]
Please help. Thanks!
Perhaps try this solution:
url <- "https://tsmc.tripura.gov.in/doc_list?page="
sq <- seq(1, 30) # There appears to be 30 pages so we create a sequence of 1:30 results
links <- paste0(url, sq) #Paste the sequence after the url "page="
store <- NULL
tbl <- NULL
library(rvest) #extract the tables
for(i in links){
store[[i]] = read_html(i)
tbl[[i]] = html_table(store[[i]])
}
library(plyr)
df <- ldply(tbl, data.frame) #combine the list of data frames into one large data frame
df$`.id` <- gsub("https://tsmc.tripura.gov.in/doc_list?page=", " ", df$`.id`, fixed = TRUE)
Which gives 846 observations across 8 variables.
EDIT: I found that the first url does not have a sequence. In order to add the first page and rbind it with the rest of the data use the following:
firsturl <- "https://tsmc.tripura.gov.in/doc_list"
first_store = read_html(firsturl)
first_tbl = html_table(first_store)
first_df <- as.data.frame(first_tbl)
first_df$`.id` <- 0
df2 <- rbind(first_df, df)

Use R to do web Crawler and it can not capture content I need(text mining)(Taiwanese BBS, ptt)

this is Joe from National Taipei University of Business, Taiwan. I'm currently doing a research on online games and E-sports by text mining in the social media. I chose to get the data from the most popular BBS, "PTT",in Taiwan, but it seems my code can only capture the article titles but cannot reach the contents.
I tried to get the texts from www.ptt.cc/bbs/LoL/index6402.html to index6391, and the code I used is here in my R code data or R code txt file or following.
install.packages("httr")
install.packages("XML")
install.packages("RCurl")
install.packages("xml2")
library(httr)
library(XML)
library(RCurl)
library(xml2)
data <- list()
for( i in 6391:6402) {
tmp <- paste(i, '.html', sep='')
url <- paste('https://www.ptt.cc/bbs/LoL/index', tmp, sep='')
tmp <- read_html(url)
html <- htmlParse(getURL(url))
url.list <- xml_find_all(tmp, "//div[#class='title']/a[#href]")
data <- rbind(data, as.matrix(paste('https://www.ptt.cc', url.list, sep='')))
}
data <- unlist(data)
getdoc <- function(line){
start <- regexpr('https://www', line)[1]
end <- regexpr('html', line)[1]
if(start != -1 & end != -1){
url <- substr(line, start, end+3)
html <- htmlParse(getURL(url), encoding='UTF-8')
doc <- xpathSApply(html, "//div[#id='main-content']", xmlValue)
name <- strsplit(url, '/')[[1]][4]
write(doc, gsub('html', 'txt', name))
}
}
setwd("E:/data")
sapply(data, getdoc)
But this code can only capture the titles and my txt files are empty. I'm not sure which part goes wrong and thus I need some advice from you at stackoverflow.
Any advice will be very much appreciated and anyone helping me with this will be on the list of acknowledgement in my thesis, and, if you're curious about it, I will inform you of the research result after it is done. :)
Something like:
library(tidyverse)
library(rvest)
# change the end number
pages <- map(6391:6392, ~read_html(sprintf("https://www.ptt.cc/bbs/LoL/index%d.html", .)))
map(pages, ~xml_find_all(., "//div[#class='title']/a[#href]")) %>%
map(xml_attr, "href") %>%
flatten_chr() %>%
map_df(function(x) {
URL <- sprintf("https://www.ptt.cc%s", x)
pg <- read_html(URL)
data_frame(
url=URL,
text=html_nodes(pg, xpath="//div[#id='main-content']") %>% html_text()
)
}) -> df
glimpse(df)
## Observations: 40
## Variables: 2
## $ url <chr> "https://www.ptt.cc/bbs/LoL/M.1481947445.A.17B.html", "https://www.ptt.cc/b...
## $ text <chr> "作者rainnawind看板LoL標題[公告] LoL 板 開始舉辦樂透!時間Sat Dec 17 12:04:03 2016\nIMT KDM 勝...
to make a data frame or sub out the last part with:
dir.create("pttdocs")
map(pages, ~xml_find_all(., "//div[#class='title']/a[#href]")) %>%
map(xml_attr, "href") %>%
flatten_chr() %>%
walk(function(x) {
URL <- sprintf("https://www.ptt.cc%s", x)
basename(x) %>%
tools::file_path_sans_ext() %>%
sprintf(fmt="%s.txt") %>%
file.path("pttdocs", .) -> fil
pg <- read_html(URL)
html_nodes(pg, xpath="//div[#id='main-content']") %>%
html_text() %>%
writeLines(fil)
})
to write files to a directory.

Resources