I am new to webscraping and have tried several methods to perform a rvest across multiple pages. Somehow it is still not working and I only get 15 results instead of the 207 products listed in this category. What am I doing wrong?
library(rvest)
all_df<-0
library(data.table)
for(i in 1:5){
url_fonq <- paste0("https://www.fonq.nl/producten/categorie-lichtbronnen/?p=",i,sep="")
webpage_fonq <- read_html(url_fonq)
head(webpage_fonq)
product_title_data_html <- html_nodes(webpage_fonq, '.product-title')
product_title_data <- html_text(product_title_data_html)
head(product_title_data)
product_title_data<-gsub("\n","",product_title_data)
product_title_data<-gsub(" ","",product_title_data)
head(product_title_data)
length(product_title_data)
product_price_data_html <- html_nodes(webpage_fonq, '.product-price')
product_price_data <- html_text(product_price_data_html)
head(product_price_data)
product_price_data<-gsub("\n","",product_price_data)
product_price_data<-gsub(" ","",product_price_data)
head(product_price_data)
product_price_data
length(product_price_data)
fonq.df <- data.frame(Procuct_title = product_title_data, Price = product_price_data)
all_df <-list(fonq.df)
}
final2<-rbindlist(all_df,fill = TRUE)
View(final2)
The problem is that you keep only the data scraped from the last page of the website, and thus you have the last 15 products stored only.
So instead of overwriting the all_df variable in every iteration
all_df <- list(fonq.df)
append the fonq.df dataframe at the end of the all_df:
all_df <- bind_rows(all_df, fonq.df)
Here is my complete solution:
library(rvest)
all_df <- list()
library(dplyr)
for(i in 1:5){
url_fonq <- paste0("https://www.fonq.nl/producten/categorie-lichtbronnen/?p=",i,sep="")
webpage_fonq <- read_html(url_fonq)
head(webpage_fonq)
product_title_data_html <- html_nodes(webpage_fonq, '.product-title')
product_title_data <- html_text(product_title_data_html)
head(product_title_data)
product_title_data<-gsub("\n","",product_title_data)
product_title_data<-gsub(" ","",product_title_data)
head(product_title_data)
length(product_title_data)
product_price_data_html <- html_nodes(webpage_fonq, '.product-price')
product_price_data <- html_text(product_price_data_html)
head(product_price_data)
product_price_data<-gsub("\n","",product_price_data)
product_price_data<-gsub(" ","",product_price_data)
head(product_price_data)
product_price_data
length(product_price_data)
fonq.df <- data.frame(Procuct_title = product_title_data, Price = product_price_data)
all_df <-bind_rows(all_df, fonq.df)
}
View(all_df)
Related
I'm trying to scrape the entire table of this website: https://sineb.mineducacion.gov.co/bcol/app
I need all records for the filter: Departamento:=BOGOTÁ, D.C.
I'm able to get the table on the first page, but not the rest of the table in pages 2 to 20.
library(tidyverse)
library(rvest)
sineb <- html_session("https://sineb.mineducacion.gov.co/bcol/app")
my_form <- html_form(sineb)[[1]]
dept <- my_form$fields$departamento$options[-1]
bogota <- dept[grep("D.C", names(dept))]
my_form <- set_values(my_form, 'departamento' = bogota[1])
sineb <- submit_form(sineb, my_form, "consultar")
df_list <- html_table(sineb, T, T, T)
table <- as.data.frame(df_list[[4]])
Thanks!
Let me first note that I used the updated syntax of rvest (See Functions renamed in rvest 1.0.0)
Your ansatz is pretty good, and with using session_follow_link, this easily completes the solution by looping through the pages and selecting the link using xpath:
library(tidyverse)
library(rvest)
sineb <- session("https://sineb.mineducacion.gov.co/bcol/app")
my_form <- html_form(sineb)[[1]]
dept <- my_form$fields$departamento$options[-1]
bogota <- dept[grep("D.C", names(dept))]
my_form <- html_form_set(my_form, 'departamento' = bogota[1])
sineb <- session_submit(sineb, my_form, "consultar")
df_list <- html_table(sineb, T, T, T)
results <- as.data.frame(df_list[[4]])
for (next_page in 2:20) {
sineb <- session_follow_link(sineb, xpath = paste0("//a[text() = '", next_page, "']"))
df_list <- html_table(sineb, T, T, T)
results <- rbind(results, as.data.frame(df_list[[4]]))
}
I have an excel file that contains certain keywords that need to be searched in google through R.
The output to be created is a data frame which contains the following variables:
Keyword;Position(position of the url in the search results);Title(title of the ith search result);Text(text in that search result);URL;Domain
The keywords and some example of the output are given in the link below:
https://drive.google.com/file/d/1AM3d5Hbf5nBpbRG1ydnZM7ZG2AdUyy-6/view?usp=sharing
(Sheet 1 has the keywords and sheet 2 has the sample output)
I tried to create a similar output but there seems to be an error.
Code:
# Web Scraping in R
library(XML)
library(RCurl)
library(dplyr)
library(rvest)
library(urltools)
library(htm2txt)
library(readxl)
data <- read_excel(file.choose()) # Importing the data
output <- data.frame(matrix(ncol=6,nrow=0))
colnames(output) <- c("Name","Position","Title","Text","URL","Domain")
for (i in 1:nrow(data)) {
search.term <- data[i,1]
getGoogleURL <- function(search.term, domain = '.com', quotes=TRUE)
{
search.term <- gsub(' ', '%20', search.term) # Cleaning the Search Term
if(quotes) search.term <- paste('%22', search.term, '%22', sep='')
getGoogleURL <- paste('http://www.google', domain, '/search?q=',
search.term, sep='')
}
quotes <- "False"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)
page <- read_html(search.url)
links <- page %>% html_nodes("a") %>% html_attr("href")
link <- links[startsWith(links, "/url?q=")]
link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
for (j in 1:length(link)) {
page1 <- read_html(link[j])
name <- data[i,1]
position <- j
title <- page1 %>% html_node("title") %>% html_text()
text <- gettxt(link[j])
url <- link[j]
domain <- suffix_extract(domain(link[j]))$host
vect <- c(name,position,title,text,url,domain)
output <- rbind(output,vect)
}
}
The error being shown is:
Error in match.names(clabs, nmi) : names do not match previous names
Please help, I'm new to R.
That error comes from rbind when the columns don't line up perfectly. For instance, if there is a missing or extra column. In this case, it might be because one of your vect variables is empty/NULL or length over 1.
rbind(data.frame(a=1,b=2), data.frame(b=3))
# Error in rbind(deparse.level, ...) :
# numbers of columns of arguments do not match
Since iteratively adding rows to a frame gets expensive (it makes a complete copy of the frame every time even one row is added, this is grossly inefficient), it's generally better to append to a list and convert into a frame in one call.
out <- list()
for (i in seq_len(nrow(data))) {
# ...
for (j in seq_along(link)) {
# ...
vect <- c(name, position, title, text, url, domain)
stopifnot(length(vect) == 6L)
out <- c(out, list(vect))
}
}
outout <- do.call(rbind.data.frame, out)
colnames(output) <- c("Name", "Position", "Title", "Text", "URL", "Domain")
(In reality, instead of stopifnot, one might record the url and data retrieved into a different list for forensic purposes. Or find the missing element and NA it before adding to the list. Either way, stopifnot is intended here as a placeholder for something more contextually relevant to you and your process.)
I am trying to scrape a website using the following:
industryurl <- "https://finance.yahoo.com/industries"
library(rvest)
read <- read_html(industryurl) %>%
html_table()
library(plyr)
industries <- ldply(read, data.frame)
industries = industries[-1,]
read <- read_html(industryurl)
industryurls <- html_attr(html_nodes(read, "a"), "href")
links <- industryurls[grep("/industry/", industryurls)]
industryurl <- "https://finance.yahoo.com"
links <- paste0(industryurl, links)
links
##############################################################################################
store <- NULL
tbl <- NULL
for(i in links){
store[[i]] = read_html(i)
tbl[[i]] = html_table(store[[i]])
}
#################################################################################################
I am mostly interested in the code between ########## and I want to apply a function instead of a for loop since I am running into time out issues with yahoo and I want to make it more human like to extract this data (it is not too much).
My question is, how can I take links apply a function and set a sort of delay timer to read in the contents of the for loop?
I can paste my own version of the for loop which does not work.
This is the function I came up with
##First argument is the link you need
##The second argument is the total time for Sys.sleep
extract_function <- function(define_link, define_time){
print(paste0("The system will stop for: ", define_time, " seconds"))
Sys.sleep(define_time)
first <- read_html(define_link)
print(paste0("It will now return the table for link", define_link))
return(html_table(first))
}
##I added the following tryCatch function
link_try_catch <- function(define_link, define_time){
out <- tryCatch(extract_function(define_link,define_time), error =
function(e) NA)
return(out)
}
##You can now retrieve the data using the links vector in two ways
##Picking the first ten, so it should not crash on link 5
p <- lapply(1:10, function(i)link_try_catch(links[i],1))
##OR (I subset the vector just for demo purposes
p2 <- lapply(links[1:10], function(i)extract_function(i,1))
Hope it helps
I am new to Webscraping. The url I am working with is this (https://tsmc.tripura.gov.in/doc_list). At present, I am able to extract data from the first page. Since, the url is unchanging, I don't have an identifier for the other pages to create a loop for data table extraction.
Here is my code:
install.packages("XML")
install.packages("RCurl")
install.packages("rlist")
install.packages("bitops")
library(bitops)
library(XML)
library(RCurl)
url1<- getURL("https://tsmc.tripura.gov.in/doc_list",.opts =
list(ssl.verifypeer = FALSE))
table1<- readHTMLTable(url1)
table1<- list.clean(table1, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(table1, function(t) dim(t)[1]))
table1[[which.max(n.rows)]]
View(table1)
table11= table1[["NULL"]]
Please help. Thanks!
Perhaps try this solution:
url <- "https://tsmc.tripura.gov.in/doc_list?page="
sq <- seq(1, 30) # There appears to be 30 pages so we create a sequence of 1:30 results
links <- paste0(url, sq) #Paste the sequence after the url "page="
store <- NULL
tbl <- NULL
library(rvest) #extract the tables
for(i in links){
store[[i]] = read_html(i)
tbl[[i]] = html_table(store[[i]])
}
library(plyr)
df <- ldply(tbl, data.frame) #combine the list of data frames into one large data frame
df$`.id` <- gsub("https://tsmc.tripura.gov.in/doc_list?page=", " ", df$`.id`, fixed = TRUE)
Which gives 846 observations across 8 variables.
EDIT: I found that the first url does not have a sequence. In order to add the first page and rbind it with the rest of the data use the following:
firsturl <- "https://tsmc.tripura.gov.in/doc_list"
first_store = read_html(firsturl)
first_tbl = html_table(first_store)
first_df <- as.data.frame(first_tbl)
first_df$`.id` <- 0
df2 <- rbind(first_df, df)
I'm looking for some assistance in writing some R code to iterate through rows in a dataframe and pass the values in each row to a function and print the output either to an excel file, txt file or just in the console.
The purpose of this is to automate a bunch of distance/time queries (several hundred) to google maps using the function found at this website: http://www.nfactorialanalytics.com/r-vignette-for-the-week-finding-time-distance-between-two-places/
The function on that website is as follows:
library(XML)
library(RCurl)
distance2Points <- function(origin,destination){
results <- list();
xml.url <- paste0('http://maps.googleapis.com/maps/api/distancematrix/xml?origins=',origin,'&destinations=',destination,'&mode=driving&sensor=false')
xmlfile <- xmlParse(getURL(xml.url))
dist <- xmlValue(xmlChildren(xpathApply(xmlfile,"//distance")[[1]])$value)
time <- xmlValue(xmlChildren(xpathApply(xmlfile,"//duration")[[1]])$value)
distance <- as.numeric(sub(" km","",dist))
time <- as.numeric(time)/60
distance <- distance/1000
results[['time']] <- time
results[['dist']] <- distance
return(results)
}
The dataframe will contain two columns: origin postal code and destination postal code (Canada, eh?). I'm a beginner R programmer, so I know how to use read.table to load a txt file into a dataframe. I'm just not sure how iterate through the dataframe, each time passing values to the distance2Points function and executing. I think this can be done using either a for loop or one of the apply calls?
Thanks for the help!
edit:
To keep it simple lets assume I want to transform these two vectors into a dataframe
> a <- c("L5B4P2","L5B4P2")
> b <- c("M5E1E5", "A2N1T3")
> postcodetest <- data.frame(a,b)
> postcodetest
a b
1 L5B4P2 M5E1E5
2 L5B4P2 A2N1T3
How should I go about iterating over these two rows to return both distances and times from the distance2Points function?
Here's one way to do it, using lapply to produce a list with the results for each row in your data and using Reduce(rbind, [yourlist]) to concatenate that list into a data frame whose rows correspond to the ones in your original. To make this work, we also have to tweak the code in the original function to return a one-row data frame, so I've done that here.
distance2Points <- function(origin,destination){
require(XML)
require(RCurl)
xml.url <- paste0('http://maps.googleapis.com/maps/api/distancematrix/xml?origins=',origin,'&destinations=',destination,'&mode=driving&sensor=false')
xmlfile <- xmlParse(getURL(xml.url))
dist <- xmlValue(xmlChildren(xpathApply(xmlfile,"//distance")[[1]])$value)
time <- xmlValue(xmlChildren(xpathApply(xmlfile,"//duration")[[1]])$value)
distance <- as.numeric(sub(" km","",dist))
time <- as.numeric(time)/60
distance <- distance/1000
# this gives you a one-row data frame instead of a list, b/c it's easy to rbind
results <- data.frame(time = time, distance = distance)
return(results)
}
# now apply that function rowwise to your data, using lapply, and roll the results
# into a single data frame using Reduce(rbind)
results <- Reduce(rbind, lapply(seq(nrow(postcodetest)), function(i)
distance2Points(postcodetest$a[i], postcodetest$b[i])))
Result when applied to your sample data:
> results
time distance
1 27.06667 27.062
2 1797.80000 2369.311
If you would prefer to do this without creating a new object, you could also write separate functions for computing time and distance -- or a single function with those outputs as options -- and then use sapply or just mutate to create new columns in your original data frame. Here's how that might look using sapply:
distance2Points <- function(origin, destination, output){
require(XML)
require(RCurl)
xml.url <- paste0('http://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
origin, '&destinations=', destination, '&mode=driving&sensor=false')
xmlfile <- xmlParse(getURL(xml.url))
if(output == "distance") {
y <- xmlValue(xmlChildren(xpathApply(xmlfile,"//distance")[[1]])$value)
y <- as.numeric(sub(" km", "", y))/1000
} else if(output == "time") {
y <- xmlValue(xmlChildren(xpathApply(xmlfile,"//duration")[[1]])$value)
y <- as.numeric(y)/60
} else {
y <- NA
}
return(y)
}
postcodetest$distance <- sapply(seq(nrow(postcodetest)), function(i)
distance2Points(postcodetest$a[i], postcodetest$b[i], "distance"))
postcodetest$time <- sapply(seq(nrow(postcodetest)), function(i)
distance2Points(postcodetest$a[i], postcodetest$b[i], "time"))
And here's how you could do it in a dplyr pipe with mutate:
library(dplyr)
postcodetest <- postcodetest %>%
mutate(distance = sapply(seq(nrow(postcodetest)), function(i)
distance2Points(a[i], b[i], "distance")),
time = sapply(seq(nrow(postcodetest)), function(i)
distance2Points(a[i], b[i], "time")))