Scrape website with R by navigating doPostBack - r

I want to extract a table periodicaly from below site.
price list changes when clicked building block names(BLOK 16 A, BLOK 16 B, BLOK 16 C, ...) . URL doesn't change, page changes by trigering
javascript:__doPostBack('ctl00$ContentPlaceHolder1$DataList2$ctl04$lnk_blok','')
I've tried 3 ways after searching google and starckoverflow.
what I've tried no 1: this doesn't triger doPostBack event.
postForm( "http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44", ctl00_ContentPlaceHolder1_DataList2_ctl03_lnk_blok="ctl00$ContentPlaceHolder1$DataList2$ctl03$lnk_blok")
what I've tried no 2: selenium remote seem to works on (http://localhost:4444/) but remotedriver doesn't navigate. returns this error. (Error in checkError(res) :
Undefined error in httr call. httr output: length(url) == 1 is not TRUE)
library(RSelenium)
startServer()
remDr <- remoteDriver()
remDr <- remoteDriver(remoteServerAddr = "localhost"
, port = 4444L, browserName = "firefox")
remDr$open()
remDr$getStatus()
remDr$navigate("http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44")
what I've tried no 3: this another way to triger dopostback event. it doesn't navigate.
base.url <- "http://www.kentkonut.com.tr/tr/modul/projeler/",
event.target <- 'ctl00$ContentPlaceHolder1$DataList2$ctl03$lnk_blok',
action <- "daire_fiyatlari.aspx?id=44"
ftarget <- paste0(base.url, action)
dum <- getURL(ftarget)
event.val <- unlist(strsplit(dum,"__EVENTVALIDATION\" value=\""))[2]
event.val <- unlist(strsplit(event.val,"\" />\r\n\r\n<script"))[1]
view.state <- unlist(strsplit(dum,"id=\"__VIEWSTATE\" value=\""))[2]
view.state <- unlist(strsplit(view.state,"\" />\r\n\r\n\r\n<script"))[1]
web.data <- postForm(ftarget, "form name" = "ctl00_ContentPlaceHolder1_DataList2_ctl03_lnk_blok",
"method" = "POST",
"action" = action,
"id" = "ctl00_ContentPlaceHolder1_DataList2_ctl03_lnk_blok",
"__EVENTTARGET"=event.target,
"__EVENTVALIDATION"=event.val,
"__VIEWSTATE"=view.state)
thanks for your help.

library(rvest)
url<-"http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44"
pgsession<-html_session(url)
t<-html_table(html_nodes(read_html(pgsession), css = "#ctl00_ContentPlaceHolder1_DataList1"), fill= TRUE)[[1]]
even_indices<-seq(2,length(t$X1),2)
t<-t[even_indices,]
t<-t[2:(length(t$X1)),]
EDITED CODE:
library(rvest)
url<-"http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44"
pgsession<-html_session(url)
pgform<-html_form(pgsession)[[1]]
page<-rvest:::request_POST(pgsession,"http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44",
body=list(
`__VIEWSTATE`=pgform$fields$`__VIEWSTATE`$value,
`__EVENTTARGET`="ctl00$ContentPlaceHolder1$DataList2$ctl01$lnk_blok",
`__EVENTARGUMENT`="",
`__VIEWSTATEGENERATOR`=pgform$fields$`__VIEWSTATEGENERATOR`$value,
`__VIEWSTATEENCRYPTED`=pgform$fields$`__VIEWSTATEENCRYPTED`$value,
`__EVENTVALIDATION`=pgform$fields$`__EVENTVALIDATION`$value
),
encode="form"
)
# in the above example change eventtarget as "ctl00$ContentPlaceHolder1$DataList2$ctl02$lnk_blok" to get different table
t<-html_table(html_nodes(read_html(page), css = "#ctl00_ContentPlaceHolder1_DataList1"), fill= TRUE)[[1]]
even_indices<-seq(2,length(t$X1),2)
t<-t[even_indices,]
t<-t[2:(length(t$X1)),]

Related

RSelenium the web page kept on loading after clicking the next button

I am new to web scraping and want to scrape data from https://www.forwardpathway.com/us-college-database. I used the following code to extract the data from the table but the page just kept on loading after I clicked the next button. Can anybody point out what is wrong?
library(RSelenium)
library(tidyverse)
library(netstat)
library(xml2)
library(data.table)
library(rvest)
binman::list_versions("chromedriver")
rs_driver_object<-rsDriver(browser="chrome",
chromever="107.0.5304.62",
verbose=F,
port=free_port())
## create the client
remDr<-rs_driver_object$client
## open the brower
remDr$open()
remDr$navigate("https://www.forwardpathway.com/us-college-database")
## locate the table that stores the data
data_table<-remDr$findElement(using = "id","table_1")
#And I tried three different methods to click the next button, but the problem persisted.
## next button method 1
next_button<-remDr$findElement(using = "id",'table_1_next')
next_button$clickElement()
## next button method 2
remDr$executeScript("document.getElementById('table_1_next').click()")
## next button method 3
next_button <- remDr$findElement("id", "table_1_next")
next_button$sendKeysToElement(list(key="enter"))
all_data<-list()
cond<-TRUE
while(cond == TRUE){
data_table_html<-data_table$getPageSource()
page<-read_html(data_table_html %>% unlist())
df<-html_table(page) %>% .[[1]]
all_data<-rbindlist((list(all_data,df)))
Sys.sleep(5)
tryCatch(
{next_button <- remDr$findElement("id", "table_1_next")
next_button$sendKeysToElement(list(key="enter"))
},
error=function(e){
print("script complete")
cond<<-FALSE
}
)
if (cond ==FALSE){
break
}
}

Don't understand "RSelenium message: javascript error"

I was running a script for webscraping in RStudio and got the following error:
Selenium message:javascript error: this.each is not a function
(Session info: chrome=81.0.4044.129)
Build info: version: '4.0.0-alpha-2', revision: 'f148142cf8', time: '2019-07-01T21:30:10'
System info: host: 'xxxxxx', ip: 'xxx.xxx.x.xxx', os.name: 'Windows 10', os.arch: 'amd64', os.version: '10.0', java.version: '1.8.0_231'
Driver info: driver.version: unknown
Error: Summary: JavaScriptError
Detail: An error occurred while executing user supplied JavaScript.
class: org.openqa.selenium.JavascriptException
Further Details: run errorDetails method
I don't really understand what the problem is and how I might solve it.
Does anyone know how to solve this problem? I am still quite new to this, so concrete steps would be very practical for me.
Thank you in advance!
Edit: This is the script I'm using. The Error seems to occur just before "#end of the main loop"
library(data.table) # Required for rbindlist
library(dplyr) # Required to use the pipes %>% and some table manipulation commands
library(magrittr) # Required to use the pipes %>%
library(rvest) # Required for read_html
library(RSelenium) # Required for webscraping with javascript
library(lubridate) # Required to collect dates
library(stringr)
library(purrr)
options(stringsAsFactors = F) #needed to prevent errors when merging data frames
#Paste the GoodReads Url
url <- "https://www.goodreads.com/book/show/1885.Pride_and_Prejudice?ac=1&from_search=true&qid=VkA2NbcGBa&rank=1"
languageOnly = F #If FALSE, "all languages" is chosen
#Set your browser settings
rD <- rsDriver(port = 4585L, browser = "chrome", chromever = "81.0.4044.69")
remDr <- rD[["client"]]
remDr$setTimeout(type = "implicit", 2000)
remDr$navigate(url)
bookTitle = unlist(remDr$getTitle())
finalData = data.frame()
# Main loop going through the website pages
morePages = T
pageNumber = 1
while(morePages){
#Select reviews in correct language.
#It should also work if you only fill in the numeral language code, and leave the first one empty.
selectLanguage = if(languageOnly){
selectLanguage = remDr$findElement("xpath", "//select[#id='language_code']/option[#value='']")
} else {
selectLanguage = remDr$findElement("xpath", "//select[#id='language_code']/option[5]")
}
selectLanguage$clickElement()
Sys.sleep(3)
#Expand all reviews
expandMore <- remDr$findElements("link text", "...more")
sapply(expandMore, function(x) x$clickElement())
#Extracting the reviews from the page
reviews <- remDr$findElements("css selector", "#bookReviews .stacked")
reviews.html <- lapply(reviews, function(x){x$getElementAttribute("outerHTML")[[1]]})
reviews.list <- lapply(reviews.html, function(x){read_html(x) %>% html_text()} )
reviews.text <- unlist(reviews.list)
#Some reviews have only rating and no text, so we process them separately
onlyRating = unlist(map(1:length(reviews.text), function(i) str_detect(reviews.text[i], "^\\\n\\\n")))
#Full reviews
if(sum(!onlyRating) > 0){
filterData = reviews.text[!onlyRating]
fullReviews = purrr::map_df(seq(1, length(filterData), by=2), function(i){
review = unlist(strsplit(filterData[i], "\n"))
data.frame(
date = mdy(review[2]), #date
username = str_trim(review[5]), #user
rating = str_trim(review[9]), #overall
comment = str_trim(review[12]) #comment
)
})
#Add review text to full reviews
fullReviews$review = unlist(purrr::map(seq(2, length(filterData), by=2), function(i){
str_trim(str_remove(filterData[i], "\\s*\\n\\s*\\(less\\)"))
}))
} else {
fullReviews = data.frame()
}
#partial reviews (only rating)
if(sum(onlyRating) > 0){
filterData = reviews.text[onlyRating]
partialReviews = purrr::map_df(1:length(filterData), function(i){
review = unlist(strsplit(filterData[i], "\n"))
data.frame(
date = mdy(review[9]), #date
username = str_trim(review[4]), #user
rating = str_trim(review[8]), #overall
comment = "",
review = ""
)
})
} else {
partialReviews = data.frame()
}
finalData = rbind(finalData, fullReviews, partialReviews)
#Go to next page if possible
nextPage = remDr$findElements("xpath", "//a[#class='next_page']")
if(length(nextPage) > 0){
message(paste("PAGE", pageNumber, "Processed - Going to next"))
nextPage[[1]]$clickElement()
pageNumber = pageNumber + 1
Sys.sleep(2)
} else {
message(paste("PAGE", pageNumber, "Processed - Last page"))
morePages = FALSE
}
}
#end of the main loop
#Replace missing ratings by 'not rated'
finalData$rating = ifelse(finalData$rating == "", "not rated", finalData$rating)
#Stop server
rD[["server"]]$stop()
#set directory to where you wish the file to go
#copy your working directory and exchange all backward slashes with forward slashes
getwd()
setwd("C:/Users/ledgreve/Desktop/GoodReads_TextMining-master/Scripts/New Scripts/Test1")
#Write results
write.csv(finalData, paste0(bookTitle, ".csv"), row.names = F)
message("FINISHED!")
Just my own update: This issue was resolved after I reinstalled java and installed rjava (https://cimentadaj.github.io/blog/2018-05-25-installing-rjava-on-windows-10/installing-rjava-on-windows-10/)

RSelenium - how to go to the next page by cliking on the button next?

my question is about scraping with RSelenium.
I am trying to scrape data from the following website:
"https://www.nhtsa.gov/ratings" using RSelenium.
My present difficulty lies in managing to skip between pages for a given carmaker.
This is my code so far:
library(RSelenium)
#opens a connection
rD <- rsDriver()
remDr <- rD$client
#goes to the page we want
url <- "https://www.nhtsa.gov/ratings"
remDr$navigate(url)
#clicking to open the manufacturer selection "page"
webElem <- remDr$findElement(using = 'css selector', "#vehicle a")
webElem$clickElement()
#opening the options menu
option.menu <- remDr$findElement(using='css selector', 'select')
option.menu$clickElement()
#selecting one maker, loop over this later
maker.select <- remDr$findElement(using = 'xpath', "//*/option[#value = 'AUDI']")
maker.select$clickElement()
#search our selection
maker.click<-remDr$findElement(using='css selector', '.manufacturer-search-submit')
maker.click$clickElement()
#now we have to go through each car (10 per page), loop later
cars<-remDr$findElement(using='css selector', 'tbody:nth-child(6) a')
individual.link<-cars$getElementAttribute("href")
#going to the next page
next_page<-remDr$findElement(using='css selector', 'button.btn.link-arrow::after')
next_page$clickElement()
But I get the error:
Error: Summary: NoSuchElement
Detail: An element could not be located on the page using the given search parameters.
Further Details: run errorDetails method
As you can probabily see I am new to RSelenium. Any help that you can give me would be appreciated. Thanks in advance.
Here is another approach that might be of help.
You can access the data simply by sending a GET request to the website. From the website (on the first page), we can see
'https://api.nhtsa.gov/vehicles/byManufacturer?offset=0&max=10&sort=overallRating&order=desc&data=crashtestratings,recommendedfeatures&productDetail=all&dateStart=2011-01-01&manufacturerName=AUDI&dateEnd=3000-01-01&name='
This is where we can get the data. The second page will have offset=10 then 20,30,etc.
If api_url is defined to be the above url, then we can get the data using httr
# request the data
request <- httr::GET(api_url)
# retrieve the content
request_content <- httr::content(request)
request_result <- request_content$results
# request results contains the data of interest
# A few glimpses into the data
# The first model
request_result[[1]]$vehicleModel
# [1] "A3"
request_result[[1]]$modelYear
# [1] 2018
request_result[[1]]$manufacturer
# [1] "AUDI OF AMERICA, INC"
Now by playing around with offset it is straight forward to build a loop and gather all pages
out <- list()
k <- 0L
i <- 1L
while (k < 1e+3) {
req_url <- paste0('https://api.nhtsa.gov/vehicles/byManufacturer?offset=',
k,
'&max=10&sort=overallRating&order=desc&data=crashtestratings,recommendedfeatures&productDetail=all&dateStart=2011-01-01&manufacturerName=AUDI&dateEnd=3000-01-01&name=')
req <- httr::content(httr::GET(req_url))$result
if (length(req) == 0) break
out[[i]] <- req
cat(paste0('\nAdded content for offset \t', k))
i <- i + 1L
k <- k + 10L
}
lengths(out)
# [1] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
Note that you can also play around with manufacturerName in the url and with many more arguments to have clean and tailored data.

Launch TorBrowser or Tor enabled Firefox browser in RSelenium

How do you launch TorBrowser in RSelenium?
I tried this to no avail:
library(RSelenium)
browserP <- "C:/Users/Administrator/Desktop/Tor Browser/Browser/firefox.exe"
jArg <- paste0("-Dwebdriver.firefox.bin=\"", browserP, "\"")
pLoc <- "C:/Users/Administrator/Desktop/Tor Browser/Browser/TorBrowser/Data/Browser/profile.meek-http-helper/"
jArg <- c(jArg, paste0("-Dwebdriver.firefox.profile=\"", pLoc, "\""))
selServ <- RSelenium::startServer(javaargs = jArg)
Error: startServer is now defunct. Users in future can find the function in
file.path(find.package("RSelenium"), "examples/serverUtils"). The
recommended way to run a selenium server is via Docker. Alternatively
see the RSelenium::rsDriver function.
rsDriver doesn't take a javaargs argument, and I can't figure out how to get this to work either:
fprof <- getFirefoxProfile("C:/Users/Administrator/Desktop/Tor Browser/Browser/TorBrowser/Data/Browser/profile.meek-http-helper/", useBase = T)
remDr <- remoteDriver(extraCapabilities = list(marionette = TRUE))

Scraping YouTube comments

I'm trying to scrape several comments on YouTube but my id doesn't work and I don't know if i have to use the API or the client id/secret:
>id <- "XXX"
>api <- "xxx"
>client <-"xxx"
>secret <- "xxx"
yt_oauth(client, secret)
yt.oauth(client, secret)
yt.ouath(client,secret, token = ' ')
yt_oauth(user, api, token = '')
# A<- yt_oauth(user,api)
install.packages("devtools")
library (devtools)
devtools::install_github("soodoku/tuber", build_vignettes = TRUE)
#quick overview of some important functions in tuber, see the vignette:
vignette("tuber-ex", package="tuber")
#Get All the Comments Including Replies
get_all_comments(video_id = "a-UQz7fqR3w")
yt.oauth(client, secret)
Error in yt.oauth(client, secret) : not found the function "yt.oauth"
yt_oauth(user, api, token = '')
Waiting for authentication in browser...Press Esc/Ctrl + C to abort
But it doesn't recognize my credentials
Thank you so much!!
I do not know if this is interesting for you, but I have been able to get the comments of a Youtube video with the following code :
library(rvest)
library(RSelenium)
port <- as.integer(4444)
rd <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = port)
remDr <- rd$client
remDr$open()
url <- 'https://www.youtube.com/watch?v=0WzPPRUTQZ4'
remDr$navigate(url)
# Scroll down the page to load all the comments
for(i in 1 : 200)
{
print(i)
java_Script <- paste0("scroll(0,", i * 500, ")")
remDr$executeScript(java_Script)
}
web_Obj_Comments <- remDr$findElements("xpath", '//*[#id="content"]')
list_Comments <- list()
nb_Comments <- length(web_Obj_Comments) - 2
for(i in 1 : nb_Comments)
{
print(i)
list_Comments[[i]] <- web_Obj_Comments[[i + 1]]$getElementText()[[1]]
}
vector_Comments <- unlist(list_Comments)
vector_Comments <- vector_Comments[vector_Comments != ""]

Resources