Launch TorBrowser or Tor enabled Firefox browser in RSelenium - r

How do you launch TorBrowser in RSelenium?
I tried this to no avail:
library(RSelenium)
browserP <- "C:/Users/Administrator/Desktop/Tor Browser/Browser/firefox.exe"
jArg <- paste0("-Dwebdriver.firefox.bin=\"", browserP, "\"")
pLoc <- "C:/Users/Administrator/Desktop/Tor Browser/Browser/TorBrowser/Data/Browser/profile.meek-http-helper/"
jArg <- c(jArg, paste0("-Dwebdriver.firefox.profile=\"", pLoc, "\""))
selServ <- RSelenium::startServer(javaargs = jArg)
Error: startServer is now defunct. Users in future can find the function in
file.path(find.package("RSelenium"), "examples/serverUtils"). The
recommended way to run a selenium server is via Docker. Alternatively
see the RSelenium::rsDriver function.
rsDriver doesn't take a javaargs argument, and I can't figure out how to get this to work either:
fprof <- getFirefoxProfile("C:/Users/Administrator/Desktop/Tor Browser/Browser/TorBrowser/Data/Browser/profile.meek-http-helper/", useBase = T)
remDr <- remoteDriver(extraCapabilities = list(marionette = TRUE))

Related

Scraping YouTube comments

I'm trying to scrape several comments on YouTube but my id doesn't work and I don't know if i have to use the API or the client id/secret:
>id <- "XXX"
>api <- "xxx"
>client <-"xxx"
>secret <- "xxx"
yt_oauth(client, secret)
yt.oauth(client, secret)
yt.ouath(client,secret, token = ' ')
yt_oauth(user, api, token = '')
# A<- yt_oauth(user,api)
install.packages("devtools")
library (devtools)
devtools::install_github("soodoku/tuber", build_vignettes = TRUE)
#quick overview of some important functions in tuber, see the vignette:
vignette("tuber-ex", package="tuber")
#Get All the Comments Including Replies
get_all_comments(video_id = "a-UQz7fqR3w")
yt.oauth(client, secret)
Error in yt.oauth(client, secret) : not found the function "yt.oauth"
yt_oauth(user, api, token = '')
Waiting for authentication in browser...Press Esc/Ctrl + C to abort
But it doesn't recognize my credentials
Thank you so much!!
I do not know if this is interesting for you, but I have been able to get the comments of a Youtube video with the following code :
library(rvest)
library(RSelenium)
port <- as.integer(4444)
rd <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = port)
remDr <- rd$client
remDr$open()
url <- 'https://www.youtube.com/watch?v=0WzPPRUTQZ4'
remDr$navigate(url)
# Scroll down the page to load all the comments
for(i in 1 : 200)
{
print(i)
java_Script <- paste0("scroll(0,", i * 500, ")")
remDr$executeScript(java_Script)
}
web_Obj_Comments <- remDr$findElements("xpath", '//*[#id="content"]')
list_Comments <- list()
nb_Comments <- length(web_Obj_Comments) - 2
for(i in 1 : nb_Comments)
{
print(i)
list_Comments[[i]] <- web_Obj_Comments[[i + 1]]$getElementText()[[1]]
}
vector_Comments <- unlist(list_Comments)
vector_Comments <- vector_Comments[vector_Comments != ""]

download file with Rselenium & docker toolbox

I m trying to download files by Rselenium but it looks impossible.I don't arrive to download even with an easy example:
1) i have installed docker toolbox (https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-docker.html)
2) i ran the firefox standalone image : 3.1.0 and now i m testing the older 2.52.0
3) i have installed the rselenium package on My R X64 3.3.2 and i read all the questions & answers on stackoverflow
4) i have tried the following code, by the way, when i analyse the firefox options about:config , i don't find the "browser.download.dir" options:
require(RSelenium)
fprof <- makeFirefoxProfile(list(browser.download.dir = "C:/temp"
, browser.download.folderList = 2L
, browser.download.manager.showWhenStarting = FALSE
, browser.helperApps.neverAsk.saveToDisk = "application/zip"))
remDr <- remoteDriver(browserName = "firefox",remoteServerAddr = "192.168.99.100",port = 4445L,extraCapabilities = fprof)
remDr$open(silent = TRUE)
remDr$navigate("https://www.chicagofed.org/applications/bhc/bhc-home")
# click year 2012
webElem <- remDr$findElement("name", "SelectedYear")
webElems <- webElem$findChildElements("css selector", "option")
webElems[[which(sapply(webElems, function(x){x$getElementText()}) == "2012" )]]$clickElement()
# click required quarter
webElem <- remDr$findElement("name", "SelectedQuarter")
Sys.sleep(1)
webElems <- webElem$findChildElements("css selector", "option")
webElems[[which(sapply(webElems, function(x){x$getElementText()}) == "4th Quarter" )]]$clickElement()
# click button
webElem <- remDr$findElement("id", "downloadDataFile")
webElem$clickElement()
6) i have no error but i have no file
7) At the end , i would like to download the excel file on this page by Rselenium:
[link]https://app2.msci.com/products/indexes/performance/country_chart.html?asOf=Feb%2028,%202010&size=30&scope=C&style=C&currency=15&priceLevel=0&indexId=83#
If you are using Docker toolbox with windows you may have issues mapping volumes see Docker : Sharing a volume on Windows with Docker toolbox
If you are using Docker Machine on Mac or Windows, your Docker daemon has only limited access to your OS X or Windows filesystem. Docker Machine tries to auto-share your /Users (OS X) or C:\Users (Windows) directory.
I initiated a clean install of docker toolbox on a windows 10 box and ran the following image:
$ docker stop $(docker ps -aq)
$ docker rm $(docker ps -aq)
$ docker run -d -v //c/Users/john/test/://home/seluser/Downloads -p 4445:4444 -p 5901:5900 selenium/standalone-firefox-debug:2.53.1
NOTE: we mapped to a directory in the Users/john space. User john is running docker toolbox
Running the below code
require(RSelenium)
fprof <- makeFirefoxProfile(list(browser.download.dir = "home/seluser/Downloads"
, browser.download.folderList = 2L
, browser.download.manager.showWhenStarting = FALSE
, browser.helperApps.neverAsk.saveToDisk = "application/zip"))
remDr <- remoteDriver(browserName = "firefox",remoteServerAddr = "192.168.99.100",port = 4445L,extraCapabilities = fprof)
remDr$open(silent = TRUE)
remDr$navigate("https://www.chicagofed.org/applications/bhc/bhc-home")
# click year 2012
webElem <- remDr$findElement("name", "SelectedYear")
webElems <- webElem$findChildElements("css selector", "option")
webElems[[which(sapply(webElems, function(x){x$getElementText()}) == "2012" )]]$clickElement()
# click required quarter
webElem <- remDr$findElement("name", "SelectedQuarter")
Sys.sleep(1)
webElems <- webElem$findChildElements("css selector", "option")
webElems[[which(sapply(webElems, function(x){x$getElementText()}) == "4th Quarter" )]]$clickElement()
# click button
webElem <- remDr$findElement("id", "downloadDataFile")
webElem$clickElement()
And checking the mapped download folder
> list.files("C://Users/john/test")
[1] "bhcf1212.zip"
>
finally i have decided to make a clean install of the docker for windows (17.03.0) stable.
i needed to decrease the number of available cpu (to 1) and available ram too (to 1GB).
i have shared my c too (btw it s mandatory to have a password session otherwise you can't share the directory
after that i restarted my computer
On the R side , do not forget to remove the:
remoteServerAddr = "192.168.99.100"
and i got the file.
my fear now is about the stability of docker, sometimes it runs, sometimes not.
many thanks john for your help

Scrape website with R by navigating doPostBack

I want to extract a table periodicaly from below site.
price list changes when clicked building block names(BLOK 16 A, BLOK 16 B, BLOK 16 C, ...) . URL doesn't change, page changes by trigering
javascript:__doPostBack('ctl00$ContentPlaceHolder1$DataList2$ctl04$lnk_blok','')
I've tried 3 ways after searching google and starckoverflow.
what I've tried no 1: this doesn't triger doPostBack event.
postForm( "http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44", ctl00_ContentPlaceHolder1_DataList2_ctl03_lnk_blok="ctl00$ContentPlaceHolder1$DataList2$ctl03$lnk_blok")
what I've tried no 2: selenium remote seem to works on (http://localhost:4444/) but remotedriver doesn't navigate. returns this error. (Error in checkError(res) :
Undefined error in httr call. httr output: length(url) == 1 is not TRUE)
library(RSelenium)
startServer()
remDr <- remoteDriver()
remDr <- remoteDriver(remoteServerAddr = "localhost"
, port = 4444L, browserName = "firefox")
remDr$open()
remDr$getStatus()
remDr$navigate("http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44")
what I've tried no 3: this another way to triger dopostback event. it doesn't navigate.
base.url <- "http://www.kentkonut.com.tr/tr/modul/projeler/",
event.target <- 'ctl00$ContentPlaceHolder1$DataList2$ctl03$lnk_blok',
action <- "daire_fiyatlari.aspx?id=44"
ftarget <- paste0(base.url, action)
dum <- getURL(ftarget)
event.val <- unlist(strsplit(dum,"__EVENTVALIDATION\" value=\""))[2]
event.val <- unlist(strsplit(event.val,"\" />\r\n\r\n<script"))[1]
view.state <- unlist(strsplit(dum,"id=\"__VIEWSTATE\" value=\""))[2]
view.state <- unlist(strsplit(view.state,"\" />\r\n\r\n\r\n<script"))[1]
web.data <- postForm(ftarget, "form name" = "ctl00_ContentPlaceHolder1_DataList2_ctl03_lnk_blok",
"method" = "POST",
"action" = action,
"id" = "ctl00_ContentPlaceHolder1_DataList2_ctl03_lnk_blok",
"__EVENTTARGET"=event.target,
"__EVENTVALIDATION"=event.val,
"__VIEWSTATE"=view.state)
thanks for your help.
library(rvest)
url<-"http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44"
pgsession<-html_session(url)
t<-html_table(html_nodes(read_html(pgsession), css = "#ctl00_ContentPlaceHolder1_DataList1"), fill= TRUE)[[1]]
even_indices<-seq(2,length(t$X1),2)
t<-t[even_indices,]
t<-t[2:(length(t$X1)),]
EDITED CODE:
library(rvest)
url<-"http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44"
pgsession<-html_session(url)
pgform<-html_form(pgsession)[[1]]
page<-rvest:::request_POST(pgsession,"http://www.kentkonut.com.tr/tr/modul/projeler/daire_fiyatlari.aspx?id=44",
body=list(
`__VIEWSTATE`=pgform$fields$`__VIEWSTATE`$value,
`__EVENTTARGET`="ctl00$ContentPlaceHolder1$DataList2$ctl01$lnk_blok",
`__EVENTARGUMENT`="",
`__VIEWSTATEGENERATOR`=pgform$fields$`__VIEWSTATEGENERATOR`$value,
`__VIEWSTATEENCRYPTED`=pgform$fields$`__VIEWSTATEENCRYPTED`$value,
`__EVENTVALIDATION`=pgform$fields$`__EVENTVALIDATION`$value
),
encode="form"
)
# in the above example change eventtarget as "ctl00$ContentPlaceHolder1$DataList2$ctl02$lnk_blok" to get different table
t<-html_table(html_nodes(read_html(page), css = "#ctl00_ContentPlaceHolder1_DataList1"), fill= TRUE)[[1]]
even_indices<-seq(2,length(t$X1),2)
t<-t[even_indices,]
t<-t[2:(length(t$X1)),]

In R phantomJS ran with Rselenium hangs after several iterations

I'm using phantomJS to collect data from different sites. During data scrapping process I experience a lot of crashes when parsing sites or sites elements. Unfortunately nor phantomJS nor RSelenium don't provide any information or bag report in the console. Script just hangs without any warnings. I see that it is executing, but actually nothing happens. The only way to stop script from executing is to manually restart R. After several test I found that phantomJS usually hangs on executing remDr$findElements() commands. I tried to reran my code using firefox and RSelenium - it works normally. So the problem is in how phantomJS works.Does anyone experience anything similar when running phantomJS? Is it possible to fix this misbehavior?
I'm using:
Windows 7
Selenium 2.0
R version 3.1.3
phantomjs-2.0.0-windows
My code:
# starting phantom server driver
phantomjsdir <- paste(mywd, "/phantomjs-2.0.0-windows/bin/phantomjs.exe", sep="" )
phantomjsUserAgent <- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36 OPR/28.0.1750.48"
eCap <- list(phantomjs.binary.path = phantomjsdir, phantomjs.page.settings.userAgent = phantomjsUserAgent )
pJS <- phantom(pjs_cmd = phantomjsdir)
remDr <- remoteDriver(browserName = "phantomjs", extraCapabilities = eCap)
remDr$open(silent = FALSE)
mywords <- c("canon 600d", "sony 58k","nikon","nikon2","nikon 800","nikon 80","nikon 8")
timeout <- 3
#'
#' Exceuting script
#'
for (word in mywords) {
print(paste0("searching for: ",word))
ss.word <- word
remDr$navigate("http://google.com")
webElem <- remDr$findElement(using = "class", "gsfi")
webElem$sendKeysToElement(list(enc2utf8(ss.word),key = "enter"))
Sys.sleep(1)
print (remDr$executeScript("return document.readyState;")[[1]])
while (remDr$executeScript("return document.readyState;")[[1]]!= "complete" && totalwait<10) {
Sys.sleep(timeout)
}
print(paste0("search completed: ",ss.word))
elem.snippet <- remDr$findElements(using="class name",value = "rc")
for (i in 1:length(elem.snippet)) {
print(paste0("element opened: ",ss.word," pos",i))
print(elem.snippet[[i]])
ss.snippet.code <- elem.snippet[[i]]$getElementAttribute('innerHTML')
print(paste0("element element innerHTML ok"))
elemtitle <- elem.snippet[[i]]$findChildElement(using = "class name", value = "r")
print(paste0("element title ok"))
elemcode <- elemtitle$getElementAttribute('innerHTML')
print(paste0("element innerHTML ok"))
elemtext <- elem.snippet[[i]]$findChildElement(using = "class name", value = "st")
ss.text <- elemtext$getElementText()[[1]]
print(paste0("element loaded: ",ss.word," pos",i))
elemloc <- elem.snippet[[i]]$getElementLocation()
elemsize <- elem.snippet[[i]]$getElementSize()
print(paste0("element location parsed: ",ss.word," pos",i))
}
print(paste0("data collected: ",ss.word))
}

htmlParse errors on accessing google search. Is their an alternative approach

I am trying to obtain the number of results obtained from specific google searches.
For example for stackoverflow there are "About 28,200,000 results (0.12 seconds)".
Normally I would use the xpathSApply function from the XML R package but I am having errors and am not sure how to solve them or know if there is an alternative approach
library(XML)
googleURL <- "https://www.google.ca/search?q=stackoverflow"
googleInfo <- htmlParse(googleURL, isURL = TRUE)
Error: failed to load external entity "https://www.google.ca/search?q=stackoverflow"
#use of RCurl which I am not that familiar with
library(RCurl)
getURL(googleURL)
#Error in function (type, msg, asError = TRUE) :
#SSL certificate problem, verify that the CA cert is OK. Details:
#error:14090086:SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
# final effort
library(httr)
x <- GET(googleURL)
# no error but am not sure how to proceed
# the relevant HTML code to parse is
# <div id=resultStats>About 28,200,000 results<nobr> (0.12 seconds) </nobr></div>
Ay help in solving errors or parsing the httr object would be much appreciated
You are asking for a secure http connection
https://www.google.ca/search?q=stackoverflow
XML is complaining about this as is RCurl. httr will download the page.
XML ask for an unsecured connection
library(XML)
googleURL <- "http://www.google.ca/search?q=stackoverflow"
googleInfo <- htmlParse(googleURL, isURL = TRUE)
xpathSApply(googleInfo,'//*/div[#id="resultStats"]')
#[[1]]
#<div id="resultStats">About 28,200,000 results</div>
RCurl use ssl.verifypeer = FALSE thou it worked without for me
library(RCurl)
googleURL <- "https://www.google.ca/search?q=stackoverflow"
googleInfo <- getURL(googleURL,ssl.verifypeer = FALSE)
googleInfo <- htmlParse(googleInfo)
# or if you want to use a cert
# system.file("CurlSSL/cacert.pem", package = "RCurl")
# googleInfo <- getURL(googleURL, cainfo = cert)
# googleInfo <- htmlParse(googleInfo)
xpathSApply(googleInfo,'//*/div[#id="resultStats"]')
#[[1]]
#<div id="resultStats">About 28,200,000 results</div>
httr use content
library(httr)
x <- GET(googleURL)
googleInfo <- htmlParse(content(x, as = 'text'))
xpathSApply(googleInfo,'//*/div[#id="resultStats"]')
#[[1]]
#<div id="resultStats">About 28,200,000 results</div>

Resources