Open new tab in RSelenium - r

I have the following code with which I try to open the url into a new tab every take a new url loaded from the for loop open to a new tab. What I made until know is this:
library("RSelenium")
startServer()
checkForServer()
remDr <- remoteDriver()
remDr$open()
remDr$navigate("http://www.google.com/")
Sys.sleep(5)
myurllist <- c("https://cran.r-project.org/", "http://edition.cnn.com/", "https://cran.r-project.org/web/packages/")
for (i in 1:length(myurllist)) {
url <- url_list[i]
webElem <- remDr$findElement("css", "urlLink")
webElem$sendKeysToElement(list(key = "t"))
remDr$navigate(url)
Sys.sleep(5)
}
From selenium I found this answer

A new tab is opened by pressing CTRL+T, not T:
library("RSelenium")
startServer()
checkForServer()
remDr <- remoteDriver()
remDr$open()
remDr$navigate("http://www.google.com/")
url_list <- c("http://edition.cnn.com/", "https://cran.r-project.org/web/packages/")
for (url in url_list) {
webElem <- remDr$findElement("css", "html")
webElem$sendKeysToElement(list(key="control", "t"))
remDr$navigate(url)
}

Related

Web scraping with R: findElement doesn't recognise drop down menu

I'm trying to scrape a government database with multiple dropdown menus. Using RSelenium, I've managed to click on the button taking me to the interactive database ("Sistema de Catastros de superficie frutícola regional"), and I'm now trying to click on the drop-down menus (ex: region, year), but keep getting errors that there's NoSuchElement. I've tried multiple xpaths based on inspect element and the Selector Gadget chrome extension to no avail. It looks like each of the dropdown menus is a combobox.
If helpful, my end goal is to go through each of the regions, years, and crops; scraping the table generated by each one.
library(RSelenium)
library(tidyverse)
rdriver = rsDriver(browser = "chrome", port = 9515L, chromever = "106.0.5249.61")
obj = rdriver$client
obj$navigate("https://www.odepa.gob.cl/estadisticas-del-sector/catastros-fruticolas")
link = obj$findElement(using = 'xpath', value = '//*[#id="content"]/div/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div/div/div[3]/div/p[2]/a')$clickElement()
When you click on the button, a new tab appears.
You have to switch tabs with the help of
remDr$switchToWindow(remDr$getWindowHandles()[[2]])
Here is an example.
library(rvest)
library(RSelenium)
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
Sys.sleep(15)
url <- "https://www.odepa.gob.cl/estadisticas-del-sector/catastros-fruticolas"
remDr$navigate(url)
Sys.sleep(15)
web_Obj_Button <- remDr$findElement("xpath", '//*[#id="content"]/div/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div/div/div[3]/div/p[2]/a')
web_Obj_Button$clickElement()
remDr$switchToWindow(remDr$getWindowHandles()[[2]])
web_Obj_Date <- remDr$findElement("css selector", "#mat-select-value-3 > span > span")
web_Obj_Date$clickElement()
remDr$screenshot(TRUE)

RSelenium message:no such element: Unable to locate element

I intend to download and clean databases using RSelenium. I am able to open the link however I am having trouble downloading and opening the database. I believe the xpath is right but when I try to open I receive the following error
Selenium message:no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="ESTBAN_AGENCIA"]"}
My code is the following:
dir <- getwd()
file_path <- paste0(dir,"\\DataBase") %>% str_replace_all("/", "\\\\\\")
eCaps <- list(
chromeOptions =
list(prefs = list('download.default_directory' = file_path))
)
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
#Creating server
rD <- rsDriver(browser = "chrome",
chromever = "101.0.4951.15",
port = 4812L,
extraCapabilities = eCaps)
#Creating the driver to use R
remDr <- remoteDriver(
remoteServerAddr = "localhost",
browserName = "chrome",
port = 4812L)
#Open server
remDr$open()
#Navegating in the webpage of ESTABAN
remDr$navigate("https://www.bcb.gov.br/acessoinformacao/legado?url=https:%2F%2Fwww4.bcb.gov.br%2Ffis%2Fcosif%2Festban.asp")
##Download
remDr$findElement(using ="xpath", '//*[#id="ESTBAN_AGENCIA"]/option[1]')
The element you are trying to access is inside an iframe and you need switch that iframe first in order to access the element.
remDr$navigate("https://www.bcb.gov.br/acessoinformacao/legado?url=https:%2F%2Fwww4.bcb.gov.br%2Ffis%2Fcosif%2Festban.asp")
#Switch to Iframe
webElem <- remDr$findElement("css", "iframe#framelegado")
remDr$switchToFrame(webElem)
##Download
remDr$findElement(using ="xpath", '//*[#id="ESTBAN_AGENCIA"]/option[1]')

Access restfulAPI token using R

my project includes fetching the token from restful API using R. THe API uses OAuth 2.0. The project is confidential, so cannot provide the information.
can anyone please help me to access the token using coding in R. For instance
URL: facebook.com
username: ABC
Password: qwerty
Here is one approach that can be considered for Facebook :
library(RSelenium)
url <- "https://www.facebook.com/"
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
remDr$navigate(url)
web_Obj_Username <- remDr$findElement("css selector", '#email')
web_Obj_Username$sendKeysToElement(list("ABC"))
web_Obj_Password <- remDr$findElement("css selector", '#pass')
web_Obj_Password$sendKeysToElement(list("qwerty"))
html_Content <- remDr$getPageSource()[[1]]
id_Submit <- unlist(stringr::str_extract_all(html_Content, 'royal_login_button" type="submit" id=".........'))
id_Submit <- stringr::str_extract_all(id_Submit, 'id=".........')
id_Submit <- stringr::str_remove_all(id_Submit, 'id|=|\"')
web_Obj_Submit <- remDr$findElement("id", id_Submit)
web_Obj_Submit$click()

Phantomjs returns 404 in R when attempting webscraping

I am trying to collect some data from the OTC Markets (within the confines of their robots.txt) and I cannot connect to the webpage.
The first step I tried was just to scrape the HTML right off the page, but the page requires javascript to load.
So I downloaded phantomjs and connected that way. However, this leads to a 404 error page
I then changed the user-agent to something resembling a user to see if it would let me connect and still, no luck! What is going on here
Here is a reproducible version of my code, any help would be appreciated. Phantomjs can be downloaded here: http://phantomjs.org/
library(rvest)
library(xml2)
library(V8)
# example website, I have no correlation to this stock
url <- 'https://www.otcmarkets.com/stock/YTROF/profile'
# create javascript file that phantomjs can process
writeLines(sprintf("var page = require('webpage').create();
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
page.open('%s', function () {
console.log(page.content); //page source
phantom.exit();
});", url), con="scrape.js")
html <- system("phantomjs.exe_PATH scrape.js", intern = TRUE)
page_html <- read_html(html)
I have been able to get the html content with the following code which is not based on PhantomJS but on Selenium :
library(RSelenium)
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
remDr$navigate('https://www.otcmarkets.com/stock/YTROF/profile')
remDr$executeScript("scroll(0, 5000)")
remDr$executeScript("scroll(0, 10000)")
remDr$executeScript("scroll(0, 15000)")
Sys.sleep(4)
remDr$screenshot(display = TRUE, useViewer = TRUE)
html_Content <- remDr$getPageSource()[[1]]
It is important to give time to the page to load before we extract the html content.
Here is another approach based on RDCOMClient :
library(RDCOMClient)
url <- 'https://www.otcmarkets.com/stock/YTROF/profile'
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)
Sys.sleep(5)
doc <- IEApp$Document()
Sys.sleep(5)
html_Content <- doc$documentElement()$innerText()

Download.File Issue with xls Excel Workbook

I'm trying to download an Excel workbook xls using R's download.file function (Windows 10, R version 3.4.4 (2018-03-15)).
When I download the file manually (using Internet Explorer or Chrome) then the file downloads and I can then open it in Excel without any problems.
When I use download.file in R, the file downloads but size is smaller than correct download file - this file is hmtl file with some notes that my browser is not supported. Tyred different modes and no luck.
My code:
download.file(
url = "https://www.atsenergo.ru/nreport?fid=696C3DB7A3F6019EE053AC103C8C8733",
destfile = "C:/MyExcel.xls",
mode = "wb",
method = "auto"
)
Solving this problem with RSelenium library. ATS site reject any query for downloading file (return .hmtl file with Required javascript enabled message) and in this case Selenium method only works. My code below (where urlList data frame with files download links):
rD <- rsDriver(port = 4444L,
browser = "chrome",
check = FALSE,
geckover = NULL,
iedrver = NULL,
phantomver = NULL)
remDr <- rD$client
for (i in 1:nrow(urlList)) {
tryCatch({
row <- urlList[i,]
remDr$navigate(row$url)
webElem <-
remDr$findElement(using =
'link text', row$FileName)
webElem$clickElement()
},
error = function(e)
logerror(paste(
substr(e, 1, 50),
atsCode,
dateFileName,
sep = "\t"
), logger = loggerName),
finally = next)
}
remDr$close()
# stop the selenium server
rD[["server"]]$stop()

Resources