Start RSelenium
library(RSelenium)
RSelenium::startServer()
pJS <- phantom()
Sys.sleep(5) # give the binary a moment
remDr <- remoteDriver(browserName = 'phantomjs')
remDr$open()
Go to the site and wait a bit
remDr$navigate("http://ideal-scope.com/online-holloway-cut-adviser/")
Sys.sleep(5)
now when I try to find element on the text boxes
depthElem <- remDr$findElements("name","depth_textbox")
tableElem <- remDr$findElements("name","table_textbox")
crownElem <- remDr$findElements("name","crown_textbox")
pavilionElem <- remDr$findElements("name","pavilion_textbox")
...just gives me a bunch of objects that are list()
If I do findElement instead of findElements I get
Error: Summary: NoSuchElement
Detail: An element could not be located on the page using the given search parameters.
How can I select these textboxes? Why can't I select them by searching name?
The findElements method returns an empty list when no elements are present. The page has the content you require in an iframe. You will need to switch to the iframe first before you can search for the elements:
remDr$navigate("http://ideal-scope.com/online-holloway-cut-adviser/")
# get iframes
webElems <- remDr$findElements("css", "iframe")
# there is only one
remDr$switchToFrame(webElems[[1]])
depthElem <- remDr$findElement("name","depth_textbox")
# > depthElem$getElementAttribute("name")
# [[1]]
# [1] "depth_textbox"
remDr$findElement("name","table_textbox")
crownElem <- remDr$findElement("name","crown_textbox")
pavilionElem <- remDr$findElement("name","pavilion_textbox")
Related
I'm trying to scrape only one value from this site, but I can not get it
Here is my code
library(RSelenium)
rD <- rsDriver(browser="chrome",port=0999L,verbose = F,chromever = "95.0.4638.54")
remDr <- rD[["client"]]
remDr$navigate("https://www.dailyfx.com/eur-usd")
html <- remDr$getPageSource()[[1]]
library(rvest)
page <- read_html(html)
nodes <- html_nodes(page, css = ".mt-2.text-black")
html_text(nodes)
My result is
html_text(nodes)
[1] "\n\nEUR/USD\nMixed\n\n\n\n\n\n\n\n\n\nNet Long\n\n\n\nNet Short\n\n\n\n\n\nDaily change in\n\n\n\nLongs\n5%\n\n\nShorts\n1%\n\n\nOI\n4%\n\n\n\n\n\nWeekly change in\n\n\n\nLongs\n13%\n\n\nShorts\n23%\n\n\nOI\n17%\n\n\n\n"
What I need to do to get the value of Net Long ?
I would use a more targeted css selector list, to target just the node of interest. Then extract the data-value attribute value from the single matched node to get the percentage:
webElem <- remDr$findElement(using = 'css selector', '.dfx-technicalSentimentCard__netLongContainer [data-type="long-value-info"]')
var <- webElem$getElementAttribute("data-value")[[1]]
Or,
page %>% html_element('.dfx-technicalSentimentCard__netLongContainer [data-type="long-value-info"]') %>% html_attr('data-value')
I would like to get news headings for a company from Yahoo. I use RSelenium to start a remote browser and accept cookies. I found the surroung css class "StretchedBox" and I can literally see the headline by browser inspection. How can I store these headings? Next, I would like to scroll down with RSelenium and save more of these elements (say for several days).
library('RSelenium')
# Start Remote Browser
rD <- rsDriver(port = 4840L, browser = c("firefox"))
remDr <- rD[["client"]]
# Navigate to Yahoo Finance News for Specific Company
# This takes unusual long time
remDr$navigate("https://finance.yahoo.com/quote/AAPL/news?p=AAPL")
# Get "accept all cookies" botton
webElems <- remDr$findElements(using = "xpath", "//button[starts-with(#class, 'btn primary')]")
# We can check if we did get the proper button by checking the text of the element:
unlist(lapply(webElems, function(x) {x$getElementText()}))
# We found the two button, and we want to click the first one:
webElems[[1]]$clickElement()
# wait for page loading
Sys.sleep(5)
# I am looking for news headline in or after the StretchedBox
boxes <- remDr$findElements(using = "class", "StretchedBox")
boxes[1] # empty
boxes[[1]]$browserName
Finally, I found an xpath from which I could getElementText the news article headlines.
library('RSelenium')
# Start Browser
rD <- rsDriver(port = 4835L, browser = c("firefox"))
remDr <- rD[["client"]]
# Navigate to Yahoo Financial News
remDr$navigate("https://finance.yahoo.com/quote/AAPL/news?p=AAPL")
# Click Accept Cookies
webElems <- remDr$findElements(using = "xpath", "//button[starts-with(#class, 'btn primary')]")
unlist(lapply(webElems, function(x) {x$getElementText()}))
webElems[[1]]$clickElement()
# extract headlines from html/css by xpath
headlines <- remDr$findElements(using = "xpath", "//h3[#class = 'Mb(5px)']//a")
# extract headline text
headlines <- sapply(headlines, function(x){x$getElementText()})
headlines[1]
[[1]]
[1] "What Kind Of Investors Own Most Of Apple Inc. (NASDAQ:AAPL)?"
I'm using RSelenium to click on a dynamic element after a search on this webpage: http://www.in.gov.br/web/guest/inicio.
Every time I search for a word, I would like to find the words/link 'Ministério Da Educação' (it is the portuguese equivalent for Ministry of Education) on the right side of the results webpage and click on it.
I have used the inspect element feature from Google Chrome, but I am not having any success on finding and clicking that element. I have already tried using xpath, css selector, id ...
I am using the following code:
## search parameters
string_search <- "contrato"
date_search <- format(
as.Date("17/04/2019", "%d/%m/%Y"),
"%d/%m/%Y") #brazilian format
## start Selenium driver
library(RSelenium)
selCommand <- wdman::selenium(
jvmargs = c("-Dwebdriver.firefox.verboseLogging=true"),
retcommand = TRUE)
shell(selCommand, wait = FALSE, minimized = TRUE) # for windows
# system(selCommand) # for Linux
remDr <- remoteDriver(port = 4567L, browserName = "firefox")
remDr$open()
## navigation & search
remDr$navigate("http://www.in.gov.br/web/guest/inicio")
Sys.sleep(5)
# from date
datefromkey<-remDr$findElement(using = 'css', "#calendario_advanced_from")
datefromkey$clickElement()
datefromkey$sendKeysToElement(list(key = "enter"))
datefromkey$clearElement()
datefromkey$sendKeysToElement(list(date_search))
datefromkey$sendKeysToElement(list(key = "enter"))
# to date
datetokey<-remDr$findElement(using = 'css', "#calendario_advanced_to")
datetokey$clickElement()
datetokey$sendKeysToElement(list(key = "enter"))
datetokey$clearElement()
datetokey$sendKeysToElement(list(date_search))
datetokey$sendKeysToElement(list(key = "enter"))
# string to search
wordkey<-remDr$findElement(using = 'css', "#input-advanced_search")
wordkey$sendKeysToElement(list('"', string_search, '"'))
# click search button
press_button <- remDr$findElement(using = 'class', "btn")
press_button$clickElement()
Here is where I struggle:
1) first attempt: using a broader tag
# using a broader tag
categorykey <- remDr$findElement(using = 'id', '_3_facetNavigation')
categorykey$getElementText()
With getElementText() I see that "Ministério da Educação" is there, but I do not know how to click on the link.
2) second attempt: using the xpath
categorykey <- remDr$findElement('xpath', '//li
[#id="yui_patched_v3_11_0_1_1555545676970_404"]/text()')
It returns an error. Selenium can't locate the element.
Found the solution myself after watching this video on YouTube:
How to locate Dynamic Elements in Selenium Webdriver - XPATH Tutorial
The code would be like this:
categorykey <-remDr$findElement('xpath', '//*[contains(#data-value,"ministério da
educação")]')
categorykey$getElementText()
# just to see if it's right
categorykey$clickElement()
I use Rselenium to scrapt the "rent" information in advertisement from the website. However, it seems like not every advertisement contains the rent information. Therefore, when my loop runs to those don't have the rent information, it faced the error i.e. 'NoSuchElement' and the loop stops. I want to:
1/ fill "NA" values to those cases which dont have rent information; and
2/ continue the loop to scrapt rent information.
I already tried "tryCatch" function, however, it seems doesnt work.R still throws me an error i.e. "Error: Summary: NoSuchElement
Detail: An element could not be located on the page using the given search parameters.
Further Details: run errorDetails method".
My code is in the below. I appreciate your time and help.
#add url
url <- "https://www.toimitilat.fi/toimitilahaku/?size_min=&size_max=&deal_type%5B%5D=1&language=fin&result_type=list&advanced=0&gbl=1&ref=main#searchresult"
rD <- rsDriver()
remDr <- rD$client
remDr$navigate(url)
< for(i in 8:13){
Sys.sleep(0.86)
rent <- remDr$findElement(using = "css selector", paste("#objectList > div:nth-child(", i, ") > div.infoCont > div.priceCont", sep = ""))$getElementText()
#checking if there is a rent or not
if(!is.null(rent)){
tryCatch({
rent <- unlist(strsplit(rent[[1]][1], "\n"))
rent_df <- rbind(rent_df, rent)
}, error = function(e){
return("NoSuchElement")
i = i + 1
})
}
}
>
You can do this much more easily with rvest rather than using the sledgehammer of RSelenium. It also copes much better with missing information.
To get a dataframe with the addresses and rents, you can use html_nodes to create a list of the boxes containing the information, and then html_node to find the relevant information in each one. There will be one entry for each box, and any missing data will just appear as NA.
library(dplyr) #only needed for the pipe operator %>%
library(rvest)
url <- "https://www.toimitilat.fi/toimitilahaku/?size_min=&size_max=&deal_type%5B%5D=1&language=fin&result_type=list&advanced=0&gbl=1&ref=main#searchresult"
boxes <- read_html(url) %>% #read the page
html_nodes(".infoCont") #find the info boxes
address <- boxes %>%
html_node("h4 > a") %>% #find the address info in each box
html_text()
rent <- boxes %>%
html_node(".priceCont") %>% #find the rent info in each box
html_text() %>% #extract the text
trimws() #trim whitespace
#put together in a dataframe
rent_df <- data.frame(address = address,
rent = rent,
stringsAsFactors = FALSE)
head(rent_df)
address rent
1 Akaa, Airolantie 5 Myyntihinta: \nMyydään tarjousten perusteella...
2 Akaa, Hämeentie 18 <NA>
3 Akaa, Hämeentie 69, Akaa
4 Akaa, Keskuskatu 42 Vuokrahinta: \n300 e/kk + alv
5 Akaa, Kirkkotori 10, Toijala Vuokrahinta: \n450
6 Akaa, Palomäentie 6 Toijala Vuokrahinta: \n3€/m2+alv
You can then easily extract the information you need.
Solution with rvest should be easier, but if you want or need to use RSelenium, this should work:
# Preparation
library(dplyr) # required for bind_rows
# add url
url <- "https://www.toimitilat.fi/toimitilahaku/?size_min=&size_max=&deal_type%5B%5D=1&language=fin&result_type=list&advanced=0&gbl=1&ref=main#searchresult"
rD <- rsDriver()
remDr <- rD$client
remDr$navigate(url)
# Checking that rD and remDr objects exist and work
## If youg get an error here, that means that selenium objects doesn´t work - usually because ports are busy, selenium server or client have not been closed properly or browser drivers are out of date (or something else)
class(rD)
class(remDr)
# making separate function retrieving the rent and handling exceptions
giveRent <- function(i) {
Sys.sleep(0.86)
tryCatch( {
rent <- remDr$findElement(using = "css selector", paste("#objectList > div:nth-child(", i, ") > div.infoCont > div.priceCont", sep = ""))$getElementText()
rent <- unlist(strsplit(rent[[1]][1], "\n"))
rent <- rent[2]}
, warning = function(e){rent <<- NA}
, error = function(e){rent <<- NA})
return(rent)}
# adding rent to the dataframe in for-loop
rent_df <- c()
for(i in 1:33){rent_df <- bind_rows(rent_df, (data.frame(giveRent(i))))}
print(rent_df)
Need to Scrape Product Information from a Ecommerce Page. But page has infinite scrolling. Currently I am able to scrape only products shown without scrolling down. Below is the code for it.
require(RCurl)
require(XML)
require(dplyr)
require(stringr)
webpage <- getURL("http://www.jabong.com/kids/clothing/girls-clothing/kids-tops-t-shirts/?source=topnav_kids")
linklist <- str_extract_all(webpage, '(?<=href=")[^"]+')[[1]]
linklist <- as.data.frame(linklist)
linklist <- filter(linklist, grepl("\\?pos=", linklist))
linklist <- unique(linklist)
a <- as.data.frame(linklist)
a[2] <- "Jabong.com"
a <- add_rownames(a, "ID")
a$V3 <- gsub(" ", "", paste(a$V2, a$linklist))
a <- a[, -(1:3)]
colnames(a) <- "Links"
Well, if scrolling is truly infinite, then it is impossible to get ALL of the links... If you wanted to settle for a finite number, you can indeed fruitfully use RSelenium here.
library(RSelenium)
#start RSelenium
checkForServer()
startServer()
remDr <- remoteDriver()
remDr$open()
# load your page
remDr$navigate("http://www.jabong.com/kids/clothing/girls-clothing/kids-tops-t-shirts/?source=topnav_kids")
# scroll down 5 times, allowing 3 second for the page to load everytime
for(i in 1:5){
remDr$executeScript(paste("scroll(0,",i*10000,");"))
Sys.sleep(3)
}
# get the page html
page_source<-remDr$getPageSource()
# get the URL's that you are looking for
pp <- xml2::read_html(page_source[[1]]) %>%
rvest::html_nodes("a") %>%
rvest::html_attr("data-original-href") %>%
{.[!is.na(.)]}
The result is 312 links (in my browser). The more you have RSelenium scroll down, the more links you'll get.