Rselenium scraping loop and list - r

I'm trying to use this code:
require(RSelenium)
checkForServer()
startServer()
remDr<-remoteDriver()
remDr$open()
appURL <- 'http://www.mtmis.excise-punjab.gov.pk'
remDr$navigate(appURL)
remDr$findElement("name", "vhlno")$sendKeysToElement(list("ria-07-777"))
Can't figure out css selector
remDr$findElements("class", "ent-button-div")[[1]]$clickElement()
after searching query
elem <- remDr$findElement(using="class", value="result-div")
elemtxt <- elem$getElementAttribute("outerHTML")[[1]]
elemxml <- htmlTreeParse(elemtxt, useInternalNodes=T)
final <- readHTMLTable(elemxml)
remDr$close()
rD[["server"]]$stop()
What I want is to create an automated "for loop" with different vehicles from list and merge all into one final table with unique identifier, e.g., "ria-07-777".
list <- c("ria-07-776", "ria-07-777", "ria-07-778")

Why do you need Selenium?
library(httr)
library(rvest)
clean_cols <- function(x) {
x <- tolower(x)
x <- gsub("[[:punct:][:space:]]+", "_", x)
x <- gsub("_+", "_", x)
x <- gsub("(^_|_$)", "", x)
make.unique(x, sep = "_")
}
get_vehicle_info <- function(vhlno) {
POST(
url = 'http://www.mtmis.excise-punjab.gov.pk/',
set_cookies(has_js=1),
body = list(vhlno=vhlno)
) -> res
stop_for_status(res)
pg <- content(res)
rows <- html_nodes(pg, xpath=".//div[contains(#class, 'result-div')]/table/tr[td[not(#colspan)]]")
cbind.data.frame(
as.list(
setNames(
html_text(html_nodes(rows, xpath=".//td[2]")),
clean_cols(html_text(html_nodes(rows, xpath=".//td[1]")))
)
),
stringsAsFactors=FALSE
)
}
Now use ^^:
vehicles <- c("ria-07-776", "ria-07-777", "ria-07-778")
Reduce(
rbind.data.frame,
lapply(vehicles, function(v) {
Sys.sleep(5) # your desire to steal a bunch of vehicle info to make a sketch database does not give you the right to hammer the server, and you'll very likely remove this line anyway, but I had to try
get_vehicle_info(v)
})
) -> vehicle_df
str(vehicle_df)
## 'data.frame': 3 obs. of 12 variables:
## $ registration_number: chr "ria-07-776" "ria-07-777" "ria-07-778"
## $ chassis_number : chr "KZJ95-0019869" "NFBFD15746R101101" "NZE1206066278"
## $ engine_number : chr "1KZ-0375851" "R18A11981105" "X583994"
## $ make_name : chr "LAND - CRUISER" "HONDA - CIVIC" "TOYOTA - COROLLA"
## $ registration_date : chr "17-Dec-2007 12:00 AM" "01-Aug-2007 12:00 AM" "01-Jan-1970 12:00 AM"
## $ model : chr "1997" "2006" "2007"
## $ vehicle_price : chr "1,396,400" "1,465,500" "0"
## $ color : chr "MULTI" "GRENDA B.P" "SILVER"
## $ token_tax_paid_upto: chr "June 2015" "June 2011" "June 2016"
## $ owner_name : chr "FATEH DIN AWAN" "M BILAL YASIN" "MUHAMMAD ALTAF"
## $ father_name : chr "HAFIZ ABDUL HAKEEM AWAN" "CH M. YASIN" "NAZAR MUHAMMAD"
## $ owner_city : chr "RAWALPINDI" "ISLAMABAD" "SARGODHA"
You'll need to handle network and scraping errors on your own. I can't justify any more time for this likely unethical endeavour (the answer was more to help others with similar q's).

Related

Creating arraylist using for loop

I want to create an arraylist of the price data of certain stocks.
First, I selected my basket of stocks using:
tickers <- c("^GSPC","MSFT","INTC","NVDA","AAPL")
Next, I downloaded the price data using a for loop function:
for (i in 1:length(tickers)) {
getSymbols(tickers[i],
from = as.Date("2006-01-01"), to = as.Date("2009-12-31"))
}
Now, I want to add each stock data into an arraylist, so I tried something like this:
s <- list()
for (i in 1:length(tickers)) {
getSymbols(tickers[i],
from = as.Date("2006-01-01"), to = as.Date("2009-12-31")) %>%
{. ->> s[[i]]}
}
But the output seems to only give me an arraylist of the name of the stocks:
[[1]] [1] "GSPC"
[[2]] [1] "MSFT"
[[3]] [1] "INTC"
[[4]] [1] "NVDA"
[[5]] [1] "AAPL"
Is there something wrong with the code I gave after the pipe function?
Just use lapply to create your list object and make sure to set the option auto.assign to FALSE.
library(quantmod)
tickers <- c("^GSPC","MSFT","INTC","NVDA","AAPL")
# Get the ticker data
s <- lapply(tickers, getSymbols, from = as.Date("2006-01-01"), to = as.Date("2009-12-31"), auto.assign = FALSE)
# name the list objects
names(s) <- tickers
str(s)
List of 5
$ ^GSPC:An ‘xts’ object on 2006-01-03/2009-12-30 containing:
Data: num [1:1006, 1:6] 1248 1269 1273 1273 1285 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:6] "GSPC.Open" "GSPC.High" "GSPC.Low" "GSPC.Close" ...
Indexed by objects of class: [Date] TZ: UTC
xts Attributes:
List of 2
..$ src : chr "yahoo"
..$ updated: POSIXct[1:1], format: "2018-12-07 15:01:48"
$ MSFT :An ‘xts’ object on 2006-01-03/2009-12-30 containing:
.....

POST to API using httr in R results in error

I'm trying to pull data directly from an API into R using the httr package. The API doesn't require any authentication, and accepts JSON strings of lat, long, elevation, variable sets, and time period to estimate climate variables for any location. This is my first time using an API, but the code below is what I've cobbled together from various Stack Overflow posts.
library(jsonlite)
library(httr)
url = "http://apibc.climatewna.com/api/clmApi"
body <- data.frame(lat = c(48.98,50.2), ##two example locations
lon = c(-115.02, -120),
el = c(1000,100),
prd = c("Normal_1961_1990.nrm","Normal_1961_1990.nrm"),
varYSM = c("Y","SST"))
requestBody <- toJSON(list("output" = body),auto_unbox = TRUE) ##convert to JSON string
result <- POST("http://apibc.climatewna.com/api/clmApi", ##post to API
body = requestBody,
add_headers(`Content-Type`="application/json"))
content(result)
I've tried various different versions of this (e.g. writing the JSON string manually, putting the body as a list in POST with encode = "json"), and it always runs, but the content always contains the below error message:
$Message
[1] "An error has occurred."
$ExceptionMessage
[1] "Object reference not set to an instance of an object."
$ExceptionType
[1] "System.NullReferenceException"
If I use GET and specify the variables directly in the URL
url = "http://apibc.climatewna.com/api/clmApi/LatLonEl?lat=48.98&lon=-115.02&el=1000&prd=Normal_1961_1990&varYSM=Y"
result <- GET(url)
content(result)
it produces the correct output, but then I can only obtain information for one location at a time. There isn't currently any public documentation about this API as it's very new, but I've attached a draft of the section explaining it using JS below. I would very much appreciate any help/suggestions on what I'm doing wrong!
Thank you!
The main problem is that jQuery.ajax encodes the data using jQuery.param before sending it to the API, so what it's sending looks something like [0][lat]=48.98&[0][lon]=-115.02.... I don't know of a package in R that does a similar encoding as jQuery.param, so we'll have to hack something together.
Modifying your example slightly:
library(httr)
body <- data.frame(lat = c(48.98,50.2), ##two example locations
lon = c(-115.02, -120),
el = c(1000,100),
prd = c("Normal_1961_1990","Normal_1961_1990"),
varYSM = c("Y","Y"))
Now, we do the encoding, like so:
out <- sapply(1:nrow(body), function(i) {
paste(c(
paste0(sprintf("[%d][lat]", i - 1), "=", body$lat[i]),
paste0(sprintf("[%d][lon]", i - 1), "=", body$lon[i]),
paste0(sprintf("[%d][el]", i - 1), "=", body$el[i]),
paste0(sprintf("[%d][prd]", i - 1), "=", body$prd[i]),
paste0(sprintf("[%d][varYSM]", i - 1), "=", body$varYSM[i])
), collapse = "&")
})
out <- paste(out, collapse = "&")
so now out is in a form that the API likes. Finally
result <- POST(url = "http://apibc.climatewna.com/api/clmApi", ##post to API
body = out,
add_headers(`Content-Type`="application/x-www-form-urlencoded"))
noting the Content-Type. We get
df <- do.call(rbind, lapply(content(result), as.data.frame, stringsAsFactors = FALSE))
str(df)
# 'data.frame': 2 obs. of 29 variables:
# $ lat : chr "48.98" "50.2"
# $ lon : chr "-115.02" "-120"
# $ elev : chr "1000" "100"
# $ prd : chr "Normal_1961_1990" "Normal_1961_1990"
# $ varYSM : chr "Y" "Y"
# $ MAT : chr "5.2" "8"
# $ MWMT : chr "16.9" "20.2"
# $ MCMT : chr "-6.7" "-5.6"
# $ TD : chr "23.6" "25.7"
# $ MAP : chr "617" "228"
# $ MSP : chr "269" "155"
# $ AHM : chr "24.7" "79.1"
# $ SHM : chr "62.9" "130.3"
# $ DD_0 : chr "690" "519"
# $ DD5 : chr "1505" "2131"
# $ DD_18 : chr "4684" "3818"
# $ DD18 : chr "60" "209"
# $ NFFD : chr "165" "204"
# $ bFFP : chr "150" "134"
# $ eFFP : chr "252" "254"
# $ FFP : chr "101" "120"
# $ PAS : chr "194" "34"
# $ EMT : chr "-36.3" "-32.7"
# $ EXT : chr "37.1" "41.2"
# $ Eref : chr "14.7" "13.6"
# $ CMD : chr "721" "862"
# $ MAR : chr "347" "679"
# $ RH : chr "57" "57"
# $ Version: chr "ClimateBC_API_v5.51" "ClimateBC_API_v5.51"

Scraping table from myneta using R

I am trying to scrape a table from http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary to my R studio.
Here's the code
url<-'http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary'
webpage<-read_html(url)
candidate_info<- html_nodes(webpage,xpath='//*[#id="main"]/div/div[2]/div[2]/table')
candidate_info<- html_table(candidate_info)
head(candidate_info)
But getting no output, suggest what I am doing wrong?
That site has some very broken HTML. But, it's workable.
I find it better to target nodes in a slightly less fragile way. The XPath below finds it by content of the table.
html_table() croaks (or took forever and I didn't want to wait) so I ended up building the table "manually".
library(rvest)
# helper to clean column names
mcga <- function(x) { make.unique(gsub("(^_|_$)", "", gsub("_+", "_", gsub("[[:punct:][:space:]]+", "_", tolower(x)))), sep = "_") }
pg <- read_html("http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary")
# target the table
tab <- html_node(pg, xpath=".//table[contains(thead, 'Liabilities')]")
# get the rows so we can target columns
rows <- html_nodes(tab, xpath=".//tr[td[not(#colspan)]]")
# make a data frame
do.call(
cbind.data.frame,
c(lapply(1:8, function(i) {
html_text(html_nodes(rows, xpath=sprintf(".//td[%s]", i)), trim=TRUE)
}), list(stringsAsFactors=FALSE))
) -> xdf
# make nicer names
xdf <- setNames(xdf, mcga(html_text(html_nodes(tab, "th")))) # get the header to get column names
str(xdf)
## 'data.frame': 4823 obs. of 8 variables:
## $ sno : chr "1" "2" "3" "4" ...
## $ candidate : chr "A Hasiv" "A Wahid" "Aan Shikhar Shrivastava" "Aaptab Urf Aftab" ...
## $ constituency : chr "ARYA NAGAR" "GAINSARI" "GOSHAINGANJ" "MUBARAKPUR" ...
## $ party : chr "BSP" "IND" "Satya Shikhar Party" "Islam Party Hind" ...
## $ criminal_case: chr "0" "0" "0" "0" ...
## $ education : chr "12th Pass" "10th Pass" "Graduate" "Illiterate" ...
## $ total_assets : chr "Rs 3,94,24,827 ~ 3 Crore+" "Rs 75,106 ~ 75 Thou+" "Rs 41,000 ~ 41 Thou+" "Rs 20,000 ~ 20 Thou+" ...
## $ liabilities : chr "Rs 58,46,335 ~ 58 Lacs+" "Rs 0 ~" "Rs 0 ~" "Rs 0 ~" ...

Geocode IP addresses in R

I have made this short code to automate geocoding of IP addresses by using the freegeoip.net (15,000 queries per hour by default; excellent service!):
> library(RCurl)
Loading required package: bitops
> ip.lst =
c("193.198.38.10","91.93.52.105","134.76.194.180","46.183.103.8")
> q = do.call(rbind, lapply(ip.lst, function(x){
try( data.frame(t(strsplit(getURI(paste0("freegeoip.net/csv/", x)), ",")[[1]]), stringsAsFactors = FALSE) )
}))
> names(q) = c("ip","country_code","country_name","region_code","region_name","city","zip_code","time_zone","latitude","longitude","metro_code")
> str(q)
'data.frame': 4 obs. of 11 variables:
$ ip : chr "193.198.38.10" "91.93.52.105" "134.76.194.180" "46.183.103.8"
$ country_code: chr "HR" "TR" "DE" "DE"
$ country_name: chr "Croatia" "Turkey" "Germany" "Germany"
$ region_code : chr "" "06" "NI" ""
$ region_name : chr "" "Ankara" "Lower Saxony" ""
$ city : chr "" "Ankara" "Gottingen" ""
$ zip_code : chr "" "06450" "37079" ""
$ time_zone : chr "Europe/Zagreb" "Europe/Istanbul" "Europe/Berlin" ""
$ latitude : chr "45.1667" "39.9230" "51.5333" "51.2993"
$ longitude : chr "15.5000" "32.8378" "9.9333" "9.4910"
$ metro_code : chr "0\r\n" "0\r\n" "0\r\n" "0\r\n"
In three lines of code you get coordinates for all IPs including city/country codes. I wonder if this could be parallelized so it runs even faster? To geocode >10,000 IPs can take hours otherwise.
library(rgeolocate)
ip_lst = c("193.198.38.10", "91.93.52.105", "134.76.194.180", "46.183.103.8")
maxmind(ip_lst, "~/Data/GeoLite2-City.mmdb",
fields=c("country_code", "country_name", "region_name", "city_name",
"timezone", "latitude", "longitude"))
## country_code country_name region_name city_name timezone latitude longitude
## 1 HR Croatia <NA> <NA> Europe/Zagreb 45.1667 15.5000
## 2 TR Turkey Istanbul Istanbul Europe/Istanbul 41.0186 28.9647
## 3 DE Germany Lower Saxony Bilshausen Europe/Berlin 51.6167 10.1667
## 4 DE Germany North Rhine-Westphalia Aachen Europe/Berlin 50.7787 6.1085
There are instructions in the package for obtaining the necessary data files. Some of the fields you're pulling are woefully inaccurate (more so than any geoip vendor would like to admit). If you do need ones that aren't available, file an issue and we'll add them.
I've found multidplyr is a great package for making parallel server calls. This is the best guide I've found, and I highly recommend reading the whole thing to better understand how the package works: http://www.business-science.io/code-tools/2016/12/18/multidplyr.html
library("devtools")
devtools::install_github("hadley/multidplyr")
library(parallel)
library(multidplyr)
library(RCurl)
library(tidyverse)
# Convert your example into a function
get_ip <- function(ip) {
do.call(rbind, lapply(ip, function(x) {
try(data.frame(t(strsplit(getURI(
paste0("freegeoip.net/csv/", x)
), ",")[[1]]), stringsAsFactors = FALSE))
})) %>% nest(X1:X11)
}
# Made ip.lst into a Tibble to make it work better with dplyr
ip.lst =
tibble(
ip = c(
"193.198.38.10",
"91.93.52.105",
"134.76.194.180",
"46.183.103.8",
"193.198.38.10",
"91.93.52.105",
"134.76.194.180",
"46.183.103.8"
)
)
# Create a cluster based on how many cores your machine has
cl <- detectCores()
cluster <- create_cluster(cores = cl)
# Create a partitioned tibble
by_group <- partition(ip.lst, cluster = cluster)
# Send libraries and the function get_ip() to each cluster
by_group %>%
cluster_library("tidyverse") %>%
cluster_library("RCurl") %>%
cluster_assign_value("get_ip", get_ip)
# Send parallel requests to the website and parse the results
q <- by_group %>%
do(get_ip(.$ip)) %>%
collect() %>%
unnest() %>%
tbl_df() %>%
select(-PARTITION_ID)
# Set names of the results
names(q) = c(
"ip",
"country_code",
"country_name",
"region_code",
"region_name",
"city",
"zip_code",
"time_zone",
"latitude",
"longitude",
"metro_code"
)

wordcloud package: get “Error in strwidth(…) : invalid 'cex' value”

I am using the tm and wordcloud packages in R 2.15.1. I am trying to make a word cloud Here is the code:
maruti_tweets = userTimeline("Maruti_suzuki", n=1000,cainfo="cacert.pem")
hyundai_tweets = userTimeline("HyundaiIndia", n=1000,cainfo="cacert.pem")
tata_tweets = userTimeline("TataMotor", n=1000,cainfo="cacert.pem")
toyota_tweets = userTimeline("Toyota_India", n=1000,cainfo="cacert.pem")
# get text
maruti_txt = sapply(maruti_tweets, function(x) x$getText())
hyundai_txt = sapply(hyundai_tweets, function(x) x$getText())
tata_txt = sapply(tata_tweets, function(x) x$getText())
toyota_txt = sapply(toyota_tweets, function(x) x$getText())
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("#\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}
# clean texts
maruti_clean = clean.text(maruti_txt)
hyundai_clean = clean.text(hyundai_txt)
tata_clean = clean.text(tata_txt)
toyota_clean = clean.text(toyota_txt)
maruti = paste(maruti_clean, collapse=" ")
hyundai= paste(hyundai_clean, collapse=" ")
tata= paste(tata_clean, collapse=" ")
toyota= paste(toyota_clean, collapse=" ")
# put ehyundaiything in a single vector
all = c(maruti, hyundai, tata, toyota)
# remove stop-words
all = removeWords(all,
c(stopwords("english"), "maruti", "tata", "hyundai", "toyota"))
# create corpus
corpus = Corpus(VectorSource(all))
# create term-document matrix
tdm = TermDocumentMatrix(corpus)
# convert as matrix
tdm = as.matrix(tdm)
# add column names
colnames(tdm) = c("MARUTI", "HYUNDAI", "TATA", "TOYOTA")
# comparison cloud
comparison.cloud(tdm, random.order=FALSE,colors = c("#00B2FF", "red", #FF0099","#6600CC"),max.words=500)
but getting following error
Error in strwidth(words[i], cex = size[i], ...) : invalid 'cex' value
please help
You have a typo in TataMotors twitter account. It should be spelled 'TataMotors', not 'TataMotor'. As a result, one column in your term matrix is empty and when cex is calculated it get assigned NAN.
Fix the typo and the rest of the code works fine. Good luck!
I spotted the empty-column issue in a different application throwing the same error. In my case it was because of the removeSparseTerms command applied to a document term matrix. Using str() helped me identify the bug.
The input variable (slightly edited) had 289 columns:
> str(corpus.dtm)
List of 6
$ i : int [1:443] 3 4 6 8 10 12 15 18 19 21 ...
$ j : int [1:443] 105 98 210 93 287 249 126 223 129 146 ...
$ v : num [1:443] 1 1 1 1 1 1 1 1 1 1 ...
$ nrow : int 1408
$ ncol : int 289
$ dimnames:List of 2
..$ Docs : chr [1:1408] "character(0)" "character(0)" "character(0)" "character(0)" ...
..$ Terms: chr [1:289] "word1" "word2" "word3" "word4" ...
- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
The command was:
removeSparseTerms(corpus.dtm,0.90)->corpus.dtm.frequent
And the result had 0 columns:
> str(corpus.dtm.frequent)
List of 6
$ i : int(0)
$ j : int(0)
$ v : num(0)
$ nrow : int 1408
$ ncol : int 0
$ dimnames:List of 2
..$ Docs : chr [1:1408] "character(0)" "character(0)" "character(0)" "character(0)" ...
..$ Terms: NULL
- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
Raising the sparsity coefficient from 0.90 to 0.95 solved the issue. For a wordier document I went up to 0.999 in order to have a non-empty result after removing the sparse terms.
Empty columns are a good thing to check out when this error occurs.

Resources