I have made this short code to automate geocoding of IP addresses by using the freegeoip.net (15,000 queries per hour by default; excellent service!):
> library(RCurl)
Loading required package: bitops
> ip.lst =
c("193.198.38.10","91.93.52.105","134.76.194.180","46.183.103.8")
> q = do.call(rbind, lapply(ip.lst, function(x){
try( data.frame(t(strsplit(getURI(paste0("freegeoip.net/csv/", x)), ",")[[1]]), stringsAsFactors = FALSE) )
}))
> names(q) = c("ip","country_code","country_name","region_code","region_name","city","zip_code","time_zone","latitude","longitude","metro_code")
> str(q)
'data.frame': 4 obs. of 11 variables:
$ ip : chr "193.198.38.10" "91.93.52.105" "134.76.194.180" "46.183.103.8"
$ country_code: chr "HR" "TR" "DE" "DE"
$ country_name: chr "Croatia" "Turkey" "Germany" "Germany"
$ region_code : chr "" "06" "NI" ""
$ region_name : chr "" "Ankara" "Lower Saxony" ""
$ city : chr "" "Ankara" "Gottingen" ""
$ zip_code : chr "" "06450" "37079" ""
$ time_zone : chr "Europe/Zagreb" "Europe/Istanbul" "Europe/Berlin" ""
$ latitude : chr "45.1667" "39.9230" "51.5333" "51.2993"
$ longitude : chr "15.5000" "32.8378" "9.9333" "9.4910"
$ metro_code : chr "0\r\n" "0\r\n" "0\r\n" "0\r\n"
In three lines of code you get coordinates for all IPs including city/country codes. I wonder if this could be parallelized so it runs even faster? To geocode >10,000 IPs can take hours otherwise.
library(rgeolocate)
ip_lst = c("193.198.38.10", "91.93.52.105", "134.76.194.180", "46.183.103.8")
maxmind(ip_lst, "~/Data/GeoLite2-City.mmdb",
fields=c("country_code", "country_name", "region_name", "city_name",
"timezone", "latitude", "longitude"))
## country_code country_name region_name city_name timezone latitude longitude
## 1 HR Croatia <NA> <NA> Europe/Zagreb 45.1667 15.5000
## 2 TR Turkey Istanbul Istanbul Europe/Istanbul 41.0186 28.9647
## 3 DE Germany Lower Saxony Bilshausen Europe/Berlin 51.6167 10.1667
## 4 DE Germany North Rhine-Westphalia Aachen Europe/Berlin 50.7787 6.1085
There are instructions in the package for obtaining the necessary data files. Some of the fields you're pulling are woefully inaccurate (more so than any geoip vendor would like to admit). If you do need ones that aren't available, file an issue and we'll add them.
I've found multidplyr is a great package for making parallel server calls. This is the best guide I've found, and I highly recommend reading the whole thing to better understand how the package works: http://www.business-science.io/code-tools/2016/12/18/multidplyr.html
library("devtools")
devtools::install_github("hadley/multidplyr")
library(parallel)
library(multidplyr)
library(RCurl)
library(tidyverse)
# Convert your example into a function
get_ip <- function(ip) {
do.call(rbind, lapply(ip, function(x) {
try(data.frame(t(strsplit(getURI(
paste0("freegeoip.net/csv/", x)
), ",")[[1]]), stringsAsFactors = FALSE))
})) %>% nest(X1:X11)
}
# Made ip.lst into a Tibble to make it work better with dplyr
ip.lst =
tibble(
ip = c(
"193.198.38.10",
"91.93.52.105",
"134.76.194.180",
"46.183.103.8",
"193.198.38.10",
"91.93.52.105",
"134.76.194.180",
"46.183.103.8"
)
)
# Create a cluster based on how many cores your machine has
cl <- detectCores()
cluster <- create_cluster(cores = cl)
# Create a partitioned tibble
by_group <- partition(ip.lst, cluster = cluster)
# Send libraries and the function get_ip() to each cluster
by_group %>%
cluster_library("tidyverse") %>%
cluster_library("RCurl") %>%
cluster_assign_value("get_ip", get_ip)
# Send parallel requests to the website and parse the results
q <- by_group %>%
do(get_ip(.$ip)) %>%
collect() %>%
unnest() %>%
tbl_df() %>%
select(-PARTITION_ID)
# Set names of the results
names(q) = c(
"ip",
"country_code",
"country_name",
"region_code",
"region_name",
"city",
"zip_code",
"time_zone",
"latitude",
"longitude",
"metro_code"
)
Related
I'd like to convert a sf object to a dataframe and restore it to its original state. But when I make the conversion of st_as_text(st_sfc(stands_sel$geometry)) is shows very difficult to retrieve it again. In my example:
library(sf)
# get AOI in shapefile
download.file(
"https://github.com/Leprechault/trash/raw/main/sel_stands_CMPC.zip",
zip_path <- tempfile(fileext = ".zip")
)
unzip(zip_path, exdir = tempdir())
# Open the file
setwd(tempdir())
stands_sel <- st_read("sel_stands_CMPC.shp")
st_crs(stands_sel) = 4326
# Extract geometry as text
geom <- st_as_text(st_sfc(stands_sel$geometry))
# Add the features
features <- st_drop_geometry(stands_sel)
str(features)
# Joining feature + geom
geo_df <- cbind(features, geom)
str(geo_df)
# 'data.frame': 2 obs. of 17 variables:
# $ CD_USO_SOL: num 2433 9053
# $ ID_REGIAO : num 11 11
# $ ID_PROJETO: chr "002" "344"
# $ PROJETO : chr "BARBA NEGRA" "CAMPO SECO"
# $ CD_TALHAO : chr "159A" "016A"
# $ CARACTERIS: chr "Plantio Comercial" "Plantio Comercial"
# $ CARACTER_1: chr "Produtivo" "Produtivo"
# $ CICLO : int 2 1
# $ ROTACAO : int 1 1
# $ DATA_PLANT: chr "2008/04/15" "2010/04/15"
# $ LOCALIDADE: chr "BARRA DO RIBEIRO" "DOM FELICIANO"
# $ ESPACAMENT: chr "3.00 x 2.50" "3.5 x 2.14"
# $ ESPECIE : chr "SALIGNA" "DUNNI"
# $ SISTEMA_PR: chr "MACRO ESTACA - EUCALIPTO" "SEMENTE - EUCALIPTO"
# $ VLR_AREA : num 8.53 28.07
# $ ID_UNIQUE : chr "BARBANEGRA159A" "CAMPOSECO016A"
# $ geom : chr "MULTIPOLYGON (((-51.21423 -30.35172, -51.21426 -30.35178, -51.2143 -30.35181, -51.21432 -30.35186, -51.21433 -3"| __truncated__
# Return to original format again
stands_sf <- geo_df %>%
st_geometry(geom) %>%
sf::st_as_sf(crs = 4326)
#Error in UseMethod("st_geometry") :
Please, any help to restore my stands_sf object to the orinal state?
I think geom isn't in a format st_geometry is expecting. st_as_text converted your geometry into WKT as discussed in the help:
The returned WKT representation of simple feature geometry conforms to the simple features access specification and extensions, known as EWKT, supported by PostGIS and other simple features implementations for addition of SRID to a WKT string.
https://r-spatial.github.io/sf/reference/st_as_text.html
Instead, use st_as_sf(wkt=) to set the new (old) geometry.
st_as_sf(geo_df, wkt = "geom", crs = 4326)
I'd like to create a BigQuery table with geoJSON files, despite the geoJSONis an accepted format in BQ (NEWLINE_DELIMITED_JSON) and bq_fields specification, or something coercible to it (like a data frame) the function bq_table_create() of the bigrquery package doesn't work. In my example below the output error is Erro: Unsupported type: list:
library(sf)
library(bigrquery)
library(DBI)
library(googleAuthR)
library(geojsonsf)
library(geojsonR)
# Convert shapefile to geoJSON
stands_sel <- st_read(
"D:/Dropbox/Stinkbug_Ml_detection_CMPC/dashboard/v_08_CMPC/sel_stands_CMPC.shp")
# Open as geoJSON
geo <- sf_geojson(stands_sel)
# Convert geoJSON to data frame
geo_js_df <- as.data.frame(geojson_wkt(geo))
str(geo_js_df)
# 'data.frame': 2 obs. of 17 variables:
# $ SISTEMA_PR: chr "MACRO ESTACA - EUCALIPTO" "SEMENTE - EUCALIPTO"
# $ ESPECIE : chr "SALIGNA" "DUNNI"
# $ ID_UNIQUE : chr "BARBANEGRA159A" "CAMPOSECO016A"
# $ CICLO : num 2 1
# $ LOCALIDADE: chr "BARRA DO RIBEIRO" "DOM FELICIANO"
# $ ROTACAO : num 1 1
# $ CARACTER_1: chr "Produtivo" "Produtivo"
# $ VLR_AREA : num 8.53 28.07
# $ ID_REGIAO : num 11 11
# $ CD_USO_SOL: num 2433 9053
# $ DATA_PLANT: chr "2008/04/15" "2010/04/15"
# $ ID_PROJETO: chr "002" "344"
# $ CARACTERIS: chr "Plantio Comercial" "Plantio Comercial"
# $ PROJETO : chr "BARBA NEGRA" "CAMPO SECO"
# $ ESPACAMENT: chr "3.00 x 2.50" "3.5 x 2.14"
# $ CD_TALHAO : chr "159A" "016A"
# $ geometry :List of 2
# ..$ : 'wkt' chr "MULTIPOLYGON (((-51.2142 -30.3517,-51.2143 -30.3518,-51.2143 -30.3518,-51.2143 -30.3519,-51.2143 -30.3519,-51.2"| __truncated__
# ..$ : 'wkt' chr "MULTIPOLYGON (((-52.3214 -30.4271,-52.3214 -30.4272,-52.3214 -30.4272,-52.3215 -30.4272,-52.3215 -30.4272,-52.3"| __truncated__
# - attr(*, "wkt_column")= chr "geometry"
# Insert information inside BQ
bq_conn <- dbConnect(bigquery(),
project = "my-project",
use_legacy_sql = FALSE
)
# First create the table
players_table = bq_table(project = "my-project", dataset = "stands_ROI_2021", table = "CF_2021")
bq_table_create(x = players_table, fields = as_bq_fields(geo_js_df))
Erro: Unsupported type: list
You can upload data frame with a list-type column on BigQuery by using bq_table_upload() syntax. Try this on your script instead of bq_table_create(),
bq_table_upload(players_table, geo_js_df)
For your reference, I tried this on my end using this sample data with a list-type column:
d <- data.frame(id = 1:2,
name = c("Jon", "Mark"),
children = I(list(c("Mary", "James"),
c("Greta", "Sally")))
)
R console:
Created BQ table:
EDIT:
As per this documentation, FeatureCollection is not yet supported in BigQuery, however there is an ongoing feature request you can find here. Workaround is to convert the GeoJson file to BigQuery new-line-delimited JSON before converting it to dataframe.
To convert GeoJson file to BigQuery new-line-delimited JSON, follow these steps:
Install node.js.
Add packages:
npm install fs JSONStream line-input-stream yargs
Clone the github repository:
git clone https://github.com/mentin/geoscripts.git
Change directory:
cd geoscripts/geojson2bq/
Convert GeoJson file to BigQuery new-line-delimited JSON:
node geojson2bqjson.js sel_stands.geojson > out.json
Using the new-line-delimited JSON file, convert this to dataframe in the R console, then use bq_table_upload() to upload the data to BigQuery.
library(bigrquery)
library(dplyr)
library(tidyverse)
library(jsonlite)
out <- stream_in(file('out.json'))
projectid<-"my-project"
datasetid<-"my-dataset"
bq_conn <- dbConnect(bigquery(),
project = projectid,
dataset = datasetid,
use_legacy_sql = FALSE)
players_table = bq_table(project = "my-project", dataset = "my-dataset", table = "CF_2021_test5")
bq_table_upload(players_table, out)
bq_table_download(players_table)
R console:
BigQuery table:
I'm trying to pull data directly from an API into R using the httr package. The API doesn't require any authentication, and accepts JSON strings of lat, long, elevation, variable sets, and time period to estimate climate variables for any location. This is my first time using an API, but the code below is what I've cobbled together from various Stack Overflow posts.
library(jsonlite)
library(httr)
url = "http://apibc.climatewna.com/api/clmApi"
body <- data.frame(lat = c(48.98,50.2), ##two example locations
lon = c(-115.02, -120),
el = c(1000,100),
prd = c("Normal_1961_1990.nrm","Normal_1961_1990.nrm"),
varYSM = c("Y","SST"))
requestBody <- toJSON(list("output" = body),auto_unbox = TRUE) ##convert to JSON string
result <- POST("http://apibc.climatewna.com/api/clmApi", ##post to API
body = requestBody,
add_headers(`Content-Type`="application/json"))
content(result)
I've tried various different versions of this (e.g. writing the JSON string manually, putting the body as a list in POST with encode = "json"), and it always runs, but the content always contains the below error message:
$Message
[1] "An error has occurred."
$ExceptionMessage
[1] "Object reference not set to an instance of an object."
$ExceptionType
[1] "System.NullReferenceException"
If I use GET and specify the variables directly in the URL
url = "http://apibc.climatewna.com/api/clmApi/LatLonEl?lat=48.98&lon=-115.02&el=1000&prd=Normal_1961_1990&varYSM=Y"
result <- GET(url)
content(result)
it produces the correct output, but then I can only obtain information for one location at a time. There isn't currently any public documentation about this API as it's very new, but I've attached a draft of the section explaining it using JS below. I would very much appreciate any help/suggestions on what I'm doing wrong!
Thank you!
The main problem is that jQuery.ajax encodes the data using jQuery.param before sending it to the API, so what it's sending looks something like [0][lat]=48.98&[0][lon]=-115.02.... I don't know of a package in R that does a similar encoding as jQuery.param, so we'll have to hack something together.
Modifying your example slightly:
library(httr)
body <- data.frame(lat = c(48.98,50.2), ##two example locations
lon = c(-115.02, -120),
el = c(1000,100),
prd = c("Normal_1961_1990","Normal_1961_1990"),
varYSM = c("Y","Y"))
Now, we do the encoding, like so:
out <- sapply(1:nrow(body), function(i) {
paste(c(
paste0(sprintf("[%d][lat]", i - 1), "=", body$lat[i]),
paste0(sprintf("[%d][lon]", i - 1), "=", body$lon[i]),
paste0(sprintf("[%d][el]", i - 1), "=", body$el[i]),
paste0(sprintf("[%d][prd]", i - 1), "=", body$prd[i]),
paste0(sprintf("[%d][varYSM]", i - 1), "=", body$varYSM[i])
), collapse = "&")
})
out <- paste(out, collapse = "&")
so now out is in a form that the API likes. Finally
result <- POST(url = "http://apibc.climatewna.com/api/clmApi", ##post to API
body = out,
add_headers(`Content-Type`="application/x-www-form-urlencoded"))
noting the Content-Type. We get
df <- do.call(rbind, lapply(content(result), as.data.frame, stringsAsFactors = FALSE))
str(df)
# 'data.frame': 2 obs. of 29 variables:
# $ lat : chr "48.98" "50.2"
# $ lon : chr "-115.02" "-120"
# $ elev : chr "1000" "100"
# $ prd : chr "Normal_1961_1990" "Normal_1961_1990"
# $ varYSM : chr "Y" "Y"
# $ MAT : chr "5.2" "8"
# $ MWMT : chr "16.9" "20.2"
# $ MCMT : chr "-6.7" "-5.6"
# $ TD : chr "23.6" "25.7"
# $ MAP : chr "617" "228"
# $ MSP : chr "269" "155"
# $ AHM : chr "24.7" "79.1"
# $ SHM : chr "62.9" "130.3"
# $ DD_0 : chr "690" "519"
# $ DD5 : chr "1505" "2131"
# $ DD_18 : chr "4684" "3818"
# $ DD18 : chr "60" "209"
# $ NFFD : chr "165" "204"
# $ bFFP : chr "150" "134"
# $ eFFP : chr "252" "254"
# $ FFP : chr "101" "120"
# $ PAS : chr "194" "34"
# $ EMT : chr "-36.3" "-32.7"
# $ EXT : chr "37.1" "41.2"
# $ Eref : chr "14.7" "13.6"
# $ CMD : chr "721" "862"
# $ MAR : chr "347" "679"
# $ RH : chr "57" "57"
# $ Version: chr "ClimateBC_API_v5.51" "ClimateBC_API_v5.51"
I am trying to scrape a table from http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary to my R studio.
Here's the code
url<-'http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary'
webpage<-read_html(url)
candidate_info<- html_nodes(webpage,xpath='//*[#id="main"]/div/div[2]/div[2]/table')
candidate_info<- html_table(candidate_info)
head(candidate_info)
But getting no output, suggest what I am doing wrong?
That site has some very broken HTML. But, it's workable.
I find it better to target nodes in a slightly less fragile way. The XPath below finds it by content of the table.
html_table() croaks (or took forever and I didn't want to wait) so I ended up building the table "manually".
library(rvest)
# helper to clean column names
mcga <- function(x) { make.unique(gsub("(^_|_$)", "", gsub("_+", "_", gsub("[[:punct:][:space:]]+", "_", tolower(x)))), sep = "_") }
pg <- read_html("http://myneta.info/uttarpradesh2017/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary")
# target the table
tab <- html_node(pg, xpath=".//table[contains(thead, 'Liabilities')]")
# get the rows so we can target columns
rows <- html_nodes(tab, xpath=".//tr[td[not(#colspan)]]")
# make a data frame
do.call(
cbind.data.frame,
c(lapply(1:8, function(i) {
html_text(html_nodes(rows, xpath=sprintf(".//td[%s]", i)), trim=TRUE)
}), list(stringsAsFactors=FALSE))
) -> xdf
# make nicer names
xdf <- setNames(xdf, mcga(html_text(html_nodes(tab, "th")))) # get the header to get column names
str(xdf)
## 'data.frame': 4823 obs. of 8 variables:
## $ sno : chr "1" "2" "3" "4" ...
## $ candidate : chr "A Hasiv" "A Wahid" "Aan Shikhar Shrivastava" "Aaptab Urf Aftab" ...
## $ constituency : chr "ARYA NAGAR" "GAINSARI" "GOSHAINGANJ" "MUBARAKPUR" ...
## $ party : chr "BSP" "IND" "Satya Shikhar Party" "Islam Party Hind" ...
## $ criminal_case: chr "0" "0" "0" "0" ...
## $ education : chr "12th Pass" "10th Pass" "Graduate" "Illiterate" ...
## $ total_assets : chr "Rs 3,94,24,827 ~ 3 Crore+" "Rs 75,106 ~ 75 Thou+" "Rs 41,000 ~ 41 Thou+" "Rs 20,000 ~ 20 Thou+" ...
## $ liabilities : chr "Rs 58,46,335 ~ 58 Lacs+" "Rs 0 ~" "Rs 0 ~" "Rs 0 ~" ...
I'm trying to use this code:
require(RSelenium)
checkForServer()
startServer()
remDr<-remoteDriver()
remDr$open()
appURL <- 'http://www.mtmis.excise-punjab.gov.pk'
remDr$navigate(appURL)
remDr$findElement("name", "vhlno")$sendKeysToElement(list("ria-07-777"))
Can't figure out css selector
remDr$findElements("class", "ent-button-div")[[1]]$clickElement()
after searching query
elem <- remDr$findElement(using="class", value="result-div")
elemtxt <- elem$getElementAttribute("outerHTML")[[1]]
elemxml <- htmlTreeParse(elemtxt, useInternalNodes=T)
final <- readHTMLTable(elemxml)
remDr$close()
rD[["server"]]$stop()
What I want is to create an automated "for loop" with different vehicles from list and merge all into one final table with unique identifier, e.g., "ria-07-777".
list <- c("ria-07-776", "ria-07-777", "ria-07-778")
Why do you need Selenium?
library(httr)
library(rvest)
clean_cols <- function(x) {
x <- tolower(x)
x <- gsub("[[:punct:][:space:]]+", "_", x)
x <- gsub("_+", "_", x)
x <- gsub("(^_|_$)", "", x)
make.unique(x, sep = "_")
}
get_vehicle_info <- function(vhlno) {
POST(
url = 'http://www.mtmis.excise-punjab.gov.pk/',
set_cookies(has_js=1),
body = list(vhlno=vhlno)
) -> res
stop_for_status(res)
pg <- content(res)
rows <- html_nodes(pg, xpath=".//div[contains(#class, 'result-div')]/table/tr[td[not(#colspan)]]")
cbind.data.frame(
as.list(
setNames(
html_text(html_nodes(rows, xpath=".//td[2]")),
clean_cols(html_text(html_nodes(rows, xpath=".//td[1]")))
)
),
stringsAsFactors=FALSE
)
}
Now use ^^:
vehicles <- c("ria-07-776", "ria-07-777", "ria-07-778")
Reduce(
rbind.data.frame,
lapply(vehicles, function(v) {
Sys.sleep(5) # your desire to steal a bunch of vehicle info to make a sketch database does not give you the right to hammer the server, and you'll very likely remove this line anyway, but I had to try
get_vehicle_info(v)
})
) -> vehicle_df
str(vehicle_df)
## 'data.frame': 3 obs. of 12 variables:
## $ registration_number: chr "ria-07-776" "ria-07-777" "ria-07-778"
## $ chassis_number : chr "KZJ95-0019869" "NFBFD15746R101101" "NZE1206066278"
## $ engine_number : chr "1KZ-0375851" "R18A11981105" "X583994"
## $ make_name : chr "LAND - CRUISER" "HONDA - CIVIC" "TOYOTA - COROLLA"
## $ registration_date : chr "17-Dec-2007 12:00 AM" "01-Aug-2007 12:00 AM" "01-Jan-1970 12:00 AM"
## $ model : chr "1997" "2006" "2007"
## $ vehicle_price : chr "1,396,400" "1,465,500" "0"
## $ color : chr "MULTI" "GRENDA B.P" "SILVER"
## $ token_tax_paid_upto: chr "June 2015" "June 2011" "June 2016"
## $ owner_name : chr "FATEH DIN AWAN" "M BILAL YASIN" "MUHAMMAD ALTAF"
## $ father_name : chr "HAFIZ ABDUL HAKEEM AWAN" "CH M. YASIN" "NAZAR MUHAMMAD"
## $ owner_city : chr "RAWALPINDI" "ISLAMABAD" "SARGODHA"
You'll need to handle network and scraping errors on your own. I can't justify any more time for this likely unethical endeavour (the answer was more to help others with similar q's).