reading address and lat,long from xml_node in R (mapsapi package) - r

I'm trying to get informations from an address over the package mapsapi in R.
So my code looks like follows:
library(mapsapi)
library(XML)
library(RCurl)
string <- "Pariser Platz 1, 10117 Berlin"
test <- mp_geocode(string)
xml <- xml_child(test[[string]],2)
xml
Now I'm getting this kind of xml file:
{xml_node}
<result>
[1] <type>street_address</type>
[2] <formatted_address>Pariser Platz 1, 10117 Berlin, Germany</formatted_address>
[3] <address_component>\n <long_name>1</long_name>\n <short_name>1</short_name>\n <type>street_number</type>\n</address_component>
[4] <address_component>\n <long_name>Pariser Platz</long_name>\n <short_name>Pariser Platz</short_name>\n <type>route</type>\n</address_component>
[5] <address_component>\n <long_name>Mitte</long_name>\n <short_name>Mitte</short_name>\n <type>political</type>\n <type>sublocality</type>\n <type>sublocality_level_1</type>\n</address_component>
[6] <address_component>\n <long_name>Berlin</long_name>\n <short_name>Berlin</short_name>\n <type>locality</type>\n <type>political</type>\n</address_component>
[7] <address_component>\n <long_name>Berlin</long_name>\n <short_name>Berlin</short_name>\n <type>administrative_area_level_1</type>\n <type>political</type>\n</address_component>
[8] <address_component>\n <long_name>Germany</long_name>\n <short_name>DE</short_name>\n <type>country</type>\n <type>political</type>\n</address_component>
[9] <address_component>\n <long_name>10117</long_name>\n <short_name>10117</short_name>\n <type>postal_code</type>\n</address_component>
[10] <geometry>\n <location>\n <lat>52.5160964</lat>\n <lng>13.3779369</lng>\n </location>\n <location_type>ROOFTOP</location_type>\n <viewport>\n <southwest>\n <lat>52.5147474</lat>\n <lng>13.37658 ...
[11] <place_id>ChIJnYvtVcZRqEcRl6Kftq66b6Y</place_id>
So how can I export the street number, address, city, zip, lat and long out of this xml into decent variables?
Thanks for your help!
regards

I've made accessing this type of information easy in my googleway package
library(googleway)
## you're using Google's API, and they require you to have an API key
## so you'll need to get one
set_key("GOOGLE_API_KEY")
## perform query
res <- google_geocode("Pariser Platz 1, 10117 Berlin")
With the res result you can use geocode_coordinates() to extract the coordinates, and geocode_address_components() to get the street number
## coordinates
geocode_coordinates(res)
# lat lng
# 1 52.5161 13.37794
geocode_address_components(res)
# long_name short_name types
# 1 1 1 street_number
# 2 Pariser Platz Pariser Platz route
# 3 Mitte Mitte political, sublocality, sublocality_level_1
# 4 Berlin Berlin locality, political
# 5 Berlin Berlin administrative_area_level_1, political
# 6 Germany DE country, political
# 7 10117 10117 postal_code
You can look at str(res) to see the full list of items returned from Google's API

Alternatively, you can also use ggmap::geocode():
> library(ggmap)
> geocode(location = "Pariser Platz 1, 10117 Berlin", output = 'latlon' )
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Pariser%20Platz%201,%2010117%20Berlin&sensor=false
lon lat
1 13.37794 52.5161
Changing the output parameter can give you a very detailed list output (if required):
> geocode(location = "Pariser Platz 1, 10117 Berlin", output = 'all' )
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Pariser%20Platz%201,%2010117%20Berlin&sensor=false
$results
$results[[1]]
$results[[1]]$address_components
$results[[1]]$address_components[[1]]
$results[[1]]$address_components[[1]]$long_name
[1] "1"
$results[[1]]$address_components[[1]]$short_name
[1] "1"
$results[[1]]$address_components[[1]]$types
[1] "street_number"
$results[[1]]$address_components[[2]]
$results[[1]]$address_components[[2]]$long_name
[1] "Pariser Platz"
$results[[1]]$address_components[[2]]$short_name
[1] "Pariser Platz"
$results[[1]]$address_components[[2]]$types
[1] "route"
$results[[1]]$address_components[[3]]
$results[[1]]$address_components[[3]]$long_name
[1] "Mitte"
$results[[1]]$address_components[[3]]$short_name
[1] "Mitte"
$results[[1]]$address_components[[3]]$types
[1] "political" "sublocality" "sublocality_level_1"
$results[[1]]$address_components[[4]]
$results[[1]]$address_components[[4]]$long_name
[1] "Berlin"
$results[[1]]$address_components[[4]]$short_name
[1] "Berlin"
$results[[1]]$address_components[[4]]$types
[1] "locality" "political"
$results[[1]]$address_components[[5]]
$results[[1]]$address_components[[5]]$long_name
[1] "Berlin"
$results[[1]]$address_components[[5]]$short_name
[1] "Berlin"
$results[[1]]$address_components[[5]]$types
[1] "administrative_area_level_1" "political"
$results[[1]]$address_components[[6]]
$results[[1]]$address_components[[6]]$long_name
[1] "Germany"
$results[[1]]$address_components[[6]]$short_name
[1] "DE"
$results[[1]]$address_components[[6]]$types
[1] "country" "political"
$results[[1]]$address_components[[7]]
$results[[1]]$address_components[[7]]$long_name
[1] "10117"
$results[[1]]$address_components[[7]]$short_name
[1] "10117"
$results[[1]]$address_components[[7]]$types
[1] "postal_code"
$results[[1]]$formatted_address
[1] "Pariser Platz 1, 10117 Berlin, Germany"
$results[[1]]$geometry
$results[[1]]$geometry$location
$results[[1]]$geometry$location$lat
[1] 52.5161
$results[[1]]$geometry$location$lng
[1] 13.37794
$results[[1]]$geometry$location_type
[1] "ROOFTOP"
$results[[1]]$geometry$viewport
$results[[1]]$geometry$viewport$northeast
$results[[1]]$geometry$viewport$northeast$lat
[1] 52.51745
$results[[1]]$geometry$viewport$northeast$lng
[1] 13.37929
$results[[1]]$geometry$viewport$southwest
$results[[1]]$geometry$viewport$southwest$lat
[1] 52.51475
$results[[1]]$geometry$viewport$southwest$lng
[1] 13.37659
$results[[1]]$place_id
[1] "ChIJnYvtVcZRqEcRl6Kftq66b6Y"
$results[[1]]$types
[1] "street_address"
$status
[1] "OK"
You can find more info in the function help section.
Sometimes the call may fail with the following message:
Warning message:
geocode failed with status OVER_QUERY_LIMIT, location = "Pariser Platz 1, 10117 Berlin"
Generally if you try after a few seconds it works fine. You can always check the remaining queries left in your quota with geocodeQueryCheck:
> geocodeQueryCheck()
2490 geocoding queries remaining.

Related

Appending multiple nested lists to a dataframe in R

I have a list of ambiguous addresses that I need to return full geocode information for.
Only issue is that what I get is a large list of nested lists (JSON)
I want to be able to get a data frame that contains the key information, i.e.
IDEAL OUTPUT
Original_Address, StreetNum, StreetName, Suburb, town_city, locality, Postcode, geo_xCord, Country, Postcode
I almost wonder if this is just too difficult and if there is an easier method that I haven't considered.
I basically just need to be able to spit out the key address elements for each address I have.
# Stack Overflow Example -------------------------------------------
random_addresses <- c('27 Hall Street, Wellington',
'52 Ethan Street, New Zealand',
'13 Epsom Street, Auckland',
'42 Elden Drive, New Zealand')
register_google(key = "MYAPIKEY")
place_lookup <- geocode(random_addresses, output = "all")
print(place_lookup[1])
>>>
[[1]]$results
[[1]]$results[[1]]
[[1]]$results[[1]]$address_components
[[1]]$results[[1]]$address_components[[1]]
[[1]]$results[[1]]$address_components[[1]]$long_name
[1] "27"
[[1]]$results[[1]]$address_components[[1]]$short_name
[1] "27"
[[1]]$results[[1]]$address_components[[1]]$types
[[1]]$results[[1]]$address_components[[1]]$types[[1]]
[1] "street_number"
[[1]]$results[[1]]$address_components[[2]]
[[1]]$results[[1]]$address_components[[2]]$long_name
[1] "Hall Street"
[[1]]$results[[1]]$address_components[[2]]$short_name
[1] "Hall St"
[[1]]$results[[1]]$address_components[[2]]$types
[[1]]$results[[1]]$address_components[[2]]$types[[1]]
[1] "route"
[[1]]$results[[1]]$address_components[[3]]
[[1]]$results[[1]]$address_components[[3]]$long_name
[1] "Newtown"
[[1]]$results[[1]]$address_components[[3]]$short_name
[1] "Newtown"
[[1]]$results[[1]]$address_components[[3]]$types
[[1]]$results[[1]]$address_components[[3]]$types[[1]]
[1] "political"
[[1]]$results[[1]]$address_components[[3]]$types[[2]]
[1] "sublocality"
[[1]]$results[[1]]$address_components[[3]]$types[[3]]
[1] "sublocality_level_1"
[[1]]$results[[1]]$address_components[[4]]
[[1]]$results[[1]]$address_components[[4]]$long_name
[1] "Wellington"
[[1]]$results[[1]]$address_components[[4]]$short_name
[1] "Wellington"
[[1]]$results[[1]]$address_components[[4]]$types
[[1]]$results[[1]]$address_components[[4]]$types[[1]]
[1] "locality"
[[1]]$results[[1]]$address_components[[4]]$types[[2]]
[1] "political"
[[1]]$results[[1]]$address_components[[5]]
[[1]]$results[[1]]$address_components[[5]]$long_name
[1] "Wellington"
[[1]]$results[[1]]$address_components[[5]]$short_name
[1] "Wellington"
[[1]]$results[[1]]$address_components[[5]]$types
[[1]]$results[[1]]$address_components[[5]]$types[[1]]
[1] "administrative_area_level_1"
[[1]]$results[[1]]$address_components[[5]]$types[[2]]
[1] "political"
[[1]]$results[[1]]$address_components[[6]]
[[1]]$results[[1]]$address_components[[6]]$long_name
[1] "New Zealand"
[[1]]$results[[1]]$address_components[[6]]$short_name
[1] "NZ"
[[1]]$results[[1]]$address_components[[6]]$types
[[1]]$results[[1]]$address_components[[6]]$types[[1]]
[1] "country"
[[1]]$results[[1]]$address_components[[6]]$types[[2]]
[1] "political"
[[1]]$results[[1]]$address_components[[7]]
[[1]]$results[[1]]$address_components[[7]]$long_name
[1] "6021"
[[1]]$results[[1]]$address_components[[7]]$short_name
[1] "6021"
[[1]]$results[[1]]$address_components[[7]]$types
[[1]]$results[[1]]$address_components[[7]]$types[[1]]
[1] "postal_code"
[[1]]$results[[1]]$formatted_address
[1] "27 Hall Street, Newtown, Wellington 6021, New Zealand"
[[1]]$results[[1]]$geometry
[[1]]$results[[1]]$geometry$bounds
[[1]]$results[[1]]$geometry$bounds$northeast
[[1]]$results[[1]]$geometry$bounds$northeast$lat
[1] -41.31066
[[1]]$results[[1]]$geometry$bounds$northeast$lng
[1] 174.7768
[[1]]$results[[1]]$geometry$bounds$southwest
[[1]]$results[[1]]$geometry$bounds$southwest$lat
[1] -41.31081
[[1]]$results[[1]]$geometry$bounds$southwest$lng
[1] 174.7766
[[1]]$results[[1]]$geometry$location
[[1]]$results[[1]]$geometry$location$lat
[1] -41.31074
[[1]]$results[[1]]$geometry$location$lng
[1] 174.7767
[[1]]$results[[1]]$geometry$location_type
[1] "ROOFTOP"
[[1]]$results[[1]]$geometry$viewport
[[1]]$results[[1]]$geometry$viewport$northeast
[[1]]$results[[1]]$geometry$viewport$northeast$lat
[1] -41.30932
[[1]]$results[[1]]$geometry$viewport$northeast$lng
[1] 174.778
[[1]]$results[[1]]$geometry$viewport$southwest
[[1]]$results[[1]]$geometry$viewport$southwest$lat
[1] -41.31202
[[1]]$results[[1]]$geometry$viewport$southwest$lng
[1] 174.7753
[[1]]$results[[1]]$place_id
[1] "ChIJiynBCOOvOG0RMx429ZNDR3A"
[[1]]$results[[1]]$types
[[1]]$results[[1]]$types[[1]]
[1] "premise"
[[1]]$status
[1] "OK"
---
You can explore the nested lists with viewer in Rstudio or listviewer::jsonedit. You can then drill down to the desired information. Basically using unnest_wider to spread the list to columns to then select desired columns and unnest_longer to tease out nested lists to then iterate through.
library(tidyverse)
map(random_addresses, ~geocode(.x, output = "all") %>%
# results is name of list with desired information, create tibble for unnest
tibble(output = .$results) %>%
# Create tibble with address_components as column-list
unnest_wider(output) %>%
dplyr::select(address_components) %>%
# Get address_components as list of lists, each list to df
unnest_longer(., col = "address_components") %>%
map_dfr(., ~.x) %>%
# types is the type of information. It is listed so unlist
mutate(types = unlist(types)) %>%
# Choose the information to keep
filter(types %in% c("street_number", "route")) %>%
# Choose the format of data
select(long_name, types) %>%
# Put in wide form
pivot_wider(names_from = "types", values_from = "long_name")
) %>%
bind_rows # create master df
It will give you lists with your information (before filtering)
[[4]]
# A tibble: 13 × 3
long_name short_name types
<chr> <chr> <chr>
1 New Zealand NZ country
2 New Zealand NZ political
3 42 42 street_number
4 Elden Drive Elden Dr route
5 Saddle River Saddle River locality
6 Saddle River Saddle River political
7 Bergen County Bergen County administrative_area_level_2
8 Bergen County Bergen County political
9 New Jersey NJ administrative_area_level_1
10 New Jersey NJ political
11 United States US country
12 United States US political
13 07458 07458 postal_code

lapply() with XPath to obtain all text after a specific tag not working

Background:
I am scraping this website to obtain a list of all people named under a respective section of the editorial board.
In total, there are 6 sections, each one beginning with a <b>...</b> part. (It actually should be 5, but the code is a bit messy.)
My goal:
I want to get a list of all people per section (a list of 6 elements called people).
My approach:
I try to fetch all the text, or text(), after each respective <b>...</b>-tag.
However, with the following R-code and XPath, I fail to get the correct list:
journal_url <- "https://aepi.biomedcentral.com/about/editorial-board"
webpage <- xml2::read_html(url(journal_url))
# get a list of 6 sections
all_sections <- rvest::html_nodes(wholepage, css = '#editorialboard p')
# the following does not work properly
people <- lapply(all_sections, function(x) rvest::html_nodes(x, xpath = '//b/following-sibling::text()'))
The mistaken outcome:
Instead of giving me a list of 6 elements comprising the people per section, it gives me a list of 6 elements comprising all people in every element.
The expected outcome:
The expected output would start with:
people
[[1]]
[1] Shichuo Li
[[2]]
[1] Zhen Hong
[2] Hermann Stefan
[3] Dong Zhou
[[3]]
[1] Jie Mu
# etc etc
The double forward slash xpath selects all nodes in the whole document, even when the object is a single node. Use the current node selector .
people <- lapply(all_sections, function(x) {
rvest::html_nodes(x, xpath = './b/following-sibling::text()')
})
Output:
[[1]]
{xml_nodeset (1)}
[1] Shichuo Li,
[[2]]
{xml_nodeset (3)}
[1] Zhen Hong,
[2] Hermann Stefan,
[3] Dong Zhou,
[[3]]
{xml_nodeset (0)}
[[4]]
{xml_nodeset (1)}
[1] Jie Mu,
[[5]]
{xml_nodeset (2)}
[1] Bing Liang,
[2] Weijia Jiang,
[[6]]
{xml_nodeset (35)}
[1] Aye Mye Min Aye,
[2] Sándor Beniczky,
[3] Ingmar Blümcke,
[4] Martin J. Brodie,
[5] Eric Chan,
[6] Yanchun Deng,
[7] Ding Ding,
[8] Yuwu Jiang,
[9] Hennric Jokeit,
[10] Heung Dong Kim,
[11] Patrick Kwan,
[12] Byung In Lee,
[13] Weiping Liao,
[14] Xiaoyan Liu,
[15] Guoming Luan,
[16] Imad M. Najm,
[17] Terence O'Brien,
[18] Jiong Qin,
[19] Markus Reuber,
[20] Ley J.W. Sander,
...

removing special apostrophes from French article contractions when tokenizing

I am currently running an stm (structural topic model) of a series of articles from the french newspaper Le Monde. The model is working just great, but I have a problem with the pre-processing of the text.
I'm currently using the quanteda package and the tm package for doing things like removing words, removing numbers...etc...
There's only one thing, though, that doesn't seem to work.
As some of you might know, in French, the masculine determinative article -le- contracts in -l'- before vowels. I've tried to remove -l'- (and similar things like -d'-) as words with removeWords
lmt67 <- removeWords(lmt67, c( "l'","d'","qu'il", "n'", "a", "dans"))
but it only works with words that are separate from the rest of text, not with the articles that are attached to a word, such as in -l'arbre- (the tree).
Frustrated, I've tried to give it a simple gsub
lmt67 <- gsub("l'","",lmt67)
but that doesn't seem to be working either.
Now, what's a better way to do this, and possibly through a c(...) vector so that I can give it a series of expressions all together?
Just as context, lmt67 is a "large character" with 30,000 elements/articles, obtained by using the "texts" functions on data imported from txt files.
Thanks to anyone that will want to help me.
I'll outline two ways to do this using quanteda and quanteda-related tools. First, let's define a slightly longer text, with more prefix cases for French. Notice the inclusion of the ’ apostrophe as well as the ASCII 39 simple apostrophe.
txt <- c(doc1 = "M. Trump, lors d’une réunion convoquée d’urgence à la Maison Blanche,
n’en a pas dit mot devant la presse. En réalité, il s’agit d’une
mesure essentiellement commerciale de ce pays qui l'importe.",
doc2 = "Réfugié à Bruxelles, l’indépendantiste catalan a désigné comme
successeur Jordi Sanchez, partisan de l’indépendance catalane,
actuellement en prison pour sédition.")
The first method will use pattern matches for the simple ASCII 39 (apostrophe) plus a bunch of
Unicode variants, matched through the category "Pf" for "Punctuation: Final Quote" category.
However, quanteda does its best to normalize the quotes at the tokenization stage - see the
"l'indépendance" in the second document for instance.
The second way below uses a French part-of-speech tagger integrated with quanteda that allows similar
selection after recognizing and separating the prefixes, and then removing determinants (among other POS).
1. quanteda tokens
toks <- tokens(txt, remove_punct = TRUE)
# remove stopwords
toks <- tokens_remove(toks, stopwords("french"))
toks
# tokens from 2 documents.
# doc1 :
# [1] "M" "Trump" "lors" "d'une" "réunion"
# [6] "convoquée" "d'urgence" "à" "la" "Maison"
# [11] "Blanche" "n'en" "a" "pas" "dit"
# [16] "mot" "devant" "la" "presse" "En"
# [21] "réalité" "il" "s'agit" "d'une" "mesure"
# [26] "essentiellement" "commerciale" "de" "ce" "pays"
# [31] "qui" "l'importe"
#
# doc2 :
# [1] "Réfugié" "à" "Bruxelles" "l'indépendantiste"
# [5] "catalan" "a" "désigné" "comme"
# [9] "successeur" "Jordi" "Sanchez" "partisan"
# [13] "de" "l'indépendance" "catalane" "actuellement"
# [17] "en" "prison" "pour" "sédition"
Then, we apply the pattern to match l', d', or l', using a regular expression replacement on the types (the unique tokens):
toks <- tokens_replace(
toks,
types(toks),
stringi::stri_replace_all_regex(types(toks), "[lsd]['\\p{Pf}]", "")
)
# tokens from 2 documents.
# doc1 :
# [1] "M" "Trump" "lors" "une" "réunion"
# [6] "convoquée" "urgence" "à" "la" "Maison"
# [11] "Blanche" "n'en" "a" "pas" "dit"
# [16] "mot" "devant" "la" "presse" "En"
# [21] "réalité" "il" "agit" "une" "mesure"
# [26] "essentiellement" "commerciale" "de" "ce" "pays"
# [31] "qui" "importe"
#
# doc2 :
# [1] "Réfugié" "à" "Bruxelles" "indépendantiste" "catalan"
# [6] "a" "désigné" "comme" "successeur" "Jordi"
# [11] "Sanchez" "partisan" "de" "indépendance" "catalane"
# [16] "actuellement" "En" "prison" "pour" "sédition"
From the resulting toks object you can form a dfm and then proceed to fit the STM.
2. using spacyr
This will involve more sophisticated part-of-speech tagging and then converting the tagged object into quanteda tokens. This requires first that you install Python, spacy, and the French language model. (See https://spacy.io/usage/models.)
library(spacyr)
spacy_initialize(model = "fr", python_executable = "/anaconda/bin/python")
# successfully initialized (spaCy Version: 2.0.1, language model: fr)
toks <- spacy_parse(txt, lemma = FALSE) %>%
as.tokens(include_pos = "pos")
toks
# tokens from 2 documents.
# doc1 :
# [1] "M./NOUN" "Trump/PROPN" ",/PUNCT"
# [4] "lors/ADV" "d’/PUNCT" "une/DET"
# [7] "réunion/NOUN" "convoquée/VERB" "d’/ADP"
# [10] "urgence/NOUN" "à/ADP" "la/DET"
# [13] "Maison/PROPN" "Blanche/PROPN" ",/PUNCT"
# [16] "\n /SPACE" "n’/VERB" "en/PRON"
# [19] "a/AUX" "pas/ADV" "dit/VERB"
# [22] "mot/ADV" "devant/ADP" "la/DET"
# [25] "presse/NOUN" "./PUNCT" "En/ADP"
# [28] "réalité/NOUN" ",/PUNCT" "il/PRON"
# [31] "s’/AUX" "agit/VERB" "d’/ADP"
# [34] "une/DET" "\n /SPACE" "mesure/NOUN"
# [37] "essentiellement/ADV" "commerciale/ADJ" "de/ADP"
# [40] "ce/DET" "pays/NOUN" "qui/PRON"
# [43] "l'/DET" "importe/NOUN" "./PUNCT"
#
# doc2 :
# [1] "Réfugié/VERB" "à/ADP" "Bruxelles/PROPN"
# [4] ",/PUNCT" "l’/PRON" "indépendantiste/ADJ"
# [7] "catalan/VERB" "a/AUX" "désigné/VERB"
# [10] "comme/ADP" "\n /SPACE" "successeur/NOUN"
# [13] "Jordi/PROPN" "Sanchez/PROPN" ",/PUNCT"
# [16] "partisan/VERB" "de/ADP" "l’/DET"
# [19] "indépendance/ADJ" "catalane/ADJ" ",/PUNCT"
# [22] "\n /SPACE" "actuellement/ADV" "en/ADP"
# [25] "prison/NOUN" "pour/ADP" "sédition/NOUN"
# [28] "./PUNCT"
Then we can use the default glob-matching to remove the parts of speech in which we are probably not interested, including the newline:
toks <- tokens_remove(toks, c("*/DET", "*/PUNCT", "\n*", "*/ADP", "*/AUX", "*/PRON"))
toks
# doc1 :
# [1] "M./NOUN" "Trump/PROPN" "lors/ADV" "réunion/NOUN" "convoquée/VERB"
# [6] "urgence/NOUN" "Maison/PROPN" "Blanche/PROPN" "n’/VERB" "pas/ADV"
# [11] "dit/VERB" "mot/ADV" "presse/NOUN" "réalité/NOUN" "agit/VERB"
# [16] "mesure/NOUN" "essentiellement/ADV" "commerciale/ADJ" "pays/NOUN" "importe/NOUN"
#
# doc2 :
# [1] "Réfugié/VERB" "Bruxelles/PROPN" "indépendantiste/ADJ" "catalan/VERB" "désigné/VERB"
# [6] "successeur/NOUN" "Jordi/PROPN" "Sanchez/PROPN" "partisan/VERB" "indépendance/ADJ"
# [11] "catalane/ADJ" "actuellement/ADV" "prison/NOUN" "sédition/NOUN"
Then we can remove the tags, which you probably don't want in your STM - but you could leave them if you prefer.
## remove the tags
toks <- tokens_replace(toks, types(toks),
stringi::stri_replace_all_regex(types(toks), "/[A-Z]+$", ""))
toks
# tokens from 2 documents.
# doc1 :
# [1] "M." "Trump" "lors" "réunion" "convoquée"
# [6] "urgence" "Maison" "Blanche" "n’" "pas"
# [11] "dit" "mot" "presse" "réalité" "agit"
# [16] "mesure" "essentiellement" "commerciale" "pays" "importe"
#
# doc2 :
# [1] "Réfugié" "Bruxelles" "indépendantiste" "catalan" "désigné"
# [6] "successeur" "Jordi" "Sanchez" "partisan" "indépendance"
# [11] "catalane" "actuellement" "prison" "sédition"
From there, you can use the toks object to form your dfm and fit the model.
Here's a scrape from the current page at Le Monde's website. Notice that the apostrophe they use is not the same character as the single-quote here "'":
text <- "Réfugié à Bruxelles, l’indépendantiste catalan a désigné comme successeur Jordi Sanchez, partisan de l’indépendance catalane, actuellement en prison pour sédition."
It has a little angle and is not actually "straight down" when I view it. You need to copy that character into your gsub command:
sub("l’", "", text)
[#1] "Réfugié à Bruxelles, indépendantiste catalan a désigné comme successeur Jordi Sanchez, partisan de l’indépendance catalane, actuellement en prison pour sédition."

Find out POI (within 2km) using latitude and longitude

I have a dataset which corresponding of Zipcode along with lat and log.I want to find out list of hospital/bank(within 2km) from that latitude and longitude.
How to do it?
The Long/Lat data looks like
store_zip lon lat
410710 73.8248981 18.5154681
410209 73.0907 19.0218215
400034 72.8148177 18.9724162
400001 72.836334 18.9385352
400102 72.834424 19.1418961
400066 72.8635299 19.2313448
400078 72.9327444 19.1570343
400078 72.9327444 19.1570343
400007 72.8133825 18.9618411
400050 72.8299518 19.0551695
400062 72.8426858 19.1593396
400083 72.9374227 19.1166191
400603 72.9781047 19.1834148
401107 72.8929 19.2762702
401105 72.8663173 19.3053477
400703 72.9992013 19.0793547
401209 NA NA
401203 72.7983705 19.4166761
400612 73.0287209 19.1799265
400612 73.0287209 19.1799265
400612 73.0287209 19.1799265
If your Points of Interest are unknown and you need to find them, you can use Google's API through my googleway package (as you've suggested in the comments). You will need a valid API key for this to work.
As the API can only accept one request at a time, you'll need to iterate over your data one row at a time. For that you can use whatever looping method you're most comforatable with
library(googleway) ## using v2.4.0 on CRAN
set_key("your_api_key")
lst <- lapply(1:nrow(df), function(x){
google_places(search_string = "Hospital",
location = c(df[x, 'lat'], df[x, 'lon']),
radius = 2000)
})
lst is now a list that contains the results of the queries. For example, the names of the hospitals it has returned for the first row of your data is
place_name(lst[[1]])
# [1] "Jadhav Hospital"
# [2] "Poona Hospital Medical Store"
# [3] "Sanjeevan Hospital"
# [4] "Suyash Hospital"
# [5] "Mehta Hospital"
# [6] "Deenanath Mangeshkar Hospital"
# [7] "Sushrut Hospital"
# [8] "Deenanath Mangeshkar Hospital and Research Centre"
# [9] "MMF Ratna Memorial Hospital"
# [10] "Maharashtra Medical Foundation's Joshi Multispeciality Hospital"
# [11] "Sahyadri Hospitals"
# [12] "Deendayal Memorial Hospital"
# [13] "Jehangir Specialty Hospital"
# [14] "Global Hospital And Research Institute"
# [15] "Prayag Hospital"
# [16] "Apex Superspeciality Hospital"
# [17] "Deoyani Multi Speciality Hospital"
# [18] "Shashwat Hospital"
# [19] "Deccan Multispeciality Hardikar Hospital"
# [20] "City Hospital"
You can also view them on a map
set_key("map_api_key", api = "map")
## the lat/lon of the returned results are found through `place_location()`
# place_location(lst[[1]])
df_hospitals <- place_location(lst[[1]])
df_hospitals$name <- place_name(lst[[1]])
google_map() %>%
add_circles(data = df[1, ], radius = 2000) %>%
add_markers(data = df_hospitals, info_window = "name")
Note:
Google's API is limited to 2,500 queries per key per day, unless you pay for a premium account.

Isolating data from single XML nodeset in R xml2

I am trying to iteratively isolate and manipulate nodesets from an XML document, but I am getting a strange behavior in the xml_find_all() function in the xml2 package in R. Can someone please help me understand the scope of functions applied to a nodeset?
Here is an example:
library( xml2 )
library( dplyr )
doc <- read_xml( "<MEMBERS>
<CUSTOMER>
<ID>178</ID>
<FIRST.NAME>Alvaro</FIRST.NAME>
<LAST.NAME>Juarez</LAST.NAME>
<ADDRESS>123 Park Ave</ADDRESS>
<ZIP>57701</ZIP>
</CUSTOMER>
<CUSTOMER>
<ID>934</ID>
<FIRST.NAME>Janette</FIRST.NAME>
<LAST.NAME>Johnson</LAST.NAME>
<ADDRESS>456 Candy Ln</ADDRESS>
<ZIP>57701</ZIP>
</CUSTOMER>
</MEMBERS>" )
doc %>% xml_find_all( '//*') %>% xml_path()
# [1] "/MEMBERS" "/MEMBERS/CUSTOMER[1]"
# [3] "/MEMBERS/CUSTOMER[1]/ID" "/MEMBERS/CUSTOMER[1]/FIRST.NAME"
# [5] "/MEMBERS/CUSTOMER[1]/LAST.NAME" "/MEMBERS/CUSTOMER[1]/ADDRESS"
# [7] "/MEMBERS/CUSTOMER[1]/ZIP" "/MEMBERS/CUSTOMER[2]"
# [9] "/MEMBERS/CUSTOMER[2]/ID" "/MEMBERS/CUSTOMER[2]/FIRST.NAME"
#[11] "/MEMBERS/CUSTOMER[2]/LAST.NAME" "/MEMBERS/CUSTOMER[2]/ADDRESS"
#[13] "/MEMBERS/CUSTOMER[2]/ZIP"
The object customer.01 is a nodeset that contains data from that customer only.
kids <- xml_children( doc )
customer.01 <- kids[[1]]
customer.01
# {xml_node}
# <CUSTOMER>
# [1] <ID>178</ID>
# [2] <FIRST.NAME>Alvaro</FIRST.NAME>
# [3] <LAST.NAME>Juarez</LAST.NAME>
# [4] <ADDRESS>123 Park Ave</ADDRESS>
# [5] <ZIP>57701</ZIP>
Why does the function, applied to the customer.01 nodeset, return the ID for customer.02 as well?
xml_find_all( customer.01, "//MEMBERS/CUSTOMER/ID" )
# {xml_nodeset (2)}
# [1] <ID>178</ID>
# [2] <ID>934</ID>
How do I return only values from that nodeset?
~~~
Ok, so here's a small wrinkle in the solution below, again related to scope of the xml_find_all() function. It says that it can be applied to a document, node, or nodeset. However...
This case works when applied to a nodeset:
library( xml2 )
url <- "https://s3.amazonaws.com/irs-form-990/201501279349300635_public.xml"
doc <- read_xml( url )
xml_ns_strip( doc )
nd <- xml_find_all( doc, "//LiquidationOfAssetsDetail|//LiquidationDetail" )
nodei <- nd[[1]]
nodei
# {xml_node}
# <LiquidationOfAssetsDetail>
# [1] <AssetsDistriOrExpnssPaidDesc>LAND</AssetsDistriOrExpnssPaidDesc>
# [2] <DistributionDt>2014-11-04</DistributionDt>
# [3] <MethodOfFMVDeterminationTxt>SEE ATTACH</MethodOfFMVDeterminationTxt>
# [4] <EIN>abcdefghi</EIN>
# [5] <BusinessName>\n <BusinessNameLine1Txt>GREENSBURG PUBLIC LIBRARY</BusinessNameLine1Txt>\n</BusinessName>
# [6] <USAddress>\n <AddressLine1Txt>1110 E MAIN ST</AddressLine1Txt>\n <CityNm>GREENSBURG</CityNm>\n <StateAbbreviationCd>IN</StateAb ...
# [7] <IRCSectionTxt>501(C)(3)</IRCSectionTxt>
xml_text( xml_find_all( nodei, "AssetsDistriOrExpnssPaidDesc" ) )
# [1] "LAND"
But not this one:
nodei <- xml_children( nd[[i]] )
nodei
# {xml_nodeset (7)}
# [1] <AssetsDistriOrExpnssPaidDesc>LAND</AssetsDistriOrExpnssPaidDesc>
# [2] <DistributionDt>2014-11-04</DistributionDt>
# [3] <MethodOfFMVDeterminationTxt>SEE ATTACH</MethodOfFMVDeterminationTxt>
# [4] <EIN>abcdefghi</EIN>
# [5] <BusinessName>\n <BusinessNameLine1Txt>GREENSBURG PUBLIC LIBRARY</BusinessNameLine1Txt>\n</BusinessName>
# [6] <USAddress>\n <AddressLine1Txt>1110 E MAIN ST</AddressLine1Txt>\n <CityNm>GREENSBURG</CityNm>\n <StateAbbreviationCd>IN</StateAb ...
# [7] <IRCSectionTxt>501(C)(3)</IRCSectionTxt>
xml_text( xml_find_all( nodei, "AssetsDistriOrExpnssPaidDesc" ) )
# character(0)
I'm guessing this is a problem applying xml_find_all() to all elements of a nodeset rather than a scoping issue?
Currently, you are using the absolute path search from root with XPath's double forward slash, //, which means find all items in document that match this path which includes both customers' ID.
For particular child nodes under a specific node, simply use a relative path under selected node:
xml_find_all(customer.01, "ID")
# {xml_nodeset (1)}
# [1] <ID>178</ID>
xml_find_all(customer.01, "FIRST.NAME|LAST.NAME")
# {xml_nodeset (2)}
# [1] <FIRST.NAME>Alvaro</FIRST.NAME>
# [2] <LAST.NAME>Juarez</LAST.NAME>
xml_find_all(customer.01, "*")
# {xml_nodeset (5)}
# [1] <ID>178</ID>
# [2] <FIRST.NAME>Alvaro</FIRST.NAME>
# [3] <LAST.NAME>Juarez</LAST.NAME>
# [4] <ADDRESS>123 Park Ave</ADDRESS>
# [5] <ZIP>57701</ZIP>

Resources