Scraping keywords on PHP page - r

I would like to scrape the keywords inside the dropdown table of this webpage https://www.aeaweb.org/jel/guide/jel.php
The problem is that the drop-down menu of each item prevents me from scraping the table directly because it only takes the heading and not the inner content of each item.
rvest::read_html("https://www.aeaweb.org/jel/guide/jel.php") %>%
rvest::html_table()
I thought of scraping each line that starts with Keywords: but I do not get how can I do that. Seems like the HTML is not showing the items inside the table.

A RSelenium solution,
#Start the server
library(RSelenium)
driver = rsDriver(
browser = c("firefox"))
remDr <- driver[["client"]]
#Navigate to the url
remDr$navigate("https://www.aeaweb.org/jel/guide/jel.php")
#xpath of the table
remDr$findElement(using = "xpath",'/html/body/main/div/section/div[4]') -> out
#get text from the table
out <- out$getElementText()
out= out[[1]]
Split using stringr package
library(stringr)
str_split(out, "\n", n = Inf, simplify = FALSE)
[[1]]
[1] "A General Economics and Teaching"
[2] "B History of Economic Thought, Methodology, and Heterodox Approaches"
[3] "C Mathematical and Quantitative Methods"
[4] "D Microeconomics"
[5] "E Macroeconomics and Monetary Economics"
[6] "F International Economics"
[7] "G Financial Economics"
[8] "H Public Economics"
[9] "I Health, Education, and Welfare"
[10] "J Labor and Demographic Economics"
[11] "K Law and Economics"
[12] "L Industrial Organization"
[13] "M Business Administration and Business Economics; Marketing; Accounting; Personnel Economics"
[14] "N Economic History"
[15] "O Economic Development, Innovation, Technological Change, and Growth"
[16] "P Economic Systems"
[17] "Q Agricultural and Natural Resource Economics; Environmental and Ecological Economics"
[18] "R Urban, Rural, Regional, Real Estate, and Transportation Economics"
[19] "Y Miscellaneous Categories"
[20] "Z Other Special Topics"
To get the Keywords for History of Economic Thought, Methodology, and Heterodox Approaches
out1 <- remDr$findElement(using = 'xpath', value = '//*[#id="cl_B"]')
out1$clickElement()
out1 <- remDr$findElement(using = 'xpath', value = '/html/body/main/div/section/div[4]/div[2]/div[2]/div/div/div/div[2]')
out1$getElementText()
[[1]]
[1] "Keywords: History of Economic Thought"

Related

Web scraping from wikipedia tables in R

I'm trying to scrape some data from the following wikipedia table:
Link: https://en.wikipedia.org/wiki/Aire-la-Ville
I am using this code to scrape Area, Elevation, and density using css selectors. I am storing the data in canton_table but only getting the elevation data and not for the other variables.
My code:
# Get labels and data
labels <- current_html %>% html_elements(css = ".infobox-label") %>% html_text()
data <- current_html %>% html_elements(css = ".infobox-data") %>% html_text()
Output for labels and data variables:
> labels
[1] "Country" "Canton" "District"
[4] " • Mayor" " • Total" "Elevation"
[7] " • Total" " • Density" "Time zone"
[10] " • Summer (DST)" "Postal code(s)" "SFOS number"
[13] "Surrounded by" "Website"
>data
[1] "Switzerland"
[2] "Geneva"
[3] "n.a."
[4] "MaireRaymond Gavillet"
[5] "6.50 km2 (2.51 sq mi)"
[6] "428 m (1,404 ft)"
[7] "11,609"
[8] "1,800/km2 (4,600/sq mi)"
[9] "UTC+01:00 (Central European Time)"
[10] "UTC+02:00 (Central European Summer Time)"
[11] "1234,1255"
[12] "6645"
[13] "Bossey (FR-74), Carouge, Chêne-Bougeries, Étrembières (FR-74), Gaillard (FR-74), Geneva (Genève), Plan-les-Ouates, Thônex, Troinex"
[14] "www.veyrier.ch SFSO statistics"
I am able to populate the table with only elevation data and not area and density. Please help. Thanks!
# Clean text and store in data frame
canton_table[canton_table$name == current_name, "area"] <- helper_function(" • Total", labels, data)
canton_table[canton_table$name == current_name, "elevation"] <- helper_function("Elevation", labels, data)
canton_table[canton_table$name == current_name, "density"] <- helper_function(" • Density", labels, data)
My output table:
My output table:
You have to change the labels name in the labels array, from " • Total" to "Total" eccs.
The names like this " • Total" are probably giving references problems.
And then create the table
canton_table[canton_table$name == current_name, "area"] <- helper_function("Total", labels, data)

Splitting string with '<U+FF0E>' in R

Hello I am trying to split a dataframe column test$Name that is in this format.
[1]"Fung Yat Building<U+FF0E>13/F<U+FF0E>Flat A"
[2] "Victoria Centre<U+FF0E>Block 3<U+FF0E>20/F<U+FF0E>Flat B"
[3] "Lei King Wan<U+FF0E>Sites B<U+FF0E>Block 6 Yat Hong Mansion<U+FF0E>3/F<U+FF0E>Flat H"
[4] "Island Place<U+FF0E>Block 3 (Three Island Place)<U+FF0E>9/F<U+FF0E>Flat G"
[5] "7A Comfort Terrace<U+FF0E>5/F<U+FF0E>Flat B"
[6] "Broadview Court<U+FF0E>Block 4<U+FF0E>38/F<U+FF0E>Flat E"
[7] "Chi Fu Fa Yuen<U+FF0E>Fu Ho Yuen (Block H-5)<U+FF0E>16/F<U+FF0E>Flat G"
[8] "City Garden<U+FF0E>Phase 2<U+FF0E>Block 10<U+FF0E>9/F<U+FF0E>Flat B"
[9] "Euston Court<U+FF0E>Tower 1<U+FF0E>12/F<U+FF0E>Flat H"
[10] "Garley Building<U+FF0E>10/F<U+FF0E>Flat C"
The structure of each entry is BuildingName<U+FF0E>FloorNumber<U+FF0E>Unit. I would like to extract the building name like the following example.
Name
Fung Yat Building
Victoria Centre
Lei King Wan
...
I have tested that <U+FF0E> is actually '.' by doing this.
grepl('.',"Fung Yat Building<U+FF0E>13/F<U+FF0E>Flat A")
[1] TRUE
Hence, I have tried the followings but none of them worked...
test %>% separate(Name, c('Name'), sep = '.') %>% head
gsub(".", " ", test$Name[1], fixed=TRUE)
sub("^\\s*<U\\+\\w+>\\s*", " ", test$Name[1])
Any suggestions please? Thanks!
easies way is to use < as a split pattern.
library(stringr)
word("Fung Yat Building<U+FF0E>13/F<U+FF0E>Flat A", 1, sep = "\\<")
# word("Fung Yat Building<U+FF0E>13/F<U+FF0E>Flat A", 1, sep = "\\<U\\+FF0E\\>") ## building is '1', FloorNumber is '2', Unit os '3'
out:
[1] "Fung Yat Building"

How do you scrape multiple pages from same website on Rstudio

so I want to download data from multiple pages of the same website using RStudio
https://www.irishjobs.ie/ShowResults.aspx?Keywords=Data&autosuggestEndpoint=%2fautosuggest&Location=0&Category=&Recruiter=Company&btnSubmit=Search&Page=2
The difference between page 2 and page 3, is …at the end of the hyperlink we just have a 3 instead of a 2
I have no problem getting what I need from 25 jobs in 1 page, but I want to get 100 jobs from 4 pages.
I am using the selector gadget chrome extension.
I tried the for loop
for (page_result in seq(from =1, to = 101, by = 25)) {
link = paste0(“ https://www.irishjobs.ie/ShowResults.aspx?Keywords=Data&autosuggestEndpoint=%2fautosuggest&Location=0&Category=&Recruiter=Company&btnSubmit=Search&Page=2)
page = read_html(link)
I can’t figure out how to do it
I think I need to fit in page_result into the link, but I don’t know where.
I welcome any ideas.
i have the rvest package and the dplyr package. But I want the for loop to go through each page. Any idea how best to do this, thanks
4 links can be easily put in for loop.
Copy the CSS link from DOM and iterate over 5 to 30 to get all 25 jobs.
AllJOBS <- vector()
for (i in 1:4) {
print("s")
url <- paste0("https://www.irishjobs.ie/ShowResults.aspx?Keywords=Data&autosuggestEndpoint=%2fautosuggest&Location=0&Category=&Recruiter=Company&btnSubmit=Search&Page=",i,sep="")
for (k in 5:30) {
jobs <- read_html(url) %>% html_node(css = paste0("#page > div.container > div.column-wrap.order-one-two > div.two-thirds > div:nth-child(",k,") > div > div.job-result-logo-title > div.job-result-title > h2 > a")) %>% html_text()
AllJOBS <- append(AllJOBS,jobs)
Sys.sleep(runif(1,1,2))
print(k)
}
print(paste0("Page",i))
}
output
> AllJOBS
[1] "Senior Consultant - Fund Static Data"
[2] "Data Warehouse Engineer"
[3] "Senior Software Engineer - Big Data DevOps"
[4] "HR Data Analyst"
[5] "Data Insights Engineer - Dublin - Permanent/Contract - SQL Server"
[6] NA
[7] "Data Engineer - Master Data Services - SQL Server - Permanent/Contract"
[8] "Senior Data Protection Officer (DPO) - Contract"
[9] "QC Data Analyst (Trending)"
[10] "Senior Data Warehouse Developer"
[11] "Senior Data Analyst FTC"
[12] "Compliance Advisory and Data Protection Relationship Manager"
[13] "Contracts Manager-Data Center"
[14] "Payments Product Data Analyst"
[15] "Data Center Product Hardware Platform Engineer"
[16] "People Data Privacy Program Lead"
[17] "Head of Data Science"
[18] "Data Protection Counsel (Product or Compliance)"
[19] "Data Engineer, GMS"
[20] "Data Protection Associate General Counsel"
[21] "Senior Data Engineer"
[22] "Geospatial Data Scientist"
[23] "Data Solutions Manager"
[24] "Data Protection Solicitor"
[25] "Junior Data Scientist"
[26] "Master Data Specialist"
[27] "Temp QC Electronic Data Management Analyst"
[28] "20725 -Data Scientist - Limerick"
[29] "Technical Support Specialist - Data Centre"
[30] "Lead QC Micro Analyst (data review and compliance)"
[31] "Temp QC Data Analyst"
[32] "#Abbvie Compliance Engineer (Data Integrity)"
[33] "People Data Analyst"
[34] "Senior Electrical Design Engineer - Data Centre Ex"
[35] "Laboratory Data Entry Assistant, UCD NVRL"
[36] "Data Migrations Specialist"
[37] "Data Protection Officer"
[38] "Data Center Operations Engineer (Linux)"
[39] "Senior Electrical Engineer | Data Centre LV Design"
[40] "Data Scientist - (Process Sciences)"
[41] "Mgr Supply Logistics Global Materials Data"
[42] "Data Protection / Privacy Delivery Consultant"
[43] "Global Supply Chain Data Analyst"
[44] "QC Data Analyst"
[45] "0582GradeVIIFOIOLOL1120 - Grade VII Data Protection / Freedom of Information & Compliance Officer"
[46] "DPO001 - Deputy Data Protection Officer (General Manager) Office of the Head of Data Protection, HSE"
[47] "Senior Campaign Data Analyst"
[48] "Data & Reporting Analyst II"
[49] "Azure Data Analytics Solution Architect"
[50] "Head of Risk Assurance for IT, Data, Projects and Outsourcing"
[51] "Trainee Data Technician, Ireland"
[52] NA
You can deal with NAs separately. Does this answer your question or I misinterpreted it?

Using strsplit results in terms with quotation marks in r

I have a large set of data, which I have imported from excel. I wish to get term frequency table for the data set. But, when I use strspplit, it includes quotation marks and other punctuation which gives wrong results.
There is a small error in the way I am using strsplit and need help on the same as I am not able to figure it out myself.
df = read_excel("C:/Users/B M Consulting/Documents/Book2.xlsx", col_types=c("text","numeric"), range=cell_cols("A:B"))
vect <- c(df[1])
vectsplit <- strsplit(tolower(vect), "\s+")
vectlev <- unique(unlist(vectsplit))
vecttermf <- sapply(vectsplit, function(x) table(factor(x, levels=vectlev)))
The output vect is something like this:
[1] "3 inch c clamp" "baby vice" "baby vice bench" "baby vise"
[5] "bench" "bench vice" "bench vice clamp" "bench vise"
[9] "bench voice" "bench wise" "bench wise heavy" "bench wise table"
[13] "box for tools" "c clamp" "c clamp set" "c clamps"
[17] "carpenter tools" "carpenter tools low price" "cast iron pipe" "clamp"
[21] "clamp set" "clamps woodworking" "g clamp" "g clamp set 3 inch"
I need to get each word out. When I use strplit, it includes all the punctuation marks.
Below is a small section of vectsplit that I get. It includes all inverted commas, backslashes and commas which I dont want.
[1] "c(\"3" "inch" "c" "clamp\"," "\"baby" "vice\"," "\"baby" "vice"
[9] "bench\"," "\"baby" "vise\"," "\"bench\"," "\"bench" "vice\"," "\"bench" "vice"
[17] "clamp\"," "\"bench" "vise\"," "\"bench" "voice\"," "\"bench" "wise\"," "\"bench"
[25] "wise" "heavy\"," "\"bench" "wise" "table\"," "\"box" "for" "tools\","
[33] "\"c" "clamp\"," "\"c" "clamp" "set\"," "\"c" "clamps\"," "\"carpenter"
[41] "tools\"," "\"carpenter" "tools" "low" "price\"," "\"cast" "iron" "pipe\","
If you check the class of vect, you'll notice that it's not a character vector, but a list.
vect<-c(df[1])
class(vect)
> "list"
If you define vect as below, the issue disappears:
vect<-df[[1]]
class(vect)
> "character"
If you define vect as such and then use strsplit, it should work just fine. Keep in mind that different kinds of subsetting ([1] vs. [[1]]) will produce different classes of outputs.

Replace all non-alphanumeric with a period

I am trying to rename all of these atrocious column names in a data frame I received from a government agency.
> colnames(thedata)
[1] "Region" "Resource Assessment Site ID"
[3] "Site Name/Facility" "Design Head (feet)"
[5] "Design Flow (cfs)" "Installed Capacity (kW)"
[7] "Annual Production (MWh)" "Plant Factor"
[9] "Total Construction Cost (1,000 $)" "Annual O&M Cost (1,000 $)"
[11] "Cost per Installed Capacity ($/kW)" "Benefit Cost Ratio with Green Incentives"
[13] "IRR with Green Incentives" "Benefit Cost Ratio without Green Incentives"
[15] "IRR without Green Incentives"
The column headers have special non-alphanumeric characters and spaces, so referring to them is impossible so I have to rename them. I would like to replace all non-alphanumeric characters with a period. But I tried:
old.col.names <- colnames(thedata)
new.col.names <- gsub("^a-z0-9", ".", old.col.names)
The ^ is a "not" delineation, so I thought it would replace everything that is not alphanumeric with a period in the old.col.names.
Can anyone help?
Here are three options to consider:
make.names(x)
gsub("[^A-Za-z0-9]", ".", x)
names(janitor::clean_names(setNames(data.frame(matrix(NA, ncol = length(x))), x)))
Here's what each looks like:
make.names(x)
## [1] "Region" "Resource.Assessment.Site.ID"
## [3] "Site.Name.Facility" "Design.Head..feet."
## [5] "Design.Flow..cfs." "Installed.Capacity..kW."
## [7] "Annual.Production..MWh." "Plant.Factor"
## [9] "Total.Construction.Cost..1.000..." "Annual.O.M.Cost..1.000..."
## [11] "Cost.per.Installed.Capacity....kW." "Benefit.Cost.Ratio.with.Green.Incentives"
## [13] "IRR.with.Green.Incentives" "Benefit.Cost.Ratio.without.Green.Incentives"
## [15] "IRR.without.Green.Incentives"
gsub("[^A-Za-z0-9]", ".", x)
## [1] "Region" "Resource.Assessment.Site.ID"
## [3] "Site.Name.Facility" "Design.Head..feet."
## [5] "Design.Flow..cfs." "Installed.Capacity..kW."
## [7] "Annual.Production..MWh." "Plant.Factor"
## [9] "Total.Construction.Cost..1.000..." "Annual.O.M.Cost..1.000..."
## [11] "Cost.per.Installed.Capacity....kW." "Benefit.Cost.Ratio.with.Green.Incentives"
## [13] "IRR.with.Green.Incentives" "Benefit.Cost.Ratio.without.Green.Incentives"
## [15] "IRR.without.Green.Incentives"
library(janitor)
names(clean_names(setNames(data.frame(matrix(NA, ncol = length(x))), x)))
## [1] "region" "resource_assessment_site_id"
## [3] "site_name_facility" "design_head_feet"
## [5] "design_flow_cfs" "installed_capacity_kw"
## [7] "annual_production_mwh" "plant_factor"
## [9] "total_construction_cost_1_000" "annual_o_m_cost_1_000"
## [11] "cost_per_installed_capacity_kw" "benefit_cost_ratio_with_green_incentives"
## [13] "irr_with_green_incentives" "benefit_cost_ratio_without_green_incentives"
## [15] "irr_without_green_incentives"
Sample data:
x <- c("Region", "Resource Assessment Site ID", "Site Name/Facility",
"Design Head (feet)", "Design Flow (cfs)", "Installed Capacity (kW)",
"Annual Production (MWh)", "Plant Factor", "Total Construction Cost (1,000 $)",
"Annual O&M Cost (1,000 $)", "Cost per Installed Capacity ($/kW)",
"Benefit Cost Ratio with Green Incentives", "IRR with Green Incentives",
"Benefit Cost Ratio without Green Incentives", "IRR without Green Incentives")

Resources