Remove HTML from the body of the text file - r

I am currently writing a function to get the review and rating of an album by getting it from Pitchfork and removing HTML. The result should be a list with 2 elements: the review and the score of that album.
So far I have this and I am still figuring out what to return, the regex of the HTML part and the paste0 function. Thank you for your time!
pitchfork = function(url){
save = getURL(url)
cat(save,file = "review.txt")
a1 = '<div class="contents dropcap"><p>'
b1 = str_replace(save, paste0("^.*",a1),"")
a2 = '</div><a class="end-mark-container" href="/">'
b2 = str_replace(b1, paste0(a2,".*$"),"")
}

How about something like this?
library(xml2)
library(rvest)
library(tidyverse)
url <- "http://pitchfork.com/reviews/albums/grimes-miss-anthropocene"
html <- read_html(url)
review <- html %>%
xml_nodes("p") %>%
html_text() %>%
enframe("paragraph_no", "text")
review
## A tibble: 14 x 2
# paragraph_no text
# <int> <chr>
# 1 1 Best new music
# 2 2 Grimes’ first project as a bona fide pop star is more morose th…
# 3 3 In 2011, Grimes was eager to say in an interview that she had “…
# 4 4 Miss Anthropocene is Grimes’ fifth album and her first as that …
# 5 5 The result is a record that’s more morose than her previous wor…
# 6 6 In November 2018, Grimes released “We Appreciate Power,” a coll…
# 7 7 When Grimes veers away from high concept toward examining intim…
# 8 8 Miss Anthropocene thrills when it reveals a refined, linear evo…
# 9 9 So much about the actual music of Miss Anthropocene succeeds th…
#10 10 And that’s the obstacle, the slimy mouthfeel, standing in the w…
#11 11 Correction: An earlier version of this review erroneously state…
#12 12 Listen to our Best New Music playlist on Spotify and Apple Musi…
#13 13 Buy: Rough Trade
#14 14 (Pitchfork may earn a commission from purchases made through af…
review is a tibble and contains the review split by paragraph; it might need some additional cleaning up (like removing the first and last row(s)).
For the score we can use a class attribute selector
score <- html %>% xml_nodes("[class='score']") %>% html_text() %>% as.numeric()
score
#[1] 8.2
Wrapping things up (in a function)
Let's wrap everything in a function that returns a list with the review tibble and numeric score.
get_pitchfork_data <- function(url) {
html <- read_html(url)
list(
review = html %>%
xml_nodes("p") %>%
html_text() %>%
trimws() %>%
enframe("paragraph_no", "text"),
score = html %>%
xml_nodes("[class='score']") %>%
html_text() %>%
as.numeric())
}
Test 1:
Grimes - Miss Anthropocene
get_pitchfork_data("http://pitchfork.com/reviews/albums/grimes-miss-anthropocene")
#$review
## A tibble: 14 x 2
# paragraph_no text
# <int> <chr>
# 1 1 Best new music
# 2 2 Grimes’ first project as a bona fide pop star is more morose th…
# 3 3 In 2011, Grimes was eager to say in an interview that she had “…
# 4 4 Miss Anthropocene is Grimes’ fifth album and her first as that …
# 5 5 The result is a record that’s more morose than her previous wor…
# 6 6 In November 2018, Grimes released “We Appreciate Power,” a coll…
# 7 7 When Grimes veers away from high concept toward examining intim…
# 8 8 Miss Anthropocene thrills when it reveals a refined, linear evo…
# 9 9 So much about the actual music of Miss Anthropocene succeeds th…
#10 10 And that’s the obstacle, the slimy mouthfeel, standing in the w…
#11 11 Correction: An earlier version of this review erroneously state…
#12 12 Listen to our Best New Music playlist on Spotify and Apple Musi…
#13 13 Buy: Rough Trade
#14 14 (Pitchfork may earn a commission from purchases made through af…
#
#$score
#[1] 8.2
Test 2:
Radiohead - OK Computer (reissue)
get_pitchfork_data("https://pitchfork.com/reviews/albums/radiohead-ok-computer-oknotok-1997-2017/")
#$review
## A tibble: 12 x 2
# paragraph_no text
# <int> <chr>
# 1 1 Best new reissue
# 2 2 Twenty years on, Radiohead revisit their 1997 masterpiece with …
# 3 3 As they regrouped to figure out what their third album might be…
# 4 4 It’s still funny to think, two decades later, that Thom Yorke’s…
# 5 5 It’s unclear what happened to that album. OK Computer obviously…
# 6 6 OKNOTOK is something a little more interesting than a remaster …
# 7 7 But “Lift’s” reputation for positivity might be a little confus…
# 8 8 The most fun to be had with OKNOTOK is in these line-blurring m…
# 9 9 This fondness for camp and schlock has always been latent in Ra…
#10 10 The ghost of Bond followed them once they decamped from their s…
#11 11 Radiohead have been at least as brilliant at packaging and posi…
#12 12 Now that they have arrived at an autumnal, valedictory stage in…
#
#$score
#[1] 10

Related

Multithread computation with R: how to get all different random numbers?

Anyone knows how to get all the random numbers different in the following code? E.g. with doRNG package? I don't care about reproducibility.
Edit: Duplicates by pure chance are accepted.
rm(list = ls())
set.seed(666)
cat("\014")
library(plyr)
library(dplyr)
library(doRNG)
# ====== Data Preparation ======
dt = data.frame(id = 1:10,
part = rep("dt",10),
HG = c(1,3,6,NA,NA,2,NA,NA,NA,NA),
random = NA)
# ====== Set Parallel Computing ======
library(foreach)
library(doParallel)
cl = makeCluster(3, outfile = "")
registerDoParallel(cl)
# ====== SIMULATION ======
nsim = 1000 # number of simulations
iterChunk = 100 # split nsim into this many chunks
out = data.frame() # prepare output DF
for(iter in 1:ceiling(nsim/iterChunk)){
strt = Sys.time()
out_iter =
foreach(i = 1:iterChunk, .combine = rbind, .multicombine = TRUE, .maxcombine = 100000, .inorder = FALSE, .verbose = FALSE,
.packages = c("plyr", "dplyr")) %dopar% {
# simulation number
id_sim = iterChunk * (iter - 1) + i
## Generate random numbers
tmp_sim = is.na(dt$HG) # no results yet
dt$random[tmp_sim] = runif(sum(tmp_sim))
dt$HG[tmp_sim] = 3
# Save Results
dt$id_sim = id_sim
dt$iter = iter
dt$i = i
print(Sys.time())
return(dt)
}#i;sim_forcycle
out = rbind.data.frame(out,subset(out_iter, !is.na(random)))
fnsh = Sys.time()
cat(" [",iter,"] ",fnsh - strt, sep = "")
}#iter
# ====== Stop Parallel Computing ======
stopCluster(cl)
# ====== Distinct Random Numbers ======
length(unique(out$random)) # expectation: 6000
I have been strugling with this for 2 days. I asked this question earlier with only general response about random numbers.
Here I would like to ask for a solution (if anybody knows) how to set doRNG package options (or similar package) in a way that all the random numbers are different. Across all the loops.
I have tried tons of doRNG settings and I still can't get it to work. Tried R versions 3.5.3 and 3.6.3 on two different computers.
UPDATE Following discussion with #Limey
Purpose of the code is to simulate football matches. As the simulation is large, I use iterChunk to "split" the simulation into managable parts and after each iter send the data into PostgreSQL database so the simulation doesn't overload RAM. Some matches already have real world results and have HG (home goals) filled in. I want to simulate the rest.
When setting iterChunk to 1 everything is fine. Increasing iterChunk leads to generation of same numbers within iter. For example when I set nsim to 100 and iterChunk to 10. (All matches simulated 100 times, 10 times in 10 loops). I expect 600 random numbers (each match independently simulated accross all the loops). However I only get 180 - following the logic: 3 cores * 6 matches * 10 iterChunks.) Using 2 workers I do get 120 distinct random numbers (2 * 6 * 10)
Furthermore: exluding dt$HG[tmp_sim] = 3 I do get all random numbers different with whatever setting.
To understand the problem, I suggest:
Run the code as is. (possibly setting nsim to 100 and iterChunk to 10) You will get 180 different random numbers. With lower number of nsim & iterChunk things may work as expected.
Comment out dt$HG[tmp_sim] = 3.
You will get 6000 different random numbers (600 if you change nsim and iterChunk)
The code in 2nd step assigns goals scored by home team. It looks like some kind of bug I can't get over. Even information that someone gets the same result and doesn't know why will be helpful - it will lift the weight of my own stupidity out of me.
Thank you, I highly appreciate any effort.
I realised what the problem with OP's code was whilst I was in the shower. It's simple, and obvious in retrospect: all the loops and parallel processes are working on the same object - the dt data frame. So they're constantly overwriting the changes that each makes, and at the end of the outer loop, you just have multiple copies of the changes made by the last loop to complete. The solution is equally simple: work on a copy of the dt data frame.
To minimise the changes, I renamed dt to baseDT
# ====== Data Preparation ======
baseDT = data.frame(id = 1:10,
part = rep("dt",10),
HG = c(1,3,6,NA,NA,2,NA,NA,NA,NA),
random = NA)
and then took a copy of it at the top of the foreach loop
out_iter = foreach(i = 1:iterChunk,
.combine = rbind, .multicombine = TRUE, .maxcombine = 100000,
.inorder = FALSE, .verbose = FALSE,
.packages = c("plyr", "dplyr")) %dopar% {
dt <- baseDT
This gives
> length(unique(out$random)) # expectation: 6000
[1] 6000
as expected.
Modifying the "Hello World" example in the "getting started with doParallel" vignette to generate random numbers, I came up with:
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
myFunc <- function(n) {runif(n)}
foreach(i=1:3) %dopar% myFunc(10)
[[1]]
[1] 0.18492375 0.13388278 0.65455450 0.93093066 0.41157625 0.89479764 0.14736529 0.47935995 0.03062963 0.16110714
[[2]]
[1] 0.89245145 0.20980791 0.83828019 0.04411547 0.38184303 0.48110619 0.51509058 0.93732055 0.40159834 0.81414140
[[3]]
[1] 0.74393129 0.66999730 0.44411989 0.85040773 0.80224527 0.72483644 0.64566262 0.22546420 0.14526819 0.05931329
Suggesting that getting random numbers across threads is straightforward. Indeed, the examples on pages 2 and 3 of the doRNG reference manual say the same thing.
In fact, if I understand you correctly, the purpose of doRNG is to do precisely the opposite of what you want: to make random processes reproducible across threads.
Of course, this doesn't guarantee that all numbers are different across all threads. But it makes duplication very unlikely. A guarantee of no duplicates would mean some degree of determinism in the process: a completely random process might produce duplicates by chance.
Update
Following on from our conversation in the comments...
We've established that the problem is in your program logic, not the parallelisation per se. So we need to refocus the question: what are you trying to do. I'm afraid it's not at all clear to me. So that means we need to simplify.
I set nsim to 5 and iterChunk to 1. I get 5 data frames which look like
id part HG random id_sim iter i
1 1 dt 1 NA 1 1 1
2 2 dt 3 NA 1 1 1
3 3 dt 6 NA 1 1 1
4 4 dt 3 0.6919744 1 1 1
5 5 dt 3 0.5413398 1 1 1
6 6 dt 2 NA 1 1 1
7 7 dt 3 0.3983175 1 1 1
8 8 dt 3 0.3342174 1 1 1
9 9 dt 3 0.6126020 1 1 1
10 10 dt 3 0.4185468 1 1 1
In each, the values of id_sim and iter are always the same, and run from 1 in the first data frame to 5 in the fifth. i is 1 for all rows in all data frames. Values in random do appear to be random, and different between data frames. But the NAs are all in the same positions in every data frame: the 1st, 2nd, 3rd and 6th rows. The values of HG are as shown above for all five data frames.
Is that what you would expect? If not, what do you expect? Given we know the problem is not the paraellisation, you need to give us more information.
Update 2
Do you know Arduan? They posted a related question over the weekend...
I'm not going to tell you what's wrong with your code. I'll show you how I would apprach your problem. I hope you'll agree it's more readable, if nothing else.
So, we're simulating some football matches. I'll assume its a league format and use the english Premier League as an example. Start by generating the fixture list for a single season.
library(tidyverse)
teams <- c("Arsenal", "Aston Villa", "Bournemouth", "Brighton & Hove Albion",
"Burnley", "Chelsea", "Crystal Palace", "Everton", "Leicester City",
"Liverpool", "Manchester City", "Manchester United", "Newcastle United",
"Norwich City", "Sheffield United", "Southampton", "Tottenham Hotspur",
"Watford", "West Ham United", "Wolverhampton Wanderers")
fixtures <- tibble(HomeTeam=teams, AwayTeam=teams) %>%
complete(HomeTeam, AwayTeam) %>%
filter(HomeTeam != AwayTeam) # A team can't play itself
fixtures %>% head(5)
# A tibble: 5 x 2
HomeTeam AwayTeam
<chr> <chr>
1 Arsenal Aston Villa
2 Arsenal Bournemouth
3 Arsenal Brighton & Hove Albion
4 Arsenal Burnley
5 Arsenal Chelsea
Suppose we know some results. I'll use yesterday's matches as an illustration.
knownResults <- tribble(~HomeTeam, ~AwayTeam, ~HomeGoals, ~AwayGoals,
"Burnley", "Sheffield United", 1, 1,
"Newcastle United", "West Ham United", 2, 2,
"Liverpool", "Aston Villa", 2, 0,
"Southampton", "Manchester City", 1, 0)
resultsSoFar <- fixtures %>%
left_join(knownResults, by=c("HomeTeam", "AwayTeam"))
resultsSoFar %>% filter(!is.na(HomeGoals))
# A tibble: 4 x 4
HomeTeam AwayTeam HomeGoals AwayGoals
<chr> <chr> <dbl> <dbl>
1 Burnley Sheffield United 1 1
2 Liverpool Aston Villa 2 0
3 Newcastle United West Ham United 2 2
4 Southampton Manchester City 1 0
Now some utility functions. You could certainly combine them, but I think it's clearer to keep them separate so you can see exactly what each one is doing.
First, a function to simulate the results of all matches whose results are unknown. The details of how you simulate the scores are entirely arbitrary. I've assumed that home teams score an average of 1.5 goals a game, away teams score 1.2 goals per game. Later on, I'm going to use this to simulate many seasons in one go, so I'll add a variable (Iteration) to index the season.
simulateResults <- function(i=NA, data) {
n <- nrow(data)
data %>%
add_column(Iteration=i, .before=1) %>%
mutate(
# Give the home team a slight advantage
HomeGoals=ifelse(is.na(HomeGoals), floor(rexp(n, rate=1/1.5)), HomeGoals),
AwayGoals=ifelse(is.na(AwayGoals), floor(rexp(n, rate=1/1.2)), AwayGoals)
)
}
Use it, and check that we haven't overwritten known results:
simulateResults(1, resultsSoFar) %>% filter(HomeTeam=="Burnley", AwayTeam=="Sheffield United")
# A tibble: 1 x 5
Iteration HomeTeam AwayTeam HomeGoals AwayGoals
<dbl> <chr> <chr> <dbl> <dbl>
1 1 Burnley Sheffield United 1 1
I'm going to parallelise the overall simulation, so now let's have a function to simulate a chunk of simulations. Again, create an index column to identify the chunk.
simulateChunk <- function(chunkID=NA, n) {
bind_rows(lapply(1:n, simulateResults, data=resultsSoFar)) %>%
add_column(Chunk=chunkID, .before=1)
}
simulateChunk(chunkID=1, n=3)
# A tibble: 1,140 x 6
Chunk Iteration HomeTeam AwayTeam HomeGoals AwayGoals
<dbl> <int> <chr> <chr> <dbl> <dbl>
1 1 1 Arsenal Aston Villa 2 0
2 1 1 Arsenal Bournemouth 0 0
3 1 1 Arsenal Brighton & Hove Albion 2 0
4 1 1 Arsenal Burnley 2 0
5 1 1 Arsenal Chelsea 1 0
6 1 1 Arsenal Crystal Palace 0 0
7 1 1 Arsenal Everton 2 3
8 1 1 Arsenal Leicester City 2 0
9 1 1 Arsenal Liverpool 0 1
10 1 1 Arsenal Manchester City 4 0
OK. Now I'm ready to do the main simulation work. I'll run 10 chunks of 100 simulations eash, to give 1000 simulated seasons in total, the same as you had.
library(doParallel)
cl <- makeCluster(3)
registerDoParallel(cl)
chunkSize <- 100
nChunks <- 10
startedAt <- Sys.time()
x <- bind_rows(foreach(i=1:nChunks, .packages=c("tidyverse")) %dopar% simulateChunk(i, n=chunkSize))
finishedAt <- Sys.time()
print(finishedAt - startedAt)
Time difference of 6.772928 secs
stopCluster(cl)
> x
# A tibble: 380,000 x 6
Chunk Iteration HomeTeam AwayTeam HomeGoals AwayGoals
<int> <int> <chr> <chr> <dbl> <dbl>
1 1 1 Arsenal Aston Villa 2 0
2 1 1 Arsenal Bournemouth 3 1
3 1 1 Arsenal Brighton & Hove Albion 0 1
4 1 1 Arsenal Burnley 3 0
5 1 1 Arsenal Chelsea 1 0
6 1 1 Arsenal Crystal Palace 0 0
7 1 1 Arsenal Everton 1 2
8 1 1 Arsenal Leicester City 0 0
9 1 1 Arsenal Liverpool 0 0
10 1 1 Arsenal Manchester City 0 0
Let's check I've got sensible results. As a basic check, I'll look at the results of Arsenal vs Aston Villa:
x %>%
filter(HomeTeam == "Arsenal", AwayTeam=="Aston Villa") %>%
group_by(HomeGoals, AwayGoals) %>%
summarise(N=n(), .groups="drop") %>%
pivot_wider(
values_from="N", names_prefix="AwayGoals",
names_sep="", names_from=AwayGoals
)
# A tibble: 8 x 10
HomeGoals AwayGoals0 AwayGoals1 AwayGoals2 AwayGoals3 AwayGoals4 AwayGoals5 AwayGoals6 AwayGoals8 AwayGoals7
<dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 0 299 129 57 19 12 7 NA NA NA
2 1 135 63 25 6 4 4 1 2 NA
3 2 75 21 12 9 4 1 NA NA 1
4 3 30 13 10 1 NA NA NA NA NA
5 4 21 7 1 1 NA NA NA NA NA
6 5 11 2 1 NA 2 NA NA NA NA
7 6 4 2 2 NA NA NA NA NA NA
8 7 4 1 1 NA NA NA NA NA NA
That looks reasonable. Now confirm that the matches with known results don't vary. For example:
x %>%
filter(HomeTeam == "Liverpool", AwayTeam=="Aston Villa") %>%
group_by(HomeGoals, AwayGoals) %>%
summarise(N=n(), .groups="drop") %>%
pivot_wider(values_from="N", names_prefix="AwayGoals", names_sep="", names_from=AwayGoals)
HomeGoals AwayGoals0
<dbl> <int>
1 2 1000
All good.
So, That's 23 statements to generate the fixtures, take account of known results, simulate the remainder of the matches and do some basic sanity checking. I could easily get that down to under 20 statements if I had to. That's about a third less than you were using just to try to simulate the unknown results. [The actual simulation takes fewer than 10 statements.] I think my approach is easier to understand: by using tidy verbs the code is almost self-documenting.

How to find search words from a table, in another table, and then create new columns of the results?

I'm trying to find specifice words listed in a tibble arbeit in the another tibble rawEng$Text. If a word, or words, were found, I want to create, or mutate, a new data frame iDataArbeit with two new columns, one for the found word/s wArbeit, and one for the sum of there tf-idf iArbeitscores from arbeit$tfidf
My Data:
arbeit:
X1 feature tfidf
<dbl> <chr> <dbl>
1 0 sick 0.338
2 2 contract 0.188
3 3 pay 0.175
4 4 job 0.170
5 5 boss 0.169
6 6 sozialversicherungsnummer 0.169
rawEng:
Gender Gruppe Datum Text
<chr> <chr> <dttm> <chr>
1 F Berlin Expats 2017-07-07 00:00:00 Anyone out there who's had to apply for Führung~
2 F FAB 2018-01-18 00:00:00 Dear FAB, I am in need of a Führungszeugnis no ~
3 M Free Advice ~ 2017-01-30 00:00:00 Dear Friends, i would like to ask you how can I~
4 M FAB 2018-04-12 00:00:00 "Does anyone know why the \"Standesamt Pankow (~
5 F Berlin Expats 2018-11-12 00:00:00 having trouble finding consistent information a~
6 F Toytown Berl~ 2017-06-08 00:00:00 "Hello\r\n\r\nI have a question regarding Airbn~
I've tried with dplyr::mutate, using this code:
idataEnArbeit <- mutate(rawEng, wArbeit = ifelse((str_count(rawEng$Text, arbeit$feature))>=1,
arbeit$feature, NA),
iArbeit = ifelse((str_count(rawEng$Text, arbeit$feature))>=1,
arbeit$tfidf, NA))
but all I get is one Word, and it's tf-idf score, in the new columens iDatatArbeit$wArbeitand iDataArbeit$iArbeit
Gender Gruppe Datum Text wArbeit iArbeit
<chr> <chr> <dttm> <chr> <chr> <dbl>
1 F Berlin | Girl ~ 2018-09-11 13:22:05 "11 septembre, 13:21 GGI ~ sick 0.338
2 F ExpatBabies Be~ 2017-10-19 16:24:23 "16:24 Babysitter needed! B~ sick 0.338
3 F Berlin | Girl ~ 2018-06-22 18:24:19 "gepostet. Leonor Valen~ sick 0.338
4 F 'Neu in Berlin' 2018-09-18 23:19:51 "Hello guys, I am working wit~ sick 0.338
5 M Free Advice Be~ 2018-04-27 08:49:24 "In need of legal advice: Wha~ sick 0.338
6 F Free Advice Be~ 2018-07-04 18:33:03 "Is there somebody I can pay ~ sick 0.338
In summary: I want all words from arbeit$feature which are found in rawEng$Text to be added in iDataArbeit$wArbeit, and the sum of there tf-idf score to be added in iDataArbeit$iArbeit
Since I don't have your data, I'll import the gutenbergr library and play w/ Treasure Island.
library(tidytext)
library(gutenbergr)
## Now get the dataset
Treasure_Island <- gutenberg_works(title == "Treasure Island") %>% pull(gutenberg_id) %>%
gutenberg_download(.)
## and construct a toy arbeit:
arbeit <- data.frame(feature = c("island", "treasure", "to"),
tfidf = c(0.3,0.5,0.6))
## Break up a word into it's components (the head is just to keep the example short... you omit)
tidy_treasure <- unnest_tokens(Treasure_Island, feature, text, drop = FALSE) %>%
head(500)
## now bring the tfidf into tidy_treasure
df <- left_join(tidy_treasure, arbeit, by = "feature")
## and now you can average by sentence normally.
## To get the words we have to throw out the words that don't contribute to our tfidf.
## Two options:
df %>% filter(!is.na(tfidf)) %>% group_by(text) %>% summarize(AveTFIDF = sum(tfidf, na.rm = TRUE),
Words = paste(feature, collapse = ";"))
## Or if you want to keep a row for each found word, we can't use summarize, but we can still add them all up.
df %>% filter(!is.na(tfidf)) %>% group_by(text) %>% mutate(AveTFIDF = sum(tfidf, na.rm = TRUE))

Obtaining data from Spotify Top Charts using spotifyr

I'm trying to obtain the audio features for the top 200 charts of all of 2017 using the spotifyr package on R, I tried:
days<- spotifycharts::chartdaily()
for (i in days) {
spotifycharts::chart_top200_daily(region = "global",days = "days[i]")
}
to obtain the top 200 daily for all of 2017, but I was unable to do it.
Can someone help me? :(
It works, if you turn days from tibble into vector:
days <- unlist(chart_daily())
lapply(days[1:3], function(i) chart_top200_daily("global", days = i))
But it parse data badly, so there will be problems with variable names, etc:
# A tibble: 6 x 5
x1 x2 x3 note.that.these.figures.are.generated.… x5
<int> <chr> <chr> <int> <chr>
1 NA Track Name Artist NA URL
2 1 thank u, next Ariana… 8293841 https://open.spoti…
3 2 Taki Taki (with S… DJ Sna… 5467625 https://open.spoti…
4 3 MIA (feat. Drake) Bad Bu… 3955367 https://open.spoti…
5 4 Happier Marshm… 3357435 https://open.spoti…
6 5 BAD XXXTEN… 3131745 https://open.spoti…

How to read in Data from Messy Excel Books

I've been dealing with patient and financial data from a hospital. The data is stored in .xlsx excel books. There are multiple pages within each sheet stretching horizontally and vertically. Some of the columns have neatly defined names as you would want for R but then others do not or have text in between and not to mention what appear to be randomly. At times
a section has a title which is the result of multiple rows being formatted into one singular row.
Unfortunately, I cannot show the data due to confidentiality. Is there anyway around this when the data is far from being in a tidy format?
So far I have been copying and pasting the data into a new CSV.
While this was effective I felt that it was largely inefficient.Is this the best approach to take?
Help would be much appreciated
Thanks
EDIT
As I cannot show data this is the best I can show
Hi #Paul
So Let me give a rough example
Jan Feb March April
Income X 1 2 3 4
Income Y 2 4 4 6
Expenditure
Jan Feb March April Another table here also
Expense 1 3 5 7
Expense 5 6 7 8
(Excel Bar chart)
Look at the readxl package, the range option might be what you're looking for:
library(readxl)
df1 <- read_xlsx("C:\\Users\\...\\Desktop\\Book1.xlsx", range = "A1:D3")
# # A tibble: 2 x 4
# Jan Feb March April
# <dbl> <dbl> <dbl> <dbl>
# 1 1 3 5 7
# 2 5 6 7 8
df2 <- read_xlsx("C:\\Users\\...\\Desktop\\Book1.xlsx", range = "B6:E8")
# # A tibble: 2 x 4
# Jan Feb March April
# <dbl> <dbl> <dbl> <dbl>
# 1 1 3 5 7
# 2 5 6 7 8

Is rvest the best tool to collect information from this table?

I have used rvest package to extract a list of companies and the a.href elements in each company, which I need to proceed with the data collection process. This is the link of the website: http://www.bursamalaysia.com/market/listed-companies/list-of-companies/main-market.
I have used the following code to extract the table but nothing comes out. I used other approaches as those posted in "Scraping table of NBA stats with rvest" and similar links, but I cannot obtain what I want. Any help would be greatly appreciated.
my code:
link.main <-
"http://www.bursamalaysia.com/market/listed-companies/list-of-companies/main-market/"
web <- read_html(link.main) %>%
html_nodes("table#bm_equities_prices_table")
# it does not work even when I write html_nodes("table")
or ".table" or #bm_equities_prices_table
web <- read_html(link.main)
%>% html_nodes(".bm_center.bm_dataTable")
# no working
web <- link.main %>% read_html() %>% html_table()
# to inspect the position of table in this website
The page generates the table using JavaScript, so you either need to use RSelenium or Python's Beautiful Soup to simulate the browser session and allow javascript to run.
Another alternative is to use awesome package by #hrbrmstr called decapitated, which basically runs headless Chrome browser session in the background.
#devtools::install_github("hrbrmstr/decapitated")
library(decapitated)
library(rvest)
res <- chrome_read_html(link.main)
main_df <- res %>%
rvest::html_table() %>%
.[[1]] %>%
as_tibble()
This outputs the content of the table alright. If you want to get to the elements underlying the table (href attributes behind the table text), you will need to do a bit more of list gymnastics. Some of the elements in the table are actually missing links, extracting by css proved to be difficult.
library(dplyr)
library(purrr)
href_lst <- res %>%
html_nodes("table td") %>%
as_list() %>%
map("a") %>%
map(~attr(.x, "href"))
# we need every third element starting from second element
idx <- seq.int(from=2, by=3, length.out = nrow(main_df))
href_df <- tibble(
market_href=as.character(href_lst[idx]),
company_href=as.character(href_lst[idx+1])
)
bind_cols(main_df, href_df)
#> # A tibble: 800 x 5
#> No `Company Name` `Company Website` market_href company_href
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 7-ELEVEN MALAYS~ http://www.7elev~ /market/list~ http://www.~
#> 2 2 A-RANK BERHAD [~ http://www.arank~ /market/list~ http://www.~
#> 3 3 ABLEGROUP BERHA~ http://www.gefun~ /market/list~ http://www.~
#> 4 4 ABM FUJIYA BERH~ http://www.abmfu~ /market/list~ http://www.~
#> 5 5 ACME HOLDINGS B~ http://www.suppo~ /market/list~ http://www.~
#> 6 6 ACOUSTECH BERHA~ http://www.acous~ /market/list~ http://www.~
#> 7 7 ADVANCE SYNERGY~ http://www.asb.c~ /market/list~ http://www.~
#> 8 8 ADVANCECON HOLD~ http://www.advan~ /market/list~ http://www.~
#> 9 9 ADVANCED PACKAG~ http://www.advan~ /market/list~ http://www.~
#> 10 10 ADVENTA BERHAD ~ http://www.adven~ /market/list~ http://www.~
#> # ... with 790 more rows
Another option without using browser:
library(httr)
library(jsonlite)
library(XML)
r <- httr::GET(paste0(
"http://ws.bursamalaysia.com/market/listed-companies/list-of-companies/list_of_companies_f.html",
"?_=1532479072277",
"&callback=jQuery16206432131784246533_1532479071878",
"&alphabet=",
"&market=main_market",
"&_=1532479072277"))
l <- rawToChar(r$content)
m <- gsub("jQuery16206432131784246533_1532479071878(", "", substring(l, 1, nchar(l)-1), fixed=TRUE)
tbl <- XML::readHTMLTable(jsonlite::fromJSON(m)$html)$bm_equities_prices_table
output:
> head(tbl)
# No Company Name Company Website
#1 1 7-ELEVEN MALAYSIA HOLDINGS BERHAD http://www.7eleven.com.my
#2 2 A-RANK BERHAD [S] http://www.arank.com.my
#3 3 ABLEGROUP BERHAD [S] http://www.gefung.com.my
#4 4 ABM FUJIYA BERHAD [S] http://www.abmfujiya.com.my
#5 5 ACME HOLDINGS BERHAD [S] http://www.supportivetech.com/
#6 6 ACOUSTECH BERHAD [S] http://www.acoustech.com.my/

Resources