quirky behavoir with rvest wikipedia scraping - r

Sorry, this is a weird question, but I just can't seem to figure it out myself. The good news is: I think it's totally reproducible.
I'm trying to build a simple R function to use {rvest} to scrape wikipedia for the hometown of musicians. Basically, the function I wrote works, but for certain artists, it doesn't work (returns NULL). (Randy Newman is one such, so I'll use him as an example.)
When I just run the whole thing (below) and then findHome("randy newman") I get NULL but then when I attempt to debug, I run the tableMusic() function and then artist <- "randy newman" and then run all the guts of the artistData() function line by line, IT WORKS!
AND THEN, once I've done that, I can run findHome("randy newman") and it will work right. What gives?! Do I have something in the wrong order or something? I can't seem to figure it out.
Any help is much appreciated. Here is the code:
library(rvest)
findHome <- function(artist) {
##function to look for the table with the right info
tableMusic <- function(data) {
if(!any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) {
for (i in 2:5) {
data <- try(url %>% html %>% html_nodes(xpath=paste('//*[#id="mw-content-text"]/table[', i, ']', sep="")) %>% html_table(fill=T), silent=T)
if(!class(data)=="try-error" & length(data)>0) {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
if(any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) {
break
}
}
}
}
if(class(data)=="try-error" | length(data)<1) {
data <- NULL
} else if (!any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) {
data <- NULL
}
data
}
#function to pull data and try different pages if the first is wrong
artistData <- function(artist) {
artist <- gsub(" ", "_", artist)
artist <- gsub("'", "%27", artist)
##first try getting the data
url <- paste("https://en.wikipedia.org/wiki/", artist, sep="")
data <- try(url %>% html %>% html_nodes(xpath='//*[#id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T)
##check if it's the right page (deal with disambiguation issues)
if(!class(data)=="try-error" & length(data)>0) {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
data <- tableMusic(data)
}
## if try-error or musicTable==NULL, try _(band)
if(class(data)=="try-error" | is.null(data) | length(data)<1) {
url <- paste("https://en.wikipedia.org/wiki/", artist, "_(band)", sep="")
data <- try(url %>% html %>% html_nodes(xpath='//*[#id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T)
if(class(data)=="try-error"){
data <- NULL
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
data <- tableMusic(data)
}
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
}
## if try-error or musicTable==NULL, try _(musician)
if(class(data)=="try-error" | is.null(data) | length(data)<1) {
url <- paste("https://en.wikipedia.org/wiki/", artist, "_(musician)", sep="")
data <- try(url %>% html %>% html_nodes(xpath='//*[#id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T)
if(class(data)=="try-error"){
data <- NULL
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
data <- tableMusic(data)
}
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
}
data
}
## first try finding data
data <- artistData(artist)
## try finding with and/&
if(is.null(data)){data <- artistData(unlist(strsplit(artist, " and| &"))[1])}
## if no matches return ""
if(class(data)=="try-error" | is.null(data)) {
data <- ""
return()
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
}
## if we have a matching page, pull the relevant data
origin <- data[data[,1]=="Origin",2]
if(length(origin)>0) {
home <- origin
} else {
born <- data[data[,1]=="Born",2]
if (length(born)>0) {
home <- unlist(strsplit(born, "age.[0-9]+)"))[2]
} else {
home <- ""
}
}
home
}
findHome("randy newman")

I figured it out. I had to add a url parameter into the tableMusic() function. As it was, it recycling the url from past searches. Thanks for the suggestion.

Related

Function to abbreviate scientific names

Could you please help me?
I'm trying to modify an R function written by a colleague. This function receives a character vector with scientific names (Latin binomes), just like this one:
Name
Cerradomys scotti
Oligoryzomys sp
Philander frenatus
Byrsonima sp
Campomanesia adamantium
Cecropia pachystachya
Cecropia sp
Erythroxylum sp
Ficus sp
Leandra aurea
Then, it should abbreviate the scientific names, using only the first three letters of the genus (first term) and the epithet (second term) to make a short code. For instance, Cerradomys scotti should become Cersco.
This is the original function:
AbbreviatedNames <- function(vector) {
abbreviations <- character(length = length(vector))
splitnames <- strsplit(vector, " ")
for (i in 1:length(vector)) {
vector[i] <- if(splitnames[[i]][2] == "^sp") {
paste(substr(splitnames[[i]][1],1,3),
splitnames[[i]][2], sep = "")
}
else {
paste(substr(splitnames[[i]][1],1,3),
substr(splitnames[[i]][2],1,3), sep = "")
}
}
vector
}
With a simple list like that one, the function works perfectly. However, when the list has some missing or extra elements, it does not work. The loop stops when it meets the first row that does not match the pattern. Let's take this more complex list as an example:
Name
Cerradomys scotti
Oligoryzomys sp
Philander frenatus
Byrsonima sp
Campomanesia adamantium
Cecropia pachystachya
Cecropia sp
Erythroxylum sp
Ficus sp
Leandra aurea
Morfosp1
Vismia cf brasiliensis
See that Morfosp1 has only 1 term. And Vismia cf brasiliensis has an additional term (cf) in the middle.
I've tried adapting the function, for instance, this way:
AbbreviatedNames <- function(vector) {
abbreviations <- character(length = length(vector))
splitnames <- strsplit(vector, " ")
for (i in 1:length(vector)) {
vector[i] <- if(splitnames[[i]][2] == "^sp" & is.na(splitnames[[i]][2]))) {
paste(substr(splitnames[[i]][1],1,3),
splitnames[[i]][2], sep = "")
}
else {
paste(substr(splitnames[[i]][1],1,3),
substr(splitnames[[i]][2],1,3), sep = "")
}
}
vector
}
Nevertheless, it does not work. I get this error message:
Error in if (splitnames[[i]][2] == "^sp" & is.na(splitnames[[i]][2])) { :
valor ausente onde TRUE/FALSE necessário
How could I make the function:
Deal also with names that have only 1 term?
Expected outcome: Morfosp1 -> Morfosp1 (stays the same)
Deal also with names that have an additional term in the middle?
Expected outcome: Vismia cf brasiliensis -> Visbra (term in the middle is ignored)
Thank you very much!
Something like this is pretty concise:
test <- c("Cerradomys scotti", "Oligoryzomys sp", "Latingstuff", "Latin staff more")
# function to truncate a given name
trunc_str <- function(latin_name) {
# split it on a space
name_split <- unlist(strsplit(latin_name, " ", fixed = TRUE))
# if one name, just return it
if (length(name_split) == 1) return(name_split)
# truncate to first 3 letters
name_trunc <- substr(name_split, 1, 3)
# paste the first and last term together (skipping any middle ones)
paste0(head(name_trunc, 1), tail(name_trunc, 1))
}
# iterate over all
vapply(test, trunc_str, "")
# Cerradomys scotti Oligoryzomys sp Latingstuff Latin staff more
# "Cersco" "Olisp" "Latingstuff" "Latmor"
If you don't want a named vector output, you can use USE.NAMES = FALSE in vapply(). Or feel free to use a loop here.
AbbreviatedNames <- function(vector) {
abbreviations <- character(length = length(vector))
splitnames <- strsplit(vector, " ")
for (i in 1:length(vector)){
# One name
if(length(splitnames[[i]])==1){
vector[i] <- paste(substr(splitnames[[i]][1],1,3),
substr(splitnames[[i]][2],1,3), sep = "")
}
# Two names
else if(length(splitnames[[i]])==2){
vector[i] <- if(splitnames[[i]][2] == "^sp") {
paste(substr(splitnames[[i]][1],1,3),
splitnames[[i]][2], sep = "")
}
else {
paste(substr(splitnames[[i]][1],1,3),
substr(splitnames[[i]][2],1,3), sep = "")
}
}
# Three names
else if(length(splitnames[[i]])==3){
vector[i] <- paste(substr(splitnames[[i]][1],1,3),
substr(splitnames[[i]][3],1,3), sep = "")
# Assuming that the unwanted word is always in the middle
}
}
return(vector)
}
I tested on the list you gave and it seems to work, tell me if you need a more general code
Thank you very much for the help, Ricardo and Adam! I've made the code available on GitHub to other people who work with interaction networks, and need to abbreviate scientific names to be used in graphs.

htmltab "no table found" workaround?

I am trying to scrape some (a lot) of NCAA mens basketball data off of a website called RealGM. My code lies below:
library(htmltab)
tables <- list()
for (i in 0:1548) {
for (j in 0:16) {
for (k in 0:4) {
a <- i+1
b <- 2003+j
c <- k+1
url <- paste("https://basketball.realgm.com/ncaa/conferences/Big-Ten-Conference/2/Michigan/",a,"/individual-games/",b,"/minutes/Season/desc/",c,sep = "")
tables[[paste(i,j,k,sep = "")]] <- htmltab(url,rm_nodata_cols = F,which = 1)
}
}
}
I've used similar methods in the past to pull data off of sites like Sports Reference which keep player data in tables.
In this loop, the variable a controls the team, b controls the year, and c controls the page number for the game log set.
My issue here is that some of the referenced URLs contain no tables, i.e. there is no 4th page of game logs for Michigan's 2003 team, but there are 5 pages for their 2018 team.
Unfortunately, htmltab returns an error when there is not table found, and it aborts my loop. Is there a workaround for this so that it will just skip those urls and/or continue through the rest of the process?
I was able to figure out how to do this by checking first to see if a table existed, and if not, go to the next iteration of the loop:
library(htmltab)
tables <- list()
for (i in 0:1548) {
for (j in 0:16) {
for (k in 0:4) {
a <- i+1
b <- 2003+j
c <- k+1
url <- paste("https://basketball.realgm.com/ncaa/conferences/Big-Ten-Conference/2/Michigan/",a,"/individual-games/",b,"/minutes/Season/desc/",c,sep = "")
test <- html_nodes(read_html(url),"table")
if (length(test) == 0){
next
}
tables[[paste(i,j,k,sep = "")]] <- htmltab(url,rm_nodata_cols = F,which = 1)
}
}
}
One option is to use tryCatch and skip the URL's which give an error.
library(htmltab)
tables <- list()
for (i in 1:1549) {
for (j in 2003:2019) {
for (k in 1:5) {
url <- paste0("https://basketball.realgm.com/ncaa/conferences/Big-Ten-Conference/2/Michigan/",i,"/individual-games/",j,"/minutes/Season/desc/",k)
tables[[paste0(i,j,k)]] <- tryCatch({
htmltab(url,rm_nodata_cols = F,which = 1)
}, error = function(e) {
cat("Wrong URL : ", url, " skipping\n")
})
}
}
}

R: Create a function with optional parameters without if statement

I want to create a function with optional parameters. I am working on a dataset with income, states, and a few flags. The function takes the data, state and flag and filters it. If state and flag is missing, it should return the entire data, and so on. So far I have this code which is working fine. However, I want to know if there is a simpler code than this.
myfun <- function(data, states, flag){
if(missing(states)) {
if(missing(flag)) {
new_data <- data
} else {
new_data <- subset(data, data$Flag == flag)
}
} else if(missing(flag)) {
new_data <- subset(data, data$State == states)
} else {
new_data <- subset(data, (data$State == states & data$Flag == flag))
}
temp_data <- toJSON(new_data)
save(temp_data, file = "Mydata.JSON")
return(new_data)
}
I know we can have optional parameters by having a default parameter like flag = Y. But how do I pass both the parameters in default parameters i.e. flag = Y & N, or all the 50 states. I am new to R and any help would be greatly appreciated.
Update: Got a solution I was looking for thanks to Daniel Winkler
test2 <- function(data,states = unique(data$State),flag = c("Y","N"))
{my_data<-subset(data, (data$State %in% states & data$Flag %in% flag))}
For passing multiple values to the function:
myfun<-function(data,states,flag){
if(missing(states))
{
if(missing(flag))
{
new_data<-data
}
else
{
new_data<-subset(data, data$Flag %in% flag)
}
}
else if(missing(flag))
{
new_data<-subset(data, data$State %in% states)
}
else
{
new_data<-subset(data, (data$State %in% states & data$Flag %in% flag))
}
temp_data <- toJSON(new_data)
save(temp_data, file="Mydata.JSON")
return(new_data)
}
Then call with:
all_states <- unique(mydata$States)
myfun(data = mydata, states = all_states, flag = c('Y', 'N'))

Making a nested for loop run faster in R

I have the following code (nested for loop) in R which is extremely slow. The loop matches values from two columns. Then picks up a corresponding file and iterates through the file to find a match. Then it picks up that row from the file. The iterations could go up to more than 100,000. Please if some one can provide an insight on how to quicken the process.
for(i in 1: length(Jaspar_ids_in_Network)) {
m <- Jaspar_ids_in_Network[i]
gene_ids <- as.character(GeneTFS$GeneIds[i])
gene_names <- as.character(GeneTFS$Genes[i])
print("i")
print(i)
for(j in 1: length(Jaspar_ids_in_Exp)) {
l <- Jaspar_ids_in_Exp[j]
print("j")
print(j)
if (m == l) {
check <- as.matrix(read.csv(file=paste0(dirpath,listoffiles[j]),sep=",",header=FALSE))
data_check <- data.frame(check)
for(k in 1: nrow(data_check)) {
gene_ids_JF <- as.character(data_check[k,3])
genenames_JF <- as.character(data_check[k,4])
if(gene_ids_JF == gene_ids) {
GeneTFS$Source[i] <- as.character(data_check[k,3])
data1 <- rbind(data1, cbind(as.character(data_check[k,3]),
as.character(data_check[k,8]),
as.character(data_check[k,9]),
as.character(data_check[k,6]),
as.character(data_check[k,7]),
as.character(data_check[k,5])))
} else if (toupper(genenames_JF) == toupper(gene_names)) {
GeneTFS$Source[i] <- as.character(data_check[k,4])
data1 <- rbind(data1, cbind(as.character(data_check[k,4]),
as.character(data_check[k,5]),
as.character(data_check[k,6]),
as.character(data_check[k,7]),
as.character(data_check[k,8]),
as.character(data_check[k,2])))
} else {
# GeneTFS[i,4] <- "No Evidence"
}
}
} else {
# GeneTFS[i,4] <- "Record Not Found"
}
}
}
If you pull out the logic for processing one pair into a function, f(m,l), then you could replace the double loop with:
outer(Jaspar_ids_in_Network, Jaspar_ids_in_Exp, Vectorize(f))

Running a fine & working code through a loop doesn't work

From the string:
"((VBD)(((JJ))(CC)((RB)(JJ)))((IN)((DT)(JJ)(NNP)(NNPS))))"
I needed the following:
"JJ", "RBJJ", "DTJJNNPNNPS", "JJCCRBJJ", "INDTJJNNPNNPS" "VBDJJCCRBJJINDTJJNNPNNPS"
(this was my earlier query on SO, which was solved by #Brian Diggs. Please refer to "R : how to differentiate between inner and innermost brackets using regex", if necessary)
so I Used the following code:
library("plotrix")
library("plyr")
strr<-c("((VBD)(((JJ))(CC)((RB)(JJ)))((IN)((DT)(JJ)(NNP)(NNPS))))")
tmp <- gsub("\\(([^\\(\\)]*)\\)", '("\\1")', strr)
tmp <- gsub("\\(", "list(", tmp)
tmp <- gsub("\\)list", "),list", tmp)
tmp <- eval(parse(text=tmp))
atdepth <- function(l, d) {
if (d > 0 & !is.list(l)) {
return(NULL)
}
if (d == 0) {
return(unlist(l))
}
if (is.list(l)) {
llply(l, atdepth, d-1)
}
}
pastelist <- function(l) {paste(unlist(l), collapse="", sep="")}
down <- llply(1:listDepth(tmp), atdepth, l=tmp)
out <- if (length(down) > 2) {
c(unlist(llply(length(down):3, function(i) {
unlist(do.call(llply, c(list(down[[i]]), replicate(i-3, llply), pastelist)))
})), unlist(pastelist(down[[2]])))
} else {
unlist(pastelist(down[[2]]))
}
out <- out[out != ""]
And I got what I wanted, But Is it not possible to use the above code through a loop to process multiple strings (strr etc.) at the same time? I roughly need a bunch of strings to be processed and collected in a file. I'm trying to include a loop, but always end up in having just the last string from the set of strings in the out file. How should I run the code through a loop? The string set below.
strr<-c("(((((NNS))((IN)((NNS)(CC)(NNS))))((VBD)((PRP))((IN)((NN))))))",
"((((NNS))((VBD)((TO)(((NNP))((NNP))))((TO)((DT)(NNP))))))",
"((((IN)(((NNP))((NNP))))((NNP)(NNP)(NNPW)(NNP))((VBD)((IN)((DT)(JJ)(NN)(NN))))))"
)
Possibly it could be modified in a more clever way, but a straightforward one would be to define a new function and use sapply()
### Nothing new
library("plotrix")
library("plyr")
strr<-c("(((((NNS))((IN)((NNS)(CC)(NNS))))((VBD)((PRP))((IN)((NN))))))",
"((((NNS))((VBD)((TO)(((NNP))((NNP))))((TO)((DT)(NNP))))))",
"((((IN)(((NNP))((NNP))))((NNP)(NNP)(NNPW)(NNP))((VBD)((IN)((DT)(JJ)(NN)(NN))))))")
strrr <- strr[rep(1:3,200)]
atdepth <- function(l, d) {
if (d > 0 & !is.list(l)) {
return(NULL)
}
if (d == 0) {
return(unlist(l))
}
if (is.list(l)) {
llply(l, atdepth, d-1)
}
}
pastelist <- function(l) {paste(unlist(l), collapse="", sep="")}
###
### New function here
fun <- function(strr){
tmp <- gsub("\\(([^\\(\\)]*)\\)", '("\\1")', strr)
tmp <- gsub("\\(", "list(", tmp)
tmp <- gsub("\\)list", "),list", tmp)
tmp <- eval(parse(text=tmp))
down <- llply(1:listDepth(tmp), atdepth, l=tmp)
out <- if (length(down) > 2) {
c(unlist(llply(length(down):3, function(i) {
unlist(do.call(llply, c(list(down[[i]]), replicate(i-3, llply), pastelist)))
})), unlist(pastelist(down[[2]])))
} else {
unlist(pastelist(down[[2]]))
}
out <- out[out != ""]
out
}
sapply(strr, fun)

Resources