Error when trying to load dl format using igraph - r

I am trying to load the kapferer min dataset into r using the igraph function "read_graph"
The code is very simple, however it throws an error.
test_g <-read_graph("http://vlado.fmf.uni-lj.si/pub/networks/data/ucinet/kapmine.dat", format = "dl")
Error in read.graph.dl(file, ...) : At foreign.c:3050 : syntax
error, unexpected $end, expecting DL in line 1, Parse error
The as can be seen by following the link the file does begin with DL. The only clue I can find to this is a message from 2015 which basically says file a bug report.
Can dl files not beloaded by igraph at the moment, or is there some trick to it?

As there doesn't seem to be a clear way to load dl files I have made a loader that seems to work well for dl graph on the Pajek website. The function is a bit scrappy and has not been extensively tested, but it may be useful to some who want to use certain graphs that are not available in a more common format. If there is more to date information on these datasets, then this code can be ignored.
load_dl_graph <- function(file_path, directed){
raw_mat <- readLines(file_path) %>%
enframe()
row_labels_row <- grep( "ROW LABELS:", raw_mat$value)
column_labels_row <- grep( "COLUMN LABELS:", raw_mat$value)
level_labels_row <- grep("LEVEL LABELS:",raw_mat$value )
data_table_row <- grep( "DATA:", raw_mat$value)
row_labels <- raw_mat %>%
slice((row_labels_row+1):(column_labels_row-1)) %>%
select(from = value)
column_labels <- raw_mat %>%
slice((column_labels_row+1):(level_labels_row-1)) %>% pull(value)
table_levels <- raw_mat %>%
slice((level_labels_row+1):(data_table_row+-1)) %>% pull(value)
data_df <- raw_mat %>%
slice((data_table_row+1):nrow(.)) %>%
select(value) %>%
mutate(value = str_squish(value)) %>%
separate(col = value, into = column_labels, sep = " ") %>%
mutate(table_id = rep(1:length(table_levels), each = nrow(.)/length(table_levels)))
tables_list <- 1:length(table_levels) %>%
map(~{
data_df %>%
filter(table_id ==.x) %>%
select(-table_id) %>%
bind_cols(row_labels,.) %>%
pivot_longer(cols = 2:ncol(.), names_to = "to", values_to = "values") %>%
filter(values ==1) %>%
select(-values) %>%
graph_from_data_frame(., directed = directed)
})
names(tables_list) <- table_levels
return(tables_list)
}

Related

gt table package in R produces header error

I am using the gt() table package in R and so far I love it. However for some reason when I publish the below in quarto I get an awkward table header that says "?caption" in bold. However when I run the table separately, I don't get anything.
Any thoughts?
Ignore the titles and columns names, I know it doesn't make sense with the diamonds package
library(tidyverse)
library(gt)
business_segment_summary <- diamonds %>%
group_by(cut) %>%
summarise(n=n(),
sum=sum(price),
sum_od=sum(price,na.rm=TRUE),
prop_25=quantile(price,.25,na.rm=TRUE),
prop_50=quantile(price,.5,na.rm=TRUE),
prop_75=quantile(price,.75,na.rm=TRUE),
mean=mean(price,na.rm=TRUE),
mean_aging=mean(table),
mean_rank=mean(depth),
prop_od=mean(carat),
sd=sd(price,na.rm=TRUE),
mad=mad(price,na.rm=TRUE),
.groups="drop"
) %>%
mutate(tar_prop=sum/sum(sum),
n_prop=n/sum(n))
business_segment_summary %>%
select(1,n,tar_prop,n_prop,prop_od,prop_25,prop_50,prop_75) %>%
gt::gt()
gt::cols_label(cut="Business Segment",
n="Customer #",
tar_prop="% of TAR",
n_prop="% of Customers",
prop_od=gt::html("% of Customers<br>with overdue"),
prop_25="25%",
prop_50="50%",
prop_75="75%") %>%
gt::tab_spanner(label="Customer Account Percentile ($k)",columns = c(prop_25,prop_50,prop_75)) %>%
gt::fmt_number(c(prop_25,prop_50,prop_75),decimals = 0,scale_by = 1/1e3) %>%
gt::fmt_number(n,decimals = 0) %>%
gt::fmt_percent(c(3:5),decimals = 0) %>%
gt::opt_stylize(style=1,color="red") %>%
gt::tab_header(title="Summary of TAR by business segments") %>%
gt::cols_align(align="left",columns = 1)

Error in xml_nodeset(NextMethod()) : Expecting an external pointer: [type=NULL] when scraping with RVEST

i am having a problem when trying to scrape some data, i have created a function that is properly working, problems occurs when i run this function for many different code.
require ("rvest")
library("dplyr")
getFin = function(ticker)
{
url= paste0("https://it.finance.yahoo.com/quote/",ticker,
"/key-statistics?p=",ticker)
a <- read_html(url)
tbl= a %>% html_nodes("section") %>% html_nodes("div")%>% html_nodes("table")
misureval = tbl %>% .[1] %>% html_table() %>% as.data.frame()
prezzistorici = tbl %>% .[2] %>% html_table() %>% as.data.frame()
titolistat = tbl %>% .[3] %>% html_table() %>% as.data.frame()
dividendi = tbl %>% .[4] %>% html_table() %>% as.data.frame()
annofiscale = tbl %>% .[5] %>% html_table() %>% as.data.frame()
redditivita = tbl %>% .[6] %>% html_table() %>% as.data.frame()
gestione = tbl %>% .[7] %>% html_table() %>% as.data.frame()
contoeco = tbl %>% .[8] %>% html_table() %>% as.data.frame()
bilancio = tbl %>% .[9] %>% html_table() %>% as.data.frame()
flussi = tbl %>% .[10] %>% html_table() %>% as.data.frame()
info1 = rbind(ticker, misureval, prezzistorici, titolistat, dividendi, annofiscale, redditivita, gestione, contoeco, bilancio, flussi)
}
What i am trying to do is to use
finale <- lapply(codici, getFin)
where codici is linked to many different Ticker which will be used in the function to generate one url at time and scrape data.
I have tried with 50 ticker and the function works properly, however when i increase the number i get this error:
Error in xml_nodeset(NextMethod()) : Expecting an external pointer:
[type=NULL].
i don't know if this may be related to the number of request or something other. i have also tested a non existing ticker and the function still works, problems just arises when the number is large.
Solved problem, i just need to add Sys.sleep in order to reduce the frequency of requests.
the best number in this case is 3, so Sys.sleep(3) at the end of the for cycle.

How to skip a page scrape when table is missing in R

I'm building a scrape that pulls the name of a player and the years he played for thousands of different players. I have built an otherwise successful function to do this but unfortunately in some instances the table with the other half of data I need (years played) does not exist. For these instances, I'd like to add a way to tell the scrape to bypass these instances. Here is the code:
(note: the object "url_final" is the list of active webpage URLs of which there are many)
library(rvest)
library(curl)
library(tidyverse)
library(httr)
df <- map_dfr(.x = url_final,
.f = function(x){Sys.sleep(.3); cat(1);
fyr <- read_html(curl(x, handle = curl::new_handle("useragent" = "Mozilla/5.0"))) %>%
html_table() %>%
.[[1]]
fyr <- fyr %>%
select(1) %>%
mutate(name = str_extract(string = x, pattern = "(?<=cbb/players/).*?(?=-\\d\\.html)"))
})
Here is an example of an active page in which you can recreate the scrape by replacing "url_final" as the .x call in map_dfr with:
https://www.sports-reference.com/cbb/players/karl-aaker-1.html
Here is an example of one of the instances in which there is no table and thus returns an error breaking the loop of the scrape.
https://www.sports-reference.com/cbb/players/karl-aaker-1.html
How about adding try-Catch which will ignore any errors?
library(tidyverse)
library(rvest)
df <- map_dfr(.x = url_final,
.f = function(x){Sys.sleep(.3); cat(1);
tryCatch({
fyr <- read_html(curl::curl(x,
handle = curl::new_handle("useragent" = "Mozilla/5.0"))) %>%
html_table() %>% .[[1]]
fyr <- fyr %>%
select(1) %>%
mutate(name = str_extract(string = x,
pattern = "(?<=cbb/players/).*?(?=-\\d\\.html)"))
}, error = function(e) message('Skipping url', x))
})

running a function over a list of lists in tidy format

I have downloaded data using the edgarWebR package.
library(edgarWebR)
ticker <- c('NVDA', 'GOOG')
years <- 5
company.details <- lapply(ticker, company_details)
I have two lists and I am trying to apply a function to each of the lists:
filing_doc <- function(href) {
sapply(href, function(x) {
filing_documents(x) %>%
filter( type == "10-K" ) %>% select(href) }) %>%
unlist(recursive = TRUE, use.names = FALSE)
}
I apply the function:
company.reports <- company.details$filings %>%
filter(type == "10-K") %>%
slice(1:years) %>%
mutate(doc.href = filing_doc(href),
mdlink = paste0("[Filing Link](", href, ")"),
reportLink = paste0("[10-K Link](", doc.href, ")")) %>%
select(filing_date, accession_number, mdlink, reportLink, href, doc.href)
However this will not work since I am trying to apply this over a list of 2.
The following works
company.reports <- company.details[[1]]$filings %>%
filter(type == "10-K") %>%
slice(1:years) %>%
mutate(doc.href = filing_doc(href),
mdlink = paste0("[Filing Link](", href, ")"),
reportLink = paste0("[10-K Link](", doc.href, ")")) %>%
select(filing_date, accession_number, mdlink, reportLink, href, doc.href)
Where I have just added the [[1]] to the first line. My question is how can I apply this same code over multiple lists - I run into errors using lapply and pipe functions.
Ultimately I would like to have the same output as the last piece of code but filled with information for all of the companies in ticker.
Check this solution:
company.reports <-
company.details %>%
map(
~.x$filings %>%
filter(type == "10-K") %>%
slice(1:years) %>%
mutate(doc.href = filing_doc(href),
mdlink = paste0("[Filing Link](", href, ")"),
reportLink = paste0("[10-K Link](", doc.href, ")")) %>%
select(filing_date, accession_number, mdlink, reportLink, href, doc.href)
)

How to use trycatch() for parLapply for the error: cannot open the connection?

I am trying to use parlapply to scrape data from multiple urls. I am getting this error
sites <- c("https://forums.vwvortex.com/showthread.php?5494121-euro-spec-parts", "https://forums.vwvortex.com/showthread.php?5489376-Is-this-normal", "https://forums.vwvortex.com/showthread.php?5490376-rear-hatch-light")
I have around 5000 urls to loop through.
nCores <- detectCores(logical = FALSE)
cat(nCores, " cores detected.")
# detect threads with parallel()
nThreads<- detectCores(logical = TRUE)
cat(nThreads, " threads detected.")
# Create doSNOW compute cluster (try 64)
# One can increase up to 128 nodes
# Each node requires 44 Mbyte RAM under WINDOWS.
cluster <- makeCluster(nThreads, type = "SOCK")
class(cluster);
# register the cluster
registerDoSNOW(cluster)
#get info
getDoParWorkers(); getDoParName();
strt <- Sys.time()
results <- parLapply(cluster, sites, function(i) {
library(dplyr)
library(xml2)
library(magrittr)
library(rvest)
review <- read_html(url(i))
threads<- cbind(review %>% html_nodes("blockquote.postcontent.restore") %>% html_text())
datethreads <- cbind(review %>% html_nodes("span.date") %>% html_text())
userinfo <- cbind(review %>% html_nodes("div.username_container") %>% html_text())
title <- cbind(review %>% html_nodes("li.navbit.lastnavbit") %>% html_text())
urls <- cbind(review %>% html_nodes("span.threadtitle") %>% html_nodes("a") %>% html_attr("href") %>% paste0("https://forums.vwvortex.com/", .) )
links <- sub("&.*","", urls)
library(rowr)
x <- data.frame(rowr::cbind.fill(threads, datethreads, userinfo, title, links, fill = NA), stringsAsFactors = FALSE)
return(x)
})
print(Sys.time()-strt)
Error in checkForRemoteErrors(val) :
one node produced an error: cannot open the connection
Since the error is vague. I want to use trycatch() for error handling.
I am unsure but I feel the error must've been at read_html(url(i)), I tried the following code:
results <- parLapply(cluster, sites, function(i) {
library(dplyr)
library(xml2)
library(magrittr)
library(rvest)
tryCatch(
review <- read_html(url(i)),
error = function(e) {
message("Here's the original error message:")
message(e)
return(NULL)
})
threads<- cbind(review %>% html_nodes("blockquote.postcontent.restore") %>% html_text())
datethreads <- cbind(review %>% html_nodes("span.date") %>% html_text())
userinfo <- cbind(review %>% html_nodes("div.username_container") %>% html_text())
title <- cbind(review %>% html_nodes("li.navbit.lastnavbit") %>% html_text())
urls <- cbind(review %>% html_nodes("span.threadtitle") %>% html_nodes("a") %>% html_attr("href") %>% paste0("https://forums.vwvortex.com/", .) )
links <- sub("&.*","", urls)
library(rowr)
x <- data.frame(rowr::cbind.fill(threads, datethreads, userinfo, title, links, fill = NA), stringsAsFactors = FALSE)
return(x)
})
Not sure how to go about it. Any help with what might have caused the cannot open the connection error or how to write a trycatch() for the above code ?
Thanks in advance!!

Resources