Scraping function works only on some computers (R)

Scraping function works only on some computers (R) - r

We are doing a project with a shiny app that involves scraping and downloading dataframes from a website. We have the following problem: It works on some computers and not others.
We have the same packages versions, and we did not do too many requests...
It is not linked with whether it is on windows of mac, as it works on some windows and some macs but not others.
Do you have any idea ? Could it be in the connexion settings ?
It is not linked with the wifi network, we tried on the same wifi...
Upon request here is the code and the error messages :
This function is the one we call directly:
scraping_function <- function(search_terms, subreddit,
sort_by , time_frame){
exctracted_link <- reddit_urls_mod(search_terms, subreddit,
sort_by , time_frame)
exctracted_data <- reddit_content(exctracted_link[,5])
exctracted_data[,13] <- cleaning_text_function(exctracted_data[,13])
return(exctracted_data)
}
These are the functions that this function calls :
extracting the URLS:
reddit_urls_mod<- function (search_terms = "", subreddit = "",
sort_by = "", time_frame= "")
{
if (subreddit == ""){
subreddit <- NA
}
if (search_terms == ""){
search_terms <- NA
}
if (!grepl("^[0-9A-Za-z]*$", subreddit) & !is.na(subreddit) ) {
stop("subreddit must be a sequence of letter and number without special characters and spaces")
}
regex_filter = ""
cn_threshold = 0
page_threshold = 15
wait_time = 1
cached_links = data.frame(date = as.Date(character()),
num_comments = numeric(),
title = character(),
subreddit = character(),
URL = character(),
link = character())
if (sort_by != "front_page"){
if (!grepl("^comments$|^new$|^relevance$|^top$|^front_page$", sort_by)) {
stop("sort_by must be either 'new', 'comments', 'top', 'relevance' or 'front_page'")
}
if (!grepl("^hour$|^day$|^week$|^month$|^year$|^all$", time_frame)) {
stop("time_frame must be either 'hour', 'day', 'week', 'month', 'year or 'all'")
}
sterms = ifelse(is.na(search_terms), NA, gsub("\\s", "+",search_terms))
subreddit = ifelse(is.na(subreddit), "", paste0("r/", gsub("\\s+","+", subreddit), "/"))
sterms = ifelse(is.na(sterms), "", paste0("q=", sterms, "&restrict_sr=on&"))
sterms_prefix = ifelse(sterms == "", "new", "search")
time_frame_in = ifelse(is.na(search_terms), "", paste0("t=",time_frame,"&"))
search_address = search_query = paste0("https://www.reddit.com/",
subreddit, sterms_prefix,
".json?",
sterms,time_frame_in,
"sort=",
sort_by)
} else {
if (is.na(subreddit)) {
stop("if you choose sort_by = front_page please enter a subreddit")
}
search_address = search_query = paste0("https://www.reddit.com/r/",
subreddit,
".json?")
}
next_page = index = ""
page_counter = 0
comm_filter = 10000
while (is.null(next_page) == FALSE & page_counter < page_threshold &
comm_filter >= cn_threshold & length(index) > 0) {
search_JSON = tryCatch(RJSONIO::fromJSON(readLines(search_query,
warn = FALSE)), error = function(e) NULL)
if (is.null(search_JSON)) {
stop(paste("Unable to connect to reddit website or invalid subreddit entered"))
} else if (length(search_JSON$data$children)==0){
stop(paste("This search term returned no results or invalid subreddit entered"))
} else {
contents = search_JSON[[2]]$children
search_permalink = paste0("http://www.reddit.com",
sapply(seq(contents), function(x) contents[[x]]$data$permalink))
search_num_comments = sapply(seq(contents), function(x) contents[[x]]$data$num_comments)
search_title = sapply(seq(contents), function(x) contents[[x]]$data$title)
search_score = sapply(seq(contents), function(x) contents[[x]]$data$score)
search_subreddit = sapply(seq(contents), function(x) contents[[x]]$data$subreddit)
search_link = sapply(seq(contents), function(x) contents[[x]]$data$url)
index = which(search_num_comments >= cn_threshold &
grepl(regex_filter, search_title, ignore.case = T,
perl = T))
if (length(index) > 0) {
search_date = format(as.Date(as.POSIXct(unlist(lapply(seq(contents), function(x) contents[[x]]$data$created_utc)),
origin = "1970-01-01")), "%d-%m-%y")
temp_dat = data.frame(date = search_date,
num_comments = search_num_comments,
title = search_title,
subreddit = search_subreddit,
URL = search_permalink,
link = search_link,
stringsAsFactors = FALSE)[index,]
cached_links = as.data.frame(rbind(cached_links,
temp_dat))
next_page = search_JSON$data$after
comm_filter = utils::tail(search_num_comments,
1)
search_query = paste0(search_address, "&after=",
next_page)
page_counter = page_counter + 1
}
Sys.sleep(min(2, wait_time))
}
}
final_table = cached_links[!duplicated(cached_links), ]
if (dim(final_table)[1] == 0) {
cat(paste("\nNo results retrieved, should be invalid subreddit entered, down server or simply unsuccessful search query :("))
}
else {
remove_row = which(final_table[, 1] == "")
if (length(remove_row) > 0) {
final_table = final_table[-remove_row, ]
}
return(final_table)
}
}
extracting the content :
reddit_content <- function (URL, wait_time = 1) {
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(
paste0(filter, " ", depth),
lapply(1:length(reply.nodes),
function(x)
get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))
))
}
data_extract = data.frame(
id = numeric(),
structure = character(),
post_date = as.Date(character()),
comm_date = as.Date(character()),
num_comments = numeric(),
subreddit = character(),
upvote_prop = numeric(),
post_score = numeric(),
author = character(),
user = character(),
comment_score = numeric(),
controversiality = numeric(),
comment = character(),
title = character(),
post_text = character(),
link = character(),
domain = character(),
URL = character()
)
withProgress(message = 'Work in progress', value = 0, min=0,max=1, {
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e)
NULL
)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X,
warn = FALSE)),
error = function(e)
NULL
)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x)
get.structure(main.node[[x]], x)))
TEMP = data.frame(
id = NA,
structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(
as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")
), "%d-%m-%y"),
comm_date = format(as.Date(
as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")
), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(
is.null(meta.node$subreddit),
"UNKNOWN",
meta.node$subreddit
),
upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score,
author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title,
post_text = meta.node$selftext,
link = meta.node$url,
domain = meta.node$domain,
URL = URL[i],
stringsAsFactors = FALSE
)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else
print(paste("missed", i, ":", URL[i]))
}
}
incProgress(amount = 1/length(URL))
Sys.sleep(min(2, wait_time))
}
# data_extract[,13] <-
# cleaning_text_function(data_extract[,13])
})
return(data_extract)
}
Cleaning the text:
cleaning_text_function <- function(x,stopwords=stopwords_vec) {
stopwords_vec <- c(stopwords::stopwords("en"), "don", "isn", "gt", "i", "re","removed","deleted","m","you re","we ll", "ve", "hasn","they re","id","tl dr", "didn", "wh","oh","tl","dr","shes","hes","aren","edit","ok","ll","wasn","shouldn","t","doesn","youre","going","still","much", "many","also")
if (is.character(x)) {
#Put accents instead of code html (only for french)
Encoding(x) <- 'latin1'
#take out accent
x <- stri_trans_general(x, 'latin-ascii')
x <- unlist(lapply(x, function(x, stopwords = stopwords_vec) {
#separate words
x <- unlist(strsplit(x, " "))
#take out internet links
x <- x[!grepl("\\S+www\\S+|\\S+https://\\S+|https://\\S+", x)]
#take out codes ASCII and ponctuation
x <-gsub("\n|[[:punct:]]|[\x01-\x09\x11-\x12\x14-\x1F\x7F]|gt"," ",x)
#take out simple alone numbers
x <-gsub("(^[0-9]{1}\\s|^[0-9]{1}$|\\s{1}[0-9]{1}$|\\s{1}[0-9]{1}\\s{1})"," ",x)
#take out space in the beginning and end of stringg
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
#lowercase
x <- tolower(x)
#take out alone letters
x <-gsub("(^[a-z]{1}\\s+|^[a-z]{1}$|\\s+[a-z]{1}$|\\s+[a-z]{1}\\s+)", "", x)
#take out words in stopwords list
x <-paste(x[!x %in% stopwords], collapse = " ")
#rerun stopwords again to get ride of stopword in composed string
x <- unlist(strsplit(x, " "))
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
x <-paste(x[!x %in% stopwords], collapse = " ")
return(x)
}))
} else{
stop("please enter a character vector")
}
return(x)
}
And the message we get :
Listening on http://127.0.0.1:7745
Warning in file(con, "r") :
cannot open URL 'https://www.reddit.com/r/news/search.json?q=Greta&restrict_sr=on&t=week&sort=comments': HTTP status was '429 Unknown Error'
Warning: Error in reddit_urls_mod: Unable to connect to reddit website or invalid subreddit entered
126: stop
125: reddit_urls_mod
124: scraping_function
123: eventReactiveHandler
79: df1
72: observeEventHandler
1: runApp
I get an error 429 even on computers that have never made a request before...
Thank you

Related

Why is this Shiny app code not reactive when using purrr:map over input variables?

EDIT WITH MWE BELOW
I have below a snippet of my code which is part of a larger app. I'm trying to rewrite the app to work with R6 classes and gargoyle as per this article. However, I cannot figure out why the observe part of the data below does not trigger except when it's initialized. To my understanding should if observe all the filters that are in input based on the map function, am I wrong?
output$filters <- renderUI({
gargoyle::watch("first thing")
data <- Data$get_data(unfiltered = TRUE)
data_names <- names(data)
if(nrow(data) > 0){
map(data_names, ~ render_ui_filter(data[[.x]], .x))
}
}
)
observe({
data <- Data$get_data(unfiltered = TRUE)
data_names <- names(data)
if(ncol(data) > 0){
each_var <- map(data_names, ~ filter_var(data[[.x]], input[[paste0("filter",.x)]]))
Transactions <- Data$set_filters(reduce(each_var, `&`))
gargoyle::trigger("second thing")
}
})
I've had a working case of the second reactive element like this:
selectedData <- reactive({
if(nrow(data()) > 0){
each_var <- map(dataFilterNames(), ~ filter_var(data()[[.x]], input[[paste0("filter",.x)]]))
reduce(each_var, `&`)
}
})
where data and dataFilterNames are reactiveVal and dataFilterNames is the column names of data.
Here you can find render_ui_filter and filter_var:
render_ui_filter <- function(x, var) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(NULL)
}
id <- paste0("filter",var)
var <- stringr::str_to_title(var)
if (is.numeric(x)) {
if(is.integer(x)){
step = 1
}
else{
step = NULL
}
rng <- range(x, na.rm = TRUE)
sliderInput(id,
var,
min = rng[1],
max = rng[2],
value = rng,
round = TRUE,
width = "90%",
sep = " ",
step = step
)
} else if (is.factor(x)) {
levs <- levels(x)
if(length(levs) < 5){
pickerInput(id, var, choices = levs, selected = levs, multiple = TRUE,
options = list(
title = sprintf("Filter on %s...", var),
#`live-search` = TRUE,
#`actions-box` = TRUE,
size = 10
))
}else {
pickerInput(id, var, choices = levs, selected = levs, multiple = TRUE,
options = list(
title = sprintf("Filter on %s...", var),
`live-search` = TRUE,
`actions-box` = TRUE,
size = 10,
`selected-text-format` = "count > 5"
))
}
} else if (is.Date(x)){
dateRangeInput(id,
var,
start = min(x),
end = max(x),
weekstart = 1,
autoclose = FALSE,
separator = "-")
} else if (is.logical(x)) {
pickerInput(id, var, choices = unique(x), selected = unique(x), multiple = TRUE,
options = list(
title = sprintf("Filter on %s...", var),
`live-search` = TRUE,
#`actions-box` = TRUE,
size = 10
))
} else {
# Not supported
NULL
}
}
filter_var <- function(x, val) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(TRUE)
}
if (is.numeric(x)) {
!is.na(x) & x >= val[1] & x <= val[2]
} else if (is.factor(x)) {
x %in% val
} else if(is.Date(x)){
!is.na(x) & x >= val[1] & x <= val[2]
} else if (is.logical(x)) {
x %in% val
} else {
# No control, so don't filter
TRUE
}
}
Edit: Here is a MWE that can be run in a notebook for example. It does not currently work since the gargoyle trigger triggers the observe it is in and we end up in a infinity loop. If you remove that you can see that the normal reactive part works, but the R6 version does not create the table ever.
if (interactive()){
require("shiny")
require("R6")
require("gargoyle")
require("purrr")
require("stringr")
# R6 DataSet ----
DataSet <- R6Class(
"DataSet",
private = list(
.data = NA,
.data_loaded = FALSE,
.filters = logical(0)
),
public = list(
initialize = function() {
private$.data = data.frame()
},
get_data = function(unfiltered = FALSE) {
if (!unfiltered) {
return(private$.data[private$.filters, ])
}
else{
return(private$.data)
}
},
set_data = function(data) {
stopifnot(is.data.frame(data))
private$.data <- data
private$.data_loaded <- TRUE
private$.filters <- rep(T, nrow(private$.data))
return(invisible(self))
},
set_filters = function(filters) {
stopifnot(is.logical(filters))
private$.filters <- filters
}
)
)
# Filtering ----
render_ui_filter <- function(x, var) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(NULL)
}
id <- paste0("filter",var)
var <- stringr::str_to_title(var)
if (is.numeric(x)) {
if(is.integer(x)){
step = 1
}
else{
step = NULL
}
rng <- range(x, na.rm = TRUE)
sliderInput(id,
var,
min = rng[1],
max = rng[2],
value = rng,
round = TRUE,
width = "90%",
sep = " ",
step = step
)
} else {
# Not supported
NULL
}
}
filter_var <- function(x, val) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(TRUE)
}
if (is.numeric(x)) {
!is.na(x) & x >= val[1] & x <= val[2]
} else {
# No control, so don't filter
TRUE
}
}
# Options ----
options("gargoyle.talkative" = TRUE)
options(shiny.trace = TRUE)
options(shiny.fullstacktrace = TRUE)
ui <- function(request){
tagList(
h4('Filters'),
uiOutput("transactionFilters"),
h4('Reactive'),
tableOutput("table_reactive"),
h4('R6'),
tableOutput("table_r6")
)
}
server <- function(input, output, session){
gargoyle::init("df_r6_filtered")
Name <- c("Jon", "Bill", "Maria", "Ben", "Tina")
Age <- c(23, 41, 32, 58, 26)
df <- reactive(data.frame(Name, Age))
df_r6 <- DataSet$new()
df_r6$set_data(data.frame(Name, Age))
output$transactionFilters <- renderUI(
map(names(df()), ~ render_ui_filter(x = df()[[.x]], var = .x))
)
selected <- reactive({
if(nrow(df()) > 0){
each_var <- map(names(df()), ~ filter_var(df()[[.x]], input[[paste0("filter",.x)]]))
reduce(each_var, `&`)
}
})
observe({
data <- df_r6$get_data(unfiltered = TRUE)
data_names <- names(data)
if(ncol(data) > 0){
each_var <- map(data_names, ~ filter_var(data[[.x]], input[[paste0("filter",.x)]]))
filters_concatted <- reduce(each_var, `&`)
df_r6$set_filters(filters_concatted)
gargoyle::trigger("df_r6_filtered")
}
})
output$table_reactive <- renderTable(df()[selected(),])
gargoyle::on("df_r6_filtered",{
output$table_r6 <- renderTable(df_r6$get_data())
})
}
shinyApp(ui, server)
}
EDIT2: I noticed that the gargoyle::trigger("df_r6_filtered") creates a infinity loop of triggering the observe component. I'm not sure how to get out of it and that's what I am looking for help with.

The answer was simpler then expected of course. Just change the observe to a observeEvent on all of the input elements regarding the filter, i.e. like this:
observeEvent(
eventExpr = {
data <- df_r6$get_data(unfiltered = TRUE)
data_names <- names(data)
map(data_names, ~ input[[paste0("filter",.x)]])
},
{
...
}
})

Strange R package behaviour: "could not find function"

I have some functionality which works fine outside of a package, but when I put it into a package, devtools::load_all, and try to run one of the functions (DTL_similarity_search_results_fast) , another function (DTL_similarity_search) which should be loaded by the package is not found when it gets run inside of DTL_similarity_search_results_fast.
The code is:
messagef <- function(...) message(sprintf(...))
printf <- function(...) print(sprintf(...))
pattern_to_vec <- function(pattern, as_int = F, keep_list = FALSE) {
ret <- strsplit(pattern, ",")
if(length(pattern) == 1 && !keep_list)
ret <- ret[[1]]
if(as_int){
ret <- lapply(ret, as.integer)
}
ret
}
DTL_similarity_search <- function(search_pattern = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
url <- suppressWarnings(httr::modify_url("https://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/similar"))
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_pattern, keep_list = T), length) %>% min()
}
messagef("[DTL API] Starting search for %s", search_pattern)
resp <- suppressWarnings(httr::POST(url, body = list( n_gram = search_pattern,
transformation = transformation,
database_names = database_names,
metadata_filters = metadata_filters,
filter_category = filter_category,
minimum_similarity = minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference, filter_category = 0),
encode = "form"))
#browser()
#print(httr::content(resp, "text"))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved search ID %s of for pattern %s", parsed$search_id, search_pattern)
parsed$search_id
}
DTL_get_results <- function(search_id) {
url <- suppressWarnings(httr::modify_url("http://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/get"))
#messagef("[DTL API] Retrieving results for search_id %s", search_id)
resp <- suppressWarnings(httr::GET(url, query = list(search_id = search_id)))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
print(httr::content(resp, "text"))
#browser()
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved %s lines for search_id %s", length(parsed), search_id)
purrr::map_dfr(parsed, function(x){
if(is.null(x$within_single_phrase)){
x$within_single_phrase <- FALSE
}
#browser()
tibble::as_tibble(x) %>% dplyr::mutate(melid = as.character(melid))
})
}
DTL_similarity_search_results <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
results <- tibble::tibble()
if(is.na(max_edit_distance)){
max_edit_distance <- purrr:::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
for(pattern in search_patterns){
print('DTL_similarity_search')
print(DTL_similarity_search)
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
next
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0){
ret$search_pattern <- pattern
}
results <- dplyr::bind_rows(results, ret)
}
#browser()
if(nrow(results))
results %>% dplyr::distinct(melid, start, length, .keep_all = T)
}
DTL_similarity_search_results_fast <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0){
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
future::plan(future::multisession)
results <- furrr:::future_map_dfr(search_patterns, function(pattern){
print('DTL_similarity_search2')
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
return(tibble::tibble())
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0 )ret$search_pattern <- pattern
ret
})
#browser()
results %>% dplyr::distinct(melid, start, length, .keep_all = TRUE)
}
Then after load_all() when I try to run:
res <- DTL_similarity_search_results_fast()
I get:
Error in DTL_similarity_search(pattern, transformation,
database_names, : could not find function "DTL_similarity_search
but running a similar, different function works using the same procedure:
res <- DTL_similarity_search_results()

POST request with httr package using R

I would like to get the output from POST request using httr from following site:
http://www.e-grunt.ba
You can see submit form when you click "ZK Ulošci".
There I would like to send POST request and get the output. For example, you can select anything from drop down window and enter 1 in filed "Broj Uloška", and than click "Traži".
Here is my try:
library(httr)
library(tidyverse)
library(rvest)
output <- httr::POST(
"http://www.e-grunt.ba/home.jsf",
body = list(
"form:court_focus" = "440",
"form:cuTransferLast" = "17.07.2019",
"form:municipality_input" = "4400000001",
"form:mpart_focus" = "44000087",
"form:folder" = 1,
`recaptcha-token` = "some token",
submit = "form:j_idt61"
),
add_headers(Referer = "http://www.e-grunt.ba/"),
encode = "form",
verbose()
)
But this just returns content of the home page.
I know it is easier with (R)Selenium, but I would like to do it with httr and POST if it is possible.

I have found the way to scrape this ASP.net site. I am providing the code if somebody will need something similar:
start_session <- function() {
p <- html_session(
"http://www.e-grunt.ba",
user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
)
viewState <- p %>% html_nodes("input") %>% .[[2]] %>% html_attr("value")
p <- rvest:::request_POST(
p,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
"javax.faces.partial.ajax" = "true",
"javax.faces.source" = "j_idt8:j_idt15",
"javax.faces.partial.execute" = "#all",
"javax.faces.partial.render" = "content",
"j_idt8:j_idt15" = "j_idt8:j_idt15",
"j_idt8" = 'j_idt8',
'javax.faces.ViewState' = viewState
),
encode = "form"
)
attr(p, "viewState") <- viewState
p
}
# EXTRACT METADATA --------------------------------------------------------
p <- start_session()
name_value_pairs <- function(html, css, cnames) {
x <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_attr("value")
y <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_text()
df <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
df <- df[df[, 1] != -1, ]
colnames(df) <- cnames
df
}
courts <- name_value_pairs(p$response$content, css = '[id="form:court_input"]', cnames = c("court_id", "court"))
metadata_post <- function(session_zk, view_state, id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:court',
'javax.faces.partial.execute' = 'form:court',
'javax.faces.partial.render' = 'msgs msgsBottom form:municipality form:mpart form:cuTransferLast',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
muni_post <- function(session_zk, view_state, id, muni_id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:municipality',
'javax.faces.partial.execute' = 'form:municipality',
'javax.faces.partial.render' = 'msgs msgsBottom form:mpart',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:municipality_input' = muni_id,
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
metadata_i <- list()
for (i in seq_along(courts$court_id)) {
print(i)
p <- metadata_post(p, attributes(p)$viewState, courts$court_id[i])
muni <- name_value_pairs(p$response$content, css = '[id="form:municipality_input"]', cnames = c("muni_id", "muni"))
if (nrow(muni) > 1) {
muni_ko <- list()
for (j in seq_along(muni$muni_id)) {
# print(j)
p <- muni_post(p, attributes(p)$viewState, courts$court_id[i], muni$muni_id[j])
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
if (nrow(ko) == 0) {
ko <- data.frame(ko_id = NA, ko = NA, stringsAsFactors = FALSE)
}
muni_ko[[j]] <- cbind.data.frame(muni[j, ], ko, stringsAsFactors = FALSE)
}
metadata_i[[i]] <- cbind.data.frame(courts[i, ], do.call(rbind, muni_ko), stringsAsFactors = FALSE)
} else {
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
meta <- cbind.data.frame(courts[i, ], muni, stringsAsFactors = FALSE)
metadata_i[[i]] <- cbind.data.frame(meta, ko, stringsAsFactors = FALSE)
}
}
metadata <- do.call(rbind, metadata_i)
metadata_post <- function(session_zk, view_state, recaptcha, court,
date = as.character(format.Date(Sys.Date() - 4, "%d.%m.%Y")),
muni, ko, zk
) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'form' = 'form',
'g-recaptcha-response' = recaptcha,
'form:court_focus' = '',
'form:court_input' = court,
'form:cuTransferLast' = date,
'form:municipality_focus' = '',
'form:municipality_input' = muni,
'form:mpart_focus' = '',
'form:mpart_input' = ko,
'form:folder' = zk,
'form:parcel' = '',
'form:parcelSub' = '',
'form:j_idt61' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
# example
result <- break_captcha()
p <- metadata_post(session_zk = p, view_state = attributes(p)$viewState,
recaptcha = result, court = metadata$court_id[i],
muni = metadata$muni_id[i], ko = metadata$ko_id[i], zk = j)

Scraping Reddit in R with RedditExtractoR

I'm trying to scrape Reddit data (I'm pretty new to web scraping and half decent at R). The RedditExtractor package has a nice function that does 90% of what I need, but it doesn't grab the "flair" associated with users who make comments. I'm trying to play around with the package's function but I'm a bit over my head.
There are examples of Reddit threads with flairs here. I think I'm looking for the text in these bits of XML:
<span class="flair flair-orthodox" title="Eastern Orthodox">Eastern Orthodox</span>
I've pasted the code from the reddit_content() function along with comments where I think the extra code should go, but I'm not quite sure where to go from here. At the moment the function returns a data frame with columns for the comment, time stamp, user, etc. I need it to also produce a comment with user flairs if they exist. Thanks in advance!
redd_content_flair <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(),
#flair = character(),
URL = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE)), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"),
comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
#flair = unlist(lapply(main.node, function(x) {GetAttribute(x, "flair")})),
URL = URL[i], stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}
Edit: I'd also like to grab the URL for the "parent" comment, which looks like its in tags like
<p class="parent"><a name="d3t1p1r"></a></p>

I managed to come up with an ad hoc solution. I'll post it here for posterity. The issue is the function as-is wasn't set up to handle NULL JSON values. It was a quick fix.
About midway down there are two raw_data = lines. You need to add the nullValue = 'your null text' argument to the fromJSON function. Then you can add whatever metadata you wanted to both the empty data frame and the TEMP data frame, using the same construction as elsewhere. In the function below I've added both the user's flair text and the ID of the parent comment.
(Note, the wonky indenting is from the original function...I've left it as is to prevent accidentally changing something.)
reddit.fixed <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(), URL = character(), flair = character(), parent = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE), nullValue = "none"),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE), nullValue = "none"), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"), comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments, subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})), comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})), controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})), comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})), title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
URL = URL[i],
flair = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author_flair_text")
})),
parent = unlist(lapply(main.node, function(x) {GetAttribute(x, "parent_id")})),
stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}

How to get SSL connection of this function to work with GnuTLS build of RCurl

I'm making an R package that interfaces with the api from opendata.socrata.com.
I've run into a problem, that I've tracked to the build of the RCurl package.
On windows, with the RCurl build with openSSL, I've got no problems, but on Linux, with GnuTLS, it doesn't work.
You can check the build using curlVersion()$ssl_version.
Here is the function:
search.Socrata.Views <- function(search = NULL, ## full
topic = NULL, ## description
name = NULL, ## title field search
tags = NULL,
category = NULL,
count = FALSE,
limit = 10, ## max 200
page = 1,
type = "json" ## can also be xml
){
require('RCurl')
require('XML')
require('rjson')
## setting curl options
capath = system.file("CurlSSL",package = "RCurl")
cainfo = system.file("CurlSSL", "ca-bundle.crt", package = "RCurl")
cookie = 'cookiefile.txt'
curl = getCurlHandle ( cookiefile = cookie,
cookiejar = cookie,
useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en - US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
header = FALSE,
verbose = TRUE,
netrc = FALSE,
maxredirs = as.integer(20),
followlocation = TRUE,
ssl.verifypeer = TRUE,
cainfo = cainfo,
timeout = 100
)
## capath doesn't work:: NEED cainfo!
## test for existing cainfo:
if (!file.exists(cainfo)){
download.file('http://curl.haxx.se/ca/cacert.pem', cainfo )
}
## test for age of cainfo, if older than 2 weeks get new.
if (file.exists(cainfo)){
file.inf.cainfo <- file.info(cainfo)
age.cainfo <- Sys.time() - file.inf.cainfo[["mtime"]]
if(as.numeric(age.cainfo, units="days") > 14 ){
download.file('http://curl.haxx.se/ca/cacert.pem', cainfo )
}
}
### Make URL
baseSocrataUrl <- 'https://opendata.socrata.com/api/views.'
if(!is.null(category)){
category <- match.arg( category, c('Business', 'Fun', 'Personal', 'Education', 'Government'))
}
type <- match.arg( type, c('json', 'xml'))
## Tag
if(is.null(tags)){
tags <- NULL
} else {
tags <- URLencode( paste('&tags=', tags, sep = ''))
}
## Category
if(is.null(category)){
category <- NULL
} else {
category <- URLencode( paste('&category=', category, sep = ''))
}
## Limit
if(limit > 200){
limit <- '&limit=200'
} else {
limit <- paste('&limit=', limit, sep = '')
}
## search
if(is.null(search)){
search <- NULL
} else {
search <- URLencode( paste('&full=', search, sep = ''))
}
## page
page <- paste('&page=', page, sep = '')
## topic
if(is.null(topic)){
topic <- NULL
} else {
topic <- URLencode( paste('&description=', topic, sep = ''))
}
## name
if(is.null(name)){
name <- NULL
} else {
name <- URLencode( paste('&name=', name, sep = ''))
}
## count
if(count){
count <- '&count=TRUE'
} else {
count <- NULL
}
### Retrieving html
SocrataUrl <- paste( baseSocrataUrl, type, '?', page, tags, category, limit, search, name, topic, count, sep = '')
SocrataHtml <- getURL(SocrataUrl, curl = curl)
assign('search.Socrata.Call', SocrataUrl, envir=.GlobalEnv)
if(type == 'json'){
SocrataTable <- fromJSON(SocrataHtml)
SocrataTable <- lapply( SocrataTable, function(x){data.frame( x, stringsAsFactors = FALSE) } )
SocrataTable.df <- data.frame( matrix( nrow = length( SocrataTable), ncol = max(unlist(lapply(SocrataTable, length) ) ) ) )
names(SocrataTable.df) <- names( SocrataTable [lapply( SocrataTable, length ) == max( unlist( lapply( SocrataTable, length) ) ) ] [[1]] )
for( i in 1: length( SocrataTable ) ){
for( j in 1: length( names( SocrataTable[[i]] ) ) ){
SocrataTable.df[i, names( SocrataTable[[i]] )[j]] <- SocrataTable[[i]][i, names( SocrataTable[[i]] ) [j] ]
}
}
rm(curl)
gc()
return(SocrataTable.df)
} else {
rm(curl)
gc()
return(SocrataHtml)
}
}
Run the function with:
socrata.views <- search.Socrata.Views(topic = 'airplane')
print(socrata.views)

I haven't tested your code under Linux, but I can say that you're constructing URLs the hard way, which may be causing bugs. Using getForm, you can simplify your code considerably.
params <- list(
category = category,
tags = tags,
limit = min(limit, 200)
#etc.
)
params <- Filter(Negate(is.null), params)
getForm(baseSocrataUrl, .params = params, curl = curl)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Scraping function works only on some computers (R) - r

Related

Why is this Shiny app code not reactive when using purrr:map over input variables?

Strange R package behaviour: "could not find function"

POST request with httr package using R

Scraping Reddit in R with RedditExtractoR

How to get SSL connection of this function to work with GnuTLS build of RCurl

Categories

Resources