I have some functionality which works fine outside of a package, but when I put it into a package, devtools::load_all, and try to run one of the functions (DTL_similarity_search_results_fast) , another function (DTL_similarity_search) which should be loaded by the package is not found when it gets run inside of DTL_similarity_search_results_fast.
The code is:
messagef <- function(...) message(sprintf(...))
printf <- function(...) print(sprintf(...))
pattern_to_vec <- function(pattern, as_int = F, keep_list = FALSE) {
ret <- strsplit(pattern, ",")
if(length(pattern) == 1 && !keep_list)
ret <- ret[[1]]
if(as_int){
ret <- lapply(ret, as.integer)
}
ret
}
DTL_similarity_search <- function(search_pattern = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
url <- suppressWarnings(httr::modify_url("https://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/similar"))
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_pattern, keep_list = T), length) %>% min()
}
messagef("[DTL API] Starting search for %s", search_pattern)
resp <- suppressWarnings(httr::POST(url, body = list( n_gram = search_pattern,
transformation = transformation,
database_names = database_names,
metadata_filters = metadata_filters,
filter_category = filter_category,
minimum_similarity = minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference, filter_category = 0),
encode = "form"))
#browser()
#print(httr::content(resp, "text"))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved search ID %s of for pattern %s", parsed$search_id, search_pattern)
parsed$search_id
}
DTL_get_results <- function(search_id) {
url <- suppressWarnings(httr::modify_url("http://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/get"))
#messagef("[DTL API] Retrieving results for search_id %s", search_id)
resp <- suppressWarnings(httr::GET(url, query = list(search_id = search_id)))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
print(httr::content(resp, "text"))
#browser()
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved %s lines for search_id %s", length(parsed), search_id)
purrr::map_dfr(parsed, function(x){
if(is.null(x$within_single_phrase)){
x$within_single_phrase <- FALSE
}
#browser()
tibble::as_tibble(x) %>% dplyr::mutate(melid = as.character(melid))
})
}
DTL_similarity_search_results <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
results <- tibble::tibble()
if(is.na(max_edit_distance)){
max_edit_distance <- purrr:::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
for(pattern in search_patterns){
print('DTL_similarity_search')
print(DTL_similarity_search)
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
next
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0){
ret$search_pattern <- pattern
}
results <- dplyr::bind_rows(results, ret)
}
#browser()
if(nrow(results))
results %>% dplyr::distinct(melid, start, length, .keep_all = T)
}
DTL_similarity_search_results_fast <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0){
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
future::plan(future::multisession)
results <- furrr:::future_map_dfr(search_patterns, function(pattern){
print('DTL_similarity_search2')
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
return(tibble::tibble())
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0 )ret$search_pattern <- pattern
ret
})
#browser()
results %>% dplyr::distinct(melid, start, length, .keep_all = TRUE)
}
Then after load_all() when I try to run:
res <- DTL_similarity_search_results_fast()
I get:
Error in DTL_similarity_search(pattern, transformation,
database_names, : could not find function "DTL_similarity_search
but running a similar, different function works using the same procedure:
res <- DTL_similarity_search_results()
Related
EDIT WITH MWE BELOW
I have below a snippet of my code which is part of a larger app. I'm trying to rewrite the app to work with R6 classes and gargoyle as per this article. However, I cannot figure out why the observe part of the data below does not trigger except when it's initialized. To my understanding should if observe all the filters that are in input based on the map function, am I wrong?
output$filters <- renderUI({
gargoyle::watch("first thing")
data <- Data$get_data(unfiltered = TRUE)
data_names <- names(data)
if(nrow(data) > 0){
map(data_names, ~ render_ui_filter(data[[.x]], .x))
}
}
)
observe({
data <- Data$get_data(unfiltered = TRUE)
data_names <- names(data)
if(ncol(data) > 0){
each_var <- map(data_names, ~ filter_var(data[[.x]], input[[paste0("filter",.x)]]))
Transactions <- Data$set_filters(reduce(each_var, `&`))
gargoyle::trigger("second thing")
}
})
I've had a working case of the second reactive element like this:
selectedData <- reactive({
if(nrow(data()) > 0){
each_var <- map(dataFilterNames(), ~ filter_var(data()[[.x]], input[[paste0("filter",.x)]]))
reduce(each_var, `&`)
}
})
where data and dataFilterNames are reactiveVal and dataFilterNames is the column names of data.
Here you can find render_ui_filter and filter_var:
render_ui_filter <- function(x, var) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(NULL)
}
id <- paste0("filter",var)
var <- stringr::str_to_title(var)
if (is.numeric(x)) {
if(is.integer(x)){
step = 1
}
else{
step = NULL
}
rng <- range(x, na.rm = TRUE)
sliderInput(id,
var,
min = rng[1],
max = rng[2],
value = rng,
round = TRUE,
width = "90%",
sep = " ",
step = step
)
} else if (is.factor(x)) {
levs <- levels(x)
if(length(levs) < 5){
pickerInput(id, var, choices = levs, selected = levs, multiple = TRUE,
options = list(
title = sprintf("Filter on %s...", var),
#`live-search` = TRUE,
#`actions-box` = TRUE,
size = 10
))
}else {
pickerInput(id, var, choices = levs, selected = levs, multiple = TRUE,
options = list(
title = sprintf("Filter on %s...", var),
`live-search` = TRUE,
`actions-box` = TRUE,
size = 10,
`selected-text-format` = "count > 5"
))
}
} else if (is.Date(x)){
dateRangeInput(id,
var,
start = min(x),
end = max(x),
weekstart = 1,
autoclose = FALSE,
separator = "-")
} else if (is.logical(x)) {
pickerInput(id, var, choices = unique(x), selected = unique(x), multiple = TRUE,
options = list(
title = sprintf("Filter on %s...", var),
`live-search` = TRUE,
#`actions-box` = TRUE,
size = 10
))
} else {
# Not supported
NULL
}
}
filter_var <- function(x, val) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(TRUE)
}
if (is.numeric(x)) {
!is.na(x) & x >= val[1] & x <= val[2]
} else if (is.factor(x)) {
x %in% val
} else if(is.Date(x)){
!is.na(x) & x >= val[1] & x <= val[2]
} else if (is.logical(x)) {
x %in% val
} else {
# No control, so don't filter
TRUE
}
}
Edit: Here is a MWE that can be run in a notebook for example. It does not currently work since the gargoyle trigger triggers the observe it is in and we end up in a infinity loop. If you remove that you can see that the normal reactive part works, but the R6 version does not create the table ever.
if (interactive()){
require("shiny")
require("R6")
require("gargoyle")
require("purrr")
require("stringr")
# R6 DataSet ----
DataSet <- R6Class(
"DataSet",
private = list(
.data = NA,
.data_loaded = FALSE,
.filters = logical(0)
),
public = list(
initialize = function() {
private$.data = data.frame()
},
get_data = function(unfiltered = FALSE) {
if (!unfiltered) {
return(private$.data[private$.filters, ])
}
else{
return(private$.data)
}
},
set_data = function(data) {
stopifnot(is.data.frame(data))
private$.data <- data
private$.data_loaded <- TRUE
private$.filters <- rep(T, nrow(private$.data))
return(invisible(self))
},
set_filters = function(filters) {
stopifnot(is.logical(filters))
private$.filters <- filters
}
)
)
# Filtering ----
render_ui_filter <- function(x, var) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(NULL)
}
id <- paste0("filter",var)
var <- stringr::str_to_title(var)
if (is.numeric(x)) {
if(is.integer(x)){
step = 1
}
else{
step = NULL
}
rng <- range(x, na.rm = TRUE)
sliderInput(id,
var,
min = rng[1],
max = rng[2],
value = rng,
round = TRUE,
width = "90%",
sep = " ",
step = step
)
} else {
# Not supported
NULL
}
}
filter_var <- function(x, val) {
if(all(is.null(x) | is.na(x))){
#If all data is null, don't create a filter from it
return(TRUE)
}
if (is.numeric(x)) {
!is.na(x) & x >= val[1] & x <= val[2]
} else {
# No control, so don't filter
TRUE
}
}
# Options ----
options("gargoyle.talkative" = TRUE)
options(shiny.trace = TRUE)
options(shiny.fullstacktrace = TRUE)
ui <- function(request){
tagList(
h4('Filters'),
uiOutput("transactionFilters"),
h4('Reactive'),
tableOutput("table_reactive"),
h4('R6'),
tableOutput("table_r6")
)
}
server <- function(input, output, session){
gargoyle::init("df_r6_filtered")
Name <- c("Jon", "Bill", "Maria", "Ben", "Tina")
Age <- c(23, 41, 32, 58, 26)
df <- reactive(data.frame(Name, Age))
df_r6 <- DataSet$new()
df_r6$set_data(data.frame(Name, Age))
output$transactionFilters <- renderUI(
map(names(df()), ~ render_ui_filter(x = df()[[.x]], var = .x))
)
selected <- reactive({
if(nrow(df()) > 0){
each_var <- map(names(df()), ~ filter_var(df()[[.x]], input[[paste0("filter",.x)]]))
reduce(each_var, `&`)
}
})
observe({
data <- df_r6$get_data(unfiltered = TRUE)
data_names <- names(data)
if(ncol(data) > 0){
each_var <- map(data_names, ~ filter_var(data[[.x]], input[[paste0("filter",.x)]]))
filters_concatted <- reduce(each_var, `&`)
df_r6$set_filters(filters_concatted)
gargoyle::trigger("df_r6_filtered")
}
})
output$table_reactive <- renderTable(df()[selected(),])
gargoyle::on("df_r6_filtered",{
output$table_r6 <- renderTable(df_r6$get_data())
})
}
shinyApp(ui, server)
}
EDIT2: I noticed that the gargoyle::trigger("df_r6_filtered") creates a infinity loop of triggering the observe component. I'm not sure how to get out of it and that's what I am looking for help with.
The answer was simpler then expected of course. Just change the observe to a observeEvent on all of the input elements regarding the filter, i.e. like this:
observeEvent(
eventExpr = {
data <- df_r6$get_data(unfiltered = TRUE)
data_names <- names(data)
map(data_names, ~ input[[paste0("filter",.x)]])
},
{
...
}
})
I use the UN Comtrade data API with R.
library(rjson)
get.Comtrade <- function(url="http://comtrade.un.org/api/get?"
,maxrec=50000
,type="C"
,freq="A"
,px="HS"
,ps="now"
,r
,p
,rg="all"
,cc="TOTAL"
,fmt="json"
)
{
string<- paste(url
,"max=",maxrec,"&" #maximum no. of records returned
,"type=",type,"&" #type of trade (c=commodities)
,"freq=",freq,"&" #frequency
,"px=",px,"&" #classification
,"ps=",ps,"&" #time period
,"r=",r,"&" #reporting area
,"p=",p,"&" #partner country
,"rg=",rg,"&" #trade flow
,"cc=",cc,"&" #classification code
,"fmt=",fmt #Format
,sep = ""
)
if(fmt == "csv") {
raw.data<- read.csv(string,header=TRUE)
return(list(validation=NULL, data=raw.data))
} else {
if(fmt == "json" ) {
raw.data<- fromJSON(file=string)
data<- raw.data$dataset
validation<- unlist(raw.data$validation, recursive=TRUE)
ndata<- NULL
if(length(data)> 0) {
var.names<- names(data[[1]])
data<- as.data.frame(t( sapply(data,rbind)))
ndata<- NULL
for(i in 1:ncol(data)){
data[sapply(data[,i],is.null),i]<- NA
ndata<- cbind(ndata, unlist(data[,i]))
}
ndata<- as.data.frame(ndata)
colnames(ndata)<- var.names
}
return(list(validation=validation,data =ndata))
}
}
}
However, sometimes it fails to connect server and I need to run the code several times to start working. Solution given here, to use Retry() function, which retries a request until it succeeds, seems attractive.
However, I have some difficulties implementing this function in the code given above. has anybody used it before and knows how to recode it?
An API call using httr::RETRY could look like the following:
library(httr)
library(jsonlite)
res <- RETRY(
verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
query = list(
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r = 842,
p = "124,484",
rg = "all",
cc = "TOTAL",
fmt = "json"
)
)
# alternativ: returns dataset as a `list`:
# parsed_content <- content(res, as = "parsed")
# returns dataset as a `data.frame`:
json_content <- content(res, as = "text")
parsed_content <- parse_json(json_content, simplifyVector = TRUE)
parsed_content$validation
parsed_content$dataset
I'd suggest rewriting the get.Comtrade function using httr:
get.Comtrade <- function(verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r,
p,
rg = "all",
cc = "TOTAL",
fmt = "json") {
res <- httr::RETRY(
verb = verb,
url = url,
path = path,
encode = encode,
times = times,
query = list(
max = max,
type = type,
freq = freq,
px = px,
ps = ps,
r = r,
p = p,
rg = rg,
cc = cc,
fmt = fmt
)
)
jsonlite::parse_json(content(res, as = "text"), simplifyVector = TRUE)
}
s1 <- get.Comtrade(r = "842", p = "124,484", times = 5)
print(s1)
Please see this and this for more information on library(httr).
I am trying to modify an existing function by copy and pasting it to an R script, and assigning it to a new function object in my local environment. However the new function cannot find functions that are called to within the original function. How can I fix this without looking up and finding each function individually? I am guessing that the original function is somehow linked to the package or its dependencies and 'knows where to look' for the missing function, but I cannot figure out how to do this with my new copy-and-pasted function.
library("camtrapR")
Print the function name
activityDensity
The output here is the code for this function. I have omitted it here because it is long (and I have pasted it below), but I copy and paste the output of the function code exactly (see below where I assign this exact code to a new function), except for the last two lines of output, which I think are important:
<bytecode: 0x000000002a2d1e20>
<environment: namespace:camtrapR>
So now I assign the copy and pasted code from the output above to a new function with New <-
New <- function (recordTable, species, allSpecies = FALSE, speciesCol = "Species",
recordDateTimeCol = "DateTimeOriginal", recordDateTimeFormat = "%Y-%m-%d %H:%M:%S",
plotR = TRUE, writePNG = FALSE, plotDirectory, createDir = FALSE,
pngMaxPix = 1000, add.rug = TRUE, ...)
{
wd0 <- getwd()
mar0 <- par()$mar
on.exit(setwd(wd0))
on.exit(par(mar = mar0), add = TRUE)
recordTable <- dataFrameTibbleCheck(df = recordTable)
timeZone <- "UTC"
checkForSpacesInColumnNames(speciesCol = speciesCol, recordDateTimeCol = recordDateTimeCol)
if (!is.data.frame(recordTable))
stop("recordTable must be a data frame", call. = FALSE)
if (!speciesCol %in% colnames(recordTable))
stop(paste("speciesCol = \"", speciesCol, "\" is not a column name in recordTable",
sep = ""), call. = FALSE)
if (!recordDateTimeCol %in% colnames(recordTable))
stop(paste("recordDateTimeCol = \"", recordDateTimeCol,
"\" is not a column name in recordTable", sep = ""),
call. = FALSE)
stopifnot(is.logical(c(allSpecies, writePNG, plotR, createDir)))
if (allSpecies == FALSE) {
stopifnot(species %in% recordTable[, speciesCol])
stopifnot(hasArg(species))
}
recordTable$DateTime2 <- parseDateTimeObject(inputColumn = recordTable[,
recordDateTimeCol], dateTimeFormat = recordDateTimeFormat,
timeZone = timeZone)
recordTable$Time2 <- format(recordTable$DateTime2, format = "%H:%M:%S",
usetz = FALSE)
recordTable$Time.rad <- (as.numeric(as.POSIXct(strptime(recordTable$Time2,
format = "%H:%M:%S", tz = timeZone))) - as.numeric(as.POSIXct(strptime("0",
format = "%S", tz = timeZone))))/3600 * (pi/12)
if (isTRUE(writePNG)) {
if (hasArg(plotDirectory)) {
if (isTRUE(createDir)) {
dir.create(plotDirectory, recursive = TRUE, showWarnings = FALSE)
setwd(plotDirectory)
}
else {
stopifnot(file.exists(plotDirectory))
setwd(plotDirectory)
}
}
else {
stop("writePNG is TRUE. Please set plotDirectory",
call. = FALSE)
}
}
pngWidth <- pngMaxPix
pngHeight <- round(pngMaxPix * 0.8)
if (allSpecies == FALSE) {
subset_species <- subset(recordTable, recordTable[, speciesCol] ==
species)
if (nrow(subset_species) == 1)
stop(paste(species, "had only 1 record. Cannot estimate density."),
call. = FALSE)
try_error_tmp <- try({
if (isTRUE(writePNG))
png(filename = paste("activity_density_",
species, "_", Sys.Date(), ".png",
sep = ""), width = pngWidth, height = pngHeight,
units = "px", res = 96, type = "cairo")
if (isTRUE(writePNG) | isTRUE(plotR)) {
densityPlot(subset_species$Time.rad, main = paste("Activity of",
species), rug = add.rug, ...)
mtext(paste("number of records:", nrow(subset_species)),
side = 3, line = 0)
}
if (isTRUE(writePNG))
dev.off()
}, silent = TRUE)
if (class(try_error_tmp) == "try-error")
warning(paste(toupper(species), ": ", try_error_tmp[1],
" - SKIPPED", sep = ""), call. = FALSE)
}
else {
subset_species_list <- list()
for (i in 1:length(unique(recordTable[, speciesCol]))) {
spec.tmp <- unique(recordTable[, speciesCol])[i]
subset_species <- subset(recordTable, recordTable[,
speciesCol] == spec.tmp)
plot_main_title <- paste("Activity of", spec.tmp)
if (nrow(subset_species) == 1) {
warning(paste(toupper(spec.tmp), ": It had only 1 record. Cannot estimate density. - SKIPPED",
sep = ""), call. = FALSE)
next
}
else {
try_error_tmp <- try({
if (isTRUE(writePNG))
png(filename = paste("activity_density_",
spec.tmp, "_", Sys.Date(), ".png",
sep = ""), width = pngWidth, height = pngHeight,
units = "px", res = 96, type = "cairo")
if (isTRUE(writePNG) | isTRUE(plotR)) {
densityPlot(subset_species$Time.rad, main = plot_main_title,
rug = add.rug, ...)
mtext(paste("number of records:", nrow(subset_species)),
side = 3, line = 0)
}
if (isTRUE(writePNG))
dev.off()
}, silent = TRUE)
if (class(try_error_tmp) == "try-error")
warning(paste(toupper(spec.tmp), ": ",
try_error_tmp[1], " - SKIPPED",
sep = ""), call. = FALSE)
}
subset_species_list[[i]] <- subset_species$Time.rad
names(subset_species_list)[i] <- spec.tmp
}
}
if (allSpecies == FALSE) {
return(invisible(subset_species$Time.rad))
}
else {
return(invisible(subset_species_list))
}
}
Yet, when I try to run this new function (arguments omitted here for clarity), it can't find a function embedded within.
How can I somehow assign this function to look within the original package camtrapR for any dependencies, etc.? and why does the code output from the function not already do this?
New()
Error in dataFrameTibbleCheck(df = recordTable) :
could not find function "dataFrameTibbleCheck"
This answer here: https://stackoverflow.com/a/49277036/9096420 allows one to manually edit and save a function's code for each R session, but it is non-reproducible (not code) that can be shared or re-used.
If New is the new function copied from camtrapR then use
environment(New) <- asNamespace("camtrapR")
to ensure that the function calls in its body are looked up in the correct places.
We are doing a project with a shiny app that involves scraping and downloading dataframes from a website. We have the following problem: It works on some computers and not others.
We have the same packages versions, and we did not do too many requests...
It is not linked with whether it is on windows of mac, as it works on some windows and some macs but not others.
Do you have any idea ? Could it be in the connexion settings ?
It is not linked with the wifi network, we tried on the same wifi...
Upon request here is the code and the error messages :
This function is the one we call directly:
scraping_function <- function(search_terms, subreddit,
sort_by , time_frame){
exctracted_link <- reddit_urls_mod(search_terms, subreddit,
sort_by , time_frame)
exctracted_data <- reddit_content(exctracted_link[,5])
exctracted_data[,13] <- cleaning_text_function(exctracted_data[,13])
return(exctracted_data)
}
These are the functions that this function calls :
extracting the URLS:
reddit_urls_mod<- function (search_terms = "", subreddit = "",
sort_by = "", time_frame= "")
{
if (subreddit == ""){
subreddit <- NA
}
if (search_terms == ""){
search_terms <- NA
}
if (!grepl("^[0-9A-Za-z]*$", subreddit) & !is.na(subreddit) ) {
stop("subreddit must be a sequence of letter and number without special characters and spaces")
}
regex_filter = ""
cn_threshold = 0
page_threshold = 15
wait_time = 1
cached_links = data.frame(date = as.Date(character()),
num_comments = numeric(),
title = character(),
subreddit = character(),
URL = character(),
link = character())
if (sort_by != "front_page"){
if (!grepl("^comments$|^new$|^relevance$|^top$|^front_page$", sort_by)) {
stop("sort_by must be either 'new', 'comments', 'top', 'relevance' or 'front_page'")
}
if (!grepl("^hour$|^day$|^week$|^month$|^year$|^all$", time_frame)) {
stop("time_frame must be either 'hour', 'day', 'week', 'month', 'year or 'all'")
}
sterms = ifelse(is.na(search_terms), NA, gsub("\\s", "+",search_terms))
subreddit = ifelse(is.na(subreddit), "", paste0("r/", gsub("\\s+","+", subreddit), "/"))
sterms = ifelse(is.na(sterms), "", paste0("q=", sterms, "&restrict_sr=on&"))
sterms_prefix = ifelse(sterms == "", "new", "search")
time_frame_in = ifelse(is.na(search_terms), "", paste0("t=",time_frame,"&"))
search_address = search_query = paste0("https://www.reddit.com/",
subreddit, sterms_prefix,
".json?",
sterms,time_frame_in,
"sort=",
sort_by)
} else {
if (is.na(subreddit)) {
stop("if you choose sort_by = front_page please enter a subreddit")
}
search_address = search_query = paste0("https://www.reddit.com/r/",
subreddit,
".json?")
}
next_page = index = ""
page_counter = 0
comm_filter = 10000
while (is.null(next_page) == FALSE & page_counter < page_threshold &
comm_filter >= cn_threshold & length(index) > 0) {
search_JSON = tryCatch(RJSONIO::fromJSON(readLines(search_query,
warn = FALSE)), error = function(e) NULL)
if (is.null(search_JSON)) {
stop(paste("Unable to connect to reddit website or invalid subreddit entered"))
} else if (length(search_JSON$data$children)==0){
stop(paste("This search term returned no results or invalid subreddit entered"))
} else {
contents = search_JSON[[2]]$children
search_permalink = paste0("http://www.reddit.com",
sapply(seq(contents), function(x) contents[[x]]$data$permalink))
search_num_comments = sapply(seq(contents), function(x) contents[[x]]$data$num_comments)
search_title = sapply(seq(contents), function(x) contents[[x]]$data$title)
search_score = sapply(seq(contents), function(x) contents[[x]]$data$score)
search_subreddit = sapply(seq(contents), function(x) contents[[x]]$data$subreddit)
search_link = sapply(seq(contents), function(x) contents[[x]]$data$url)
index = which(search_num_comments >= cn_threshold &
grepl(regex_filter, search_title, ignore.case = T,
perl = T))
if (length(index) > 0) {
search_date = format(as.Date(as.POSIXct(unlist(lapply(seq(contents), function(x) contents[[x]]$data$created_utc)),
origin = "1970-01-01")), "%d-%m-%y")
temp_dat = data.frame(date = search_date,
num_comments = search_num_comments,
title = search_title,
subreddit = search_subreddit,
URL = search_permalink,
link = search_link,
stringsAsFactors = FALSE)[index,]
cached_links = as.data.frame(rbind(cached_links,
temp_dat))
next_page = search_JSON$data$after
comm_filter = utils::tail(search_num_comments,
1)
search_query = paste0(search_address, "&after=",
next_page)
page_counter = page_counter + 1
}
Sys.sleep(min(2, wait_time))
}
}
final_table = cached_links[!duplicated(cached_links), ]
if (dim(final_table)[1] == 0) {
cat(paste("\nNo results retrieved, should be invalid subreddit entered, down server or simply unsuccessful search query :("))
}
else {
remove_row = which(final_table[, 1] == "")
if (length(remove_row) > 0) {
final_table = final_table[-remove_row, ]
}
return(final_table)
}
}
extracting the content :
reddit_content <- function (URL, wait_time = 1) {
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(
paste0(filter, " ", depth),
lapply(1:length(reply.nodes),
function(x)
get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))
))
}
data_extract = data.frame(
id = numeric(),
structure = character(),
post_date = as.Date(character()),
comm_date = as.Date(character()),
num_comments = numeric(),
subreddit = character(),
upvote_prop = numeric(),
post_score = numeric(),
author = character(),
user = character(),
comment_score = numeric(),
controversiality = numeric(),
comment = character(),
title = character(),
post_text = character(),
link = character(),
domain = character(),
URL = character()
)
withProgress(message = 'Work in progress', value = 0, min=0,max=1, {
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e)
NULL
)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X,
warn = FALSE)),
error = function(e)
NULL
)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x)
get.structure(main.node[[x]], x)))
TEMP = data.frame(
id = NA,
structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(
as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")
), "%d-%m-%y"),
comm_date = format(as.Date(
as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")
), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(
is.null(meta.node$subreddit),
"UNKNOWN",
meta.node$subreddit
),
upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score,
author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title,
post_text = meta.node$selftext,
link = meta.node$url,
domain = meta.node$domain,
URL = URL[i],
stringsAsFactors = FALSE
)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else
print(paste("missed", i, ":", URL[i]))
}
}
incProgress(amount = 1/length(URL))
Sys.sleep(min(2, wait_time))
}
# data_extract[,13] <-
# cleaning_text_function(data_extract[,13])
})
return(data_extract)
}
Cleaning the text:
cleaning_text_function <- function(x,stopwords=stopwords_vec) {
stopwords_vec <- c(stopwords::stopwords("en"), "don", "isn", "gt", "i", "re","removed","deleted","m","you re","we ll", "ve", "hasn","they re","id","tl dr", "didn", "wh","oh","tl","dr","shes","hes","aren","edit","ok","ll","wasn","shouldn","t","doesn","youre","going","still","much", "many","also")
if (is.character(x)) {
#Put accents instead of code html (only for french)
Encoding(x) <- 'latin1'
#take out accent
x <- stri_trans_general(x, 'latin-ascii')
x <- unlist(lapply(x, function(x, stopwords = stopwords_vec) {
#separate words
x <- unlist(strsplit(x, " "))
#take out internet links
x <- x[!grepl("\\S+www\\S+|\\S+https://\\S+|https://\\S+", x)]
#take out codes ASCII and ponctuation
x <-gsub("\n|[[:punct:]]|[\x01-\x09\x11-\x12\x14-\x1F\x7F]|gt"," ",x)
#take out simple alone numbers
x <-gsub("(^[0-9]{1}\\s|^[0-9]{1}$|\\s{1}[0-9]{1}$|\\s{1}[0-9]{1}\\s{1})"," ",x)
#take out space in the beginning and end of stringg
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
#lowercase
x <- tolower(x)
#take out alone letters
x <-gsub("(^[a-z]{1}\\s+|^[a-z]{1}$|\\s+[a-z]{1}$|\\s+[a-z]{1}\\s+)", "", x)
#take out words in stopwords list
x <-paste(x[!x %in% stopwords], collapse = " ")
#rerun stopwords again to get ride of stopword in composed string
x <- unlist(strsplit(x, " "))
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
x <-paste(x[!x %in% stopwords], collapse = " ")
return(x)
}))
} else{
stop("please enter a character vector")
}
return(x)
}
And the message we get :
Listening on http://127.0.0.1:7745
Warning in file(con, "r") :
cannot open URL 'https://www.reddit.com/r/news/search.json?q=Greta&restrict_sr=on&t=week&sort=comments': HTTP status was '429 Unknown Error'
Warning: Error in reddit_urls_mod: Unable to connect to reddit website or invalid subreddit entered
126: stop
125: reddit_urls_mod
124: scraping_function
123: eventReactiveHandler
79: df1
72: observeEventHandler
1: runApp
I get an error 429 even on computers that have never made a request before...
Thank you
I'm trying to scrape Reddit data (I'm pretty new to web scraping and half decent at R). The RedditExtractor package has a nice function that does 90% of what I need, but it doesn't grab the "flair" associated with users who make comments. I'm trying to play around with the package's function but I'm a bit over my head.
There are examples of Reddit threads with flairs here. I think I'm looking for the text in these bits of XML:
<span class="flair flair-orthodox" title="Eastern Orthodox">Eastern Orthodox</span>
I've pasted the code from the reddit_content() function along with comments where I think the extra code should go, but I'm not quite sure where to go from here. At the moment the function returns a data frame with columns for the comment, time stamp, user, etc. I need it to also produce a comment with user flairs if they exist. Thanks in advance!
redd_content_flair <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(),
#flair = character(),
URL = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE)), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"),
comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
#flair = unlist(lapply(main.node, function(x) {GetAttribute(x, "flair")})),
URL = URL[i], stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}
Edit: I'd also like to grab the URL for the "parent" comment, which looks like its in tags like
<p class="parent"><a name="d3t1p1r"></a></p>
I managed to come up with an ad hoc solution. I'll post it here for posterity. The issue is the function as-is wasn't set up to handle NULL JSON values. It was a quick fix.
About midway down there are two raw_data = lines. You need to add the nullValue = 'your null text' argument to the fromJSON function. Then you can add whatever metadata you wanted to both the empty data frame and the TEMP data frame, using the same construction as elsewhere. In the function below I've added both the user's flair text and the ID of the parent comment.
(Note, the wonky indenting is from the original function...I've left it as is to prevent accidentally changing something.)
reddit.fixed <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(), URL = character(), flair = character(), parent = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE), nullValue = "none"),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE), nullValue = "none"), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"), comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments, subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})), comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})), controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})), comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})), title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
URL = URL[i],
flair = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author_flair_text")
})),
parent = unlist(lapply(main.node, function(x) {GetAttribute(x, "parent_id")})),
stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}