I'm trying to scrape Reddit data (I'm pretty new to web scraping and half decent at R). The RedditExtractor package has a nice function that does 90% of what I need, but it doesn't grab the "flair" associated with users who make comments. I'm trying to play around with the package's function but I'm a bit over my head.
There are examples of Reddit threads with flairs here. I think I'm looking for the text in these bits of XML:
<span class="flair flair-orthodox" title="Eastern Orthodox">Eastern Orthodox</span>
I've pasted the code from the reddit_content() function along with comments where I think the extra code should go, but I'm not quite sure where to go from here. At the moment the function returns a data frame with columns for the comment, time stamp, user, etc. I need it to also produce a comment with user flairs if they exist. Thanks in advance!
redd_content_flair <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(),
#flair = character(),
URL = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE)), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"),
comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
#flair = unlist(lapply(main.node, function(x) {GetAttribute(x, "flair")})),
URL = URL[i], stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}
Edit: I'd also like to grab the URL for the "parent" comment, which looks like its in tags like
<p class="parent"><a name="d3t1p1r"></a></p>
I managed to come up with an ad hoc solution. I'll post it here for posterity. The issue is the function as-is wasn't set up to handle NULL JSON values. It was a quick fix.
About midway down there are two raw_data = lines. You need to add the nullValue = 'your null text' argument to the fromJSON function. Then you can add whatever metadata you wanted to both the empty data frame and the TEMP data frame, using the same construction as elsewhere. In the function below I've added both the user's flair text and the ID of the parent comment.
(Note, the wonky indenting is from the original function...I've left it as is to prevent accidentally changing something.)
reddit.fixed <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(), URL = character(), flair = character(), parent = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE), nullValue = "none"),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE), nullValue = "none"), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"), comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments, subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})), comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})), controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})), comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})), title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
URL = URL[i],
flair = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author_flair_text")
})),
parent = unlist(lapply(main.node, function(x) {GetAttribute(x, "parent_id")})),
stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}
Related
I have some functionality which works fine outside of a package, but when I put it into a package, devtools::load_all, and try to run one of the functions (DTL_similarity_search_results_fast) , another function (DTL_similarity_search) which should be loaded by the package is not found when it gets run inside of DTL_similarity_search_results_fast.
The code is:
messagef <- function(...) message(sprintf(...))
printf <- function(...) print(sprintf(...))
pattern_to_vec <- function(pattern, as_int = F, keep_list = FALSE) {
ret <- strsplit(pattern, ",")
if(length(pattern) == 1 && !keep_list)
ret <- ret[[1]]
if(as_int){
ret <- lapply(ret, as.integer)
}
ret
}
DTL_similarity_search <- function(search_pattern = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
url <- suppressWarnings(httr::modify_url("https://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/similar"))
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_pattern, keep_list = T), length) %>% min()
}
messagef("[DTL API] Starting search for %s", search_pattern)
resp <- suppressWarnings(httr::POST(url, body = list( n_gram = search_pattern,
transformation = transformation,
database_names = database_names,
metadata_filters = metadata_filters,
filter_category = filter_category,
minimum_similarity = minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference, filter_category = 0),
encode = "form"))
#browser()
#print(httr::content(resp, "text"))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved search ID %s of for pattern %s", parsed$search_id, search_pattern)
parsed$search_id
}
DTL_get_results <- function(search_id) {
url <- suppressWarnings(httr::modify_url("http://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/get"))
#messagef("[DTL API] Retrieving results for search_id %s", search_id)
resp <- suppressWarnings(httr::GET(url, query = list(search_id = search_id)))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
print(httr::content(resp, "text"))
#browser()
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved %s lines for search_id %s", length(parsed), search_id)
purrr::map_dfr(parsed, function(x){
if(is.null(x$within_single_phrase)){
x$within_single_phrase <- FALSE
}
#browser()
tibble::as_tibble(x) %>% dplyr::mutate(melid = as.character(melid))
})
}
DTL_similarity_search_results <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
results <- tibble::tibble()
if(is.na(max_edit_distance)){
max_edit_distance <- purrr:::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
for(pattern in search_patterns){
print('DTL_similarity_search')
print(DTL_similarity_search)
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
next
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0){
ret$search_pattern <- pattern
}
results <- dplyr::bind_rows(results, ret)
}
#browser()
if(nrow(results))
results %>% dplyr::distinct(melid, start, length, .keep_all = T)
}
DTL_similarity_search_results_fast <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0){
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
future::plan(future::multisession)
results <- furrr:::future_map_dfr(search_patterns, function(pattern){
print('DTL_similarity_search2')
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
return(tibble::tibble())
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0 )ret$search_pattern <- pattern
ret
})
#browser()
results %>% dplyr::distinct(melid, start, length, .keep_all = TRUE)
}
Then after load_all() when I try to run:
res <- DTL_similarity_search_results_fast()
I get:
Error in DTL_similarity_search(pattern, transformation,
database_names, : could not find function "DTL_similarity_search
but running a similar, different function works using the same procedure:
res <- DTL_similarity_search_results()
I have a nested list structure with some elements (not all) having attributes that I want to keep (I've converted some xml output to a list). I'm trying to flatten it into a data.frame. The structure is something like this:
myList <- structure(list(address = structure(list(Address = list(Line = list("xxxxxxx"),
Line = list("xxxxxxx"), Line = list("xxxxxxx"), PostCode = list(
"XXX XXX"))), type = "Residential", verified = "Unverified"),
amount = structure(list(paymentAmount = list(maxAmount = list(
amountPart = structure(list(Amount = list("0.00")), component = "Standard"),
amountPart = structure(list(Amount = list("0.00")), component = "Thing1"),
amountPart = structure(list(Amount = list("0.00")), component = "Thing2"),
amountPart = structure(list(Amount = list("0.00")), component = "Thing3"),
amountPart = structure(list(Amount = list("0.00")), component = "Thing4"),
amountPart = structure(list(Amount = list("100.00")), component = "Thing5"),
amountPart = structure(list(Amount = list("0.00")), component = "Thing6")),
otherAmount = list(Amount = list("0.00")),
discount = list("0.00"),
transition = list(
"0.00"), discounts = list(), regularPayment = list(
"200.00")),
paymentInfo = list(income = structure(list(
net = list("0")), refNumber = "xxxxxxx"))),
paymentDate = "2021-03-22", startDate = "2021-02-16", endDate = "2021-03-15")),
type = "Normal")
I've tried rapply(myList, attributes) but that just seems to return NULL.
I've also tried using a loop in a recursive function:
get_attributes <- function(myList, attribute_list = NULL) {
if (is.null(attribute_list)) attribute_list <- list()
for (i in seq_along(myList)) {
if (is.list(myList[[i]])) {
attribute_list <- c(attribute_list, sapply(myList[[i]], attributes))
attribute_list <- get_attributes(myList[[i]], attribute_list)
} else {
attribute_list <- c(attribute_list, attributes(myList[[i]]))
}
}
attribute_list
}
Once I've got the list of attributes, I then want to put them in a one row data.frame - something like data.frame(address.type = "Residential", address.verified = "Unverified", component.1 = "Standard", component.2 = "Thing1"
The function with a loop is a bit messy and not very 'R', and it also seems to spit out lots of repeated elements that I don't want. Does anyone have any idea how to implement this more elegantly?
UPDATE
I've refined the loop implementation to this, which seems to work, but I just couldn't figure out how to use either purrr or one of the *apply functions in place of the loop:
get_attributes <- function(myList, attribute_list = NULL, prefix = NULL) {
if (is.null(attribute_list)) {
attribute_list <- list()
}
if (is.null(prefix)) {
prefix <- ""
}
for (i in seq_along(myList)) {
name <- names(myList)[i]
attrs <- attributes(myList[[i]])
if (!is.null(attrs)) {
names(attrs) <- paste0(prefix, name, ".", names(attrs))
attrs <- attrs[!grepl("\\.names$", names(attrs))]
attribute_list <- c(attribute_list, attrs)
}
if (is.list(myList[[i]])) {
attribute_list <- get_attributes(myList[[i]],
attribute_list,
paste0(prefix, name, "."))
}
}
attribute_list
}
do.call(data.frame, get_attributes(myList))
You can gather all the attributes available and just keep the ones you are interested from it.
library(purrr)
map_df(myList, ~map_chr(attributes(.x), toString))
# names type verified paymentDate startDate endDate
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 Address Residential Unverified NA NA NA
#2 paymentAmount, paymentInfo NA NA 2021-03-22 2021-02-16 2021-03-15
I am trying to modify an existing function by copy and pasting it to an R script, and assigning it to a new function object in my local environment. However the new function cannot find functions that are called to within the original function. How can I fix this without looking up and finding each function individually? I am guessing that the original function is somehow linked to the package or its dependencies and 'knows where to look' for the missing function, but I cannot figure out how to do this with my new copy-and-pasted function.
library("camtrapR")
Print the function name
activityDensity
The output here is the code for this function. I have omitted it here because it is long (and I have pasted it below), but I copy and paste the output of the function code exactly (see below where I assign this exact code to a new function), except for the last two lines of output, which I think are important:
<bytecode: 0x000000002a2d1e20>
<environment: namespace:camtrapR>
So now I assign the copy and pasted code from the output above to a new function with New <-
New <- function (recordTable, species, allSpecies = FALSE, speciesCol = "Species",
recordDateTimeCol = "DateTimeOriginal", recordDateTimeFormat = "%Y-%m-%d %H:%M:%S",
plotR = TRUE, writePNG = FALSE, plotDirectory, createDir = FALSE,
pngMaxPix = 1000, add.rug = TRUE, ...)
{
wd0 <- getwd()
mar0 <- par()$mar
on.exit(setwd(wd0))
on.exit(par(mar = mar0), add = TRUE)
recordTable <- dataFrameTibbleCheck(df = recordTable)
timeZone <- "UTC"
checkForSpacesInColumnNames(speciesCol = speciesCol, recordDateTimeCol = recordDateTimeCol)
if (!is.data.frame(recordTable))
stop("recordTable must be a data frame", call. = FALSE)
if (!speciesCol %in% colnames(recordTable))
stop(paste("speciesCol = \"", speciesCol, "\" is not a column name in recordTable",
sep = ""), call. = FALSE)
if (!recordDateTimeCol %in% colnames(recordTable))
stop(paste("recordDateTimeCol = \"", recordDateTimeCol,
"\" is not a column name in recordTable", sep = ""),
call. = FALSE)
stopifnot(is.logical(c(allSpecies, writePNG, plotR, createDir)))
if (allSpecies == FALSE) {
stopifnot(species %in% recordTable[, speciesCol])
stopifnot(hasArg(species))
}
recordTable$DateTime2 <- parseDateTimeObject(inputColumn = recordTable[,
recordDateTimeCol], dateTimeFormat = recordDateTimeFormat,
timeZone = timeZone)
recordTable$Time2 <- format(recordTable$DateTime2, format = "%H:%M:%S",
usetz = FALSE)
recordTable$Time.rad <- (as.numeric(as.POSIXct(strptime(recordTable$Time2,
format = "%H:%M:%S", tz = timeZone))) - as.numeric(as.POSIXct(strptime("0",
format = "%S", tz = timeZone))))/3600 * (pi/12)
if (isTRUE(writePNG)) {
if (hasArg(plotDirectory)) {
if (isTRUE(createDir)) {
dir.create(plotDirectory, recursive = TRUE, showWarnings = FALSE)
setwd(plotDirectory)
}
else {
stopifnot(file.exists(plotDirectory))
setwd(plotDirectory)
}
}
else {
stop("writePNG is TRUE. Please set plotDirectory",
call. = FALSE)
}
}
pngWidth <- pngMaxPix
pngHeight <- round(pngMaxPix * 0.8)
if (allSpecies == FALSE) {
subset_species <- subset(recordTable, recordTable[, speciesCol] ==
species)
if (nrow(subset_species) == 1)
stop(paste(species, "had only 1 record. Cannot estimate density."),
call. = FALSE)
try_error_tmp <- try({
if (isTRUE(writePNG))
png(filename = paste("activity_density_",
species, "_", Sys.Date(), ".png",
sep = ""), width = pngWidth, height = pngHeight,
units = "px", res = 96, type = "cairo")
if (isTRUE(writePNG) | isTRUE(plotR)) {
densityPlot(subset_species$Time.rad, main = paste("Activity of",
species), rug = add.rug, ...)
mtext(paste("number of records:", nrow(subset_species)),
side = 3, line = 0)
}
if (isTRUE(writePNG))
dev.off()
}, silent = TRUE)
if (class(try_error_tmp) == "try-error")
warning(paste(toupper(species), ": ", try_error_tmp[1],
" - SKIPPED", sep = ""), call. = FALSE)
}
else {
subset_species_list <- list()
for (i in 1:length(unique(recordTable[, speciesCol]))) {
spec.tmp <- unique(recordTable[, speciesCol])[i]
subset_species <- subset(recordTable, recordTable[,
speciesCol] == spec.tmp)
plot_main_title <- paste("Activity of", spec.tmp)
if (nrow(subset_species) == 1) {
warning(paste(toupper(spec.tmp), ": It had only 1 record. Cannot estimate density. - SKIPPED",
sep = ""), call. = FALSE)
next
}
else {
try_error_tmp <- try({
if (isTRUE(writePNG))
png(filename = paste("activity_density_",
spec.tmp, "_", Sys.Date(), ".png",
sep = ""), width = pngWidth, height = pngHeight,
units = "px", res = 96, type = "cairo")
if (isTRUE(writePNG) | isTRUE(plotR)) {
densityPlot(subset_species$Time.rad, main = plot_main_title,
rug = add.rug, ...)
mtext(paste("number of records:", nrow(subset_species)),
side = 3, line = 0)
}
if (isTRUE(writePNG))
dev.off()
}, silent = TRUE)
if (class(try_error_tmp) == "try-error")
warning(paste(toupper(spec.tmp), ": ",
try_error_tmp[1], " - SKIPPED",
sep = ""), call. = FALSE)
}
subset_species_list[[i]] <- subset_species$Time.rad
names(subset_species_list)[i] <- spec.tmp
}
}
if (allSpecies == FALSE) {
return(invisible(subset_species$Time.rad))
}
else {
return(invisible(subset_species_list))
}
}
Yet, when I try to run this new function (arguments omitted here for clarity), it can't find a function embedded within.
How can I somehow assign this function to look within the original package camtrapR for any dependencies, etc.? and why does the code output from the function not already do this?
New()
Error in dataFrameTibbleCheck(df = recordTable) :
could not find function "dataFrameTibbleCheck"
This answer here: https://stackoverflow.com/a/49277036/9096420 allows one to manually edit and save a function's code for each R session, but it is non-reproducible (not code) that can be shared or re-used.
If New is the new function copied from camtrapR then use
environment(New) <- asNamespace("camtrapR")
to ensure that the function calls in its body are looked up in the correct places.
We are doing a project with a shiny app that involves scraping and downloading dataframes from a website. We have the following problem: It works on some computers and not others.
We have the same packages versions, and we did not do too many requests...
It is not linked with whether it is on windows of mac, as it works on some windows and some macs but not others.
Do you have any idea ? Could it be in the connexion settings ?
It is not linked with the wifi network, we tried on the same wifi...
Upon request here is the code and the error messages :
This function is the one we call directly:
scraping_function <- function(search_terms, subreddit,
sort_by , time_frame){
exctracted_link <- reddit_urls_mod(search_terms, subreddit,
sort_by , time_frame)
exctracted_data <- reddit_content(exctracted_link[,5])
exctracted_data[,13] <- cleaning_text_function(exctracted_data[,13])
return(exctracted_data)
}
These are the functions that this function calls :
extracting the URLS:
reddit_urls_mod<- function (search_terms = "", subreddit = "",
sort_by = "", time_frame= "")
{
if (subreddit == ""){
subreddit <- NA
}
if (search_terms == ""){
search_terms <- NA
}
if (!grepl("^[0-9A-Za-z]*$", subreddit) & !is.na(subreddit) ) {
stop("subreddit must be a sequence of letter and number without special characters and spaces")
}
regex_filter = ""
cn_threshold = 0
page_threshold = 15
wait_time = 1
cached_links = data.frame(date = as.Date(character()),
num_comments = numeric(),
title = character(),
subreddit = character(),
URL = character(),
link = character())
if (sort_by != "front_page"){
if (!grepl("^comments$|^new$|^relevance$|^top$|^front_page$", sort_by)) {
stop("sort_by must be either 'new', 'comments', 'top', 'relevance' or 'front_page'")
}
if (!grepl("^hour$|^day$|^week$|^month$|^year$|^all$", time_frame)) {
stop("time_frame must be either 'hour', 'day', 'week', 'month', 'year or 'all'")
}
sterms = ifelse(is.na(search_terms), NA, gsub("\\s", "+",search_terms))
subreddit = ifelse(is.na(subreddit), "", paste0("r/", gsub("\\s+","+", subreddit), "/"))
sterms = ifelse(is.na(sterms), "", paste0("q=", sterms, "&restrict_sr=on&"))
sterms_prefix = ifelse(sterms == "", "new", "search")
time_frame_in = ifelse(is.na(search_terms), "", paste0("t=",time_frame,"&"))
search_address = search_query = paste0("https://www.reddit.com/",
subreddit, sterms_prefix,
".json?",
sterms,time_frame_in,
"sort=",
sort_by)
} else {
if (is.na(subreddit)) {
stop("if you choose sort_by = front_page please enter a subreddit")
}
search_address = search_query = paste0("https://www.reddit.com/r/",
subreddit,
".json?")
}
next_page = index = ""
page_counter = 0
comm_filter = 10000
while (is.null(next_page) == FALSE & page_counter < page_threshold &
comm_filter >= cn_threshold & length(index) > 0) {
search_JSON = tryCatch(RJSONIO::fromJSON(readLines(search_query,
warn = FALSE)), error = function(e) NULL)
if (is.null(search_JSON)) {
stop(paste("Unable to connect to reddit website or invalid subreddit entered"))
} else if (length(search_JSON$data$children)==0){
stop(paste("This search term returned no results or invalid subreddit entered"))
} else {
contents = search_JSON[[2]]$children
search_permalink = paste0("http://www.reddit.com",
sapply(seq(contents), function(x) contents[[x]]$data$permalink))
search_num_comments = sapply(seq(contents), function(x) contents[[x]]$data$num_comments)
search_title = sapply(seq(contents), function(x) contents[[x]]$data$title)
search_score = sapply(seq(contents), function(x) contents[[x]]$data$score)
search_subreddit = sapply(seq(contents), function(x) contents[[x]]$data$subreddit)
search_link = sapply(seq(contents), function(x) contents[[x]]$data$url)
index = which(search_num_comments >= cn_threshold &
grepl(regex_filter, search_title, ignore.case = T,
perl = T))
if (length(index) > 0) {
search_date = format(as.Date(as.POSIXct(unlist(lapply(seq(contents), function(x) contents[[x]]$data$created_utc)),
origin = "1970-01-01")), "%d-%m-%y")
temp_dat = data.frame(date = search_date,
num_comments = search_num_comments,
title = search_title,
subreddit = search_subreddit,
URL = search_permalink,
link = search_link,
stringsAsFactors = FALSE)[index,]
cached_links = as.data.frame(rbind(cached_links,
temp_dat))
next_page = search_JSON$data$after
comm_filter = utils::tail(search_num_comments,
1)
search_query = paste0(search_address, "&after=",
next_page)
page_counter = page_counter + 1
}
Sys.sleep(min(2, wait_time))
}
}
final_table = cached_links[!duplicated(cached_links), ]
if (dim(final_table)[1] == 0) {
cat(paste("\nNo results retrieved, should be invalid subreddit entered, down server or simply unsuccessful search query :("))
}
else {
remove_row = which(final_table[, 1] == "")
if (length(remove_row) > 0) {
final_table = final_table[-remove_row, ]
}
return(final_table)
}
}
extracting the content :
reddit_content <- function (URL, wait_time = 1) {
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(
paste0(filter, " ", depth),
lapply(1:length(reply.nodes),
function(x)
get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))
))
}
data_extract = data.frame(
id = numeric(),
structure = character(),
post_date = as.Date(character()),
comm_date = as.Date(character()),
num_comments = numeric(),
subreddit = character(),
upvote_prop = numeric(),
post_score = numeric(),
author = character(),
user = character(),
comment_score = numeric(),
controversiality = numeric(),
comment = character(),
title = character(),
post_text = character(),
link = character(),
domain = character(),
URL = character()
)
withProgress(message = 'Work in progress', value = 0, min=0,max=1, {
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e)
NULL
)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X,
warn = FALSE)),
error = function(e)
NULL
)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x)
get.structure(main.node[[x]], x)))
TEMP = data.frame(
id = NA,
structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(
as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")
), "%d-%m-%y"),
comm_date = format(as.Date(
as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")
), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(
is.null(meta.node$subreddit),
"UNKNOWN",
meta.node$subreddit
),
upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score,
author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title,
post_text = meta.node$selftext,
link = meta.node$url,
domain = meta.node$domain,
URL = URL[i],
stringsAsFactors = FALSE
)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else
print(paste("missed", i, ":", URL[i]))
}
}
incProgress(amount = 1/length(URL))
Sys.sleep(min(2, wait_time))
}
# data_extract[,13] <-
# cleaning_text_function(data_extract[,13])
})
return(data_extract)
}
Cleaning the text:
cleaning_text_function <- function(x,stopwords=stopwords_vec) {
stopwords_vec <- c(stopwords::stopwords("en"), "don", "isn", "gt", "i", "re","removed","deleted","m","you re","we ll", "ve", "hasn","they re","id","tl dr", "didn", "wh","oh","tl","dr","shes","hes","aren","edit","ok","ll","wasn","shouldn","t","doesn","youre","going","still","much", "many","also")
if (is.character(x)) {
#Put accents instead of code html (only for french)
Encoding(x) <- 'latin1'
#take out accent
x <- stri_trans_general(x, 'latin-ascii')
x <- unlist(lapply(x, function(x, stopwords = stopwords_vec) {
#separate words
x <- unlist(strsplit(x, " "))
#take out internet links
x <- x[!grepl("\\S+www\\S+|\\S+https://\\S+|https://\\S+", x)]
#take out codes ASCII and ponctuation
x <-gsub("\n|[[:punct:]]|[\x01-\x09\x11-\x12\x14-\x1F\x7F]|gt"," ",x)
#take out simple alone numbers
x <-gsub("(^[0-9]{1}\\s|^[0-9]{1}$|\\s{1}[0-9]{1}$|\\s{1}[0-9]{1}\\s{1})"," ",x)
#take out space in the beginning and end of stringg
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
#lowercase
x <- tolower(x)
#take out alone letters
x <-gsub("(^[a-z]{1}\\s+|^[a-z]{1}$|\\s+[a-z]{1}$|\\s+[a-z]{1}\\s+)", "", x)
#take out words in stopwords list
x <-paste(x[!x %in% stopwords], collapse = " ")
#rerun stopwords again to get ride of stopword in composed string
x <- unlist(strsplit(x, " "))
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
x <-paste(x[!x %in% stopwords], collapse = " ")
return(x)
}))
} else{
stop("please enter a character vector")
}
return(x)
}
And the message we get :
Listening on http://127.0.0.1:7745
Warning in file(con, "r") :
cannot open URL 'https://www.reddit.com/r/news/search.json?q=Greta&restrict_sr=on&t=week&sort=comments': HTTP status was '429 Unknown Error'
Warning: Error in reddit_urls_mod: Unable to connect to reddit website or invalid subreddit entered
126: stop
125: reddit_urls_mod
124: scraping_function
123: eventReactiveHandler
79: df1
72: observeEventHandler
1: runApp
I get an error 429 even on computers that have never made a request before...
Thank you
I would like to get the output from POST request using httr from following site:
http://www.e-grunt.ba
You can see submit form when you click "ZK Ulošci".
There I would like to send POST request and get the output. For example, you can select anything from drop down window and enter 1 in filed "Broj Uloška", and than click "Traži".
Here is my try:
library(httr)
library(tidyverse)
library(rvest)
output <- httr::POST(
"http://www.e-grunt.ba/home.jsf",
body = list(
"form:court_focus" = "440",
"form:cuTransferLast" = "17.07.2019",
"form:municipality_input" = "4400000001",
"form:mpart_focus" = "44000087",
"form:folder" = 1,
`recaptcha-token` = "some token",
submit = "form:j_idt61"
),
add_headers(Referer = "http://www.e-grunt.ba/"),
encode = "form",
verbose()
)
But this just returns content of the home page.
I know it is easier with (R)Selenium, but I would like to do it with httr and POST if it is possible.
I have found the way to scrape this ASP.net site. I am providing the code if somebody will need something similar:
start_session <- function() {
p <- html_session(
"http://www.e-grunt.ba",
user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
)
viewState <- p %>% html_nodes("input") %>% .[[2]] %>% html_attr("value")
p <- rvest:::request_POST(
p,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
"javax.faces.partial.ajax" = "true",
"javax.faces.source" = "j_idt8:j_idt15",
"javax.faces.partial.execute" = "#all",
"javax.faces.partial.render" = "content",
"j_idt8:j_idt15" = "j_idt8:j_idt15",
"j_idt8" = 'j_idt8',
'javax.faces.ViewState' = viewState
),
encode = "form"
)
attr(p, "viewState") <- viewState
p
}
# EXTRACT METADATA --------------------------------------------------------
p <- start_session()
name_value_pairs <- function(html, css, cnames) {
x <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_attr("value")
y <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_text()
df <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
df <- df[df[, 1] != -1, ]
colnames(df) <- cnames
df
}
courts <- name_value_pairs(p$response$content, css = '[id="form:court_input"]', cnames = c("court_id", "court"))
metadata_post <- function(session_zk, view_state, id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:court',
'javax.faces.partial.execute' = 'form:court',
'javax.faces.partial.render' = 'msgs msgsBottom form:municipality form:mpart form:cuTransferLast',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
muni_post <- function(session_zk, view_state, id, muni_id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:municipality',
'javax.faces.partial.execute' = 'form:municipality',
'javax.faces.partial.render' = 'msgs msgsBottom form:mpart',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:municipality_input' = muni_id,
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
metadata_i <- list()
for (i in seq_along(courts$court_id)) {
print(i)
p <- metadata_post(p, attributes(p)$viewState, courts$court_id[i])
muni <- name_value_pairs(p$response$content, css = '[id="form:municipality_input"]', cnames = c("muni_id", "muni"))
if (nrow(muni) > 1) {
muni_ko <- list()
for (j in seq_along(muni$muni_id)) {
# print(j)
p <- muni_post(p, attributes(p)$viewState, courts$court_id[i], muni$muni_id[j])
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
if (nrow(ko) == 0) {
ko <- data.frame(ko_id = NA, ko = NA, stringsAsFactors = FALSE)
}
muni_ko[[j]] <- cbind.data.frame(muni[j, ], ko, stringsAsFactors = FALSE)
}
metadata_i[[i]] <- cbind.data.frame(courts[i, ], do.call(rbind, muni_ko), stringsAsFactors = FALSE)
} else {
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
meta <- cbind.data.frame(courts[i, ], muni, stringsAsFactors = FALSE)
metadata_i[[i]] <- cbind.data.frame(meta, ko, stringsAsFactors = FALSE)
}
}
metadata <- do.call(rbind, metadata_i)
metadata_post <- function(session_zk, view_state, recaptcha, court,
date = as.character(format.Date(Sys.Date() - 4, "%d.%m.%Y")),
muni, ko, zk
) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'form' = 'form',
'g-recaptcha-response' = recaptcha,
'form:court_focus' = '',
'form:court_input' = court,
'form:cuTransferLast' = date,
'form:municipality_focus' = '',
'form:municipality_input' = muni,
'form:mpart_focus' = '',
'form:mpart_input' = ko,
'form:folder' = zk,
'form:parcel' = '',
'form:parcelSub' = '',
'form:j_idt61' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
# example
result <- break_captcha()
p <- metadata_post(session_zk = p, view_state = attributes(p)$viewState,
recaptcha = result, court = metadata$court_id[i],
muni = metadata$muni_id[i], ko = metadata$ko_id[i], zk = j)