POST request with httr package using R - r

I would like to get the output from POST request using httr from following site:
http://www.e-grunt.ba
You can see submit form when you click "ZK Ulošci".
There I would like to send POST request and get the output. For example, you can select anything from drop down window and enter 1 in filed "Broj Uloška", and than click "Traži".
Here is my try:
library(httr)
library(tidyverse)
library(rvest)
output <- httr::POST(
"http://www.e-grunt.ba/home.jsf",
body = list(
"form:court_focus" = "440",
"form:cuTransferLast" = "17.07.2019",
"form:municipality_input" = "4400000001",
"form:mpart_focus" = "44000087",
"form:folder" = 1,
`recaptcha-token` = "some token",
submit = "form:j_idt61"
),
add_headers(Referer = "http://www.e-grunt.ba/"),
encode = "form",
verbose()
)
But this just returns content of the home page.
I know it is easier with (R)Selenium, but I would like to do it with httr and POST if it is possible.

I have found the way to scrape this ASP.net site. I am providing the code if somebody will need something similar:
start_session <- function() {
p <- html_session(
"http://www.e-grunt.ba",
user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
)
viewState <- p %>% html_nodes("input") %>% .[[2]] %>% html_attr("value")
p <- rvest:::request_POST(
p,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
"javax.faces.partial.ajax" = "true",
"javax.faces.source" = "j_idt8:j_idt15",
"javax.faces.partial.execute" = "#all",
"javax.faces.partial.render" = "content",
"j_idt8:j_idt15" = "j_idt8:j_idt15",
"j_idt8" = 'j_idt8',
'javax.faces.ViewState' = viewState
),
encode = "form"
)
attr(p, "viewState") <- viewState
p
}
# EXTRACT METADATA --------------------------------------------------------
p <- start_session()
name_value_pairs <- function(html, css, cnames) {
x <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_attr("value")
y <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_text()
df <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
df <- df[df[, 1] != -1, ]
colnames(df) <- cnames
df
}
courts <- name_value_pairs(p$response$content, css = '[id="form:court_input"]', cnames = c("court_id", "court"))
metadata_post <- function(session_zk, view_state, id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:court',
'javax.faces.partial.execute' = 'form:court',
'javax.faces.partial.render' = 'msgs msgsBottom form:municipality form:mpart form:cuTransferLast',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
muni_post <- function(session_zk, view_state, id, muni_id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:municipality',
'javax.faces.partial.execute' = 'form:municipality',
'javax.faces.partial.render' = 'msgs msgsBottom form:mpart',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:municipality_input' = muni_id,
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
metadata_i <- list()
for (i in seq_along(courts$court_id)) {
print(i)
p <- metadata_post(p, attributes(p)$viewState, courts$court_id[i])
muni <- name_value_pairs(p$response$content, css = '[id="form:municipality_input"]', cnames = c("muni_id", "muni"))
if (nrow(muni) > 1) {
muni_ko <- list()
for (j in seq_along(muni$muni_id)) {
# print(j)
p <- muni_post(p, attributes(p)$viewState, courts$court_id[i], muni$muni_id[j])
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
if (nrow(ko) == 0) {
ko <- data.frame(ko_id = NA, ko = NA, stringsAsFactors = FALSE)
}
muni_ko[[j]] <- cbind.data.frame(muni[j, ], ko, stringsAsFactors = FALSE)
}
metadata_i[[i]] <- cbind.data.frame(courts[i, ], do.call(rbind, muni_ko), stringsAsFactors = FALSE)
} else {
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
meta <- cbind.data.frame(courts[i, ], muni, stringsAsFactors = FALSE)
metadata_i[[i]] <- cbind.data.frame(meta, ko, stringsAsFactors = FALSE)
}
}
metadata <- do.call(rbind, metadata_i)
metadata_post <- function(session_zk, view_state, recaptcha, court,
date = as.character(format.Date(Sys.Date() - 4, "%d.%m.%Y")),
muni, ko, zk
) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'form' = 'form',
'g-recaptcha-response' = recaptcha,
'form:court_focus' = '',
'form:court_input' = court,
'form:cuTransferLast' = date,
'form:municipality_focus' = '',
'form:municipality_input' = muni,
'form:mpart_focus' = '',
'form:mpart_input' = ko,
'form:folder' = zk,
'form:parcel' = '',
'form:parcelSub' = '',
'form:j_idt61' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
# example
result <- break_captcha()
p <- metadata_post(session_zk = p, view_state = attributes(p)$viewState,
recaptcha = result, court = metadata$court_id[i],
muni = metadata$muni_id[i], ko = metadata$ko_id[i], zk = j)

Related

Implement call retries with httr::RETRY() function in API call (R)

I use the UN Comtrade data API with R.
library(rjson)
get.Comtrade <- function(url="http://comtrade.un.org/api/get?"
,maxrec=50000
,type="C"
,freq="A"
,px="HS"
,ps="now"
,r
,p
,rg="all"
,cc="TOTAL"
,fmt="json"
)
{
string<- paste(url
,"max=",maxrec,"&" #maximum no. of records returned
,"type=",type,"&" #type of trade (c=commodities)
,"freq=",freq,"&" #frequency
,"px=",px,"&" #classification
,"ps=",ps,"&" #time period
,"r=",r,"&" #reporting area
,"p=",p,"&" #partner country
,"rg=",rg,"&" #trade flow
,"cc=",cc,"&" #classification code
,"fmt=",fmt #Format
,sep = ""
)
if(fmt == "csv") {
raw.data<- read.csv(string,header=TRUE)
return(list(validation=NULL, data=raw.data))
} else {
if(fmt == "json" ) {
raw.data<- fromJSON(file=string)
data<- raw.data$dataset
validation<- unlist(raw.data$validation, recursive=TRUE)
ndata<- NULL
if(length(data)> 0) {
var.names<- names(data[[1]])
data<- as.data.frame(t( sapply(data,rbind)))
ndata<- NULL
for(i in 1:ncol(data)){
data[sapply(data[,i],is.null),i]<- NA
ndata<- cbind(ndata, unlist(data[,i]))
}
ndata<- as.data.frame(ndata)
colnames(ndata)<- var.names
}
return(list(validation=validation,data =ndata))
}
}
}
However, sometimes it fails to connect server and I need to run the code several times to start working. Solution given here, to use Retry() function, which retries a request until it succeeds, seems attractive.
However, I have some difficulties implementing this function in the code given above. has anybody used it before and knows how to recode it?
An API call using httr::RETRY could look like the following:
library(httr)
library(jsonlite)
res <- RETRY(
verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
query = list(
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r = 842,
p = "124,484",
rg = "all",
cc = "TOTAL",
fmt = "json"
)
)
# alternativ: returns dataset as a `list`:
# parsed_content <- content(res, as = "parsed")
# returns dataset as a `data.frame`:
json_content <- content(res, as = "text")
parsed_content <- parse_json(json_content, simplifyVector = TRUE)
parsed_content$validation
parsed_content$dataset
I'd suggest rewriting the get.Comtrade function using httr:
get.Comtrade <- function(verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r,
p,
rg = "all",
cc = "TOTAL",
fmt = "json") {
res <- httr::RETRY(
verb = verb,
url = url,
path = path,
encode = encode,
times = times,
query = list(
max = max,
type = type,
freq = freq,
px = px,
ps = ps,
r = r,
p = p,
rg = rg,
cc = cc,
fmt = fmt
)
)
jsonlite::parse_json(content(res, as = "text"), simplifyVector = TRUE)
}
s1 <- get.Comtrade(r = "842", p = "124,484", times = 5)
print(s1)
Please see this and this for more information on library(httr).

Strange R package behaviour: "could not find function"

I have some functionality which works fine outside of a package, but when I put it into a package, devtools::load_all, and try to run one of the functions (DTL_similarity_search_results_fast) , another function (DTL_similarity_search) which should be loaded by the package is not found when it gets run inside of DTL_similarity_search_results_fast.
The code is:
messagef <- function(...) message(sprintf(...))
printf <- function(...) print(sprintf(...))
pattern_to_vec <- function(pattern, as_int = F, keep_list = FALSE) {
ret <- strsplit(pattern, ",")
if(length(pattern) == 1 && !keep_list)
ret <- ret[[1]]
if(as_int){
ret <- lapply(ret, as.integer)
}
ret
}
DTL_similarity_search <- function(search_pattern = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
url <- suppressWarnings(httr::modify_url("https://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/similar"))
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_pattern, keep_list = T), length) %>% min()
}
messagef("[DTL API] Starting search for %s", search_pattern)
resp <- suppressWarnings(httr::POST(url, body = list( n_gram = search_pattern,
transformation = transformation,
database_names = database_names,
metadata_filters = metadata_filters,
filter_category = filter_category,
minimum_similarity = minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference, filter_category = 0),
encode = "form"))
#browser()
#print(httr::content(resp, "text"))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved search ID %s of for pattern %s", parsed$search_id, search_pattern)
parsed$search_id
}
DTL_get_results <- function(search_id) {
url <- suppressWarnings(httr::modify_url("http://staging-dtl-pattern-api.hfm-weimar.de/", path = "/patterns/get"))
#messagef("[DTL API] Retrieving results for search_id %s", search_id)
resp <- suppressWarnings(httr::GET(url, query = list(search_id = search_id)))
if (httr::http_error(resp)) {
messagef(
"[DTL API] Similarity Search request failed [%s]\n%s\n<%s>",
httr::status_code(resp),
"",#parsed$message,
""#parsed$documentation_url
)
return(NULL)
}
print(httr::content(resp, "text"))
#browser()
parsed <- jsonlite::fromJSON(httr::content(resp, "text"), simplifyVector = FALSE)
messagef("[DTL API] Retrieved %s lines for search_id %s", length(parsed), search_id)
purrr::map_dfr(parsed, function(x){
if(is.null(x$within_single_phrase)){
x$within_single_phrase <- FALSE
}
#browser()
tibble::as_tibble(x) %>% dplyr::mutate(melid = as.character(melid))
})
}
DTL_similarity_search_results <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0) {
results <- tibble::tibble()
if(is.na(max_edit_distance)){
max_edit_distance <- purrr:::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
for(pattern in search_patterns){
print('DTL_similarity_search')
print(DTL_similarity_search)
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
next
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0){
ret$search_pattern <- pattern
}
results <- dplyr::bind_rows(results, ret)
}
#browser()
if(nrow(results))
results %>% dplyr::distinct(melid, start, length, .keep_all = T)
}
DTL_similarity_search_results_fast <- function(search_patterns = "1,2,1,2,1,2,1,2",
transformation = "interval",
database_names = "dtl,wjazzd,omnibook",
metadata_filters = '{"dtl": {}, "wjazzd": {}, "esac": {}, "omnibook": {}}',
filter_category = "0",
minimum_similarity = 1.0,
max_edit_distance = NA,
max_length_difference = 0){
if(is.na(max_edit_distance)){
max_edit_distance <- purrr::map_int(pattern_to_vec(search_patterns, keep_list = T), length) %>% min()
}
future::plan(future::multisession)
results <- furrr:::future_map_dfr(search_patterns, function(pattern){
print('DTL_similarity_search2')
search_id <- DTL_similarity_search(pattern,
transformation,
database_names,
metadata_filters,
filter_category,
minimum_similarity,
max_edit_distance = max_edit_distance,
max_length_difference = max_length_difference)
if(is.null(search_id)){
return(tibble::tibble())
}
ret <- DTL_get_results(search_id)
if(!is.null(ret) && nrow(ret) > 0 )ret$search_pattern <- pattern
ret
})
#browser()
results %>% dplyr::distinct(melid, start, length, .keep_all = TRUE)
}
Then after load_all() when I try to run:
res <- DTL_similarity_search_results_fast()
I get:
Error in DTL_similarity_search(pattern, transformation,
database_names, : could not find function "DTL_similarity_search
but running a similar, different function works using the same procedure:
res <- DTL_similarity_search_results()

R/Rshiny adding item to list of factors

Why is this so difficult?! I have (what I believe is a factor vector) and I want to add an item to the list so I can use it farther down the road.
I want to add "memo.txt" to the factor vector filenames.
I have figured out how to add a factor level to the list, but not the item itself.
levels(filenames) <- c(levels(filenames), "memo.txt")
The specific section I am working in is here:
observeEvent(input$download, {
filenames <- na.omit(data[input$tbl1_rows_selected, "file_name"])
#I need to add items to "filenames" here. I then display "test" to make sure those items exist in "filenames" - ie, i want to add "memo.txt" to filenames.
output$test <- renderTable(filenames)
files <- file.path(".", "www", filenames)
URIs <- lapply(seq_along(files), function(i){
URI <- dataURI(file = files[i])
list(filename = filenames[i], uri = substr(URI, 14, nchar(URI)))
})
table <- fromJSON(toJSON(input$appts_data), simplifyDataFrame = FALSE)
session$sendCustomMessage(
"download",
list(table = table, URIs = URIs)
)
})
The entire code:
library(shiny)
library(timevis)
library(lubridate)
library(dplyr)
library(jsonlite)
library(base64enc)
starthour <- 8
today <- as.character(Sys.Date())
todayzero <- paste(today, "00:00:00")
todayAM <- paste(today, "07:00:00")
todayPM <- paste(today, "18:00:00")
items <- data.frame(
category = c("Room", "IceBreaker", "Activity", "Break"),
group = c(1, 2, 3, 4),
className = c ("red_point", "blue_point", "green_point", "purple_point"),
content = c("Big Room", "Introductions", "Red Rover", "Lunch"),
length = c(480, 60, 120, 90),
file_name = c("Toolkit_placeholder.pdf", NA, "Placeholder.txt", "Toolkit_placeholder.pdf")
)
groups <- data.frame(id = items$group, content = items$category)
data <- items %>% mutate(
id = 1:4,
start = as.POSIXct(todayzero) + hours(starthour),
end = as.POSIXct(todayzero) + hours(starthour) + minutes(items$length)
)
js <- "
function downloadZIP(x){
var csv = Papa.unparse(x.table);
var URIs = x.URIs;
domtoimage.toPng(document.getElementById('appts'), {bgcolor: 'white'})
.then(function (dataUrl) {
var zip = new JSZip();
var idx = dataUrl.indexOf('base64,') + 'base64,'.length;
var content = dataUrl.substring(idx);
zip.file('timeline.png', content, {base64: true})
.file('timeline.csv', btoa(csv), {base64: true});
for(let i=0; i < URIs.length; ++i){
zip.file(URIs[i].filename, URIs[i].uri, {base64: true});
}
zip.generateAsync({type:'base64'}).then(function (b64) {
var link = document.createElement('a');
link.download = 'mytimeline.zip';
link.href = 'data:application/zip;base64,' + b64;
link.click();
});
});
}
$(document).on('shiny:connected', function(){
Shiny.addCustomMessageHandler('download', downloadZIP);
});"
ui <- fluidPage(
tags$head(
tags$script(src = "https://cdnjs.cloudflare.com/ajax/libs/dom-to-image/2.6.0/dom-to-image.min.js"),
tags$script(src = "https://cdnjs.cloudflare.com/ajax/libs/jszip/3.5.0/jszip.min.js"),
tags$script(src = "https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.2.0/papaparse.min.js"),
tags$script(HTML(js)),
tags$style(
HTML(
"
.red_point { border-color: red; border-width: 2px; }
.blue_point { border-color: blue; border-width: 2px; }
.green_point { border-color: green; border-width: 2px; }
.purple_point { border-color: purple; border-width: 2px; }
"
)
)
),
DT::dataTableOutput("tbl1"),
conditionalPanel(
condition = "typeof input.tbl1_rows_selected !== 'undefined' && input.tbl1_rows_selected.length > 1",
actionButton(class = "btn-success",
"button2",
"GENERATE TIMELINE")
),
conditionalPanel(
condition = "input.button2 > 0",
timevisOutput("appts"),
actionButton("download", "Download timeline", class = "btn-success"),
conditionalPanel(
condition = "input.download > 0",
tableOutput("test")
)
)
)
server <- function(input, output, session) {
output$tbl1 <- DT::renderDataTable({
data
},
caption = 'Select desired options and scroll down to continue.',
selection = 'multiple',
class = "display nowrap compact",
extensions = 'Scroller',
options = list(
dom = 'Bfrtip',
paging = FALSE,
columnDefs = list(list(visible = FALSE))
))
observeEvent(input$button2, {
row_data <- data[input$tbl1_rows_selected, ]
output$appts <- renderTimevis(timevis(
data = row_data,
groups = groups,
fit = TRUE,
options = list(
editable = TRUE,
multiselect = TRUE,
align = "center",
stack = TRUE,
start = todayAM,
end = todayPM,
showCurrentTime = FALSE,
showMajorLabels = FALSE
)
))
file_list <- as.data.frame(row_data$file_name)
})
observeEvent(input$download, {
filenames <- na.omit(data[input$tbl1_rows_selected, "file_name"])
#levels(filenames) <- c(levels(filenames), "memo.txt")
#test <- "memo.txt"
#browser()
#filenames <- append(filenames,test)
# levels(filenames) <- c(levels(filenames), "memo.txt")
output$test <- renderTable(filenames)
files <- file.path(".", "www", filenames)
URIs <- lapply(seq_along(files), function(i){
URI <- dataURI(file = files[i])
list(filename = filenames[i], uri = substr(URI, 14, nchar(URI)))
})
table <- fromJSON(toJSON(input$appts_data), simplifyDataFrame = FALSE)
session$sendCustomMessage(
"download",
list(table = table, URIs = URIs)
)
})
}
shinyApp(ui, server)
EDIT w/answer(ish)
After trying (and failing) like so many others to deal with the factors, I wisened up and set my original "items" dataframe to stringAsFactors = FALSE
this is by far the easiest solution. From there the following works:
items <- data.frame(
category = c("Room", "IceBreaker", "Activity", "Break"),
group = c(1, 2, 3, 4),
className = c ("red_point", "blue_point", "green_point",
"purple_point"),
content = c("Big Room", "Introductions", "Red Rover", "Lunch"),
length = c(480, 60, 120, 90),
file_name = c("Toolkit_placeholder.pdf", NA, "Placeholder.txt",
"Toolkit_placeholder.pdf"), stringsAsFactors = FALSE
)
observeEvent(input$download, {
filenames <- na.omit(data[input$tbl1_rows_selected, "file_name"])
static_files <- "memo.txt"
filenames <- append(filenames,static_files)
output$test <- renderTable(filenames)
files <- file.path(".", "www", filenames)
URIs <- lapply(seq_along(files), function(i){
URI <- dataURI(file = files[i])
list(filename = filenames[i], uri = substr(URI, 14, nchar(URI)))
})
table <- fromJSON(toJSON(input$appts_data), simplifyDataFrame = FALSE)
session$sendCustomMessage(
"download",
list(table = table, URIs = URIs)
)
})
Instead of trying to manipulate the factors, the simplest answer was to set stringsToFactors as FALSE. I am using R version 3.6, in R 4.0 that is now the default behavior.
I have updated the original question to include the answer.
Try this code:
observeEvent(input$download, {
filenames <- na.omit(data[input$tbl1_rows_selected, "file_name"])
## added the next seven lines; no other modifications to your code.
filez <- file.path(".", "www", filenames)
fnamez <- lapply(seq_along(filez), function(i){
list(filename = filenames[i])
})
f2namez <- list(fnamez,"memo.txt")
filenamez <- unlist(f2namez)
filenamez2 <- data.frame(filenamez)
output$test <- renderTable(filenamez2)
files <- file.path(".", "www", filenames)
URIs <- lapply(seq_along(files), function(i){
URI <- dataURI(file = files[i])
list(filename = filenames[i], uri = substr(URI, 14, nchar(URI)))
})
table <- fromJSON(toJSON(input$appts_data), simplifyDataFrame = FALSE)
session$sendCustomMessage(
"download",
list(table = table, URIs = URIs)
)
})
No other modification to the remaining part of your code. This gives the following output:

Scraping function works only on some computers (R)

We are doing a project with a shiny app that involves scraping and downloading dataframes from a website. We have the following problem: It works on some computers and not others.
We have the same packages versions, and we did not do too many requests...
It is not linked with whether it is on windows of mac, as it works on some windows and some macs but not others.
Do you have any idea ? Could it be in the connexion settings ?
It is not linked with the wifi network, we tried on the same wifi...
Upon request here is the code and the error messages :
This function is the one we call directly:
scraping_function <- function(search_terms, subreddit,
sort_by , time_frame){
exctracted_link <- reddit_urls_mod(search_terms, subreddit,
sort_by , time_frame)
exctracted_data <- reddit_content(exctracted_link[,5])
exctracted_data[,13] <- cleaning_text_function(exctracted_data[,13])
return(exctracted_data)
}
These are the functions that this function calls :
extracting the URLS:
reddit_urls_mod<- function (search_terms = "", subreddit = "",
sort_by = "", time_frame= "")
{
if (subreddit == ""){
subreddit <- NA
}
if (search_terms == ""){
search_terms <- NA
}
if (!grepl("^[0-9A-Za-z]*$", subreddit) & !is.na(subreddit) ) {
stop("subreddit must be a sequence of letter and number without special characters and spaces")
}
regex_filter = ""
cn_threshold = 0
page_threshold = 15
wait_time = 1
cached_links = data.frame(date = as.Date(character()),
num_comments = numeric(),
title = character(),
subreddit = character(),
URL = character(),
link = character())
if (sort_by != "front_page"){
if (!grepl("^comments$|^new$|^relevance$|^top$|^front_page$", sort_by)) {
stop("sort_by must be either 'new', 'comments', 'top', 'relevance' or 'front_page'")
}
if (!grepl("^hour$|^day$|^week$|^month$|^year$|^all$", time_frame)) {
stop("time_frame must be either 'hour', 'day', 'week', 'month', 'year or 'all'")
}
sterms = ifelse(is.na(search_terms), NA, gsub("\\s", "+",search_terms))
subreddit = ifelse(is.na(subreddit), "", paste0("r/", gsub("\\s+","+", subreddit), "/"))
sterms = ifelse(is.na(sterms), "", paste0("q=", sterms, "&restrict_sr=on&"))
sterms_prefix = ifelse(sterms == "", "new", "search")
time_frame_in = ifelse(is.na(search_terms), "", paste0("t=",time_frame,"&"))
search_address = search_query = paste0("https://www.reddit.com/",
subreddit, sterms_prefix,
".json?",
sterms,time_frame_in,
"sort=",
sort_by)
} else {
if (is.na(subreddit)) {
stop("if you choose sort_by = front_page please enter a subreddit")
}
search_address = search_query = paste0("https://www.reddit.com/r/",
subreddit,
".json?")
}
next_page = index = ""
page_counter = 0
comm_filter = 10000
while (is.null(next_page) == FALSE & page_counter < page_threshold &
comm_filter >= cn_threshold & length(index) > 0) {
search_JSON = tryCatch(RJSONIO::fromJSON(readLines(search_query,
warn = FALSE)), error = function(e) NULL)
if (is.null(search_JSON)) {
stop(paste("Unable to connect to reddit website or invalid subreddit entered"))
} else if (length(search_JSON$data$children)==0){
stop(paste("This search term returned no results or invalid subreddit entered"))
} else {
contents = search_JSON[[2]]$children
search_permalink = paste0("http://www.reddit.com",
sapply(seq(contents), function(x) contents[[x]]$data$permalink))
search_num_comments = sapply(seq(contents), function(x) contents[[x]]$data$num_comments)
search_title = sapply(seq(contents), function(x) contents[[x]]$data$title)
search_score = sapply(seq(contents), function(x) contents[[x]]$data$score)
search_subreddit = sapply(seq(contents), function(x) contents[[x]]$data$subreddit)
search_link = sapply(seq(contents), function(x) contents[[x]]$data$url)
index = which(search_num_comments >= cn_threshold &
grepl(regex_filter, search_title, ignore.case = T,
perl = T))
if (length(index) > 0) {
search_date = format(as.Date(as.POSIXct(unlist(lapply(seq(contents), function(x) contents[[x]]$data$created_utc)),
origin = "1970-01-01")), "%d-%m-%y")
temp_dat = data.frame(date = search_date,
num_comments = search_num_comments,
title = search_title,
subreddit = search_subreddit,
URL = search_permalink,
link = search_link,
stringsAsFactors = FALSE)[index,]
cached_links = as.data.frame(rbind(cached_links,
temp_dat))
next_page = search_JSON$data$after
comm_filter = utils::tail(search_num_comments,
1)
search_query = paste0(search_address, "&after=",
next_page)
page_counter = page_counter + 1
}
Sys.sleep(min(2, wait_time))
}
}
final_table = cached_links[!duplicated(cached_links), ]
if (dim(final_table)[1] == 0) {
cat(paste("\nNo results retrieved, should be invalid subreddit entered, down server or simply unsuccessful search query :("))
}
else {
remove_row = which(final_table[, 1] == "")
if (length(remove_row) > 0) {
final_table = final_table[-remove_row, ]
}
return(final_table)
}
}
extracting the content :
reddit_content <- function (URL, wait_time = 1) {
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else
NULL
return(list(
paste0(filter, " ", depth),
lapply(1:length(reply.nodes),
function(x)
get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))
))
}
data_extract = data.frame(
id = numeric(),
structure = character(),
post_date = as.Date(character()),
comm_date = as.Date(character()),
num_comments = numeric(),
subreddit = character(),
upvote_prop = numeric(),
post_score = numeric(),
author = character(),
user = character(),
comment_score = numeric(),
controversiality = numeric(),
comment = character(),
title = character(),
post_text = character(),
link = character(),
domain = character(),
URL = character()
)
withProgress(message = 'Work in progress', value = 0, min=0,max=1, {
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e)
NULL
)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(
RJSONIO::fromJSON(readLines(X,
warn = FALSE)),
error = function(e)
NULL
)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x)
get.structure(main.node[[x]], x)))
TEMP = data.frame(
id = NA,
structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(
as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")
), "%d-%m-%y"),
comm_date = format(as.Date(
as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")
), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(
is.null(meta.node$subreddit),
"UNKNOWN",
meta.node$subreddit
),
upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score,
author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title,
post_text = meta.node$selftext,
link = meta.node$url,
domain = meta.node$domain,
URL = URL[i],
stringsAsFactors = FALSE
)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else
print(paste("missed", i, ":", URL[i]))
}
}
incProgress(amount = 1/length(URL))
Sys.sleep(min(2, wait_time))
}
# data_extract[,13] <-
# cleaning_text_function(data_extract[,13])
})
return(data_extract)
}
Cleaning the text:
cleaning_text_function <- function(x,stopwords=stopwords_vec) {
stopwords_vec <- c(stopwords::stopwords("en"), "don", "isn", "gt", "i", "re","removed","deleted","m","you re","we ll", "ve", "hasn","they re","id","tl dr", "didn", "wh","oh","tl","dr","shes","hes","aren","edit","ok","ll","wasn","shouldn","t","doesn","youre","going","still","much", "many","also")
if (is.character(x)) {
#Put accents instead of code html (only for french)
Encoding(x) <- 'latin1'
#take out accent
x <- stri_trans_general(x, 'latin-ascii')
x <- unlist(lapply(x, function(x, stopwords = stopwords_vec) {
#separate words
x <- unlist(strsplit(x, " "))
#take out internet links
x <- x[!grepl("\\S+www\\S+|\\S+https://\\S+|https://\\S+", x)]
#take out codes ASCII and ponctuation
x <-gsub("\n|[[:punct:]]|[\x01-\x09\x11-\x12\x14-\x1F\x7F]|gt"," ",x)
#take out simple alone numbers
x <-gsub("(^[0-9]{1}\\s|^[0-9]{1}$|\\s{1}[0-9]{1}$|\\s{1}[0-9]{1}\\s{1})"," ",x)
#take out space in the beginning and end of stringg
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
#lowercase
x <- tolower(x)
#take out alone letters
x <-gsub("(^[a-z]{1}\\s+|^[a-z]{1}$|\\s+[a-z]{1}$|\\s+[a-z]{1}\\s+)", "", x)
#take out words in stopwords list
x <-paste(x[!x %in% stopwords], collapse = " ")
#rerun stopwords again to get ride of stopword in composed string
x <- unlist(strsplit(x, " "))
x <-gsub("(^[[:blank:]]+|[[:blank:]]+$)", "", x)
x <-paste(x[!x %in% stopwords], collapse = " ")
return(x)
}))
} else{
stop("please enter a character vector")
}
return(x)
}
And the message we get :
Listening on http://127.0.0.1:7745
Warning in file(con, "r") :
cannot open URL 'https://www.reddit.com/r/news/search.json?q=Greta&restrict_sr=on&t=week&sort=comments': HTTP status was '429 Unknown Error'
Warning: Error in reddit_urls_mod: Unable to connect to reddit website or invalid subreddit entered
126: stop
125: reddit_urls_mod
124: scraping_function
123: eventReactiveHandler
79: df1
72: observeEventHandler
1: runApp
I get an error 429 even on computers that have never made a request before...
Thank you

How to get SSL connection of this function to work with GnuTLS build of RCurl

I'm making an R package that interfaces with the api from opendata.socrata.com.
I've run into a problem, that I've tracked to the build of the RCurl package.
On windows, with the RCurl build with openSSL, I've got no problems, but on Linux, with GnuTLS, it doesn't work.
You can check the build using curlVersion()$ssl_version.
Here is the function:
search.Socrata.Views <- function(search = NULL, ## full
topic = NULL, ## description
name = NULL, ## title field search
tags = NULL,
category = NULL,
count = FALSE,
limit = 10, ## max 200
page = 1,
type = "json" ## can also be xml
){
require('RCurl')
require('XML')
require('rjson')
## setting curl options
capath = system.file("CurlSSL",package = "RCurl")
cainfo = system.file("CurlSSL", "ca-bundle.crt", package = "RCurl")
cookie = 'cookiefile.txt'
curl = getCurlHandle ( cookiefile = cookie,
cookiejar = cookie,
useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en - US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
header = FALSE,
verbose = TRUE,
netrc = FALSE,
maxredirs = as.integer(20),
followlocation = TRUE,
ssl.verifypeer = TRUE,
cainfo = cainfo,
timeout = 100
)
## capath doesn't work:: NEED cainfo!
## test for existing cainfo:
if (!file.exists(cainfo)){
download.file('http://curl.haxx.se/ca/cacert.pem', cainfo )
}
## test for age of cainfo, if older than 2 weeks get new.
if (file.exists(cainfo)){
file.inf.cainfo <- file.info(cainfo)
age.cainfo <- Sys.time() - file.inf.cainfo[["mtime"]]
if(as.numeric(age.cainfo, units="days") > 14 ){
download.file('http://curl.haxx.se/ca/cacert.pem', cainfo )
}
}
### Make URL
baseSocrataUrl <- 'https://opendata.socrata.com/api/views.'
if(!is.null(category)){
category <- match.arg( category, c('Business', 'Fun', 'Personal', 'Education', 'Government'))
}
type <- match.arg( type, c('json', 'xml'))
## Tag
if(is.null(tags)){
tags <- NULL
} else {
tags <- URLencode( paste('&tags=', tags, sep = ''))
}
## Category
if(is.null(category)){
category <- NULL
} else {
category <- URLencode( paste('&category=', category, sep = ''))
}
## Limit
if(limit > 200){
limit <- '&limit=200'
} else {
limit <- paste('&limit=', limit, sep = '')
}
## search
if(is.null(search)){
search <- NULL
} else {
search <- URLencode( paste('&full=', search, sep = ''))
}
## page
page <- paste('&page=', page, sep = '')
## topic
if(is.null(topic)){
topic <- NULL
} else {
topic <- URLencode( paste('&description=', topic, sep = ''))
}
## name
if(is.null(name)){
name <- NULL
} else {
name <- URLencode( paste('&name=', name, sep = ''))
}
## count
if(count){
count <- '&count=TRUE'
} else {
count <- NULL
}
### Retrieving html
SocrataUrl <- paste( baseSocrataUrl, type, '?', page, tags, category, limit, search, name, topic, count, sep = '')
SocrataHtml <- getURL(SocrataUrl, curl = curl)
assign('search.Socrata.Call', SocrataUrl, envir=.GlobalEnv)
if(type == 'json'){
SocrataTable <- fromJSON(SocrataHtml)
SocrataTable <- lapply( SocrataTable, function(x){data.frame( x, stringsAsFactors = FALSE) } )
SocrataTable.df <- data.frame( matrix( nrow = length( SocrataTable), ncol = max(unlist(lapply(SocrataTable, length) ) ) ) )
names(SocrataTable.df) <- names( SocrataTable [lapply( SocrataTable, length ) == max( unlist( lapply( SocrataTable, length) ) ) ] [[1]] )
for( i in 1: length( SocrataTable ) ){
for( j in 1: length( names( SocrataTable[[i]] ) ) ){
SocrataTable.df[i, names( SocrataTable[[i]] )[j]] <- SocrataTable[[i]][i, names( SocrataTable[[i]] ) [j] ]
}
}
rm(curl)
gc()
return(SocrataTable.df)
} else {
rm(curl)
gc()
return(SocrataHtml)
}
}
Run the function with:
socrata.views <- search.Socrata.Views(topic = 'airplane')
print(socrata.views)
I haven't tested your code under Linux, but I can say that you're constructing URLs the hard way, which may be causing bugs. Using getForm, you can simplify your code considerably.
params <- list(
category = category,
tags = tags,
limit = min(limit, 200)
#etc.
)
params <- Filter(Negate(is.null), params)
getForm(baseSocrataUrl, .params = params, curl = curl)

Resources