Faster way to download multiple files in R - r

I write a small downloader in R, in order to download some log files from remote server in one run:
file_remote <- fun_to_list_URLs()
file_local <- fun_to_gen_local_paths()
credentials <- "usr/pwd"
downloader <- function(file_remote, file_local, credentials) {
data_bin <- RCurl::getBinaryURL(
file_remote,
userpwd = credentials,
ftp.use.epsv = FALSE,
forbid.reuse = TRUE
)
writeBin(data_bin, file_local)
}
purrr::walk2(
file_remote,
file_local,
~ downloader(
file_remote = .x,
file_local = .y,
credentials = credentials
)
)
This works, but slowly, especially compare it to some FTP clients like WinSCP, downloading 64 log files, each 2kb, takes minutes.
Is there a faster way to download a lot of files in R?

The curl package has a way to perform async requests, which means that downloads are performed simultaneously instead of one after another. Especially with smaller files this should give you a large boost in performance. Here is a barebone function that does that
# total_con: max total concurrent connections.
# host_con: max concurrent connections per host.
# print: print status of requests at the end.
multi_download <- function(file_remote,
file_local,
total_con = 1000L,
host_con = 1000L,
print = TRUE) {
# check for duplication (deactivated for testing)
# dups <- duplicated(file_remote) | duplicated(file_local)
# file_remote <- file_remote[!dups]
# file_local <- file_local[!dups]
# create pool
pool <- curl::new_pool(total_con = total_con,
host_con = host_con)
# function performed on successful request
save_download <- function(req) {
writeBin(req$content, file_local[file_remote == req$url])
}
# setup async calls
invisible(
lapply(
file_remote, function(f)
curl::curl_fetch_multi(f, done = save_download, pool = pool)
)
)
# all created requests are performed here
out <- curl::multi_run(pool = pool)
if (print) print(out)
}
Now we need some test files to compare it to your baseline approach. I use covid data from the Johns Hopkins University GitHub page as it contains many small csv files which should be similar to your files.
file_remote <- paste0(
"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/",
format(seq(as.Date("2020-03-03"), as.Date("2022-06-01"), by = "day"), "%d-%m-%Y"),
".csv"
)
file_local <- paste0("/home/johannes/Downloads/test/", seq_along(file_remote), ".bin")
We could also infer the file names from the URLs but I assume that is not what you want. So now lets compare the approaches for these 821 files:
res <- bench::mark(
baseline(),
multi_download(file_remote,
file_local,
print = FALSE),
check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
summary(res)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec`
#> <bch:expr> <bch:> <bch:> <dbl>
#> 1 baseline() 2.8m 2.8m 0.00595
#> 2 multi_download(file_remote, file_local, print = FALSE) 12.7s 12.7s 0.0789
#> # … with 2 more variables: mem_alloc <bch:byt>, `gc/sec` <dbl>
summary(res, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec`
#> <bch:expr> <dbl> <dbl> <dbl>
#> 1 baseline() 13.3 13.3 1
#> 2 multi_download(file_remote, file_local, print = FALSE) 1 1 13.3
#> # … with 2 more variables: mem_alloc <dbl>, `gc/sec` <dbl>
The new approach is 13.3 times faster than the original one. I would assume that the difference will be bigger the more files you have. Note though, that this benchmark is not perfect as my internet speed fluctuates quite a bit.
The function should also be improved in terms of handling errors (currently you get a message how many requests have been successful and how many errored, but no indication which files exist). My understanding is also that multi_run writes files to the memory before save_download writes them to disk. With small files this is fine, but it might be an issue with larger ones.
baseline function
baseline <- function() {
credentials <- "usr/pwd"
downloader <- function(file_remote, file_local, credentials) {
data_bin <- RCurl::getBinaryURL(
file_remote,
userpwd = credentials,
ftp.use.epsv = FALSE,
forbid.reuse = TRUE
)
writeBin(data_bin, file_local)
}
purrr::walk2(
file_remote,
file_local,
~ downloader(
file_remote = .x,
file_local = .y,
credentials = credentials
)
)
}
Created on 2022-06-05 by the reprex package (v2.0.1)

Related

How to close ssh (tunel) connection in R?

I have ssh + tunel connection executed like:
target <- paste0("host:3306")
ProxySev <- "name#1.1.1.1"
keyfile <- "path/to/key"
port <- 3307
cmd <- paste0('ssh::ssh_tunnel(ssh::ssh_connect(host = "',
ProxySev, '", keyfile ="', keyfile, '"), port = ', port, ', target = "', target, '")')
pid <- sys::r_background(
args = c("-e", cmd),
std_out = TRUE,
std_err = TRUE
)
To close it I use tools::pskill(pid), but how to do that not knowing the pid? Eg without proper closing previous connection and trying to run again
pid <- sys::r_background(
args = c("-e", cmd),
std_out = TRUE,
std_err = TRUE
)
I get the message:
Error: System failure for: bind() (Only one usage of each socket address (protocol/network address/port) is normally permitted)
Execution halted
Warning message:
Disconnecting from unused ssh session. Please use ssh_disconnect()
How to use ssh_disconnect() in such case? Or how to get pid of this background process?
Thanks!
I think you may be able to infer which process by using tools in the ps package. I'll demonstrate with Sys.sleep instead of an ssh connection, but the steps should be translatable.
pid <- sys::r_background(args=c("-e", "Sys.sleep(120)"), std_out = FALSE, std_err = FALSE)
procs <- ps::ps()
subset(procs, name == "Rterm.exe")
# # A tibble: 3 x 11
# pid ppid name username status user system rss vms created ps_handle
# <int> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dttm> <I<list>>
# 1 22284 4224 Rterm.exe "myhost\\r2" running 0.0938 0.0156 47214592 68915200 2022-09-21 12:43:32.263 <ps_handl>
# 2 8472 14464 Rterm.exe "myhost\\r2" running 6.5 0.719 267501568 281235456 2022-09-20 21:31:30.642 <ps_handl>
# 3 4224 14464 Rterm.exe "myhost\\r2" running 4768. 272. 1433354240 31551180800 2022-09-17 22:46:15.056 <ps_handl>
pids <- subset(procs, name == "Rterm.exe")$pid
lapply(setNames(nm = pids), function(pid) ps::ps_cmdline(ps::ps_handle(pid)))
# $`22284`
# [1] "c:\\R\\R-4.1.2\\bin\\x64\\Rterm" "-e" "Sys.sleep(120)"
# $`8472`
# [1] "c:\\R\\R-4.1.2\\bin\\x64\\Rterm.exe" "--ess" "--no-save"
# $`4224`
# [1] "c:\\R\\R-4.1.2\\bin\\x64\\Rterm.exe" "--ess" "--no-save"
(tools::pskill(22284))
# [1] TRUE
I wrapped pskill in parens since its return value (logical indicating if the process was found and killed) is invisible, the parens cause it to be printed.

sparklyr connecting to kafka streams/topics

I'm having difficulty connecting to and retrieving data from a kafka instance. Using python's kafka-python module, I can connect (using the same connection parameters), see the topic, and retrieve data, so the network is viable, there is no authentication problem, the topic exists, and data exists in the topic.
On R-4.0.5 using sparklyr-1.7.2, connecting to kafka-2.8
library(sparklyr)
spark_installed_versions()
# spark hadoop dir
# 1 2.4.7 2.7 /home/r2/spark/spark-2.4.7-bin-hadoop2.7
# 2 3.1.1 3.2 /home/r2/spark/spark-3.1.1-bin-hadoop3.2
sc <- spark_connect(master = "local", version = "2.4",
config = list(
sparklyr.shell.packages = "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0"
))
system.time({
Z <- stream_read_kafka(
sc,
options = list(
kafka.bootstrap.servers="11.22.33.44:5555",
subscribe = "mytopic"))
})
# user system elapsed
# 0.080 0.000 10.349
system.time(collect(Z))
# user system elapsed
# 1.336 0.136 8.537
Z
# # Source: spark<?> [inf x 7]
# # … with 7 variables: key <lgl>, value <lgl>, topic <chr>, partition <int>, offset <dbl>, timestamp <dbl>, timestampType <int>
My first concern is that I'm not seeing data from the topic, I appear to be getting a frame suggesting (meta)data about topics in general, and there is nothing found. With this topic, there are 800 strings (json), modest-to-small sizes. My second concern is that it takes almost 20 seconds to realize this problem (though I suspect that's a symptom of the larger connection problem).
For confirmation, this works:
cons = import("kafka")$KafkaConsumer(bootstrap_servers="11.22.33.44:5555", auto_offset_reset="earliest", max_partition_fetch_bytes=10240000L)
cons$subscribe("mytopic")
msg <- cons$poll(timeout_ms=30000L, max_records=99999L)
length(msg)
# [1] 1
length(msg[[1]])
# [1] 801
as.character( msg[[1]][[1]]$value )
# [1] "{\"TrackId\":\"c839dcb5-...\",...}"
(and those commands complete almost instantly, nothing like the 8-10sec lag above).
The kafka instance to which I'm connecting is using ksqlDB, though I don't think that's a requirement in order to need to use the "org.apache.spark:spark-sql-kafka-.." java package.
(Ultimately I'll be using stateless/stateful procedures on streaming data, including joins and window ops, so I'd like to not have to re-implement that from scratch on the simple kafka connection.)

Using code_to_plan and target(..., format = "fst") in drake

I really like using the code_to_plan function when constructing drake plans. I also really using target(..., format = "fst") for big files. However I am struggling to combine these two workflows. For example if I have this _drake.R file:
# Data --------------------------------------------------------------------
data_plan = code_to_plan("code/01-data/data.R")
join_plan = code_to_plan("code/01-data/merging.R")
# Cleaning ----------------------------------------------------------------
cleaning_plan = code_to_plan("code/02-cleaning/remove_na.R")
# Model -------------------------------------------------------------------
model_plan = code_to_plan("code/03-model/model.R")
# Combine Plans
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
)
config <- drake_config(dplan)
This works fine when called with r_make(r_args = list(show = TRUE))
As I understand it though target can only be used within a drake_plan. If I try something like this:
dplan2 <- drake_plan(full_plan = target(dplan, format = "fst"))
config <- drake_config(dplan2)
I get an r_make error like this:
target full_plan
Error in fst::write_fst(x = value$value, path = tmp) :
Unknown type found in column.
In addition: Warning message:
You selected fst format for target full_plan, so drake will convert it from class c("drake_plan", "tbl_df", "tbl", "data.frame") to a plain data frame.
Error:
-->
in process 18712
See .Last.error.trace for a stack trace.
So ultimately my question is where does one specify special data formats for targets when you are using code_to_plan?
Edit
Using #landau helpful suggestion, I defined this function:
add_target_format <- function(plan) {
# Get a list of named commands.
commands <- plan$command
names(commands) <- plan$target
# Turn it into a good plan.
do.call(drake_plan, commands)
}
So that this would work:
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
) %>%
add_target_format()
It is possible, but not convenient. Here is a workaround.
writeLines(
c(
"x <- small_data()",
"y <- target(large_data(), format = \"fst\")"
),
"script.R"
)
cat(readLines("script.R"), sep = "\n")
#> x <- small_data()
#> y <- target(large_data(), format = "fst")
library(drake)
# Produces a plan, but does not process target().
bad_plan <- code_to_plan("script.R")
bad_plan
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 x small_data()
#> 2 y target(large_data(), format = "fst")
# Get a list of named commands.
commands <- bad_plan$command
names(commands) <- bad_plan$target
# Turn it into a good plan.
good_plan <- do.call(drake_plan, commands)
good_plan
#> # A tibble: 2 x 3
#> target command format
#> <chr> <expr> <chr>
#> 1 x small_data() <NA>
#> 2 y large_data() fst
Created on 2019-12-18 by the reprex package (v0.3.0)

Accessing an API with R

I am trying to retrieve information from an API, which gives the name of the product from the barcode, through the API.
I am using the httr::GET().
The URL needed for the API contains the barcode itself, but I do not know how to automate the system so it can read the barcode contained in every entry, and plugging it into the url without me copying and pasting the code manually in the script.
one_code <- GET("api.upcdatabase.org/json/aebfed7a26f24a05efd7f77‌​749dc2fcc/…")
result <- content(one_code)
result$description
A couple extra things to consider.
First, the site provides https for the API so it should be used since you're exposing your API key on any network you make requests from otherwise.
Test the core HTTP status code and halt on major HTTP errors (not API errors).
You should also put your API key in something like an environment variable so it never ends up in scripts or GitHub repo commits. Use ~/.Renviron (make a single line entry for UPCDATABASE_API_KEY=your_key and then restart R).
You should handle error and success conditions and consider returning a data frame so you can have all the fields in a tidy, accessible fashion.
Finally, do some basic type conversion prior to returning the values to make return field values easier to use.
library(httr)
library(jsonlite)
library(purrr)
get_upc_code_info <- function(code, api_key=Sys.getenv("UPCDATABASE_API_KEY")) {
URL <- sprintf("https://api.upcdatabase.org/json/%s/%s", api_key, code)
res <- GET(URL)
stop_for_status(res)
res <- content(res, as="text", encoding="UTF-8")
res <- fromJSON(res, flatten=TRUE)
if (res$valid == "true") {
res <- flatten_df(res)
res$valid <- TRUE
res$avg_price <- as.numeric(res$avg_price)
res$rate_up <- as.numeric(res$rate_up)
res$rate_down <- as.numeric(res$rate_down)
return(res)
} else {
message(res$reason)
return(data.frame(number = code, valid = FALSE, stringsAsFactors=FALSE))
}
}
xdf <- get_upc_code_info("0111222333446")
dplyr::glimpse(xdf)
## Observations: 1
## Variables: 8
## $ valid <lgl> TRUE
## $ number <chr> "0111222333446"
## $ itemname <chr> "UPC Database Testing Code"
## $ alias <chr> "Testing Code"
## $ description <chr> "http://upcdatabase.org/code/0111222333446"
## $ avg_price <dbl> 123.45
## $ rate_up <dbl> 14
## $ rate_down <dbl> 3
Similar to what Aurèle suggested, you can use the function to make it easier to get multiple codes. Since this function returns a data frame, you can easily get a larger, complete data frame from individual lookups with purrr::map_df():
codes <- c("0057000006976", "3228881010711", "0817346023170", "44xx4444444")
xdf <- map_df(codes, get_upc_code_info)
dplyr::glimpse(xdf)
## Observations: 4
## Variables: 8
## $ valid <lgl> TRUE, TRUE, TRUE, FALSE
## $ number <chr> "0057000006976", "3228881010711", "0817346023170",...
## $ itemname <chr> "Heinz Original Beans (Pork & Molasses)", "Lip...
## $ alias <chr> "", "", "", NA
## $ description <chr> "", "Boîte de 20 sachets", "", NA
## $ avg_price <dbl> NA, NA, 39.99, NA
## $ rate_up <dbl> 0, 0, 1, NA
## $ rate_down <dbl> 0, 0, 0, NA
Consider putting finishing touches on this, adding a function to POST to the API, possibly make a Shiny app so folks can submit new entries through R and turning it into a package. You might even get extra free credits on the site if you do so.
Store your barcodes in one data structure (list or vector):
barcodes <- c(
"aebfed7a26f24a05efd7f77749dc2fcc",
"xyz1234567f24a05efd7f77749dc2fcc",
"pqr9876543f24a05efd7f77749dc2fcc"
)
Write a function:
scrape <- function(barcode) {
sample=GET(paste0("api.upcdatabase.org/json/", barcode, "/rest/of/the/url"))
result=content(sample)
result$description
}
And apply:
res <- lapply(barcodes, scrape)
The results are stored in a single list, so that they're easier to manipulate.

How to compute large object's hash value in R?

I have large objects in R, that barely fits in my 16GB memory (a data.table database of >4M records, >400 variables).
I'd like to have a hash function that will be used to confirm, that the database loaded into R is not modified.
One fast way to do that is to calculate the database's hash with the previously stored hash.
The problem is that digest::digest function copies (serializes) the data, and only after all data are serialized it will calculate the hash. Which is too late on my hardware... :-(
Does anyone know about a way around this problem?
There is a poor's man solution: save the object into the file, and calculate the hash of the file. But it introduces large, unnecessary overhead (I have to make sure there is a spare on HDD for yet another copy, and need to keep track of all the files that may not be automatically deleted)
Similar problem has been described in our issue tracker here:
https://github.com/eddelbuettel/digest/issues/33
The current version of digest can read a file to compute the hash.
Therefore, at least on Linux, we can use a named pipe which will be read by the digest package (in one thread) and from the other side data will be written by another thread.
The following code snippet shows how we can compute a MD5 hash from 10 number by feeding the digester first with 1:5 and then 6:10.
library(parallel)
library(digest)
x <- as.character(1:10) # input
fname <- "mystream.fifo" # choose name for your named pipe
close(fifo(fname, "w")) # creates your pipe if does not exist
producer <- mcparallel({
mystream <- file(fname, "w")
writeLines(x[1:5], mystream)
writeLines(x[6:10], mystream)
close(mystream) # sends signal to the consumer (digester)
})
digester <- mcparallel({
digest(fname, file = TRUE, algo = "md5") # just reads the stream till signalled
})
# runs both processes in parallel
mccollect(list(producer, digester))
unlink(fname) # named pipe removed
UPDATE: Henrik Bengtsson provided a modified example based on futures:
library("future")
plan(multiprocess)
x <- as.character(1:10) # input
fname <- "mystream.fifo" # choose name for your named pipe
close(fifo(fname, open="wb")) # creates your pipe if does not exists
producer %<-% {
mystream <- file(fname, open="wb")
writeBin(x[1:5], endian="little", con=mystream)
writeBin(x[6:10], endian="little", con=mystream)
close(mystream) # sends signal to the consumer (digester)
}
# just reads the stream till signalled
md5 <- digest::digest(fname, file = TRUE, algo = "md5")
print(md5)
## [1] "25867862802a623c16928216e2501a39"
# Note: Identical on Linux and Windows
Following up on nicola's comment, here's a benchmark of the column-wise idea. It seems it doesn't help much, at least not for these at this size. iris is 150 rows, long_iris is 3M (3,000,000).
library(microbenchmark)
#iris
nrow(iris)
microbenchmark(
whole = digest::digest(iris),
cols = digest::digest(lapply(iris, digest::digest))
)
#long iris
long_iris = do.call(bind_rows, replicate(20e3, iris, simplify = F))
nrow(long_iris)
microbenchmark(
whole = digest::digest(long_iris),
cols = digest::digest(lapply(long_iris, digest::digest))
)
Results:
#normal
Unit: milliseconds
expr min lq mean median uq max neval cld
whole 12.6 13.6 14.4 14.0 14.6 24.9 100 b
cols 12.5 12.8 13.3 13.1 13.5 23.0 100 a
#long
Unit: milliseconds
expr min lq mean median uq max neval cld
whole 296 306 317 311 316 470 100 b
cols 261 276 290 282 291 429 100 a

Resources