I have ssh + tunel connection executed like:
target <- paste0("host:3306")
ProxySev <- "name#1.1.1.1"
keyfile <- "path/to/key"
port <- 3307
cmd <- paste0('ssh::ssh_tunnel(ssh::ssh_connect(host = "',
ProxySev, '", keyfile ="', keyfile, '"), port = ', port, ', target = "', target, '")')
pid <- sys::r_background(
args = c("-e", cmd),
std_out = TRUE,
std_err = TRUE
)
To close it I use tools::pskill(pid), but how to do that not knowing the pid? Eg without proper closing previous connection and trying to run again
pid <- sys::r_background(
args = c("-e", cmd),
std_out = TRUE,
std_err = TRUE
)
I get the message:
Error: System failure for: bind() (Only one usage of each socket address (protocol/network address/port) is normally permitted)
Execution halted
Warning message:
Disconnecting from unused ssh session. Please use ssh_disconnect()
How to use ssh_disconnect() in such case? Or how to get pid of this background process?
Thanks!
I think you may be able to infer which process by using tools in the ps package. I'll demonstrate with Sys.sleep instead of an ssh connection, but the steps should be translatable.
pid <- sys::r_background(args=c("-e", "Sys.sleep(120)"), std_out = FALSE, std_err = FALSE)
procs <- ps::ps()
subset(procs, name == "Rterm.exe")
# # A tibble: 3 x 11
# pid ppid name username status user system rss vms created ps_handle
# <int> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dttm> <I<list>>
# 1 22284 4224 Rterm.exe "myhost\\r2" running 0.0938 0.0156 47214592 68915200 2022-09-21 12:43:32.263 <ps_handl>
# 2 8472 14464 Rterm.exe "myhost\\r2" running 6.5 0.719 267501568 281235456 2022-09-20 21:31:30.642 <ps_handl>
# 3 4224 14464 Rterm.exe "myhost\\r2" running 4768. 272. 1433354240 31551180800 2022-09-17 22:46:15.056 <ps_handl>
pids <- subset(procs, name == "Rterm.exe")$pid
lapply(setNames(nm = pids), function(pid) ps::ps_cmdline(ps::ps_handle(pid)))
# $`22284`
# [1] "c:\\R\\R-4.1.2\\bin\\x64\\Rterm" "-e" "Sys.sleep(120)"
# $`8472`
# [1] "c:\\R\\R-4.1.2\\bin\\x64\\Rterm.exe" "--ess" "--no-save"
# $`4224`
# [1] "c:\\R\\R-4.1.2\\bin\\x64\\Rterm.exe" "--ess" "--no-save"
(tools::pskill(22284))
# [1] TRUE
I wrapped pskill in parens since its return value (logical indicating if the process was found and killed) is invisible, the parens cause it to be printed.
Related
I can run DQD using the connection parameters shown below but I get an SSL error if I try to use connect() and I get the same error trying to use the synthea etl tool.
Full script is below.
Any help would be greatly appreciated.
Connection Parameters
dbms <- "sql server"
user <- "synthea_user"
password <- "sneeker"
server <- "localhost"
port <- "1433"
pathToDriver <- pathToDriver
extraSettings <- ";databaseName=master;integratedSecurity=true;encrypt=false"
Error
Error in rJava::.jcall(jdbcDriver, "Ljava/sql/Connection;", "connect", :
com.microsoft.sqlserver.jdbc.SQLServerException: The driver could not establish a secure connection to SQL Server by using Secure Sockets Layer (SSL) encryption. Error: "PKIX path building failed: sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target". ClientConnectionId:1799f03c-5ddf-4498-9670-6fcf87f557ac
Full Script
# local files
pathToDriver <- "D:\\NACHC\\SYNTHEA\\DQD\\resources\\jar\\sqlserver-jar" # location of the mssql-jdbc-10.2.0.jre8.jar
outputFolder <- "D:\\NACHC\\SYNTHEA\\DQD\\output" # location where output file will be written
outputFile <- "results.json" # file for results json
# database connectivity
dbms <- "sql server"
user <- "synthea_user"
password <- "sneeker"
server <- "localhost"
port <- "1433"
pathToDriver <- pathToDriver
extraSettings <- ";databaseName=master;integratedSecurity=true;encrypt=false"
# database schemas
cdmDatabaseSchema <- "synthea_micro.dbo" # your omop instance
resultsDatabaseSchema <- "synthea_micro_dqd_results.dbo" # instance where results will be written
cdmSourceName <- "SYNTHEA_MICRO_Test_Database" # a human readable name for your CDM source
# config parameters
numThreads <- 1 # number of threads to run, 3 seems to work well on Redshift
sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries
verboseMode <- TRUE # set to TRUE if you want to see activity written to the console
writeToTable <- TRUE # set to FALSE if you want to skip writing to a SQL table in the results schema
# dqd parameters
checkLevels <- c("TABLE", "FIELD", "CONCEPT") # which DQ check levels to run
checkNames <- c() # which DQ checks to run, names can be found in inst/csv/OMOP_CDM_v5.3.1_Check_Desciptions.csv
tablesToExclude <- c() # which CDM tables to exclude?
# ---
#
# run-dqd.r
#
# Run the init-parameters.r script before running this script.
#
# Script to run the Data Quality Dashboard (DQD)
#
# The output of this script is a single JSON file that can then be rendered with the run-dqd.r script
# to view the results as a web page in a browser.
#
# Prior to running this script you will need to download and install the jdbc driver For MS Sql Server.
# This is entered as the pathToDriver variable below.
#
# ---
# ---
#
# CREATE THE CONNECTION OBJECT AND RUN THE JOB
#
# ---
# create connection details object
connectionDetails <- DatabaseConnector::createConnectionDetails(
dbms = dbms,
user = user,
password = password,
server = server,
port = port,
pathToDriver = pathToDriver,
extraSettings = extraSettings
)
# conn <- connect(cd)
# disconnect(conn)
# (OPTIONAL): if writing to table and using Redshift, bulk loading can be initialized
# Sys.setenv (
# "AWS_ACCESS_KEY_ID" = "",
# "AWS_SECRET_ACCESS_KEY" = "",
# "AWS_DEFAULT_REGION" = "",
# "AWS_BUCKET_NAME" = "",
# "AWS_OBJECT_KEY" = "",
# "AWS_SSE_TYPE" = "AES256",
# "USE_MPP_BULK_LOAD" = TRUE
# )
# ---
#
# run the job
#
# ---
DataQualityDashboard::executeDqChecks (
connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema,
cdmSourceName = cdmSourceName,
numThreads = numThreads,
sqlOnly = sqlOnly,
outputFolder = outputFolder,
outputFile = outputFile,
verboseMode = verboseMode,
writeToTable = writeToTable,
checkLevels = checkLevels,
tablesToExclude = tablesToExclude,
checkNames = checkNames,
cdmVersion = "5.4"
)
# (OPTIONAL) inspect logs
# ParallelLogger::launchLogViewer(
# logFileName = file.path(outputFolder, cdmSourceName,
# sprintf("log_DqDashboard_%s.txt", cdmSourceName))
# )
# (OPTIONAL) if you want to write the JSON file to the results table separately
# jsonFilePath <- ""
# DataQualityDashboard::writeJsonResultsToTable(
# connectionDetails = connectionDetails,
# resultsDatabaseSchema = resultsDatabaseSchema,
# jsonFilePath = jsonFilePath
# )
print("TRYING TO CONNECT USING connect(cd)")
conn <- connect(cd)
disconnect(conn)
This are the connection settings that needed to be used to get this to work.
extraSettings <- ";databaseName=synthea_etl;integratedSecurity=false;encrypt=false;trustServerCertificate=true;sslProtocol=TLSv1"
This is the full script.
# run the following once
devtools::install_github("OHDSI/ETL-Synthea", INSTALL_opts = "--no-multiarch")
library(ETLSyntheaBuilder)
# database connectivity
dbms <- "sql server"
user <- "synthea_etl"
password <- "Sneaker01"
server <- "localhost"
port <- "1433"
pathToDriver <- "D:\\NACHC\\SYNTHEA\\DQD\\resources\\jar\\sqlserver-jar"
extraSettings <- ";databaseName=synthea_etl;integratedSecurity=false;encrypt=false;trustServerCertificate=true;sslProtocol=TLSv1"
cd <- DatabaseConnector::createConnectionDetails(
dbms = dbms,
user = user,
password = password,
server = server,
port = port,
pathToDriver = pathToDriver,
extraSettings = extraSettings
)
# test the connection
conn <- connect(cd)
disconnect(conn)
cdmSchema <- "synthea_etl.dbo"
cdmVersion <- "5.4"
syntheaVersion <- "2.7.0"
syntheaSchema <- "synthea_etl.dbo"
syntheaFileLoc <- "C:\\Users\\gresh\\Downloads\\synthea-etl\\synthea_sample_data_csv_apr2020\\csv"
vocabFileLoc <- "C:\\fhir-to-omop\\terminology\\vocabulary_download_v5_{8c94604f-71b9-47ea-aef2-04be2a7d52b1}_1646823567698"
# ETLSyntheaBuilder::CreateCDMTables(connectionDetails = cd, cdmSchema = cdmSchema, cdmVersion = cdmVersion)
ETLSyntheaBuilder::CreateSyntheaTables(connectionDetails = cd, syntheaSchema = syntheaSchema, syntheaVersion = syntheaVersion)
ETLSyntheaBuilder::LoadSyntheaTables(connectionDetails = cd, syntheaSchema = syntheaSchema, syntheaFileLoc = syntheaFileLoc)
# ETLSyntheaBuilder::LoadVocabFromCsv(connectionDetails = cd, cdmSchema = cdmSchema, vocabFileLoc = vocabFileLoc)
conn <- connect(cd)
dbExecute(conn, paste("delete from ", cdmSchema, ".provider", sep=""))
dbExecute(conn, paste("delete from ", cdmSchema, ".person", sep=""))
disconnect(conn)
ETLSyntheaBuilder::LoadEventTables(connectionDetails = cd, cdmSchema = cdmSchema, syntheaSchema = syntheaSchema, cdmVersion = cdmVersion, syntheaVersion = syntheaVersion)
I write a small downloader in R, in order to download some log files from remote server in one run:
file_remote <- fun_to_list_URLs()
file_local <- fun_to_gen_local_paths()
credentials <- "usr/pwd"
downloader <- function(file_remote, file_local, credentials) {
data_bin <- RCurl::getBinaryURL(
file_remote,
userpwd = credentials,
ftp.use.epsv = FALSE,
forbid.reuse = TRUE
)
writeBin(data_bin, file_local)
}
purrr::walk2(
file_remote,
file_local,
~ downloader(
file_remote = .x,
file_local = .y,
credentials = credentials
)
)
This works, but slowly, especially compare it to some FTP clients like WinSCP, downloading 64 log files, each 2kb, takes minutes.
Is there a faster way to download a lot of files in R?
The curl package has a way to perform async requests, which means that downloads are performed simultaneously instead of one after another. Especially with smaller files this should give you a large boost in performance. Here is a barebone function that does that
# total_con: max total concurrent connections.
# host_con: max concurrent connections per host.
# print: print status of requests at the end.
multi_download <- function(file_remote,
file_local,
total_con = 1000L,
host_con = 1000L,
print = TRUE) {
# check for duplication (deactivated for testing)
# dups <- duplicated(file_remote) | duplicated(file_local)
# file_remote <- file_remote[!dups]
# file_local <- file_local[!dups]
# create pool
pool <- curl::new_pool(total_con = total_con,
host_con = host_con)
# function performed on successful request
save_download <- function(req) {
writeBin(req$content, file_local[file_remote == req$url])
}
# setup async calls
invisible(
lapply(
file_remote, function(f)
curl::curl_fetch_multi(f, done = save_download, pool = pool)
)
)
# all created requests are performed here
out <- curl::multi_run(pool = pool)
if (print) print(out)
}
Now we need some test files to compare it to your baseline approach. I use covid data from the Johns Hopkins University GitHub page as it contains many small csv files which should be similar to your files.
file_remote <- paste0(
"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/",
format(seq(as.Date("2020-03-03"), as.Date("2022-06-01"), by = "day"), "%d-%m-%Y"),
".csv"
)
file_local <- paste0("/home/johannes/Downloads/test/", seq_along(file_remote), ".bin")
We could also infer the file names from the URLs but I assume that is not what you want. So now lets compare the approaches for these 821 files:
res <- bench::mark(
baseline(),
multi_download(file_remote,
file_local,
print = FALSE),
check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
summary(res)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec`
#> <bch:expr> <bch:> <bch:> <dbl>
#> 1 baseline() 2.8m 2.8m 0.00595
#> 2 multi_download(file_remote, file_local, print = FALSE) 12.7s 12.7s 0.0789
#> # … with 2 more variables: mem_alloc <bch:byt>, `gc/sec` <dbl>
summary(res, relative = TRUE)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec`
#> <bch:expr> <dbl> <dbl> <dbl>
#> 1 baseline() 13.3 13.3 1
#> 2 multi_download(file_remote, file_local, print = FALSE) 1 1 13.3
#> # … with 2 more variables: mem_alloc <dbl>, `gc/sec` <dbl>
The new approach is 13.3 times faster than the original one. I would assume that the difference will be bigger the more files you have. Note though, that this benchmark is not perfect as my internet speed fluctuates quite a bit.
The function should also be improved in terms of handling errors (currently you get a message how many requests have been successful and how many errored, but no indication which files exist). My understanding is also that multi_run writes files to the memory before save_download writes them to disk. With small files this is fine, but it might be an issue with larger ones.
baseline function
baseline <- function() {
credentials <- "usr/pwd"
downloader <- function(file_remote, file_local, credentials) {
data_bin <- RCurl::getBinaryURL(
file_remote,
userpwd = credentials,
ftp.use.epsv = FALSE,
forbid.reuse = TRUE
)
writeBin(data_bin, file_local)
}
purrr::walk2(
file_remote,
file_local,
~ downloader(
file_remote = .x,
file_local = .y,
credentials = credentials
)
)
}
Created on 2022-06-05 by the reprex package (v2.0.1)
I'm having difficulty connecting to and retrieving data from a kafka instance. Using python's kafka-python module, I can connect (using the same connection parameters), see the topic, and retrieve data, so the network is viable, there is no authentication problem, the topic exists, and data exists in the topic.
On R-4.0.5 using sparklyr-1.7.2, connecting to kafka-2.8
library(sparklyr)
spark_installed_versions()
# spark hadoop dir
# 1 2.4.7 2.7 /home/r2/spark/spark-2.4.7-bin-hadoop2.7
# 2 3.1.1 3.2 /home/r2/spark/spark-3.1.1-bin-hadoop3.2
sc <- spark_connect(master = "local", version = "2.4",
config = list(
sparklyr.shell.packages = "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0"
))
system.time({
Z <- stream_read_kafka(
sc,
options = list(
kafka.bootstrap.servers="11.22.33.44:5555",
subscribe = "mytopic"))
})
# user system elapsed
# 0.080 0.000 10.349
system.time(collect(Z))
# user system elapsed
# 1.336 0.136 8.537
Z
# # Source: spark<?> [inf x 7]
# # … with 7 variables: key <lgl>, value <lgl>, topic <chr>, partition <int>, offset <dbl>, timestamp <dbl>, timestampType <int>
My first concern is that I'm not seeing data from the topic, I appear to be getting a frame suggesting (meta)data about topics in general, and there is nothing found. With this topic, there are 800 strings (json), modest-to-small sizes. My second concern is that it takes almost 20 seconds to realize this problem (though I suspect that's a symptom of the larger connection problem).
For confirmation, this works:
cons = import("kafka")$KafkaConsumer(bootstrap_servers="11.22.33.44:5555", auto_offset_reset="earliest", max_partition_fetch_bytes=10240000L)
cons$subscribe("mytopic")
msg <- cons$poll(timeout_ms=30000L, max_records=99999L)
length(msg)
# [1] 1
length(msg[[1]])
# [1] 801
as.character( msg[[1]][[1]]$value )
# [1] "{\"TrackId\":\"c839dcb5-...\",...}"
(and those commands complete almost instantly, nothing like the 8-10sec lag above).
The kafka instance to which I'm connecting is using ksqlDB, though I don't think that's a requirement in order to need to use the "org.apache.spark:spark-sql-kafka-.." java package.
(Ultimately I'll be using stateless/stateful procedures on streaming data, including joins and window ops, so I'd like to not have to re-implement that from scratch on the simple kafka connection.)
I really like using the code_to_plan function when constructing drake plans. I also really using target(..., format = "fst") for big files. However I am struggling to combine these two workflows. For example if I have this _drake.R file:
# Data --------------------------------------------------------------------
data_plan = code_to_plan("code/01-data/data.R")
join_plan = code_to_plan("code/01-data/merging.R")
# Cleaning ----------------------------------------------------------------
cleaning_plan = code_to_plan("code/02-cleaning/remove_na.R")
# Model -------------------------------------------------------------------
model_plan = code_to_plan("code/03-model/model.R")
# Combine Plans
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
)
config <- drake_config(dplan)
This works fine when called with r_make(r_args = list(show = TRUE))
As I understand it though target can only be used within a drake_plan. If I try something like this:
dplan2 <- drake_plan(full_plan = target(dplan, format = "fst"))
config <- drake_config(dplan2)
I get an r_make error like this:
target full_plan
Error in fst::write_fst(x = value$value, path = tmp) :
Unknown type found in column.
In addition: Warning message:
You selected fst format for target full_plan, so drake will convert it from class c("drake_plan", "tbl_df", "tbl", "data.frame") to a plain data frame.
Error:
-->
in process 18712
See .Last.error.trace for a stack trace.
So ultimately my question is where does one specify special data formats for targets when you are using code_to_plan?
Edit
Using #landau helpful suggestion, I defined this function:
add_target_format <- function(plan) {
# Get a list of named commands.
commands <- plan$command
names(commands) <- plan$target
# Turn it into a good plan.
do.call(drake_plan, commands)
}
So that this would work:
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
) %>%
add_target_format()
It is possible, but not convenient. Here is a workaround.
writeLines(
c(
"x <- small_data()",
"y <- target(large_data(), format = \"fst\")"
),
"script.R"
)
cat(readLines("script.R"), sep = "\n")
#> x <- small_data()
#> y <- target(large_data(), format = "fst")
library(drake)
# Produces a plan, but does not process target().
bad_plan <- code_to_plan("script.R")
bad_plan
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 x small_data()
#> 2 y target(large_data(), format = "fst")
# Get a list of named commands.
commands <- bad_plan$command
names(commands) <- bad_plan$target
# Turn it into a good plan.
good_plan <- do.call(drake_plan, commands)
good_plan
#> # A tibble: 2 x 3
#> target command format
#> <chr> <expr> <chr>
#> 1 x small_data() <NA>
#> 2 y large_data() fst
Created on 2019-12-18 by the reprex package (v0.3.0)
I'm relatively new to R (and brand spanking new to scraping with R), so apologies in advance if I'm overlooking something obvious here!
I've been trying to learn how to scrape with RSelenium by following this tutorial: https://rawgit.com/petrkeil/Blog/master/2017_08_15_Web_scraping/web_scraping.html#advanced-scraping-with-rselenium
After running the following in Terminal (docker run -d -p 4445:4444 selenium/standalone-firefox), I tried to run the R code below, pulled with only slight modifications from the tutorial hyperlinked above:
get.tree <- function(genus, species)
{
# navigate to the page
browser <- remoteDriver(port=4445L)
browser$open(silent = T)
browser$navigate("http://www.bgci.org/global_tree_search.php?sec=globaltreesearch")
browser$refresh()
# create r objects from the web search input and button elements
genusElem <- browser$findElement(using = 'id', value = "genus-field")
specElem <- browser$findElement(using = 'id', value = "species-field")
buttonElem <- browser$fiendElement(using = 'class', value = "btn_ohoDO")
# tell R to fill in the fields
genusElem$sendKeysToElement(list(genus))
specElem$sendKeysToElement(list(species))
# tell R to click the search button
buttonElem$clickElement()
# get output
out <- browser$findElement(using = "css", value = "td.cell_1O3UaG:nth-child(4)") # the country origin
out <- out$getElementText()[[1]] # extract actual text string
out <- strsplit(out, split = "; ")[[1]] # turns into character vector
# close browser
browser$close()
return(out)
}
# Now let's try it:
get.tree("Abies", "alba")
But after doing all that, I get the following error:
Selenium message:Failed to decode response from marionette Build info:
version: '3.6.0', revision: '6fbf3ec767', time:
'2017-09-27T16:15:40.131Z' System info: host: 'd260fa60d69b', ip:
'172.17.0.2', os.name: 'Linux', os.arch: 'amd64', os.version:
'4.9.49-moby', java.version: '1.8.0_131' Driver info: driver.version:
unknown
Error: Summary: UnknownError Detail: An unknown server-side error
occurred while processing the command. class:
org.openqa.selenium.WebDriverException Further Details: run
errorDetails method
Anyone have any idea what this means and where I went wrong?
Thanks very much for your help!
Just take advantage of the XHR request it makes to retrieve the in-line results and toss RSelenium:
library(httr)
library(tidyverse)
get_tree <- function(genus, species) {
GET(
url = sprintf("https://data.bgci.org/treesearch/genus/%s/species/%s", genus, species),
add_headers(
Origin = "http://www.bgci.org",
Referer = "http://www.bgci.org/global_tree_search.php?sec=globaltreesearch"
)
) -> res
stop_for_status(res)
matches <- content(res, flatten=TRUE)$results[[1]]
flatten_df(matches[c("id", "taxon", "family", "author", "source", "problems", "distributionDone", "note", "wcsp")]) %>%
mutate(geo = list(map_chr(matches$TSGeolinks, "country"))) %>%
mutate(taxas = list(map_chr(matches$TSTaxas, "checkTaxon")))
}
xdf <- get_tree("Abies", "alba")
xdf
## # A tibble: 1 x 8
## id taxon family author source distributionDone geo taxas
## <int> <chr> <chr> <chr> <chr> <chr> <list> <list>
## 1 58373 Abies alba Pinaceae Mill. WCSP Phans yes <chr [21]> <chr [45]>
glimpse(xdf)
## Observations: 1
## Variables: 8
## $ id <int> 58373
## $ taxon <chr> "Abies alba"
## $ family <chr> "Pinaceae"
## $ author <chr> "Mill."
## $ source <chr> "WCSP Phans"
## $ distributionDone <chr> "yes"
## $ geo <list> [<"Albania", "Andorra", "Austria", "Bulgaria", "Croatia", "Czech Republic", "Fr...
## $ taxas <list> [<"Abies abies", "Abies alba f. columnaris", "Abies alba f. compacta", "Abies a...
It's highly likely you'll need to modify get_tree() at some point but it's better than having Selenium or Splash or phantomjs or Headless Chrome as a dependency.