I would like to download a particular file with Covid patient metadata from the public GISAID website (Login button, https://www.epicov.org/epi3/start) and I would like to automate this so I can do this from R & automatically produce daily updates etc of my analyses.
I thought I would be able to do this using a web browser automation tool like RSelenium, but I am stumbling at a problem where I have to click a checkbox and then press a Download button, and that last part of the code doesn't work.
What I have so far, and what is working so far is
library(RSelenium)
chr = wdman::chrome(port = 4572L, version="102.0.5005.61", check=FALSE)
# run one time with check=TRUE and then
# delete ..//Users/XXXX/AppData/Local/binman/binman_chromedriver/win32/103.0.5060.24
# and download and install chrome 102.0.5005.63 from https://google-chrome.en.uptodown.com/windows/versions
# and re-run with check=FALSE
# to avoid version mismatches between chrome and chromedriver
# TO DO: elegant fix for this welcome, see
# https://github.com/ropensci/RSelenium/issues/221
eCaps = list(chromeOptions = list(
args = c(# '--headless', # unmark for headless operation
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
)
))
remDr = remoteDriver(port = 4572L,
version="102.0.5005.61",
browserName = "chrome",
extraCapabilities = eCaps)
remDr$open()
# TO DO: set download directory, see
# https://github.com/ropensci/RSelenium/issues/138
# otherwise default download directory would be used
# advice here also welcome
# clicking Login on https://www.gisaid.org/
# gets one to https://www.epicov.org/epi3/start
# so we start there
remDr$navigate("https://www.epicov.org/epi3/start")
remDr$setImplicitWaitTimeout(milliseconds = 10)
remDr$getTitle()
# enter credentials
username = remDr$findElement(using = "xpath", "//input[#id='elogin']")
username$sendKeysToElement(list("USERNAME"))
# PUT GISAID USERNAME HERE (EVERYONE CAN REGISTER)
password = remDr$findElement(using = "xpath", "//input[#id='epassword']")
password$sendKeysToElement(list("PASSWORD"))
# PUT GISAID PASSWORD HERE (EVERYONE CAN REGISTER)
# click Login buttom
login_button = remDr$findElement(using = "xpath", "//input[#value='Login']")
login_button$clickElement()
remDr$getCurrentUrl()
epicov_tab = remDr$findElement("xpath", "//a[contains(text(),'EpiCoV™')]")
epicov_tab$click()
downloads_tab = remDr$findElements("class", "sys-actionbar-action-ni")[[3]]
downloads_tab$clickElement()
remDr$getCurrentUrl()
# switch to right frame
frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
# available download buttons
download_buttons = remDr$findElements("class", "kachel75")
length(download_buttons) # 26 downloads available in total
# download patient metadata
metadata_button = download_buttons[[12]] # patient metadata
metadata_button$click()
At this point a window with a checkbox pops up.
All the steps above all work (except for the installation problems where there is a chrome & chromedriver version mismatch, which is a known issue in RSelenium, https://github.com/ropensci/RSelenium/issues/221, which I had to solve by manually deleting one of the installed versions, and the fact that I still don't know how to set a custom download directory rather than just use the default download directory), but I don't manage to get past this window with this checkbox. What I tried was
remDr$getCurrentUrl() # https://www.epicov.org/epi3/frontend#2f475c
# switch to right frame
frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
checkbox = remDr$findElements("xpath", "//input[#type='checkbox']")
checkbox$click()
Sys.sleep(5)
download = remDr$findElements("xpath",
"//button[contains(.,'Download')]")
download$click()
# then we would still have to move and unzip resulting *.tar.xz file
# and quit
remDr$close()
chr$stop()
remDr$quit()
but the checkbox$click() gives me an error "Error: attempt to apply non-function" and then download$click() also doesn't work...
The HTML code of this last page with this checkbox is
<html><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n<meta content=\"IE=EmulateIE8\" http-equiv=\"X-UA-Compatible\">\n<meta content=\"GISAID actively promotes the sharing of virus sequences, related clinical and epidemiological data associated with human isolates, and geographic and species-specific data associated with avian and other animal isolates.\" name=\"description\">\n<script type=\"text/javascript\">\nvar CKEDITOR_BASEPATH = 'entities/files/js/ckeditor/';\n</script>\n<meta content=\"GISAID, GISAID Initiative, COVID-19, EpiCoV, EpiFlu, EpiRSV, Global Health Security, Real-Time Disease Surveillance\" name=\"keywords\">\n<meta content=\"noindex\" name=\"robots\">\n<style type=\"text/css\">\nbody {\n margin: 0;\n}\n#c_rdj376_1bg {\n position: absolute;\n width: 100%;\n height: 100%;\n margin: auto;\n}\n#c_rdj376_1bh {\n position: absolute;\n top: 0px;\n right: 0px;\n bottom: 60px;\n left: 0px;\n overflow: auto;\n padding: 0px 5px 0px 5px;\n}\n#c_rdj376_1bj {\n position: absolute;\n bottom: 0;\n left: 0;\n right: 0;\n height: 60px;\n}\n</style>\n<title>GISAID Initiative</title>\n<link type=\"text/css\" rel=\"stylesheet\" href=\"/epi3/entities/tmp//static/_rdj376_q3/combined_std.css\">\n<link type=\"text/css\" rel=\"stylesheet\" href=\"/epi3/entities/tmp//static/_rdj376_q4/combined_std.css\">\n<link type=\"text/css\" rel=\"stylesheet\" href=\"/epi3/entities/tmp//static/_rdj376_26g/combined_std.css\">\n<script language=\"JavaScript\" type=\"text/javascript\">\nfunction loadScript(u,c) {\n//alert(\"Loading \" + u)\n\tvar s=document.createElement(\"script\");\n\ts.type=\"text/javascript\";\n\tif (s.readyState) {\n\t\ts.onreadystatechange = function() {\n\t\t\tif (s.readyState == \"loaded\" || s.readyState == \"complete\") {\n\t\t\t\ts.onreadystatechange=null;\n\t\t\t\tc();\n\t\t\t}\n\t\t};\n\t} else {\n\t\ts.onload = function() {\n\t\t\tc()\n\t\t}\n\t};\n\ts.src = u;\n\tdocument.getElementsByTagName(\"head\")[0].appendChild(s)\n};\n\nvar full_page_load_responses = [];\n\nloadScript(\"/epi3/entities/files/js/sys.js?no_cache=1655312378.13\",function(){sys[\"SID\"] = \"B485F3DEYKSGHX7ZWG8MP234FLP5FFLI\";sys[\"WID\"] = \"wid_rdj376_9b3y\";sys[\"PID\"] = \"pid_rdj376_9b3z\";sys[\"UID\"] = \"291260\";sys[\"is_development_mode\"] = false;sys[\"testrunner_active\"] = false;sys[\"page_name\"] = \"\";sys[\"msgs\"] = new Object({'txt_disable_popupblocker':' Please disable popup-blocker!'});loadScript(\"/epi3/entities/tmp//static/_rdj376_q1/combined_std.js\",function(){loadScript(\"/epi3/entities/tmp//static/_rdj376_1os/combined_std.js\",function(){loadScript(\"/epi3/entities/tmp//static/_rdj376_1z7/combined_std.js\",function(){if(document.readyState=='loaded'||document.readyState=='complete'){sys.handleDomReady();}else{window.onload=sys.handleDomReady;}});});});sys.onPageInit(function(){sys.createComponent('c_rdj376_1bk','Corona2020DownloadReminderButtonsComponent',new Object({'buttons':'c_rdj376_1bj-c_rdj376_1bk','__main__':'c_rdj376_1bk-c_rdj376_1bk'}),\"FormComponent\",\"\",function(){this.getForm().createFI('ce_rdj376_16f','LinesetWidget','checkit',function(){this.getForm().createFI('ce_rdj376_16g','CheckboxWidget','agreed',function(){this.setParams(new Object({'on_change':'Agreed','has_help':false}));}); }); this.getForm().createFI('ce_rdj376_16h','ButtonsWidget','ce_rdj376_16h',function(){this.getForm().createFI('ce_rdj376_16i','ButtonWidget','back',function(){this.setParams(new Object({'on_change':null,'has_help':false,'cmd':'Back'}));\n }); this.getForm().createFI('ce_rdj376_16j','ButtonWidget','downbtn',function(){this.setParams(new Object({'on_change':null,'has_help':false,'cmd':'Download'}));\n }); }); });});sys.onPageInit(function(){sys.createComponent('c_rdj376_1bi','Corona2020DownloadReminderComponent',new Object({'__main__':'c_rdj376_1bi-c_rdj376_1bi'}),\"CanvasComponent\",\"\",function(){function ccc() {\n var els = document.getElementsByTagName(\"INPUT\");\n for (idx in els) {\n if (els[idx].type == \"checkbox\") {\n if (!els[idx].checked) {\n alert(\"You must first agree to the terms and conditions\")\n return false;\n }\n }\n }\n return;\n}\n\n });});sys.onPageInit(function() {new sys.Packer(\"c_rdj376_1bg\", \"c_rdj376_1bh\");new LegacyHandler();\n var resizeTimer;\n $(window).resize(function() {\n clearTimeout(resizeTimer);\n resizeTimer = setTimeout(sys.handleAfterWindowResize, 100);\n });\n sys.showInitiallyHiddens();\n \n setTimeout(function() { sys.executeFullPageLoadResponses(); }, 500);\n\t});if(sys[\"testrunner_active\"]){sys.testrunner_activatePage();};\n});\n</script><script type=\"text/javascript\" src=\"/epi3/entities/files/js/sys.js?no_cache=1655312378.13\"></script>\n<script type=\"text/javascript\" src=\"/epi3/entities/tmp//static/_rdj376_q1/combined_std.js\"></script><script type=\"text/javascript\" src=\"/epi3/entities/tmp//static/_rdj376_1os/combined_std.js\"></script><script type=\"text/javascript\" src=\"/epi3/entities/tmp//static/_rdj376_1z7/combined_std.js\"></script></head>\n<body marginheight=\"0\" marginwidth=\"0\" topmargin=\"0\" leftmargin=\"0\" style=\"text-align:left;\" onbeforeunload=\"sys.handleUnloadPage()\" class=\"yui-skin-sam\">\n<form class=\"sys-form\">\n<input style=\"display:none\" name=\"dummy\" type=\"entry\"><button id=\"sys_focus_button\" type=\"button\" style=\"display: none;\"></button>\n<div id=\"sys_devmenu\">\n<img title=\"Clear caches and reload page\" onclick=\"sys.devmenu_reloadApplication(false, null, true)\" style=\"cursor:pointer\" align=\"absmiddle\" src=\"/epi3/entities/files/icons//sys_recache.gif\"><img title=\"Reload page\" onclick=\"sys.devmenu_reloadApplication(false)\" style=\"cursor:pointer\" align=\"absmiddle\" src=\"/epi3/entities/files/icons//sys_reload.gif\"><img title=\"Reload page and reset work setup\" onclick=\"sys.devmenu_reloadApplication(true)\" style=\"cursor:pointer\" align=\"absmiddle\" src=\"/epi3/entities/files/icons//sys_reset.png\"><span id=\"sys_devmenu_info\"></span><img title=\"Run tests\" onclick=\"sys.devmenu_startTestRunner()\" style=\"cursor:pointer;margin-left:5px\" align=\"absmiddle\" src=\"/epi3/entities/files/icons//testrunner_24.png\"></div>\n<div style=\"display:none\" id=\"testrunner_indicator\">\n<img style=\"position:absolute;top:30px;left:30px\" src=\"/epi3/entities/files/icons//testrunner_large.png\"></div>\n<div id=\"sys_curtain\" class=\"sys_curtain\" onclick=\"sys.handleCurtainClick()\" style=\"display: none; opacity: 0;\"></div>\n<div id=\"sys_timer\" class=\"sys_timer\" style=\"display: none;\">\n<div class=\"sys_timer_inner\">\n<img class=\"sys_timer_img\" src=\"/epi3/entities/files/icons//sys_timer.gif\"><div id=\"sys_timer_message\" class=\"sys_timer_message\"></div>\n<div style=\"margin-top: 20px; display: none;\" id=\"sys_timer_gauge\" class=\"sys_timer_gauge\"></div>\n</div>\n</div>\n<div style=\"width: 100%; margin-left: auto; margin-right: auto\" class=\"page\">\n \n<div id=\"c_rdj376_1bg\" class=\"packer\">\n \n<div style=\"top: 0px; right: 0px; bottom: 60px; left: 0px;\" scrollpositions=\",\" id=\"c_rdj376_1bh\" class=\"main container-slot\">\n \n<div cid=\"c_rdj376_1bi\" class=\"sys-component-slot\" id=\"c_rdj376_1bi-c_rdj376_1bi\">\n<div style=\"padding: 0px 0px 0px 10px; margin-top:15px\">\n\n\n<b>NOTICE AND REMINDER of TERMS OF USE:</b>\n<br clear=\"none\">\n\n<p>The Terms of Use you agreed to when requesting access credentials to GISAID include the following:\n</p>\n<p>1) You will not distribute, redistribute, share, or otherwise make available Data, to any third party or the public, unless the individual is an Authorized User of GISAID;\n<br clear=\"none\">2) You will not display Data, in whole or in part, on any website, media material, or as part of a service, without GISAID’s express written permission;\n<br clear=\"none\">3) You will treat all Data contained in these files consistent with other Data in GISAID and in accordance with the GISAID Database Access Agreement (“DAA”);\n<br clear=\"none\">4) You will provide proper attributions, acknowledgements, and make best efforts to collaborate consistent with the DAA when using Data in any publication, including preprints, manuscripts, articles, and any other analyses.\n<br clear=\"none\">\n\n</p>\n<p>\nBy checking this box you reaffirm your understanding, and assent to, the Terms of Use\n</p>\n\n\n</div>\n</div>\n \n</div>\n \n<div style=\"width: 100%; height: 60px;\" id=\"c_rdj376_1bj\" class=\"buttons container-slot\">\n \n<div cid=\"c_rdj376_1bk\" class=\"sys-component-slot\" id=\"c_rdj376_1bk-c_rdj376_1bk\">\n<div id=\"ce_rdj376_16f\">\n<table class=\"sys-form-firow\" style=\"width:98%;\">\n<tbody><tr>\n<td class=\"sys-form-cfilabel\" colspan=\"1\" rowspan=\"1\" style=\"width:150px\">\n\t\t\n<div class=\"sys-form-filabel sys-form-filabel\"></div>\n\t\n</td><td colspan=\"1\" rowspan=\"1\" style=\"float:right\">\n\t\t\n<table class=\"sys-form-filine\">\n<tbody><tr>\n<td class=\"sys-form-filine-td\" colspan=\"1\" rowspan=\"1\" style=\"\">\n\t\t\n<div id=\"ce_rdj376_16g\">\n<div class=\"sys-form-fi-cb sys-fi-mark\">\n \n<div style=\"float:left;;margin-right:3px;white-space: nowrap;\">\n \n<input class=\"sys-event-hook\" name=\"ce_rdj376_16g_name\" style=\"vertical-align: middle;\" type=\"checkbox\" value=\"agreed\"><span class=\"\" style=\"vertical-align: middle;\">I agree to the terms and conditions</span>\n \n</div>\n\t\n</div>\n\n<div class=\"sys-form-fi-message-cnt\">\n<div class=\"sys-form-fi-message\" id=\"ce_rdj376_16g_msg\"></div>\n</div>\n</div>\t\n</td>\n</tr>\n</tbody></table>\t\n</td>\n</tr>\n</tbody></table>\n</div>\n</div>\n \n<div cid=\"c_rdj376_1bk\" class=\"sys-component-slot\" id=\"c_rdj376_1bj-c_rdj376_1bk\">\n<div id=\"ce_rdj376_16h\">\n<div id=\"ce_rdj376_16i\">\n<div style=\"float: left\">\n\t\n<button class=\"sys-event-hook sys-form-button\" style=\"\" title=\"\" type=\"button\"><img align=\"absmiddle\" class=\"sys-form-button-icon\" src=\"/epi3/app_entities/entities/icons/24x24/navigate_left.png\">Back</button>\n\n</div>\n</div>\n<div id=\"ce_rdj376_16j\">\n<div style=\"float: right\">\n\t\n<button class=\"sys-event-hook sys-form-button\" disabled=\"\" style=\"\" title=\"\" type=\"button\"><img align=\"absmiddle\" class=\"sys-form-button-icon\" src=\"/epi3/app_entities/entities/icons/24x24/data_down.png\">Download</button>\n\n</div>\n</div>\n</div>\n</div>\n</div>\n \n</div>\n \n</div>\n</form>\n\n\n<div id=\"cboxOverlay\" style=\"display: none;\"></div><div id=\"colorbox\" class=\"\" style=\"display: none;\"><div id=\"cboxWrapper\"><div><div id=\"cboxTopLeft\" style=\"float: left;\"></div><div id=\"cboxTopCenter\" style=\"float: left;\"></div><div id=\"cboxTopRight\" style=\"float: left;\"></div></div><div style=\"clear: left;\"><div id=\"cboxMiddleLeft\" style=\"float: left;\"></div><div id=\"cboxContent\" style=\"float: left;\"><div id=\"cboxLoadedContent\" style=\"width: 0px; height: 0px; overflow: hidden; float: left;\"></div><div id=\"cboxLoadingOverlay\" style=\"float: left;\"></div><div id=\"cboxLoadingGraphic\" style=\"float: left;\"></div><div id=\"cboxTitle\" style=\"float: left;\"></div><div id=\"cboxCurrent\" style=\"float: left;\"></div><div id=\"cboxNext\" style=\"float: left;\"></div><div id=\"cboxPrevious\" style=\"float: left;\"></div><div id=\"cboxSlideshow\" style=\"float: left;\"></div><div id=\"cboxClose\" style=\"float: left;\"></div></div><div id=\"cboxMiddleRight\" style=\"float: left;\"></div></div><div style=\"clear: left;\"><div id=\"cboxBottomLeft\" style=\"float: left;\"></div><div id=\"cboxBottomCenter\" style=\"float: left;\"></div><div id=\"cboxBottomRight\" style=\"float: left;\"></div></div></div><div style=\"position: absolute; width: 9999px; visibility: hidden; display: none;\"></div></div></body></html>
Any thoughts?
Was also wondering what could be the reason that using wdman::phantomjs instead of wdman::chrome doesn't appear to work (links not followed etc). Is that common, and could that be fixed? As phantomjs is easier to install than chrome within RSelenium (no annoying version conflicts as with chrome)...
> sessionInfo()
R version 4.2.0 (2022-04-22 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 22000)
Matrix products: default
locale:
[1] LC_COLLATE=English_Belgium.utf8 LC_CTYPE=English_Belgium.utf8
[3] LC_MONETARY=English_Belgium.utf8 LC_NUMERIC=C
[5] LC_TIME=English_Belgium.utf8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] RSelenium_1.7.7
loaded via a namespace (and not attached):
[1] Rcpp_1.0.8.3 XML_3.99-0.10 binman_0.1.2 ps_1.7.0
[5] assertthat_0.2.1 bitops_1.0-7 rappdirs_0.3.3 R6_2.5.1
[9] jsonlite_1.8.0 semver_0.2.0 httr_1.4.3 curl_4.3.2
[13] tools_4.2.0 wdman_0.2.5 yaml_2.3.5 compiler_4.2.0
[17] processx_3.6.0 askpass_1.1 caTools_1.18.2 openssl_2.0.2
Found the mistake in the end, the following code below works, either headless or not (obtained by unmarking # '--headless', below). Using the phantomjs browser instead didn't work, but apparently that's common, and found a good way to avoid the chromedriver version conflicts.
# note: set GISAID access credentials first using ####
# Sys.setenv(GISAIDR_USERNAME = "XXX") # GISAID username
# Sys.setenv(GISAIDR_PASSWORD = "XXX") # GISAID password
# put these commands in a file set_GISAID_credentials.R
# and then source that one first
# source(".//set_GISAID_credentials.R")
library(locatexec)
download_GISAD_meta = function(target_dir = getwd(),
clean_up = FALSE,
headless = FALSE,
genom_epidem = FALSE, # if TRUE use Genomic Epidemiology Metadata package download, else use regular Metadata package download
chromedriver_version = as.character(unlist(binman::list_versions("chromedriver")))[grepl(as.character(locatexec::exec_version("chrome")[[1, 1]]), as.character(unlist(binman::list_versions("chromedriver"))))][[1]], # or specify correct version, e.g. "104.0.5112.79"
usr = Sys.getenv("GISAIDR_USERNAME"),
psw = Sys.getenv("GISAIDR_PASSWORD")) {
# TO DO: also implement arguments clean_up=TRUE to delete downloaded file (default best set to FALSE though)
# and get_sequence=TRUE to also download FASTA with sequences & add those to outputted dataframe
require(RSelenium)
require(readr)
require(archive)
require(data.table)
require(dplyr)
if (!dir.exists(target_dir)) dir.create(target_dir)
arg = c('--no-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
)
if (headless) { arg = c('--headless', arg) } # for headless operation
eCaps = list(chromeOptions = list(
args = arg,
prefs = list(
"profile.default_content_settings.popups" = 0L,
"profile.default_content_setting_values.automatic_downloads" = 1L,
"download.prompt_for_download" = FALSE,
"download.directory_upgrade" = TRUE,
"safebrowsing.enabled" = TRUE,
"safebrowsing.disable_download_protection" = TRUE,
"useAutomationExtension" = FALSE,
"default_directory" = normalizePath(target_dir)
)
))
browser = wdman::chrome(port = 4570L, version = chromedriver_version, check = TRUE)
remDr = remoteDriver(port = 4570L,
version=chromedriver_version,
browserName = "chrome",
extraCapabilities = eCaps)
remDr$open()
# note: if you get a message This version of ChromeDriver only supports Chrome version xxx
# then make sure that you have specified the right chromedriver version
# (check if installed chrome version matches some version available under binman/binman_chromedriver/XXX which ones get installed)
# it may be necessary to downgrade your Chrome browser to version xxx using instructions at
# https://browserhow.com/how-to-downgrade-and-install-older-version-of-chrome/#download-the-older-chrome-version
# download browser install from https://filehippo.com/download_google-chrome/history/
# and disable chrome updates
# code below needed in headless mode
# https://stackoverflow.com/questions/35504731/specify-download-folder-in-rselenium
remDr$queryRD(
ipAddr = paste0(remDr$serverURL, "/session/", remDr$sessionInfo[["id"]], "/chromium/send_command"),
method = "POST",
qdata = list(
cmd = "Page.setDownloadBehavior",
params = list(
behavior = "allow",
downloadPath = normalizePath(target_dir)
)
)
)
if (genom_epidem) { message("Downloading GISAID genomic epidemiology metadata...") } else { message("Downloading GISAID metadata...") }
remDr$navigate("https://www.epicov.org/epi3/start")
remDr$setImplicitWaitTimeout(milliseconds = 100)
# enter credentials
username = NULL
while (length(username)==0) username = remDr$findElement(using = "xpath", "//input[#id='elogin']")
username$sendKeysToElement(list(usr))
password = remDr$findElement(using = "xpath", "//input[#id='epassword']")
password$sendKeysToElement(list(psw))
# click Login buttom
login_button = remDr$findElement(using = "xpath", "//input[#value='Login']")
login_button$clickElement()
epicov_tab = NULL
while (length(epicov_tab)==0) {
suppressMessages(tryCatch({
epicov_tab = remDr$findElement("xpath", "//a[contains(text(),'EpiCoV™')]")
}, error = function( err ) { epicov_tab = NULL })) }
epicov_tab$click()
downloads_tab = NULL
while (length(downloads_tab)==0) downloads_tab = remDr$findElements("class", "sys-actionbar-action-ni")[[3]]
downloads_tab$clickElement()
# switch to right frame
frames = NULL
while (length(frames)==0) frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
# available download buttons
download_buttons = NULL
while (length(download_buttons)==0) download_buttons = remDr$findElements("class", "kachel75")
# length(download_buttons) # 26 downloads available in total
# sapply(download_buttons, function(d) d$findChildElement("class", "downicon"))
downicons = remDr$findElements("class", "downicon")
downicons_titles = unlist(sapply(downicons, function (d) d$getElementAttribute('title')))
linkicons = remDr$findElements("class", "linkicon")
linkicons_titles = unlist(sapply(linkicons, function (d) d$getElementAttribute('title')))
# TO DO: figure out all titles, in the order in which they appear on the Downloads page
# all_titles = XXX
download_nr_gisaid_meta = which(grepl("TSV-File", downicons_titles))+
sum(linkicons_titles %in% c("Audacity files archive","LANL"))
download_nr_gisaid_genom_epidem_meta = which(grepl("metadata_", downicons_titles)) +
sum(linkicons_titles %in% c("Audacity files archive",
"LANL",
"FASTA and Metadata per clade",
"FASTA and Metadata per lineage",
"Global Phylogeny",
"Select input for the Augur pipeline"))
# TO DO : get titles of <div class="downicon" onclick="sys.call('c_rg1kzy_13e','DownloadFile',new Object({'id':'gisaid:metadata_tsv.tar.xz'}));" title="TSV-File (2022-08-01)">
# <img src="/epi3/app_entities/entities/corona2020/download_other2.png"><div>metadata</div>
# </div>
# TO DO: check version available for download & if already downloaded don't download it again
# allow downloading FASTA or other available download packages
# DOWNLOAD PATIENT METADATA (EITHER REGULAR GISAID METADATA OR GENOMIC EPIDEMIOLOGY METADATA)
download_nr = ifelse(genom_epidem, download_nr_gisaid_genom_epidem_meta, download_nr_gisaid_meta)
metadata_button = download_buttons[[download_nr]] # patient metadata
# TO DO: check available version & if that file is already present in target_dir don't bother downloading it again
metadata_button$clickElement()
# CLICK CHECKBOX NOTICE AND REMINDER OF TERMS OF USE & PRESS DOWNLOAD (THIS ONE DOES NOT ALWAYS SHOW UP)
frames = NULL
while (length(frames)==0) frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
checkbox_iagree=NULL
while (length(checkbox_iagree)==0) checkbox_iagree = remDr$findElements("class", "sys-event-hook")[[1]]
checkbox_iagree$clickElement()
Sys.sleep(3)
download_button = NULL
while (length(download_button)==0) download_button = remDr$findElements("class", "sys-form-button")[[2]]
suppressMessages(tryCatch({
download_button$clickElement()
}, error = function( err ) { message("") }))
# wait until download finishes
while (length(list.files(target_dir, pattern="crdownload", full.names=T))>=1) {
Sys.sleep(1)
}
remDr$close()
browser$stop()
remDr$quit()
if (genom_epidem) pat = ".tsv.gz" else pat = ".tar.xz"
df = file.info(list.files(target_dir, pattern=pat, full.names = T))
download = gsub(paste0(target_dir,"/"), "", rownames(df)[which.max(df$mtime)])
message(paste0("Downloaded GISAID metadata file version ", download))
message(paste0("Reading GISAID metadata file version ", download))
if (!genom_epidem) { output = read_tsv( # we directly read from archive
archive_read(file.path(target_dir,download), file=2),
col_types = cols(.default = "c"))
# PS fread is slightly faster (multicore), but requires file to be unzipped first
# system.time(archive_extract(archive=file.path(target_dir,download),
# dir=target_dir)) # 33s
# system.time(GISAID <- fread(file.path(target_dir,"metadata.tsv")))
colnames(output) = gsub("-", "_", gsub("?", "", gsub(" ", "_", tolower(colnames(output))), fixed=T), fixed=T)
output$pango_lineage = gsub(" (marker override based on Emerging Variants AA substitutions)", "", output$pango_lineage, fixed=T)
# for me the regular GISAID metadata package download then has the following column names:
# colnames(output)
# [1] "virus_name" "type"
# [3] "accession_id" "collection_date"
# [5] "location" "additional_location_information"
# [7] "sequence_length" "host"
# [9] "patient_age" "gender"
# [11] "clade" "pango_lineage"
# [13] "pangolin_version" "variant"
# [15] "aa_substitutions" "submission_date"
# [17] "is_reference" "is_complete"
# [19] "is_high_coverage" "is_low_coverage"
# [21] "n_content" "gc_content"
} else {
# system.time(output <- read_tsv( # we directly read from archive
# gzfile(file.path(target_dir,download)),
# col_types = cols(.default = "c"))) # 61s
system.time(output <- tibble(fread( # we directly read from archive, fread 2x faster than read_tsv
file.path(target_dir,download),
colClasses = c("character")))) # 28s
colnames(output) = tolower(colnames(output))
# the GISAID genomic epidemiology metadata package download has the following column names:
# TO DO: make some names consistent across metadata files?
# colnames(output)
# [1] "strain" "virus" "gisaid_epi_isl" "genbank_accession"
# [5] "date" "region" "country" "division"
# [9] "location" "region_exposure" "country_exposure" "division_exposure"
# [13] "segment" "length" "host" "age"
# [17] "sex" "nextstrain_clade" "pango_lineage" "gisaid_clade"
# [21] "originating_lab" "submitting_lab" "authors" "url"
# [25] "title" "paper_url" "date_submitted" "purpose_of_sequencing"
}
if (clean_up) unlink(file.path(target_dir,download))
return(output)
}
# example
# GISAID = download_GISAD_meta(target_dir = "C:/TEMP")
This download package will not contain the records uploaded over the last few days.
You can check the maximum submission date in the download package using
GISAID_max_submdate = as.Date(max(GISAID$submission_date, na.rm=T))
GISAID_max_submdate
and get the GISAID access IDs of the recently uploaded records using
library(devtools)
devtools::install_github("Wytamma/GISAIDR")
library(GISAIDR)
credentials = login(username = Sys.getenv("GISAIDR_USERNAME"),
password = Sys.getenv("GISAIDR_PASSWORD"),
database = "EpiCoV")
# dataframe with recently uploaded records
# (not included in metadata download package)
recent_records = as.vector(query(
credentials = credentials,
from_subm = as.character(GISAID_max_submdate),
to_subm = as.character(as.Date(Sys.time())+1),
fast = TRUE
))$accession_id
length(recent_records)
recent_records = recent_records[!recent_records %in% GISAID$accession_id] # do not keep record IDs already present in download package
length(recent_records)
These extra records you can then download using
d_extra = download_GISAID_records(accession_ids = recent_records,
get_sequence=FALSE,
clean_up=FALSE, target_dir=getwd(),
max_batch_size=10000, # maximum batch size, usually either 10000 or 5000
headless = FALSE,
usr=Sys.getenv("GISAIDR_USERNAME"),
psw=Sys.getenv("GISAIDR_PASSWORD"))
And you can merge the GISAID download package & recently submitted records using :
library(dplyr)
GISAID = dplyr::bind_rows(GISAID, d_extra)
This is using these couple of functions:
library(locatexec)
# function to download given records, after having logged in first
# (max batch of 10 000 records at a time)
downl_records = function (accession_ids,
get_sequence=FALSE,
clean_up=FALSE,
target_dir,
remDr=remDr) {
require(readr)
require(dplyr)
require(stringr)
require(dplyr)
remDr$refresh()
# click Select tab at the bottom
select_tab = NULL
while(length(select_tab)==0) select_tab = remDr$findElements("class", "sys-form-button")[[2]]
select_tab$clickElement()
# enter desired GISAID access nrs in input field
frames=NULL
while (length(frames)==0) frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
# write accession IDs to file & select them via Choose file...
IDfile = file.path(target_dir,"IDs.csv")
write.csv(accession_ids, IDfile, row.names=F) # make temporary file with IDs
# clear input field
input_field = remDr$findElements("class", "sys-form-fi-multiline")[[2]]
input_field$clickElement()
input_field$clearElement()
remDr$refresh()
frames=NULL
while (length(frames)==0) frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
frames=NULL
while (length(frames)==0) frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
file_field = remDr$findElements("name", "data")[[1]]
file_field$sendKeysToElement(list(IDfile))
unlink(IDfile) # remove temporary file with IDs again
Sys.sleep(1)
# click OK
remDr$refresh()
frames=NULL
while (length(frames)==0) frames = remDr$findElements("tag name", "iframe")
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
OK_button = remDr$findElements("class", "sys-form-button")[[2]]
OK_button$clickElement()
Sys.sleep(1) # TO DO change to while loop
# click OK again to Message XXX entries selected
buttonid = ""
while (buttonid=="") {
html = remDr$getPageSource()[[1]]
buttonid = str_extract(html, "(?<=button id=\")[0-9]*") # button id appears dynamic
# print(buttonid)
}
OK_button = remDr$findElement("id", buttonid)
OK_button$clickElement()
# click Download at the bottom
remDr$refresh()
download_tab = remDr$findElements("class", "sys-form-button")[[4]]
download_tab$clickElement()
# Sys.sleep(20) # TO DO change to while loop
# select Patient status metadata checkbox
frames=NULL
while (length(frames)==0) {
suppressMessages(tryCatch({
frames = remDr$findElements("tag name", "iframe")
}, error = function( err ) { frames = NULL }))
}
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
Sys.sleep(1)
checkbox_metadata = NULL
while (length(checkbox_metadata)==0) {
suppressMessages(tryCatch({
checkbox_metadata = remDr$findElements("class", "sys-event-hook")[[4]]
}, error = function( err ) { checkbox_metadata = NULL }))
}
checkbox_metadata$clickElement()
# click Download
mostrecenttsv = suppressWarnings(max(file.info(list.files(target_dir, pattern=".tsv", full.names=T))$mtime))
Sys.sleep(1) # TO DO change to while loop
mostrecenttsv_new = mostrecenttsv
download_button = NULL
while (length(download_button)==0) {
suppressMessages(tryCatch({
download_button = remDr$findElements("class", "sys-form-button")[[2]]
download_button$clickElement()
}, error = function( err ) { download_button = NULL }))
}
# CLICK CHECKBOX NOTICE AND REMINDER OF TERMS OF USE & PRESS DOWNLOAD (THIS ONE DOES NOT ALWAYS SHOW UP)
frames=NULL
while (length(frames)==0) {
suppressMessages(tryCatch({
frames = remDr$findElements("tag name", "iframe")
}, error = function( err ) { frames = NULL }))
}
remDr$switchToFrame(frames[[1]])
remDr$setImplicitWaitTimeout(milliseconds = 10)
checkbox_iagree=NULL
while (length(checkbox_iagree)==0) checkbox_iagree = remDr$findElements("class", "sys-event-hook")[[1]]
checkbox_iagree$clickElement()
Sys.sleep(3)
download_button = NULL
while (length(download_button)==0) download_button = remDr$findElements("class", "sys-form-button")[[2]]
suppressMessages(tryCatch({
download_button$clickElement()
}, error = function( err ) { message("") }))
Sys.sleep(1)
# wait until download finishes
while (mostrecenttsv_new==mostrecenttsv) {
mostrecenttsv_new = suppressWarnings(max(file.info(list.files(target_dir, pattern=".tsv", full.names=T))$mtime))
Sys.sleep(1)
}
df = file.info(list.files(target_dir, pattern=".tsv", full.names = T))
download = rownames(df)[which.max(df$mtime)]
message(paste0("Downloaded GISAID metadata file ", download))
# read in .tsv file download
output = read_tsv(download,
col_types = cols(.default = "c"))
colnames(output) = gsub(" ", "_", tolower(colnames(output)))
colnames(output)[which(colnames(output) %in% c("lineage"))] = "pango_lineage" # code lineage as pango_lineage as in GISAID metadata download package
output$pango_lineage = gsub(" (marker override based on Emerging Variants AA substitutions)", "", output$pango_lineage, fixed=T)
if (clean_up) unlink(download)
return(output)
}
# function to download given records
# (will be split in batches of max 10 000 records each)
download_GISAID_records = function(
accession_ids,
get_sequence=FALSE,
clean_up=FALSE,
target_dir=getwd(),
max_batch_size=10000, # maximum batch size
headless = FALSE,
chromedriver_version = as.character(unlist(binman::list_versions("chromedriver")))[grepl(as.character(locatexec::exec_version("chrome")[[1, 1]]), as.character(unlist(binman::list_versions("chromedriver"))))][[1]], # or specify correct version e.g. "104.0.5112.79"
usr=Sys.getenv("GISAIDR_USERNAME"),
psw=Sys.getenv("GISAIDR_PASSWORD")) {
require(RSelenium)
if (!dir.exists(target_dir)) dir.create(target_dir)
arg = c('--no-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
)
if (headless) { arg = c('--headless', arg) } # for headless operation
eCaps = list(chromeOptions = list(
args = c(#'--headless', # for headless operation
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
),
prefs = list(
"profile.default_content_settings.popups" = 0L,
"profile.default_content_setting_values.automatic_downloads" = 1L,
"download.prompt_for_download" = FALSE,
"download.directory_upgrade" = TRUE,
"safebrowsing.enabled" = TRUE,
"safebrowsing.disable_download_protection" = TRUE,
"useAutomationExtension" = FALSE,
"default_directory" = normalizePath(target_dir)
)
))
browser = wdman::chrome(port = 4570L, version = chromedriver_version, check = TRUE)
remDr = remoteDriver(port = 4570L,
version = chromedriver_version,
browserName = "chrome",
extraCapabilities = eCaps)
remDr$open()
# code below needed for headless operation
# https://stackoverflow.com/questions/35504731/specify-download-folder-in-rselenium
remDr$queryRD(
ipAddr = paste0(remDr$serverURL, "/session/", remDr$sessionInfo[["id"]], "/chromium/send_command"),
method = "POST",
qdata = list(
cmd = "Page.setDownloadBehavior",
params = list(
behavior = "allow",
downloadPath = normalizePath(target_dir)
)
)
)
message("Downloading GISAID records...")
remDr$navigate("https://www.epicov.org/epi3/start")
remDr$setImplicitWaitTimeout(milliseconds = 100)
# enter credentials
username = NULL
while (length(username)==0) { username = remDr$findElement(using = "xpath", "//input[#id='elogin']") }
username$sendKeysToElement(list(usr))
password = remDr$findElement(using = "xpath", "//input[#id='epassword']")
password$sendKeysToElement(list(psw))
# click Login buttom
login_button = remDr$findElement(using = "xpath", "//input[#value='Login']")
login_button$clickElement()
# click EpiCov tab
epicov_tab = NULL
while (length(epicov_tab)==0) {
suppressMessages(tryCatch({
epicov_tab = remDr$findElement("xpath", "//a[contains(text(),'EpiCoV™')]")
}, error = function( err ) { epicov_tab = NULL })) }
epicov_tab$click()
# click Search tab
search_tab = NULL
while (length(search_tab)==0) search_tab = remDr$findElements("class", "sys-actionbar-action-ni")[[2]]
search_tab$clickElement()
# Sys.sleep(10) # TO DO add while loop in downl_records
# download records in batches of maximum size max_batch_size
# function to split vector in chunks of max size chunk_length
chunk = function(x, chunk_length=max_batch_size) split(x, ceiling(seq_along(x)/chunk_length))
batches = chunk(accession_ids)
downloads = do.call(bind_rows, lapply(1:length(batches),
function (batchnr) {
message(paste0("Downloading batch ", batchnr, " out of ", length(batches)))
Sys.sleep(1)
output = downl_records(accession_ids=batches[[batchnr]],
get_sequence,
clean_up,
target_dir,
remDr)
return(output) } ))
remDr$close()
browser$stop()
remDr$quit()
return(downloads)
}