Parallelizing a loop with updating during each iteration - r

I have some R code that puts together demographic data from the Census for all of states in the US into a list object. The block-level code can take a week to run as a sequential loop since there are ~11M blocks, so I am trying to parallelize the loop over states to make it faster. I have accomplished this goal with this:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
plan(multiprocess)
ptm <- proc.time()
CensusObj_block_age_sex = list()
CensusObj_block_age_sex[states] <- future_lapply(states, function(s){
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
censusObj[[s]] <- list(state = s, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
)
However, I need to make it more robust. Sometimes there are problem with the Census API, so I would like the CensusObj to be updated at each state iteration so that I don't loose my completed data if something wrong. That way I can restart the loop over the remaining state if something does goes wrong (like if I spell "WY" as "WU")
Would it be possible to accomplish this somehow? I am open to other methods of parallelization.
The code above runs, but it seems to run into memory issues:
Error: Failed to retrieve the value of MultisessionFuture (future_lapply-3) from cluster RichSOCKnode #3 (PID 80363 on localhost ‘localhost’). The reason reported was ‘vector memory exhausted (limit reached?)’. Post-mortem diagnostic: A process with this PID exists, which suggests that the localhost worker is still alive.
I have R_MAX_VSIZE = 8Gb in my .Renviron, but I am not sure how that would get divided between the 8 cores on my machine. This all suggests that I need to store the results of each iteration rather than try to keep it all in memory, and then append the objects together at the end.

Here is a solution that uses doParallel (with the options for UNIX systems, but you can also use it on Windows, see here) and foreach that stores the results for every state separately and afterwards reads in the single files and combines them to a list.
library(doParallel)
library(foreach)
path_results <- "my_path"
ncpus = 8L
registerDoParallel(cores = ncpus)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
results <- foreach(state = states) %dopar% {
county <- census_geo_api(key = "XXX", state = state, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = state, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = state, geo = "block", age = TRUE, sex = TRUE)
results <- list(state = state, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# store the results as rds
saveRDS(results,
file = paste0(path_results, "/", state, ".Rds"))
# remove the results
rm(county)
rm(tract)
rm(block)
rm(results)
gc()
# just return a string
paste0("done with ", state)
}
library(purrr)
# combine the results to a list
result_files <- list.files(path = path_results)
CensusObj_block_age_sex <- set_names(result_files, states) %>%
map(~ readRDS(file = paste0(path_results, "/", .x)))

You could use a tryCatch inside future_lapply to try to relaunch the calculation in case of API error, for a maximum of maxtrials.
In the resulting list, you get for each calculation the number of trials and the final status, OK or Error:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
#> Le chargement a nécessité le package : future
plan(multiprocess)
ptm <- proc.time()
maxtrials <- 3
census_geo_api <-
function(key = "XXX",
state = s,
geo = "county",
age = TRUE,
sex = TRUE) {
paste(state,'-', geo)
}
CensusObj_block_age_sex <- future_lapply(states, function(s) {
ntrials <- 1
while (ntrials <= maxtrials) {
hasError <- tryCatch({
#simulate random error
if (runif(1)>0.3) {error("API failed")}
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
},
error = function(e)
e)
if (inherits(hasError, "error")) {
ntrials <- ntrials + 1
} else { break}
}
if (ntrials > maxtrials) {
res <- list(state = s, status = 'Error', ntrials = ntrials-1, age = NA, sex = NA, block = NA, tract = NA, county = NA)
} else {
res <- list(state = s, status = 'OK' , ntrials = ntrials, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
res
}
)
CensusObj_block_age_sex[[1]]
#> $state
#> [1] "AL"
#>
#> $status
#> [1] "OK"
#>
#> $ntrials
#> [1] 3
#>
#> $age
#> [1] TRUE
#>
#> $sex
#> [1] TRUE
#>
#> $block
#> [1] "AL - block"
#>
#> $tract
#> [1] "AL - tract"
#>
#> $county
#> [1] "AL - county"
<sup>Created on 2020-08-19 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>

One possible solution that I have is to log the value of CensusObj to a text file i.e print the CensusObj in each iteration. The doSNOW package can be used for logging for example
library(doSNOW)
cl <- makeCluster(1, outfile="abc.out")
registerDoSNOW(cl)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
foreach(i=1:length(states), .combine=rbind, .inorder = TRUE) %dopar% {
county <- "A"
tract <- "B"
block <- "C"
censusObj <- data.frame(state = states[i], age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# edit: print json objects to easily extract from the file
cat(sprintf("%s\n",rjson::toJSON(censusObj)))
}
stopCluster(cl)
This would log the value of censusObj in abc.out and also logs the error if program crashes but you will get the latest value of censusObj logged in abc.out.
Here is the output of the last iteration from the log file:
Type: EXEC {"state":"PR","age":true,"sex":true,"block":"C","tract":"B","county":"A"} Type: DONE
Type:EXEC means that the iteration has started and Type:DONE means execution is completed. The result of cat will be present between these two statements of each iteration. Now, the value of CensusObj can be extracted from the log file as shown below:
Lines = readLines("abc.out")
results = list()
for(i in Lines){
# skip processing logs created by doSNOW
if(!startsWith(i, "starting") && !startsWith(i, "Type:")){
results = rlist::list.append(results, jsonlite::fromJSON(i))
}
}
results will contain the elements all the values printed in abc.out.
> head(results, 1)
[[1]]
[[1]]$state
[1] "AL"
[[1]]$age
[1] TRUE
[[1]]$sex
[1] TRUE
[[1]]$block
[1] "C"
[[1]]$tract
[1] "B"
[[1]]$county
[1] "A"
It is not a very clean solution but works.

Related

See unmatched countries for joinCountryData2Map in rworldmap?

I'm using the joinCountryData2Map function in rworldmap to match my data to the countries in the world map.
I get this result:
230 codes from your data successfully matched countries in the map
11 codes from your data failed to match with a country code in the map
11 codes from the map weren't represented in your data
I cannot figure out how to view those two lists of 11 countries. I am guessing that those 11 countries have issues with their ISO2 codes that I need to correct, but am not sure which ones to check without being able to view those two lists.
I'm guessing there's a solution along the lines of just View(SomeObject$Countries) but I haven't been able to find anything that works.
Set joinCountryData2Map(...,verbose=TRUE) to print the names of the countries that failed to match in the console.
From the FAQ: "You can see that a summary of how many countries are successfully joined is output to the console. You can specify verbose=TRUE to get a full list of countries"
library(rworldmap)
data(countryExData)
# Set Angola to fail
countryExData[countryExData$ISO3V10 == "AGO", "ISO3V10"] <- "AGO_FAIL"
# Attempt to join
# With verbose=TRUE, failed joins (ie Angola) are printed in the console
sPDF <- joinCountryData2Map(
countryExData[,c("ISO3V10", "Country")],
joinCode = "ISO3",
nameJoinColumn = "ISO3V10",
verbose = TRUE)
# > 148 codes from your data successfully matched countries in the map
# > 1 codes from your data failed to match with a country code in the map
# > failedCodes failedCountries
# > [1,] "AGO_FAIL" "Angola"
# > 95 codes from the map weren't represented in your data
But what if you want to get the information on failed joins programmatically? I may have missed something, but I don't see an option for that (i.e., str(sPDF) or function arguments). However, looking at the internals of joinCountryData2Map(), the object failedCountries contains the info you want, so it should be easy enough to include it in the returned object.
Here's how you could modify joinCountryData2Map() to return a list with two elements: the first element is the default object, and the second element is failedCountries.
# Modify the function to return the failed joins in the environment
joinCountryData2Map_wfails <- function(
dF, joinCode = "ISO3", nameJoinColumn = "ISO3V10",
nameCountryColumn = "Country", suggestForFailedCodes = FALSE,
mapResolution = "coarse", projection = NA, verbose = FALSE) {
# Retain successful join as first element and failed join as second element
ll <- list() # MODIFIED
mapWithData <- getMap(resolution = mapResolution)
if (!is.na(projection))
warning("the projection argument has been deprecated, returning Lat Lon, use spTransform from package rgdal as shown in help details or the FAQ")
listJoinCodesNew <- c("ISO_A2", "ISO_A3", "FIPS_10_",
"ADMIN", "ISO_N3")
listJoinCodesOld <- c("ISO2", "ISO3", "FIPS",
"NAME", "UN")
listJoinCodes <- c(listJoinCodesOld, listJoinCodesNew)
if (joinCode %in% listJoinCodes == FALSE) {
stop("your joinCode (", joinCode, ") in joinCountryData2Map() is not one of those supported. Options are :",
paste(listJoinCodes, ""), "\n")
return(FALSE)
}
joinCodeOld <- joinCode
if (joinCode %in% listJoinCodesOld) {
joinCode <- listJoinCodesNew[match(joinCode, listJoinCodesOld)]
}
if (is.na(match(nameJoinColumn, names(dF)))) {
stop("your chosen nameJoinColumn :'", nameJoinColumn,
"' seems not to exist in your data, columns = ",
paste(names(dF), ""))
return(FALSE)
}
dF[[joinCode]] <- as.character(dF[[nameJoinColumn]])
dF[[joinCode]] <- gsub("[[:space:]]*$", "", dF[[joinCode]])
if (joinCode == "ADMIN") {
dF$ISO3 <- NA
for (i in 1:nrow(dF)) dF$ISO3[i] = rwmGetISO3(dF[[joinCode]][i])
joinCode = "ISO3"
nameCountryColumn = nameJoinColumn
}
matchPosnsInLookup <- match(as.character(dF[[joinCode]]),
as.character(mapWithData#data[[joinCode]]))
failedCodes <- dF[[joinCode]][is.na(matchPosnsInLookup)]
numFailedCodes <- length(failedCodes)
numMatchedCountries <- nrow(dF) - numFailedCodes
cat(numMatchedCountries, "codes from your data successfully matched countries in the map\n")
failedCountries <- dF[[nameCountryColumn]][is.na(matchPosnsInLookup)]
failedCountries <- cbind(failedCodes, failedCountries = as.character(failedCountries))
cat(numFailedCodes, "codes from your data failed to match with a country code in the map\n")
if (verbose)
print(failedCountries)
matchPosnsInUserData <- match(as.character(mapWithData#data[[joinCode]]),
as.character(dF[[joinCode]]))
codesMissingFromUserData <- as.character(mapWithData#data[[joinCode]][is.na(matchPosnsInUserData)])
countriesMissingFromUserData <- as.character(mapWithData#data[["NAME"]][is.na(matchPosnsInUserData)])
numMissingCodes <- length(codesMissingFromUserData)
cat(numMissingCodes, "codes from the map weren't represented in your data\n")
mapWithData#data <- cbind(mapWithData#data, dF[matchPosnsInUserData,
])
invisible(mapWithData)
ll[[1]] <- mapWithData # MODIFIED
ll[[2]] <- failedCountries # MODIFIED
return(ll) # MODIFIED
}
Usage:
sPDF_wfails <- joinCountryData2Map_wfails(
countryExData[,c("ISO3V10", "Country")],
joinCode = "ISO3",
nameJoinColumn = "ISO3V10",
verbose = TRUE)
# This is the result of the original function
# sPDF_wfails[[1]]
# This is info on the failed joins
sPDF_wfails[[2]]
# > failedCodes failedCountries
# > [1,] "AGO_FAIL" "Angola"

Error in R code for Reddit sentiment analysis

I am trying to so a Reddit sentiment analysis in R. I cannot get past an error and I am trying to troubleshoot the problem.
# Getting Reddit Data
links<- find_thread_urls(
keywords = "Ghostbusters",
sort_by = "top",
subreddit = NA,
period = "all"
)
# function to iterate through all posts
funct = function(i){
content = get_thread_content()(links$URL[i])
com = iconv(content$comment, to = 'utf-8')
clov = get_nrc_sentiment(com)
x1 = 100*colSums(clov)/sum(clov)
return(cbind(links[i,], t(x1) ))
}
# list of all the links
ls = 1:nrow(links)
# loop through all the links and bind to a data frame
res = do.call("rbind", lapply(ls, funct))
When I run this code, I get this error:
Error in lapply(urls, parse_thread_url) :
argument "urls" is missing, with no default
What am I missing here?
There were few typos in the code,
links = find_thread_urls(
keywords = "Ghostbusters",
sort_by = "top",
subreddit = NA,
period = "all"
)
x0 = vector()
funct = function(i){
content = get_thread_content(links$url[i])
com = iconv(content$comment, to = 'utf-8')
clov = get_nrc_sentiment(com)
x1 = 100*colSums(clov)/sum(clov)
x1 = cbind(x0, t(x1))
return(x1)
}
ls = 1:nrow(links)
res = lapply(ls, funct)
res = do.call(rbind, res)
anger anticipation disgust fear joy sadness surprise trust negative positive
[1,] 7.276119 8.955224 6.156716 9.514925 6.156716 9.328358 3.731343 9.701493 18.28358 20.89552
[2,] 5.586592 11.731844 4.469274 6.145251 12.290503 8.379888 5.586592 14.525140 10.61453 20.67039
[3,] 7.333333 9.238095 5.238095 8.952381 8.095238 6.571429 4.952381 11.523810 16.95238 21.14286
[4,] 8.641975 6.790123 8.024691 7.098765 8.333333 8.024691 5.555556 11.111111 16.35802 20.06173

%dopar% safe way of write to csv inside foreach loop

[EDITED]
It is a general question: I have seen some posts saying that it is not a good idea to use foreach and write.csv inside a foreach loop due to different cores trying to write in the file at the same time, resulting in missing results. Still, I need to write in an external file inside the parallel loop to get my output (500000+ rows and 10+ columns). Otherwise, it crushes for memory issues. So, I would like to know if there is a more safe way to write a result file within a foreach loop.
I appreciate any help on this
I am adding some more info and a much more simple code and data than what I actually have.
Description: I have two different polygons layers (sf, polygon), each with 500000+ sf. I need to calculate the area of different raster classes (1 raster layer with 3 classes) within each one of the polygons. This is the most time-consuming part of the script, specifically because I need to use sf::sf_intersection multiple times. Then, I use many different combinations of if-else and rules to populate a df with values and rules.
This is the original code, which I get memory issues with the original data:
require(sf)
require(raster)
require(rgdal)
require(rgeos)
require(dplyr)
require(stars)
## Sample data
set.seed(131)
sample_raster = raster(nrows = 1, ncols = 1, res = 0.5, xmn = 0, xmx = 11, ymn = 0, ymx = 11)
values(sample_raster) = rep(1:3, length.out = ncell(sample_raster))
crs(sample_raster) = CRS('+init=EPSG:4326')
plot(sample_raster, axes=T)
sample_raster
##
m = rbind(c(0,0), c(1,0), c(1,1), c(0,1), c(0,0))
p = st_polygon(list(m))
n = 100
l = vector("list", n)
for (i in 1:n)
l[[i]] = p + 10 * runif(2)
sample_poly = st_sfc(l)
data = data.frame(PR_ID = seq(1:100),
COND1 = rep(1:10, length.out = 100))
sample_poly = st_sf(cbind(data, sample_poly))
plot(sample_poly, col = sf.colors(categorical = TRUE, alpha = .5), add=T)
sample_poly = sample_poly %>% st_set_crs(4326)
sample_poly
##
## Code
require(parallel)
require(foreach)
require(doParallel)
idall = as.character(sample_poly$PR_ID)
area = as.numeric(st_area(sample_poly))/10000
# i=1
# listID = idall
# mainpoly = sample_poly
# mainras = sample_raster
# mainpolyarea = area
per.imovel.paralallel = function (listID, mainpoly, mainras, mainpolyarea) { # Starting the function
## Setting the parallel work up into your computer
UseCores = detectCores()-1
cl = parallel::makeCluster(UseCores, output="")
doParallel::registerDoParallel(cl)
writeLines(c(""), "log.txt") # Creates a LOG FILE in the folder to follow processing
FOREACH.RESULT = foreach(i = 1:length(listID), .packages=c('raster', 'rgdal', 'rgeos', 'dplyr', 'parallel',
'doParallel', 'sf', 'stars'), .inorder = T , .combine ='rbind') %dopar%
{ # Stating the paral-loop
sink("log.txt", append=TRUE) # LOG FILE in the home folder
cat(paste(i, "of", length(listID), as.character(Sys.time()),"\n")) # Write to LOG FILE
sink() # end diversion of output
########################
### Pick one poly
px = sf::st_buffer(mainpoly[mainpoly$PR_ID == listID[i],], # Conditional to select the geometry PR_ID in position i
dist = 0.1) # buffer = 0 w/ byid, selects the geometry
########################
### Intersect with raster and get area
px2 = sf::st_buffer(px, dist = 0.1) # Buffer because raster::mask() masks out partially covered cells since it call rasterize() first
desm_prop = raster::crop(mainras, as_Spatial(px2))
desm_prop_shp = if(all(is.na(values(desm_prop)))){NULL
} else {sf::st_intersection(st_cast(sf::st_as_sf(stars::st_as_stars(desm_prop)), "POLYGON"), px)}
names(desm_prop_shp)[1] = if(any(names(desm_prop_shp) == "layer")){"values"
} else {NULL}
desm_prop_bet0108 = if(is.null(desm_prop_shp)){NULL
} else {desm_prop_shp[desm_prop_shp$values == 1, ]}
desm_prop_bet0108 = if(is.null(desm_prop_bet0108) | length(desm_prop_bet0108) == 0){NULL
} else if(length(desm_prop_bet0108$values) == 0){NULL
} else {desm_prop_bet0108}
desm_prop_after08 = if(is.null(desm_prop_shp)){NULL
} else {desm_prop_shp[desm_prop_shp$values == 2, ]}
desm_prop_after08 = if(is.null(desm_prop_after08) | length(desm_prop_after08) == 0){NULL
} else if(length(desm_prop_after08$values) == 0){NULL
} else {desm_prop_after08}
desm_prop_upto00 = if(is.null(desm_prop_shp)){NULL
} else {desm_prop_shp[desm_prop_shp$values == 3, ]}
desm_prop_upto00 = if(is.null(desm_prop_upto00) | length(desm_prop_upto00) == 0){NULL
} else if(length(desm_prop_upto00$values) == 0){NULL
} else {desm_prop_upto00}
area_desm_prop_bet0108 <- if(is.null(desm_prop_bet0108)){0
} else { sum(as.numeric(sf::st_area(desm_prop_bet0108)/10000))} # Deforestation area in PX 2001 - 2008
area_desm_prop_after08 <- if(is.null(desm_prop_after08)){0
} else { sum(as.numeric(sf::st_area(desm_prop_after08)/10000))} # Deforestation area in PX after 2008
area_desm_prop_upto00 <- if(is.null(desm_prop_upto00)){0
} else { sum(as.numeric(sf::st_area(desm_prop_upto00)/10000))} # Deforestation area in PX upto 2000
########################
# RESULTS
TEMP.RESULTS = data.frame(PR_ID = as.character(listID[i]),
PR_AREA_HA = mainpolyarea[i],
PR_D09 = area_desm_prop_after08,
PR_D0108 = area_desm_prop_bet0108,
PR_D00 = area_desm_prop_upto00)
return (TEMP.RESULTS)
} # Ending the loop
return (FOREACH.RESULT)
parallel::stopCluster(cl) # stop cluster
stopImplicitCluster() # stop cluster
gc()
} # Ending the function
#####################################################################################################
results_feach = per.imovel.paralallel (listID = idall, mainpoly = sample_poly, mainras = sample_raster, mainpolyarea = area)
warnings()
I have also tried #mischva11 (modified) suggestion by adding this:
length_of_chunk = round(length(idall)/(length(idall)/10)) # generate chunks of 10 lines
lchunks = split(idall, sort(rep_len(1:length_of_chunk, length(idall))))
for (z in 1:length_of_chunk){
# split up the data in chunks
idall_chunk = as.vector(unlist(lchunks[z]))
results_chunk = per.imovel.paralallel (listID = idall_chunk, mainpoly = sample_poly, mainras = sample_raster, mainpolyarea = area)
# save your foreach results for each chunk, append after the first one
if (z == 1) {write.table(results_chunk, file = "TESTDATAresults1.csv")
}else {write.table(results_chunk, file = "TESTDATAresults1.csv", append = TRUE, col.names = FALSE)}
print(NULL) # print(results_chunk)
}
It works like a charm for this example.
BUT, I have a setback when running it with the real script/data: it takes ages for the foreach to close. I am watching my machine performance and log file.. after processing all lines of my sf object, my CPU work goes down as expected, but it still takes more than 30min (i did not wait for it to completely finish) to close the foreach function.
Because of it, I thought about writing the output on the flow inside the foreach work. But clearly it is not a good idea as explained here. I have seen some posts about the package 'flock' which look the output file for writing the output. I have not tested but it sounds promising.
The problem here is, that you need communication between the cores. One core has to wait for the next one until it's finished writing in the csv. That's not easily done and not possible with foreach as far as I now. foreach does provide this method with the variable inorder(by default true). You are telling us, you got memory issues. So one solution is to chunk up your output if it's possible. I do not have a good dataset for this example, so I use mtcars which will be filled by NAs
library(foreach)
library(parallel)
library(doParallel)
registerDoParallel(4)
# split your output here, I use 5 chunks here. My data is mtcars */
length_of_chunk <-round(nrow(mtcars)/5)
for ( z in 1:length_of_chunk-1){
x<-0
#here the data gets split up
data <- mtcars[(z*length_of_chunk):(z*length_of_chunk+length_of_chunk),]
#foreach with those 5 datarows
results <- foreach(i=1:length_of_chunk, .combine=rbind) %dopar% {
#***your code***
y = data[i,]
return(y)
}
print(results)
# save your foreach results and then begin again
if (z==1) {write.table(results, file= "test.csv")}
else {write.table(results, file="test.csv", append=TRUE, col.names = FALSE)}
}

Difficulty in downloading TCGA data

I am trying to download the TCGA data but I am getting this error:
Error in summarizeMaf(maf = maf, anno = clinicalData, chatty =
verbose): Tumor_Sample_Barcode column not found in provided clinical
data. Rename column containing sample names to Tumor_Sample_Barcode if
necessary.
This is my code:
library("TCGAbiolinks")
library("tidyverse")
library(maftools)
query <- GDCquery( project = "TCGA-LIHC",
data.category = "Clinical",
file.type = "xml",
legacy = FALSE)
GDCdownload(query,directory = ".")
clinical <- GDCprepare_clinic(query, clinical.info = "patient",directory = ".")
#getting the survival time of event data
survival_data <- as_tibble(clinical[,c("days_to_last_followup","days_to_death","vital_status","bcr_patient_barcode","patient_id")])
survival_data <- filter(survival_data,!is.na(days_to_last_followup)|!is.na(days_to_death)) #not both NA
survival_data <- filter(survival_data,!is.na(days_to_last_followup)|days_to_last_followup>0 &is.na(days_to_death)|days_to_death > 0 ) #ensuring positive values
survival_data <- survival_data[!duplicated(survival_data$patient_id),] #ensuring no duplicates
dim(survival_data) #should be 371
maf <- GDCquery_Maf("LIHC", pipelines = "muse")
#maf <- GDCquery_Maf("LIHC", pipelines = "somaticsniper")
#clin <- GDCquery_clinic("TCGA-LIHC","clinical")
#print(clin )
laml = read.maf(
maf,
clinicalData = clinical,
removeDuplicatedVariants = TRUE,
useAll = TRUE,
gisticAllLesionsFile = NULL,
gisticAmpGenesFile = NULL,
gisticDelGenesFile = NULL,
gisticScoresFile = NULL,
cnLevel = "all",
cnTable = NULL,
isTCGA = TRUE,
vc_nonSyn = NULL,
verbose = TRUE
)
You should have: a) loaded with library(maftools) and b) included what was printed out before that error message:
-Validating
-Silent variants: 18306
-Summarizing
--Possible FLAGS among top ten genes:
TTN
MUC16
OBSCN
FLG
-Processing clinical data
Available fields in provided annotations..
[1] "bcr_patient_barcode" "additional_studies"
[3] "tissue_source_site" "patient_id"
# snipped remaining 78 column names
Notice that the first column is not named "Tumor_Sample_Barcode", so you need to follow the helpful error message directions and rename the appropriate column which appears to be the first one:
ns. After doing so I get:
-Validating
-Silent variants: 18306
-Summarizing
--Possible FLAGS among top ten genes:
TTN
MUC16
OBSCN
FLG
-Processing clinical data
-Finished in 1.911s elapsed (2.470s cpu)

Mapdist: Error is.character(from)

My dataset includes a column "pickup" corresponding to the starting coordinates and a "dropoff" for the ending coordinates, of a trip. Like:
pickup dropoff
40.77419,-73.872608 40.78055,-73.955042
40.7737,-73.870721 40.757007,-73.971953
I want to calculate the shortest route suggested by Google Maps, and saved the calculations in a new column. This is what I'm doing:
X$GoogleDist <- mapdist(from= list(X$pickup),
to = list(X$dropoff),
mode = "driving" ,
output = "simple", messaging = FALSE, sensor = FALSE,
language = "en-EN", override_limit = FALSE)
Which gives me the following error:
Error: is.character(from) is not TRUE
You could do
library(ggmap)
X <- read.table(header=TRUE, text="pickup dropoff
40.77419,-73.872608 40.78055,-73.955042
40.7737,-73.870721 40.757007,-73.971953")
X <- as.data.frame(lapply(X, function(x) sapply(as.character(x), function(y) URLencode(y, T) ) ), stringsAsFactors = F)
rownames(X) <- NULL
res <- mapdist(from= X$pickup,
to = X$dropoff,
mode = "driving" ,
output = "simple", messaging = FALSE, sensor = FALSE,
language = "en-EN", override_limit = FALSE)
cbind(X, res)
# pickup dropoff from to m km miles seconds minutes hours
# 1 40.77419%2C-73.872608 40.78055%2C-73.955042 40.77419%2C-73.872608 40.78055%2C-73.955042 12805 12.805 7.957027 1212 20.20 0.3366667
# 2 40.7737%2C-73.870721 40.757007%2C-73.971953 40.7737%2C-73.870721 40.757007%2C-73.971953 14038 14.038 8.723213 1437 23.95 0.3991667
Your columns are probably of type factor (check with str(X)). mapdist needs character vectors (check ?mapdist). So you have to convert the columns using as.character beforehand. Also, when using geo coordinates, I think you got to URL encode them. I.e. the comma , becomes %2C. Otherwise it didn`t work for me...

Resources