How to submit a job in Spark with a netCDF data source? - r

I have this code:
install.packages("ncdf4")
library(ncdf4)
install.packages("tidync")
library(tidync)
pp <- tidync("~/climate_data/pp_ens_mean_0.1deg_reg_v25.0e.nc")
print(pp)
## Daily averaged sea level pressure - PP
# set path and filename
ncpath <- "~/climate_data/"
ncname <- "pp_ens_mean_0.1deg_reg_v25.0e"
ncfname <- paste(ncpath, ncname, ".nc", sep="")
dname <- "pp"
# open a netCDF file
ncin <- nc_open(ncfname)
print(ncin)
# get longitude and latitude
lon <- ncvar_get(ncin,"longitude")
nlon <- dim(lon)
head(lon)
lat <- ncvar_get(ncin,"latitude")
nlat <- dim(lat)
head(lat)
print(c(nlon,nlat))
# get time
time <- ncvar_get(ncin,"time")
time
tunits <- ncatt_get(ncin,"time","units")
nt <- dim(time)
nt
# get pressure
pp_array <- ncvar_get(ncin,dname)
dlname <- ncatt_get(ncin,dname,"long_name")
dunits <- ncatt_get(ncin,dname,"units")
fillvalue <- ncatt_get(ncin,dname,"_FillValue")
dim(pp_array)
I have a RAM issue while running pp_array <- ncvar_get(ncin,dname). I would like to maybe submit a job in Spark to maybe run it on a cluster.
I have followed the installation procedure from here: https://therinspark.com/starting.html
#installing Spark
library(sparklyr)
#spark_install()
sc <- spark_connect(master = "local")
spark_web(sc)
# Retrieve the Spark installation directory
spark_home <- spark_home_dir()
# Build paths and classes
spark_path <- file.path(spark_home, "bin", "spark-class")
# Start cluster manager master node
system2(spark_path, "org.apache.spark.deploy.master.Master", wait = FALSE)
# Start worker node, find master URL at http://localhost:8080/
system2(spark_path, c("org.apache.spark.deploy.worker.Worker",
"spark://192.168.1.32:7077"), wait = FALSE)
but I am not familiar at all with Spark and I get confused on what to do next, as their main example is using a .csv file and I have .nc files as a source.
How can I run this previous piece of code with Stark?
Thank you very much for the precious help.

Related

R Subset area NetCDF

I'm having trouble trying to read a plus 2Gb NetCDF file from links in R
If I try to read the whole file R returns me a message that I don't have enough memory,
'Error: cannot allocate vector of size 31.3 Gb'
as it was reported by the post link
Following the this post advice, I decided to read only the part of the file based on the geographic coordinates of the a given area. Here is my code:
#load any packages
library(ncdf4)
set path and filename
ncpath <- "C:\Users\Me\Documents\Science\GIS\Global Land Cover\"
ncname <- "C3S-LC-L4-LCCS-Map-300m-P1Y-2018-v2.1.1"
ncfname <- paste(ncpath, ncname, ".nc", sep="")
dname <-"lccs_class"
open a netCDF file
ncin <- nc_open(ncfname)
print(ncin)
get longitude and latitude
lon <- ncvar_get(ncin,"lon")
nlon <- dim(lon)
head(lon)
lat <- ncvar_get(ncin,"lat")
nlat <- dim(lat) head(lat)
print(c(nlon,nlat))
#' create a bounding box to work with a subset
LonIdx <- c(841, 842, 844,845,846,847,848)
LatIdx <- c(93,94,95,96)
However, when I try to execute the code for the Subset:
Susbset <- ncvar_get(ncin, dname,
start = c(LatIdx[1], LonIdx[1]),
count = c(length(LatIdx),length(LonIdx)))
I get the error:
> Error in ncvar_get_inner(ncid2use, varid2use, nc$var[[li]]$missval,
> addOffset, : Error: variable has 3 dims, but start has 2 entries. They
> must match!
Can anyone help me? Much appreciated.
It seems like there is still a third dimension to specify before you can procceed.
Let's suppose it is a time variable:
start = c(LatIdx[1], LonIdx[1], DesiredTimeIdx),
count = c(length(LatIdx),length(LonIdx), 1))
This should read all lats an all lons in that specific time.

Split Raster Iteration or loop in R

R Programming Language (New to this)
I am attempting to loop through a number of tiled rasters that have been output by splitRaster. During the loop I want to carry out some processes on each raster.
But the following code throws an error.
library(ForestTools)
library(raster)
library(sp)
library(rgdal)
library(SpaDES)
rm(list = ls())
tmpdir <- file.path(tempdir(), "splitRaster")
lin <- function(x){x * 0.1 + 0.6}
inCHM <- raster("input raster path and name.tif")
split <- splitRaster(inCHM, 5, 5, c(0.05, 0.05), tmpdir)
files <- list.files(path=tmpdir, pattern="*.grd", full.names=FALSE, recursive=FALSE)
file.names <- dir(tmpdir, pattern ="*.grd")
for(file.names in files ){
name <- file.names
ttops <- vwf(name, winFun = lin, minHeight = 5)
writeOGR(ttops, "output folder", name, driver = "ESRI Shapefile")
}
and this is the error
[1] "Xrastername_tile1.grd"
Error in CRS(x) :
PROJ4 argument-value pairs must begin with +: Xrastername_tile1.grd
More to the problem (24/7/2020),
I have removed the loop for trouble shooting instead just choosing one of the splitRasters outputs that would be used in the loop ie files[[3]]
When I run the following code the error is the same;
library(ForestTools)
library(raster)
library(sp)
library(rgdal)
library(SpaDES)
rm(list = ls())
# set temp directory
tmpdir <- "C:\\R-Test\\Temp_Output"
# get raster
r <- raster("C:\\Lidar\\grid_treeheight_max_1m_nofill.tif")
# define projection
projection(r) <- "+proj=utm +zone=50 +south +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs"
# split raster brick
y <- splitRaster(r, 8, 8, c(0.05, 0.05), tmpdir)
# Get the complete file locations with full.names = T
files <- list.files(path=tmpdir, pattern="*.grd", full.names=FALSE, recursive=FALSE)
tmpfile <- paste(tmpdir, "\\", files[[3]], sep="")
lin <- function(x){x * 0.06 + 0.6}
ttops <- vwf(tmpfile, winFun = lin, minHeight = 5)
This is the error
Error in CRS(x) :
PROJ4 argument-value pairs must begin with +: D:\R-Test\Temp_Output\Xgrid_treeheight_max_1m_nofill_tile11.grd
When I run the following code using one of the splitRaster outputs (files[[3]]) from the above code it runs error free and I am able to plot ttops.
rm(list = ls())
# set temp directory
tmpdir <- "D:\\R-Test\\Temp_Output"
# get raster
r <- raster("D:\\R-Test\\Temp_Output\\Xgrid_treeheight_max_1m_nofill_tile11.grd")
lin <- function(x){x * 0.06 + 0.6}
ttops <- vwf(r, winFun = lin, minHeight = 5)
Why is the PROJ4 error occurring?
This seems to be the error that is causing the loop to fail?
I think the problem is that you are trying to feed the vwf function with the file name instead of a raster object. I would also recommend using lapply instead of for for the loop. Here is a code that should work
library(raster)
library(ForestTools)
library(rgdal)
# Get the complete file locations with full.names = T
files <- list.files(path=tmpdir, pattern="*.grd", full.names=T, recursive=FALSE)
# Loop over each item of the list, i.e., each raster
lapply(files, function(x){
# Load the image as raster
image <- raster(x)
# Calculate vwf (I added a dummy function for winFun)
ttops <- vwf(image, winFun = function(x){x * 0.06 + 0.5}, minHeight = 5)
# Write the file with the name of each raster
writeOGR(ttops, "output_dir", names(x), driver = "ESRI Shapefile")
})

CHIRPS data for variable in a loop using R

This program must download the CHIRPS data according to the detail entered, and then assign each year of information in a variable 'nc' and export it as '.csv'. However, the code does not work and I received the message "R Session Failed Failed Error" in RStudio.
### Importing libraries
library(parallel)
library(sp)
library(raster)
library(heavyRain)
### Working directory
setwd("D:/data-science_hydrology/g8/")
### Download CHIRPS monthly
data_CHIRPS <- getCHIRPS(region = "global",
format = "netcdf",
tres = "daily",
sres = 0.25,
begin = as.Date("1981-01-01"),
end = as.Date("2020-12-31"),
dsn = "data/chirps_daily",
overwrite = T,
cores = 1)
### THIS CRASH ME RSTUDIO
for (i in 1981:1982){
nc <- paste("nc", i, sep = "")
assign(nc, brick(paste("data/chirps_daily/chirps-v2.0.", i, ".days_p25.nc", sep ="")))
}
First, you should never use assign; rather store the objects you create in a list. You can nornally do that like this
library(raster)
y <- 1981:1982
ff <- paste0("data/chirps_daily/chirps-v2.0.", y, ".days_p25.nc")
nc <- lapply(ff, brick)
names(nc) <- y
That will probably give the same error, most likely because of a corrupted file. To find out which, use a loop to see where it fails.
nc <- list()
for (i in 1:length(ff)) {
print(ff[i]); flush.console()
nc[[i]] <- brick(ff[i])
}
And then re-download the bad file and try again.

How to extract NetCDF data frame by region using a polygon shapefile

I'am trying to extract the variable "swh_ku" from multiple NetCDF files together with their corresponding latitude and longitude values into csv files using a polygon shapefile or it's extent. I'm working with Jason-1 global altimetry swath data but I only need the data for the domain represented by the shapefile. I just need help with some lines of code that would complete the working code bellow so I can extract only the data for the region I'm interested in.
I've tried several software applications such as QGIS, ESA SNAP, Broadview Radar Altimetry Toolbox (BRAT) with no success unfortunately because I couldn't find a way automate the extraction process for the hundreds of NetCDF files. So I resorted to code with which I'm fairly new but managed to get it working after reading other posts. I've tried opening the files as raster or brick to use the #extract or #mask functions because they seem more straightforward but I couldn't manage to work them out.
Link to data: https://drive.google.com/drive/folders/1d_XVYFe__-ynxbJNUwlyl74SPJi8GybR?usp=sharing
library(ncdf4)
library(rgdal)
library(raster)
my_read_function <- function(ncname) {
setwd("D:/Jason-1/cycle_030")
bs_shp=readOGR("D:/Black_Sea.shp")
e<-extent(bs_shp)
ncfname = ncname
names(ncin[['var']])
dname = "swh_ku"
ncin = nc_open(ncfname)
print(ncin)
vars<-(names(ncin[['var']]))
vars
lon <- ncvar_get(ncin, "lon")
nlon <- dim(lon)
head(lon)
lat <- ncvar_get(ncin, "lat", verbose = F)
nlat <- dim(lat)
head(lat)
print(c(nlon, nlat))
sm_array <- ncvar_get(ncin,dname)
dlname <- ncatt_get(ncin,dname,"long_name")
dunits <- ncatt_get(ncin,dname,"units")
fillvalue <- ncatt_get(ncin,dname,"_FillValue")
dim(sm_array)
ls()
sm.slice <- sm_array[]
sm.vec <- as.vector(sm.slice)
length(sm.vec)
lonlat <- expand.grid(lon, lat)
sm.df01 <- data.frame(cbind(lonlat, sm.vec))
names(sm.df01) <- c("lon", "lat", paste(dname, sep = "_"))
head(na.omit(sm.df01), 20)
csvfile <- paste0(ncname,".csv")
write.table(na.omit(sm.df01), csvfile, row.names = FALSE, sep = ",")
}
my_files <- list.files("D:/Jason-1/cycle_030/")
lapply(my_files, my_read_function)
Looks like your data is not gridded.
library(ncdf4)
library(raster)
bs <- shapefile("Black_Sea.shp")
# simplify so that the data will look better later
bs <- as(bs, "SpatialPolygons")
f <- list.files("cycle_022", pattern="nc$", full=TRUE)
Loop would start here
ncfname = f[1]
dname = "swh_ku"
ncin = nc_open(ncfname)
lon <- ncvar_get(ncin, "lon")
lat <- ncvar_get(ncin, "lat", verbose = F)
sm_array <- ncvar_get(ncin, dname)
xyz <- na.omit(cbind(lon, lat, sm_array))
p <- SpatialPoints(xyz[,1:2], proj4string=crs(bs))
p <- SpatialPointsDataFrame(p, data.frame(xyz))
x <- intersect(p, bs)
x has the points that intersect with the Black Sea
plot(bs)
points(x)
head(x#data)

How to geocode a table of invalid/incorrect locations in R?

I have collected data of different users' location from twitter. I am trying to plot those data in a map in R. The problem is users have given invalid/incorrect addresses which causes geocode function to fail. How can I avoid this failure? Is there any way to check for this error case and not proceed? For example the user location data is something like this for any file geocode9.csv.
available locations,
Buffalo,
New York,
thsjf,
Washington, USA
Michigan,
nkjnt,
basketball,
ejhrbvw
library(ggmap)
fileToLoad <- file.choose(new = TRUE)
origAddress <- read.csv(fileToLoad, stringsAsFactors = FALSE)
geocoded <- data.frame(stringsAsFactors = FALSE)
for(i in 1:nrow(origAddress))
{
result <- geocode(origAddress$available_locations[i], output = "latlona", source = "google")
origAddress$lon[i] <- as.numeric(result[1])
origAddress$lat[i] <- as.numeric(result[2])
origAddress$geoAddress[i] <- as.character(result[3])
}
write.csv(origAddress, "geocoded.csv", row.names=FALSE)
When the code runs through "thsjf" of the locations list, it throws an error. How can I get past this error? I want something like,
if(false){ # do not run geocode function}
I'm not sure how to geocode those addresses if they are actually wrong. How would the machine even figure it out if it was wrong? I think you need to get the addresses corrected, and THEN geocode everything. Here is some sample code.
#load ggmap
library(ggmap)
startTime <- Sys.time()
# Select the file from the file chooser
fileToLoad <- file.choose(new = TRUE)
# Read in the CSV data and store it in a variable
origAddress <- read.csv(fileToLoad, stringsAsFactors = FALSE)
# Initialize the data frame
geocoded <- data.frame(stringsAsFactors = FALSE)
# Loop through the addresses to get the latitude and longitude of each address and add it to the
# origAddress data frame in new columns lat and lon
for(i in 1:nrow(origAddress))
{
# Print("Working...")
result <- geocode(origAddress$addresses[i], output = "latlona", source = "google")
origAddress$lon[i] <- as.numeric(result[1])
origAddress$lat[i] <- as.numeric(result[2])
origAddress$geoAddress[i] <- as.character(result[3])
}
# Write a CSV file containing origAddress to the working directory
write.csv(origAddress, "geocoded.csv", row.names=FALSE)
endTime <- Sys.time()
processingTime <- endTime - startTime
processingTime
Check this for more info.
http://www.storybench.org/geocode-csv-addresses-r/

Resources