R Subset area NetCDF - r

I'm having trouble trying to read a plus 2Gb NetCDF file from links in R
If I try to read the whole file R returns me a message that I don't have enough memory,
'Error: cannot allocate vector of size 31.3 Gb'
as it was reported by the post link
Following the this post advice, I decided to read only the part of the file based on the geographic coordinates of the a given area. Here is my code:
#load any packages
library(ncdf4)
set path and filename
ncpath <- "C:\Users\Me\Documents\Science\GIS\Global Land Cover\"
ncname <- "C3S-LC-L4-LCCS-Map-300m-P1Y-2018-v2.1.1"
ncfname <- paste(ncpath, ncname, ".nc", sep="")
dname <-"lccs_class"
open a netCDF file
ncin <- nc_open(ncfname)
print(ncin)
get longitude and latitude
lon <- ncvar_get(ncin,"lon")
nlon <- dim(lon)
head(lon)
lat <- ncvar_get(ncin,"lat")
nlat <- dim(lat) head(lat)
print(c(nlon,nlat))
#' create a bounding box to work with a subset
LonIdx <- c(841, 842, 844,845,846,847,848)
LatIdx <- c(93,94,95,96)
However, when I try to execute the code for the Subset:
Susbset <- ncvar_get(ncin, dname,
start = c(LatIdx[1], LonIdx[1]),
count = c(length(LatIdx),length(LonIdx)))
I get the error:
> Error in ncvar_get_inner(ncid2use, varid2use, nc$var[[li]]$missval,
> addOffset, : Error: variable has 3 dims, but start has 2 entries. They
> must match!
Can anyone help me? Much appreciated.

It seems like there is still a third dimension to specify before you can procceed.
Let's suppose it is a time variable:
start = c(LatIdx[1], LonIdx[1], DesiredTimeIdx),
count = c(length(LatIdx),length(LonIdx), 1))
This should read all lats an all lons in that specific time.

Related

How to submit a job in Spark with a netCDF data source?

I have this code:
install.packages("ncdf4")
library(ncdf4)
install.packages("tidync")
library(tidync)
pp <- tidync("~/climate_data/pp_ens_mean_0.1deg_reg_v25.0e.nc")
print(pp)
## Daily averaged sea level pressure - PP
# set path and filename
ncpath <- "~/climate_data/"
ncname <- "pp_ens_mean_0.1deg_reg_v25.0e"
ncfname <- paste(ncpath, ncname, ".nc", sep="")
dname <- "pp"
# open a netCDF file
ncin <- nc_open(ncfname)
print(ncin)
# get longitude and latitude
lon <- ncvar_get(ncin,"longitude")
nlon <- dim(lon)
head(lon)
lat <- ncvar_get(ncin,"latitude")
nlat <- dim(lat)
head(lat)
print(c(nlon,nlat))
# get time
time <- ncvar_get(ncin,"time")
time
tunits <- ncatt_get(ncin,"time","units")
nt <- dim(time)
nt
# get pressure
pp_array <- ncvar_get(ncin,dname)
dlname <- ncatt_get(ncin,dname,"long_name")
dunits <- ncatt_get(ncin,dname,"units")
fillvalue <- ncatt_get(ncin,dname,"_FillValue")
dim(pp_array)
I have a RAM issue while running pp_array <- ncvar_get(ncin,dname). I would like to maybe submit a job in Spark to maybe run it on a cluster.
I have followed the installation procedure from here: https://therinspark.com/starting.html
#installing Spark
library(sparklyr)
#spark_install()
sc <- spark_connect(master = "local")
spark_web(sc)
# Retrieve the Spark installation directory
spark_home <- spark_home_dir()
# Build paths and classes
spark_path <- file.path(spark_home, "bin", "spark-class")
# Start cluster manager master node
system2(spark_path, "org.apache.spark.deploy.master.Master", wait = FALSE)
# Start worker node, find master URL at http://localhost:8080/
system2(spark_path, c("org.apache.spark.deploy.worker.Worker",
"spark://192.168.1.32:7077"), wait = FALSE)
but I am not familiar at all with Spark and I get confused on what to do next, as their main example is using a .csv file and I have .nc files as a source.
How can I run this previous piece of code with Stark?
Thank you very much for the precious help.

Split Raster Iteration or loop in R

R Programming Language (New to this)
I am attempting to loop through a number of tiled rasters that have been output by splitRaster. During the loop I want to carry out some processes on each raster.
But the following code throws an error.
library(ForestTools)
library(raster)
library(sp)
library(rgdal)
library(SpaDES)
rm(list = ls())
tmpdir <- file.path(tempdir(), "splitRaster")
lin <- function(x){x * 0.1 + 0.6}
inCHM <- raster("input raster path and name.tif")
split <- splitRaster(inCHM, 5, 5, c(0.05, 0.05), tmpdir)
files <- list.files(path=tmpdir, pattern="*.grd", full.names=FALSE, recursive=FALSE)
file.names <- dir(tmpdir, pattern ="*.grd")
for(file.names in files ){
name <- file.names
ttops <- vwf(name, winFun = lin, minHeight = 5)
writeOGR(ttops, "output folder", name, driver = "ESRI Shapefile")
}
and this is the error
[1] "Xrastername_tile1.grd"
Error in CRS(x) :
PROJ4 argument-value pairs must begin with +: Xrastername_tile1.grd
More to the problem (24/7/2020),
I have removed the loop for trouble shooting instead just choosing one of the splitRasters outputs that would be used in the loop ie files[[3]]
When I run the following code the error is the same;
library(ForestTools)
library(raster)
library(sp)
library(rgdal)
library(SpaDES)
rm(list = ls())
# set temp directory
tmpdir <- "C:\\R-Test\\Temp_Output"
# get raster
r <- raster("C:\\Lidar\\grid_treeheight_max_1m_nofill.tif")
# define projection
projection(r) <- "+proj=utm +zone=50 +south +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs"
# split raster brick
y <- splitRaster(r, 8, 8, c(0.05, 0.05), tmpdir)
# Get the complete file locations with full.names = T
files <- list.files(path=tmpdir, pattern="*.grd", full.names=FALSE, recursive=FALSE)
tmpfile <- paste(tmpdir, "\\", files[[3]], sep="")
lin <- function(x){x * 0.06 + 0.6}
ttops <- vwf(tmpfile, winFun = lin, minHeight = 5)
This is the error
Error in CRS(x) :
PROJ4 argument-value pairs must begin with +: D:\R-Test\Temp_Output\Xgrid_treeheight_max_1m_nofill_tile11.grd
When I run the following code using one of the splitRaster outputs (files[[3]]) from the above code it runs error free and I am able to plot ttops.
rm(list = ls())
# set temp directory
tmpdir <- "D:\\R-Test\\Temp_Output"
# get raster
r <- raster("D:\\R-Test\\Temp_Output\\Xgrid_treeheight_max_1m_nofill_tile11.grd")
lin <- function(x){x * 0.06 + 0.6}
ttops <- vwf(r, winFun = lin, minHeight = 5)
Why is the PROJ4 error occurring?
This seems to be the error that is causing the loop to fail?
I think the problem is that you are trying to feed the vwf function with the file name instead of a raster object. I would also recommend using lapply instead of for for the loop. Here is a code that should work
library(raster)
library(ForestTools)
library(rgdal)
# Get the complete file locations with full.names = T
files <- list.files(path=tmpdir, pattern="*.grd", full.names=T, recursive=FALSE)
# Loop over each item of the list, i.e., each raster
lapply(files, function(x){
# Load the image as raster
image <- raster(x)
# Calculate vwf (I added a dummy function for winFun)
ttops <- vwf(image, winFun = function(x){x * 0.06 + 0.5}, minHeight = 5)
# Write the file with the name of each raster
writeOGR(ttops, "output_dir", names(x), driver = "ESRI Shapefile")
})

How to extract NetCDF data frame by region using a polygon shapefile

I'am trying to extract the variable "swh_ku" from multiple NetCDF files together with their corresponding latitude and longitude values into csv files using a polygon shapefile or it's extent. I'm working with Jason-1 global altimetry swath data but I only need the data for the domain represented by the shapefile. I just need help with some lines of code that would complete the working code bellow so I can extract only the data for the region I'm interested in.
I've tried several software applications such as QGIS, ESA SNAP, Broadview Radar Altimetry Toolbox (BRAT) with no success unfortunately because I couldn't find a way automate the extraction process for the hundreds of NetCDF files. So I resorted to code with which I'm fairly new but managed to get it working after reading other posts. I've tried opening the files as raster or brick to use the #extract or #mask functions because they seem more straightforward but I couldn't manage to work them out.
Link to data: https://drive.google.com/drive/folders/1d_XVYFe__-ynxbJNUwlyl74SPJi8GybR?usp=sharing
library(ncdf4)
library(rgdal)
library(raster)
my_read_function <- function(ncname) {
setwd("D:/Jason-1/cycle_030")
bs_shp=readOGR("D:/Black_Sea.shp")
e<-extent(bs_shp)
ncfname = ncname
names(ncin[['var']])
dname = "swh_ku"
ncin = nc_open(ncfname)
print(ncin)
vars<-(names(ncin[['var']]))
vars
lon <- ncvar_get(ncin, "lon")
nlon <- dim(lon)
head(lon)
lat <- ncvar_get(ncin, "lat", verbose = F)
nlat <- dim(lat)
head(lat)
print(c(nlon, nlat))
sm_array <- ncvar_get(ncin,dname)
dlname <- ncatt_get(ncin,dname,"long_name")
dunits <- ncatt_get(ncin,dname,"units")
fillvalue <- ncatt_get(ncin,dname,"_FillValue")
dim(sm_array)
ls()
sm.slice <- sm_array[]
sm.vec <- as.vector(sm.slice)
length(sm.vec)
lonlat <- expand.grid(lon, lat)
sm.df01 <- data.frame(cbind(lonlat, sm.vec))
names(sm.df01) <- c("lon", "lat", paste(dname, sep = "_"))
head(na.omit(sm.df01), 20)
csvfile <- paste0(ncname,".csv")
write.table(na.omit(sm.df01), csvfile, row.names = FALSE, sep = ",")
}
my_files <- list.files("D:/Jason-1/cycle_030/")
lapply(my_files, my_read_function)
Looks like your data is not gridded.
library(ncdf4)
library(raster)
bs <- shapefile("Black_Sea.shp")
# simplify so that the data will look better later
bs <- as(bs, "SpatialPolygons")
f <- list.files("cycle_022", pattern="nc$", full=TRUE)
Loop would start here
ncfname = f[1]
dname = "swh_ku"
ncin = nc_open(ncfname)
lon <- ncvar_get(ncin, "lon")
lat <- ncvar_get(ncin, "lat", verbose = F)
sm_array <- ncvar_get(ncin, dname)
xyz <- na.omit(cbind(lon, lat, sm_array))
p <- SpatialPoints(xyz[,1:2], proj4string=crs(bs))
p <- SpatialPointsDataFrame(p, data.frame(xyz))
x <- intersect(p, bs)
x has the points that intersect with the Black Sea
plot(bs)
points(x)
head(x#data)

Error writing raster in for loop

-
I am beginner in R and I have made a script where I convert a .grd file into .tif file in a for loop (I have quite a lot of .grd files that should be converted). The script is as you can see below.
# Set your work directory
setwd("xxx")
library(rgdal)
library(sp)
library(raster)
# Data: .grd files in order to reproduce the code below
g1 <- raster(ncol=10, nrow=10)
vals <- 1:100
g1 <- setValues(g1, vals)
writeRaster(g1, filename="TEST_G1.grd", overwrite=TRUE)
g2 <- raster(ncol=50, nrow=50)
vals <- 1
g2 <- setValues(g2, vals)
writeRaster(g2, filename="TEST_G2.grd", overwrite=TRUE)
# Convert .grd to geotif in a for loop
rlist <- list.files(pattern=".grd$")
for (i in rlist)
{
#Read raster
x <- raster(rlist[i])
#make new file name
filename <- rlist[i]
n <- unlist(strsplit(filename, split='.', fixed=TRUE))[1]
name <- paste(n, ".tif", sep="")
#write the raster as GTiff
writeRaster(x, filename=name, format="GTiff", overwrite=TRUE)
}
I have run all sentences in the script separately and managed to get geotif file. However, when I run the lobe I get the following error:
Error in .local(.Object, ...) :
Error in .rasterObjectFromFile(x, band = band, objecttype = "RasterLayer", :
Cannot create a RasterLayer object from this file. (file does not exist)
The file does exist, and when I just run the last line separately the tif file is created... so I don't understand what is wrong with the loop.
If I e.g. write "for (i in 1:2)" instead of "for (i in rlist)" it works. but I would prefer not to count the number of files in every directory, which is the reason I was trying to use "for (i in rlist)"
Thanks a lot for you help!

Creating a list of raster bricks from a multivariate netCDF file

I've been working with the RCP (Representative Concentration Pathway) spatial data. It's a nice gridded dataset in netCDF format. How can I get a list of bricks where each element represents one variable from a multivariate netCDF file (by variable I don't mean lat,lon,time,depth...etc). This is what Iv'e tried to do. I can't post an example of the data, but I've set up the script below to be reproducible if you want to look in to it. Obviously questions welcome... I might not have expressed the language associated with the code smoothly. Cheers.
A: Package requirements
library(sp)
library(maptools)
library(raster)
library(ncdf)
library(rgdal)
library(rasterVis)
library(latticeExtra)
B: Gather data and look at the netCDF file structure
td <- tempdir()
tf <- tempfile(pattern = "fileZ")
download.file("http://tntcat.iiasa.ac.at:8787/RcpDb/download/R85_NOX.zip", tf , mode = 'wb' )
nc <- unzip( tf , exdir = td )
list.files(td)
## Take a look at the netCDF file structure, beyond this I don't use the ncdf package directly
ncFile <- open.ncdf(nc)
print(ncFile)
vars <- names(ncFile$var)[1:12] # I'll try to use these variable names later to make a list of bricks
C: Create a raster brick for one variable. Levels correspond to years
r85NOXene <- brick(nc, lvar = 3, varname = "emiss_ene")
NAvalue(r85NOXene) <- 0
dim(r85NOXene) # [1] 360 720 12
D: Names to faces
data(wrld_simpl) # in maptools
worldPolys <- SpatialPolygons(wrld_simpl#polygons)
cTheme <- rasterTheme(region = rev(heat.colors(20)))
levelplot(r85NOXene,layers = 4,zscaleLog = 10,main = "2020 NOx Emissions From Power Plants",
margin = FALSE, par.settings = cTheme) + layer(sp.polygons(worldPolys))
E: Summarize all grid cells for each year one variable "emis_ene", I want to do this for each variable of the netCDF file I'm working with.
gVals <- getValues(r85NOXene)
dim(gVals)
r85NOXeneA <- sapply(1:12,function(x){ mat <- matrix(gVals[,x],nrow=360)
matfun <- sum(mat, na.rm = TRUE) # Other conversions are needed, but not for the question
return(matfun)
})
F: Another meet and greet. Check out how E looks
library(ggplot2) # loaded here because of masking issues with latticeExtra
years <- c(2000,2005,seq(2010,2100,by=10))
usNOxDat <- data.frame(years=years,NOx=r85NOXeneA)
ggplot(data=usNOxDat,aes(x=years,y=(NOx))) + geom_line() # names to faces again
detach(package:ggplot2, unload=TRUE)
G: Attempt to create a list of bricks. A list of objects created in part C
brickLst <- lapply(1:12,function(x){ tmpBrk <- brick(nc, lvar = 3, varname = vars[x])
NAvalue(tmpBrk) <- 0
return(tmpBrk)
# I thought a list of bricks would be a good structure to do (E) for each netCDF variable.
# This doesn't break but, returns all variables in each element of the list.
# I want one variable in each element of the list.
# with brick() you can ask for one variable from a netCDF file as I did in (C)
# Why can't I loop through the variable names and return on variable for each list element.
})
H: Get rid of the junk you might have downloaded... Sorry
file.remove(dir(td, pattern = "^fileZ",full.names = TRUE))
file.remove(dir(td, pattern = "^R85",full.names = TRUE))
close(ncFile)
Your (E) step can be simplified using cellStats.
foo <- function(x){
b <- brick(nc, lvar = 3, varname = x)
NAvalue(b) <- 0
cellStats(b, 'sum')
}
sumLayers <- sapply(vars, foo)
sumLayers is the result you are looking for, if I understood correctly your question.
Moreover, you may use the zoo package because you are dealing with time series.
library(zoo)
tt <- getZ(r85NOXene)
z <- zoo(sumLayers, tt)
xyplot(z)

Resources