How can I create an R loop with the code provided below? - r

Please, I need help in creating a loop that would do the computation shown in the codes below on a hdflist containing 483 files in R. I have added a link that contains two .hdf files and the shapefiles for trial. The code seems to work just fine for a single .hdf file but I'm still struggling with looping. Thank you
download files from here
https://beardatashare.bham.ac.uk/getlink/fi2gNzWbuv5H8Gp7Qg2aemdM/
# import .hdf file into R using get_subdatasets to access the subsets in the file`
sub <- get_subdatasets("MOD13Q1.A2020353.h18v08.006.2021003223721.hdf")
# convert red and NIR subsets and save them as raster`
gdalwarp(sub[4], 'red_c.tif')
gdalwarp(sub[5], 'NIR_c.tif')
# import red and NIR raster back into R`
# scale the rater while at it`
r_r=raster('red_c.tif') * 0.0001
r_N=raster('NIR_c.tif') * 0.0001
# calculate sigma using (0.5*(NIR+red))`
sigma <- (0.5*(r_N+r_r))
# calculate knr using exp((-(NIR-red)^2)/(2*sigma^2))`
knr <- exp((-(r_N-r_r)^2)/(2*sigma^2))
# calculate kndvi using (1 - knr) / (1 + knr)`
kndvi <- (1 - knr) / (1 + knr)
# import shapefile into R`
shp=readOGR(".", "National_Parks")
options(stringsAsFactors = FALSE)
#change crs of shapefile to crs of one of the rasters`
shp2 <- spTransform(shp, crs(kndvi))
# use extent to crop/clip raster`
## set extent`
e <- extent(910000,980000, 530000, 650000)
## clip using crop function`
crop_kndvi <- crop(kndvi, e)
# mask raster using the shapefile`
kndvi_mask <- mask(crop_kndvi, shp2)
And then save kndvi_mask as raster for 483 files

Here is how you can do that with terra. terra is the replacement for raster; it is much faster and more versatile. For example, with terra you can skip the gdalwarp step.
You can write one big for-loop, but I prefer to use functions and then call these in a loop or lapply.
Also, instead of your raster-algebra approach, it could be more efficient to wrap the kndvi computation into its own function and use it with lapp. I think that is a better approach as the code is clearer and it allows you to re-use the kndvi function.
library(terra)
parks <- vect("National_Parks.shp")
parks <- project(parks, "+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +R=6371007.181 +units=m")
e <- ext(910000,980000, 530000, 650000)
kndvi function to be used by lapp
kndvi <- function(red, NIR) {
red <- red * 0.0001
NIR <- NIR * 0.0001
sigma <- (0.5 * (NIR + red))
knr <- exp((-(NIR-red)^2)/(2*sigma^2))
(1 - knr) / (1 + knr)
}
Main function. Note that I use crop before the other functions; that saves a lot of unnecessary processing.
fun <- function(f) {
outf <- gsub(".hdf$", "_processed.tif", f)
# if file.exists(outf) return(rast(outf))
r <- rast(f)[[4:5]]
# or r <- sds(f)[4:5]
r <- crop(r, e)
kn <- lapp(r, kndvi)
name <- substr(basename(f), 9, 16)
mask(kn, parks, filename=outf, overwrite=TRUE, names=name)
}
Get the filenames and use the function with a loop or with lapply as shown by Elia.
ff <- list.files(pattern="hdf$", full=TRUE)
x <- list()
for (i in 1:length(ff)) {
print(ff[i]); flush.console()
x[[i]] <- fun(ff[i])
}
z <- rast(x)
z
#class : SpatRaster
#dimensions : 518, 302, 2 (nrow, ncol, nlyr)
#resolution : 231.6564, 231.6564 (x, y)
#extent : 909946.2, 979906.4, 530029.7, 650027.7 (xmin, xmax, ymin, ymax)
#coord. ref. : +proj=sinu +lon_0=0 +x_0=0 +y_0=0 +R=6371007.181 +units=m +no_defs
#sources : MOD13Q1.A2020337.h18v08.006.2020358165204_processed.tif
# MOD13Q1.A2020353.h18v08.006.2021003223721_processed.tif
#names : A2020337, A2020353
#min values : 0.0007564131, 0.0028829363
#max values : 0.7608207, 0.7303495
This takes about 1 second per file on my computer.
Or as a for-loop that you asked for:
ff <- list.files(pattern="hdf$", full=TRUE)
for (f in ff) {
print(f); flush.console()
outf <- gsub(".hdf$", "_processed.tif", f)
r <- rast(f)[[4:5]]
r <- crop(r, e)
kn <- lapp(r, kndvi)
name <- substr(basename(f), 9, 16)
mask(kn, parks, filename=outf, overwrite=TRUE, names=name)
}
outf <- list.files(pattern="_processed.tif$", full=TRUE)
x <- rast(outf)

You can wrap your code in a function and then lapply over the hdf path. In this way if your loop is too slow it will be easy to parallelize it.
You could try this:
library(gdalUtils)
library(raster)
library(rgdal)
#set the directory where you have .hdf files. In my case I downloaded your data in "D:/download"
setwd("D:/download")
#function to save the masked index in your current working directory
#the final files name will depend on the name of the input hdf files
myfun <- function(path){
name <- basename(tools::file_path_sans_ext(path))
sub <- get_subdatasets(path)
gdalwarp(sub[4], paste0(name,'_red_c.tif'))
gdalwarp(sub[5], paste0(name,'NIR_c.tif'))
r_r=raster(paste0(name,'_red_c.tif')) * 0.0001
r_N=raster(paste0(name,'NIR_c.tif')) * 0.0001
sigma <- (0.5*(r_N+r_r))
knr <- exp((-(r_N-r_r)^2)/(2*sigma^2))
kndvi <- (1 - knr) / (1 + knr)
crop_kndvi <- crop(kndvi, e)
kndvi_mask <- mask(crop_kndvi,
shp2,filename=paste0(name,"_kndvi_mask.tif"))
}
#list the hdf file in your current working directory. Thanks to setwd("D:/download") there is no need to specify the path argument of list.files().
b#however for the for peace of mind:
hdf <- list.files(path=getwd(),pattern = "hdf",full.names = T)
#since your shop is always the same you could keep this part out of the function
shp=readOGR(".", "National_Parks")
options(stringsAsFactors = FALSE)
shp2 <- spTransform(shp, "+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m
+no_defs ")
e <- extent(910000,980000, 530000, 650000)
#now run your function across the hdf files path
lapply(hdf, myfun)
in your working directory now you find all the saved if
list.files(pattern = "tif")
[1] "MOD13Q1.A2020337.h18v08.006.2020358165204_kndvi_mask.tif"
[2] "MOD13Q1.A2020337.h18v08.006.2020358165204_red_c.tif"
[3] "MOD13Q1.A2020337.h18v08.006.2020358165204NIR_c.tif"
[4] "MOD13Q1.A2020353.h18v08.006.2021003223721_kndvi_mask.tif"
[5] "MOD13Q1.A2020353.h18v08.006.2021003223721_red_c.tif"
[6] "MOD13Q1.A2020353.h18v08.006.2021003223721NIR_c.tif"
With lapply on my PC the function run in 45 seconds.
You could easily parallelize lapply by replacing it with sfLapply from the snowfall package, for example. For just 2 files it's not worth it, but if you have hundreds of files you can speed up the process a lot:
library(snowfall)
#open cluster with as many node as hdf file
sfInit(parallel=TRUE, cpus=length(hdf))
# Load the required packages inside the cluster
sfLibrary(raster)
sfLibrary(rgdal)
sfLibrary(gdalUtils)
sfExportAll()
system.time(sfLapply(hdf, myfun))
sfStop()
with sfLapply this function took 20 secs to run. It is a good improvement

hdf_files <- list.files("foldername", pattern = ".hdf")
for(f in files) { ... }
For the save code, you can use the string "f" to create a name for the file, so they don't save on top of each other.

Related

R Error: attempt to apply non-function (Only first time I run the script!)

I defined the following function to create masks from a vector file (e.g. ESRI Shapefile) and a raster image (e.g. GeoTiff).
if (!require("raster")) install.packages("raster") # https://cran.r-project.org/web/packages/raster/index.html
if (!require("rgdal")) install.packages("rgdal") # https://cran.r-project.org/web/packages/rgdal/index.html
if (!require("sf")) install.packages("sf") # https://cran.r-project.org/web/packages/sf/index.html
suppressPackageStartupMessages({
library(raster)
library(rgdal)
library(sf)
})
shape2Mask <- function(x, referenceRaster, labelField, noDataValue=0){
# shape2Mask.R creates a raster mask (GeoTiff) from a vector file (ESRI Shapefile) using a RasterLayer or RasterStack
# as reference. If the extent of the vector file exceeds the extent of the reference raster, the vector is clipped.
# Mask contains the following values:
# 0: unlabeled pixels
# 1,2,3,...n: for each class, where n is the total number of classes.
# Arguments:
# x: SpatialPolygonsDataFrame containing the labeled polygons
# referenceRaster: RasterLayer or RasterStack used as reference
# labelField: field (integer) containing the labels
# noDataValue: value used to identify non-classified pixels (default, 0)
# Return
# (RasterLayer) raster mask.
# If x and the referenceRaster do not overlap, shape2Mask returns NULL.
#t0 <- system.time()
vectorFile <- x
if (st_crs(vectorFile) != st_crs(referenceRaster)){
errorMessage <- 'Warning! The CRS are not matched ...'
message(errorMessage)
} else {
# Adapt vectorFile to the referenceRaster extent
vectorFile <- raster::crop(vectorFile, extent(referenceRaster[[1]]))
if(is.null(vectorFile)){
return(NULL)
}
runMessage <- 'Processing! Creating raster mask...'
message(runMessage)
# Convert to sf object
vectorFile <- sf::st_as_sf(vectorFile)
# Create masks
r <- raster::rasterize(vectorFile, referenceRaster[[1]], field=labelField)
r <- raster::extend(r, referenceRaster[[1]])
r[is.na(r)] <- noDataValue
#t1 <- system.time()
#message(t1-t0)
return(r)
}
}
Now I am trying to apply the function to a list of GeoTiff files to create their respective masks as follows:
# Read ground truth data
LSF_21_path <- 'F:/data/labeled_polygons.shp'
vectorFile <- rgdal::readOGR(LSF_21_path)
vectorFile#data$label <- as.factor(vectorFile#data$class)
# Read reference images
IMG_21_path <- 'F:/data/LSF_21'
tifFiles <- list.files(path=IMG_21_path, pattern='.tif', full.names=TRUE, recursive=FALSE)
tifNames <- list.files(path=IMG_21_path, pattern='.tif', full.names=FALSE, recursive=FALSE)
rasList <- list()
for(i in 1:length(tifFiles)){
rasList <- append(rasList, raster::stack(tifFiles[[i]]))
}
# Create raster masks
outPath <- paste0(IMG_21_path, '/masks/')
dir.create(outPath)
for(i in 1:length(rasList)){
print(tifFiles[[i]])
maskFile <- shape2Mask(x = vectorFile, referenceRaster = rasList[[i]], labelField = 'label', noDataValue = 0)
if(is.null(maskFile)){ next }
raster::writeRaster(maskFile, paste0(outPath, tifNames[[i]]))
}
First time I run the code I get the following error:
R Error: attempt to apply non-function
But, I run the script again without close the R session and it works. What am I missing?

R - gdalUtils - gdal_grid example data giving zero values...?

I have been trying to use the gdal_grid in R and while running the example data set I recieve a raster which has only zero values. I have tried experimenting with this with my own data, and have searched the forums with no luck. Can others get the example to work?
I have tried to explicitly call my path to my GDAL library, and have updated the version of GDAL. I am running R studio with version 3.3.1.
library(raster)
library(rgeos)
library(gdalUtils)
# We'll pre-check to make sure there is a valid GDAL install
# and that raster and rgdal are also installed.
# Note this isn't strictly neccessary, as executing the function will
# force a search for a valid GDAL install.
gdal_setInstallation()
valid_install <- !is.null(getOption("gdalUtils_gdalPath"))
if(require(raster) && valid_install)
{
# Create a properly formatted CSV:
temporary_dir <- tempdir()
tempfname_base <- file.path(temporary_dir,"dem")
tempfname_csv <- paste(tempfname_base,".csv",sep="")
pts <- data.frame(
Easting=c(86943.4,87124.3,86962.4,87077.6),
Northing=c(891957,892075,892321,891995),
Elevation=c(139.13,135.01,182.04,135.01)
)
write.csv(pts,file=tempfname_csv,row.names=FALSE)
# Now make a matching VRT file
tempfname_vrt <- paste(tempfname_base,".vrt",sep="")
vrt_header <- c(
'<OGRVRTDataSource>',
'\t<OGRVRTLayer name="dem">',
'\t<SrcDataSource>dem.csv</SrcDataSource>',
'\t<GeometryType>wkbPoint</GeometryType>',
'\t<GeometryField encoding="PointFromColumns" x="Easting" y="Northing" z="Elevation"/>',
'\t</OGRVRTLayer>',
'\t</OGRVRTDataSource>'
)
vrt_filecon <- file(tempfname_vrt,"w")
writeLines(vrt_header,con=vrt_filecon)
close(vrt_filecon)
tempfname_tif <- paste(tempfname_base,".tiff",sep="")
# Now run gdal_grid:
setMinMax(gdal_grid(src_datasource=tempfname_vrt,
dst_filename=tempfname_tif,a="invdist:power=2.0:smoothing=1.0",
txe=c(85000,89000),tye=c(894000,890000),outsize=c(400,400),
of="GTiff",ot="Float64",l="dem",output_Raster=TRUE))
}
r<-raster(tempfname_tif)
r
#class : RasterLayer
#dimensions : 400, 400, 160000 (nrow, ncol, ncell)
#resolution : 10, 10 (x, y)
#extent : 85000, 89000, 890000, 894000 (xmin, xmax, ymin, ymax)
#coord. ref. : NA
#data source : C:\Users\m.modeler\AppData\Local\Temp\RtmpW6HvOc\dem.tiff
#names : dem
#min values : 0
#max values : 0
plot(r)
Raster results plot with zero values:
Thanks much,
I have got the code to run by changing the path from the temp directory to a folder on my hard drive. Example below.
# change to a path on your computer
setwd("C:\\Users\\m.modeler\\Documents\\R\\gdal_Examples")
#######################################################
#create XYZ csv
pts <- data.frame(
Easting=c(86943.4,87124.3,86962.4,87077.6),
Northing=c(891957,892075,892321,891995),
Elevation=c(139.13,135.01,182.04,135.01))
write.csv(pts,file="dem.csv",row.names=FALSE)
#######################################################
#create VRT
fn_vrt<-"dem.vrt"
# Now make a matching VRT file
vrt_header <- c(
'<OGRVRTDataSource>',
'\t<OGRVRTLayer name="dem">',
'\t<SrcDataSource>dem.csv</SrcDataSource>',
'\t<GeometryType>wkbPoint</GeometryType>',
'\t<GeometryField encoding="PointFromColumns" x="Easting" y="Northing" z="Elevation"/>',
'\t</OGRVRTLayer>',
'\t</OGRVRTDataSource>')
vrt_filecon <- file(fn_vrt,"w")
writeLines(vrt_header,con=vrt_filecon)
close(vrt_filecon)
#######################################################
#create interpolated DEM
fn_tif <- "dem.tif"
# Now run gdal_grid:
r.dem <- setMinMax(gdal_grid(src_datasource=fn_vrt,
dst_filename=fn_tif,a="invdist:power=2.0:smoothing=1.0",
txe=c(85000,89000),tye=c(894000,890000),outsize=c(400,400),
of="GTiff",ot="Float64",l="dem",output_Raster=TRUE,verbose=TRUE))
plot(r.dem)

Finding the nearest distance between two SpatialPointsDataframes using gDistance rgeos?

I have two separate but related questions.
First, I would like to determine the distance to the nearest construction site (construction_layer.csv) for every data point within the subset_original_data.csv file. I am trying to use the gDistance() function to calculate the nearest neighbor, but I am open to other ideas as well.
I want to append my subset_original_data.csv dataframe with this new vector of nearest neighbor distances from the construction_layer.csv. That is, for every row of my subset_original_data.csv dataframe, I want the minimum distance to the nearest construction site.
The second goal is to determine the nearest distance from each subset_original_data.csv row to a freeway shapefile (fwy.shp). I would also like to append this new vector back onto the subset_original.csv dataframe.
I have successfully converted the construction_layer.csv and subset_original_data.csv into SpatialPointsDataFrame. I have also converted the fwy.shp file into a SpatialLinesDataFrame by reading in the shape file with the readOGR() function. I am not sure where to go next. Your input is greatly appreciated!
~ $ spacedSparking
Here's my data:
construction_layer.csv, fwy.shp, subset_original_data.csv
Here's my code:
#requiring necessary packages:
library(rgeos)
library(sp)
library(rgdal)
#reading in the files:
mydata <- read.csv("subset_original_data.csv", header = T)
con <- read.csv("construction_layer.csv", header = T)
fwy <- readOGR(dsn = "fwy.shp")
#for those who prefer not to download any files:
data.lat <- c(45.53244, 45.53244, 45.53244, 45.53244, 45.53245, 45.53246)
data.lon <- c(-122.7034, -122.7034, -122.7034, -122.7033, -122.7033, -122.7032)
data.black.carbon <- c(187, 980, 466, 826, 637, 758)
mydata <- data.frame(data.lat, data.lon, data.black.carbon)
con.lat <- c(45.53287, 45.53293, 45.53299, 45.53259, 45.53263, 45.53263)
con.lon <- c(-122.6972, -122.6963, -122.6952, -122.6929, -122.6918, -122.6918)
con <- data.frame(con.lat, con.lon)
#I am not sure how to include the `fwy.shp` in a similar way,
#so don't worry about trying to solve that problem if you would prefer not to download the file.
#convert each file to SpatialPoints or SpatialLines Dataframes:
mydata.coords <- data.frame(lon = mydata[,2], lat = mydata[,1], data = mydata)
mydata.sp <- sp::SpatialPointsDataFrame(mydata.coords, data = data.frame(BlackCarbon = mydata[,3])) #appending a vector containing air pollution data
con.coords <- data.frame(lon = con[,2], lat = con[,1])
con.sp <- sp:SpatialPointsDataFrame(con.coords, data = con)
str(fwy) #already a SpatialLinesDataFrame
#Calculate the minimum distance (in meters) between each observation between mydata.sp and con.sp and between mydata.sp and fwy objects.
#Create a new dataframe appending these two nearest distance vectors back to the original mydata file.
#Desired output:
head(mydata.appended)
LATITUDE LONGITUDE BC6. NEAREST_CON (m) NEAREST_FWY (m)
1 45.53244 -122.7034 187 ??? ???
2 45.53244 -122.7034 980 ??? ???
3 45.53244 -122.7034 466 ??? ???
4 45.53244 -122.7033 826 ??? ???
5 45.53245 -122.7033 637 ??? ???
6 45.53246 -122.7032 758 ??? ???
EDIT:
SOLUTION:
When in doubt, ask a friend who is an R wizard! He even made a map.
library(rgeos)
library(rgdal)
library(leaflet)
library(magrittr)
#Define Projections
wgs84<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0")
utm10n<-CRS("+proj=utm +zone=10 +ellps=GRS80 +datum=NAD83 +units=m +no_defs +towgs84=0,0,0")
#creating example black carbon data by hand:
lat <- c(45.5324, 45.5325, 45.53159, 45.5321, 45.53103, 45.53123)
lon <- c(-122.6972, -122.6963, -122.6951, -122.6919, -122.6878, -122.6908)
BlackCarbon <- c(187, 980, 466, 826, 637, 758)
bc.coords <- data.frame(lat, lon, BlackCarbon)
bc<-SpatialPointsDataFrame(data.frame(x=lon,y =lat),data=data.frame(BlackCarbon),proj4string = wgs84)
# Project into something - Decimal degrees are no fun to work with when measuring distance!
bcProj<-spTransform(bc,utm10n)
#creating example construction data layer:
con.lat <- c(45.53287, 45.53293, 45.53299, 45.53259, 45.53263, 45.53263)
con.lon <- c(-122.6972, -122.6963, -122.6952, -122.6929, -122.6918, -122.6910)
con.coords <- data.frame(con.lat, con.lon)
con<-SpatialPointsDataFrame(data.frame(x=con.lon,y =con.lat),data=data.frame(ID=1:6),proj4string = wgs84)
conProj<-spTransform(con,utm10n)
#All at once (black carbon points on top, construction on the y-axis)
dist<-gDistance(bcProj,conProj,byid=T)
min_constructionDistance<-apply(dist, 2, min)
# make a new column in the WGS84 data, set it to the distance
# The distance vector will stay in order, so just stick it on!
bc#data$Nearest_Con<-min_constructionDistance
bc#data$Near_ID<-as.vector(apply(dist, 2, function(x) which(x==min(x))))
#Map the original WGS84 data
pop1<-paste0("<b>Distance</b>: ",round(bc$Nearest_Con,2),"<br><b>Near ID</b>: ",bc$Near_ID)
pop2<-paste0("<b>ID</b>: ",con$ID)
m<-leaflet()%>%
addTiles()%>%
addCircleMarkers(data=bc,radius=8,fillColor = 'red',fillOpacity=0.8,weight=1,color='black',popup=pop1)%>%
addCircleMarkers(data=con,radius=8,fillColor = 'blue',fillOpacity=0.8,weight=1,color='black',popup=pop2)
m
You can use the a haversine distance function and use functional programming to achieve the desired result.
library(geosphere)
find_min_dist <- function(site, sites) {
min(distHaversine(site, sites))
}
#X is the data id, split into a list so you can iterate through each site point
data <- split(mydata[ , 3:2], mydata$X)
sapply(data, find_min_dist, sites = con.coords)

how to add average rasters within for-loop that creates the rasters? R

I have several directories with 700+ binary encoded rasters that i take average the output rasters per directory. however, i currently create the rasters 1 by 1 in a for loop, then load newly created rasters back into R to take the sum to obtain the monthly rainfall total.
However, since I dont need the individual rasters, only the average raster, I have a hunch that I could do this all w/in 1 loop and not save the rasters but just the output average raster, but I am coming up short in how to program this in R.
setwd("~/Desktop/CMORPH/Levant-Clip/200001")
dir.output <- '~/Desktop/CMORPH/Levant-Clip/200001' ### change as needed to give output location
path <- list.files("~/Desktop/CMORPH/MonthlyCMORPH/200001",pattern="*.bz2", full.names=T, recursive=T)
for (i in 1:length(path)) {
files = bzfile(path[i], "rb")
data <- readBin(files,what="double",endian = "little", n = 4948*1649, size=4) #Mode of the vector to be read
data[data == -999] <- NA #covert missing data from -999(CMORPH notation) to NAs
y<-matrix((data=data), ncol=1649, nrow=4948)
r <- raster(y)
e <- extent(-180, 180, -90, 83.6236) ### choose the extent based on the netcdf file info
tr <- t(r) #transpose
re <- setExtent(tr,extent(e)) ### set the extent to the raster
ry <- flip(re, direction = 'y')
projection(ry) <- "+proj=longlat +datum=WGS84 +ellps=WGS84"
C_Lev <- crop(ry, Levant) ### Clip to Levant
M_C_Lev<-mask(C_Lev, Levant)
writeRaster(M_C_Lev, paste(dir.output, basename(path[i]), sep = ''), format = 'GTiff', overwrite = T) ###the basename allows the file to be named the same as the original
}
#
raspath <- list.files ('~/Desktop/CMORPH/Levant-Clip/200001',pattern="*.tif", full.names=T, recursive=T)
rasstk <- stack(raspath)
sum200001<-sum(rasstk)
writeRaster(avg200001, paste(dir.output, basename(path[i]), sep = ''), format = 'GTiff', overwrite = T) ###the basename allows the file to be named the same as the original
currently, this code takes about 75 mins to execute, and I have about 120 more directories to go, and am looking for faster solutions.
thank you for all and any comments and input. best, evan
Elaborating on my previous comment, you could try:
setwd("~/Desktop/CMORPH/Levant-Clip/200001")
dir.output <- '~/Desktop/CMORPH/Levant-Clip/200001' ### change as needed to give output location
path <- list.files("~/Desktop/CMORPH/MonthlyCMORPH/200001",pattern="*.bz2", full.names=T, recursive=T)
raster_list = list()
for (i in 1:length(path)) {
files = bzfile(path[i], "rb")
data <- readBin(files,what="double",endian = "little", n = 4948*1649, size=4) #Mode of the vector to be read
data[data == -999] <- NA #covert missing data from -999(CMORPH notation) to NAs
y<-matrix((data=data), ncol=1649, nrow=4948)
r <- raster(y)
if (i == 1) {
e <- extent(-180, 180, -90, 83.6236) ### choose the extent based on the netcdf file info
}
tr <- t(r) #transpose
re <- setExtent(tr,extent(e)) ### set the extent to the raster
ry <- flip(re, direction = 'y')
projection(ry) <- "+proj=longlat +datum=WGS84 +ellps=WGS84"
C_Lev <- crop(ry, Levant) ### Clip to Levant
M_C_Lev<-mask(C_Lev, Levant)
raster_list[[i]] = M_C_Lev
}
#
rasstk <- stack(raster_list, quick = TRUE) # OR rasstk <- brick(raster_list, quick = TRUE)
avg200001<-mean(rasstk)
writeRaster(avg200001, paste(dir.output, basename(path[i]), sep = ''), format = 'GTiff', overwrite = T) ###the basename allows the file to be named the same as the original
Using the "quick" options in stack should definitely speed-up things, in particular if you have many rasters.
Another possibility is to first compute the average, and then perform the "spatial proceesing". For example:
for (i in 1:length(path)) {
files = bzfile(path[i], "rb")
data <- readBin(files,what="double",endian = "little", n = 4948*1649, size=4) #Mode of the vector to be read
data[data == -999] <- NA #covert missing data from -999(CMORPH notation) to NAs
if (i == 1) {
totdata <- data
num_nonNA <- as.numeric(!is.na(data))
} else {
totdata = rowSums(cbind(totdata,data), na.rm = TRUE)
# We have to count the number of "valid" entries so that the average is correct !
num_nonNA = rowSums(cbind(num_nonNA,as.numeric(!is.na(data))),na.rm = TRUE)
}
}
avg_data = totdata/num_nonNA # Compute the average
# Now do the "spatial" processing
y<-matrix(avg_data, ncol=1649, nrow=4948)
r <- raster(y)
e <- extent(-180, 180, -90, 83.6236) ### choose the extent based on the netcdf file info
tr <- t(r) #transpose
re <- setExtent(tr,extent(e)) ### set the extent to the raster
ry <- flip(re, direction = 'y')
projection(ry) <- "+proj=longlat +datum=WGS84 +ellps=WGS84"
C_Lev <- crop(avg_data, Levant) ### Clip to Levant
M_C_Lev<-mask(C_Lev, Levant)
writeRaster(M_C_Lev, paste(dir.output, basename(path[i]), sep = ''), format = 'GTiff', overwrite = T) ###the basename allows the file to be named the same as the original
This could be faster or slower, depending from "how much" you are cropping the original data.
HTH,
Lorenzo
I'm adding another answer to clarify and simplify things a bit, also in relation with comments in chat. The code below should do what you ask: that is, cycle over files, read the "data", compute the sum over all files and convert it to a raster with specified dimensions.
Note that for testing purposes here I substituted your cycle on file names with a simple 1 to 720 cycle, and file reading with the creation of arrays of the same length as yours filled with values from 1 to 4 and some NA !
totdata <- array(dim = 4948*1649) # Define Dummy array
for (i in 1:720) {
message("Working on file: ", i)
data <- array(rep(c(1,2,3,4),4948*1649/4), dim = 4948*1649) # Create a "fake" 4948*1649 array each time to simulate data reading
data[1:1000] <- -999 # Set some values to NA
data[data == -999] <- NA #convert missing data from -999
totdata <- rowSums(cbind(totdata, data), na.rm = T) # Let's sum the current array with the cumulative sum so far
}
# Now reshape to matrix and convertt to raster, etc.
y <- matrix(totdata, ncol=1649, nrow=4948)
r <- raster(y)
e <- extent(-180, 180, -90, 83.6236) ### choose the extent based on the netcdf file info
tr <- t(r) #transpose
re <- setExtent(tr,e) ### set the extent to the raster
ry <- flip(re, direction = 'y')
projection(ry) <- "+proj=longlat +datum=WGS84 +ellps=WGS84"
This generates a "proper" raster:
> ry
class : RasterLayer
dimensions : 1649, 4948, 8159252 (nrow, ncol, ncell)
resolution : 0.07275667, 0.1052902 (x, y)
extent : -180, 180, -90, 83.6236 (xmin, xmax, ymin, ymax)
coord. ref. : +proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0
data source : in memory
names : layer
values : 0, 2880 (min, max)
contatining the sum of the different arrays: You can notice that max value is 720 * 4 = 2880 (Only caveat: If you have cells which are always at NA, you will get 0 instead than NA)
On my laptop, this runs in about 5 minutes !
In practice:
to avoid memory problems, I am not reading in memory all the data.
Each of your arrays is more or less 64MB, so I cannot load them all
and then do the sum (unless I have 50 GB of RAM to throw away - and even in
that case it would be slow). I instead make use of the associative
propoerty of summation by computing a "cumulative" sum at each
cycle. In this way you are only working with two 8-millions arrays at
a time: the one you read from file "i", and the one that contains
the current sum.
to avoid unnecessary computations here I am summing directly the
1-dimensional arrays I get from reading the binary. You don't need
to reshape to matrix the arrays in the cycle because you can do that
on the final "summed" array which you can then convert to matrix form
I hope this will work for you and that I am not missing something obvious !
As far as I can understand, if using this approach is still slow you are having problems elsewhere (for example in data reading: on 720 files, 3 seconds spent on reading for each file means roughly 35 minutes of processing).
HTH,
Lorenzo

index not working on for loop created to read rasters from a directory, summarize data into single raster, and output new raster

I have several directories full of rasters that are daily climate data. I need to combine the daily rasters into weekly rasters, some by sum of values and some by mean of values. So far, I've created a vector of the file names within the directories (which contains the daily raster files) and written a for loop to import the first 7 rasters, put the rasters into a stack, used calc to get the mean (or sum) of the rasters in the stack into a single new raster, and write the raster to a file within the working directory. I then deal with the new vector of file names by attempting to remove the first 7 names from the vector and repeating the loop on the first 7 remaining file names within the vector. The problem I am having is that the first indexed file name is not being removed from the vector. Here's the code.
#file names for rasters are in a column of data frame
fname <- (repDf$fname)
#get rid of first 5 values to start on Sunday
fname <- fname[-c(1:5)]
#look at beginning of fname vector
head(fname)
[1] "1980_6.asc" "1980_7.asc" "1980_8.asc" "1980_9.asc" "1980_10.asc" "1980_11.asc"
for (i in seq_along(fname)){
f1 <- fname[[i]]
f2 <- fname[[i+1]]
f3 <- fname[[i+2]]
f4 <- fname[[i+3]]
f5 <- fname[[i+4]]
f6 <- fname[[i+5]]
f7 <- fname[[i+6]]
r1 <- raster(f1)
r2 <- raster(f2)
r3 <- raster(f3)
r4 <- raster(f4)
r5 <- raster(f5)
r6 <- raster(f6)
r7 <- raster(f7)
s <- stack(r1,r2,r3,r4,r5,r6,r7)
r <- calc(s, fun=sum)
r <- r * 0.0393701
r <- round(r, 2)
writeRaster(x=r, filename=paste0("week_", i, sep=""), format="ascii", overwrite=TRUE)
remove <- c(f1,f2,f3,f4,f5,f6,f7)
fname <- fname[! fname %in% remove]
}
#Example RasterLayer (after loop has run twice)
print(r1)
class : RasterLayer
dimensions : 227, 199, 45173 (nrow, ncol, ncell)
resolution : 994.9749, 994.9749 (x, y)
extent : 367500, 565500, -1325500, -1099641 (xmin, xmax, ymin, ymax)
coord. ref. : NA
data source : G:\dailyPrism\10843\prcp\1980_30.asc
names : X1980_30
#Example indexed file names (after loop has run twice)
f1
[1] "1980_30.asc"
#Example output raster (after loop has run twice)
print(r)
class : RasterLayer
dimensions : 227, 199, 45173 (nrow, ncol, ncell)
resolution : 994.9749, 994.9749 (x, y)
extent : 367500, 565500, -1325500, -1099641 (xmin, xmax, ymin, ymax)
coord. ref. : NA
data source : in memory
names : layer
values : 1.02, 3.54 (min, max)
The problem can be seen here;
head(fname)
[1] "1980_13.asc" "1980_21.asc" "1980_29.asc" "1980_30.asc" "1980_31.asc" "1980_32.asc"
For some reason, the file name that would have been used as the first day of the week is retained in the fname vector and the files that go into the weekly calculations don't represent the days that they need to. Any help is greatly appreciated. I would try and provide some example files to work with, but the files are quite large.
Add some debug code to your code and you can recognize what happens (I have disabled the processing part since I do not have files or calculation logic):
fname <- c(paste0("1980_", 6:40, ".asc"))
fname[1]
seq_along(fname) # vector from 1 to length of vector!!!
for (i in seq_along(fname)){
print(i)
print(paste("Size of fname:", length(fname)))
print(head(fname))
print(fname[i])
f1 <- fname[[i]]
f2 <- fname[[i+1]]
f3 <- fname[[i+2]]
f4 <- fname[[i+3]]
f5 <- fname[[i+4]]
f6 <- fname[[i+5]]
f7 <- fname[[i+6]]
# r1 <- raster(f1)
# r2 <- raster(f2)
# r3 <- raster(f3)
# r4 <- raster(f4)
# r5 <- raster(f5)
# r6 <- raster(f6)
# r7 <- raster(f7)
# s <- stack(r1,r2,r3,r4,r5,r6,r7)
# r <- calc(s, fun=sum)
# r <- r * 0.0393701
# r <- round(r, 2)
# writeRaster(x=r, filename=paste0("week_", i, sep=""), format="ascii", overwrite=TRUE)
remove <- c(f1,f2,f3,f4,f5,f6,f7)
fname <- fname[! fname %in% remove]
}
This will result in:
[1] 1
[1] "Size of fname: 35"
[1] "1980_6.asc" "1980_7.asc" "1980_8.asc" "1980_9.asc" "1980_10.asc" "1980_11.asc"
[1] 2
[1] "Size of fname: 28"
[1] "1980_13.asc" "1980_14.asc" "1980_15.asc" "1980_16.asc" "1980_17.asc" "1980_18.asc"
[1] 3
[1] "Size of fname: 21"
[1] "1980_13.asc" "1980_21.asc" "1980_22.asc" "1980_23.asc" "1980_24.asc" "1980_25.asc"
[1] 4
[1] "Size of fname: 14"
[1] "1980_13.asc" "1980_21.asc" "1980_29.asc" "1980_30.asc" "1980_31.asc" "1980_32.asc"
[1] 5
[1] "Size of fname: 7"
[1] "1980_13.asc" "1980_21.asc" "1980_29.asc" "1980_37.asc" "1980_38.asc" "1980_39.asc"
Error in fname[[i + 3]] : subscript out of bounds
The reason is that you loop over a predefined number of fname vector items ("seq_along" = 35 in my example).
Therefore you delete the processed items but i is NOT reset to one but is incremented every time you loop through.
And puh, there is much room to improve the code (e. g. no need to delete the elements, just loop over the vector elements; how to deal with a wrong vector size in the last loop; why do you use double square brackets to access the fnames...)
Easy modifications to solve the problem (without optimizing the code):
Modify the loop:
for (i in seq(1,length(fname), by=7)) {
Remove the two lines:
remove <- c(f1,f2,f3,f4,f5,f6,f7)
fname <- fname[! fname %in% remove]
and add a debug output instead:
print( paste("processing files", f1, "to", f7))
Here is how you make the code more succinct by creating a RasterStack from a vector of filenames
fname <- repDf$fname[-c(1:5)]
for (i in seq(1,length(fname), by=7)){
s <- stack(fname[i:(i+6)])
r <- sum(s) * 0.0393701
r <- round(r, 2)
writeRaster(r, filename=paste0("week_", i), format="ascii", overwrite=TRUE)
}

Resources