I use machine learning algorithms for species distribution modeling. This involves fitting a model in R and applying the model to a raster stack of environmental predictors using the predict() function. This is relatively straightforward for functions such as randomForest, glm, and maxent. However, I would like to use the best model selected using the h2o.automl function in the package h2o to predict occurrence probabilities across a stack of environmental predictors. Now, it is possible to convert raster stacks into data frames and then use these for h2o prediction. However, this tends to eat up a lot of RAM for large raster stacks. Below is an example using some publicly available low-resolution data.
library(sp)
library(raster)
library(maptools)
library(rgdal)
library(dismo)
library(h2o)
library(sp)
library(h2o)
bioclim.data <- getData(name = "worldclim",
var = "bio",
res = 2.5)
obs.data <- read.csv("https://raw.githubusercontent.com/jcoliver/learn-r/gh-pages/data/Carnegiea-gigantea-GBIF.csv")
obs.data <- obs.data[!is.na(obs.data$latitude), ]
obs.data <- obs.data[, c("longitude", "latitude")]
obs.sp<-SpatialPoints(obs.data, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs"))
# Determine geographic extent of our data
max.lat <- ceiling(max(obs.data$latitude))
min.lat <- floor(min(obs.data$latitude))
max.lon <- ceiling(max(obs.data$longitude))
min.lon <- floor(min(obs.data$longitude))
geographic.extent <- extent(x = c(min.lon, max.lon, min.lat, max.lat))
bioclim.data <- crop(x = bioclim.data, y = geographic.extent)
background<-randomPoints(mask = bioclim.data,
n = nrow(obs.data),
ext = geographic.extent,
extf = 1.25)
colnames(background)<-c("longitude", "latitude")
background.sp<-SpatialPoints(background, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs"))
presence<-data.frame(Presence = rep(1, times = nrow(obs.data)), extract(bioclim.data, obs.sp))
absence<-data.frame(Presence = rep(0, times = nrow(obs.data)), extract(bioclim.data, background.sp))
df<-rbind(presence, absence)
df$Presence<-as.factor(df$Presence)
h2o.init(max_mem_size = "4g")
set.seed(1234)
df_h2o <- as.h2o(df)
splits <- h2o.splitFrame(df_h2o, c(0.7, 0.15), seed = 1234)
train <- h2o.assign(splits[[1]], "train")
valid <- h2o.assign(splits[[2]], "valid")
test <- h2o.assign(splits[[3]], "test")
y <- "Presence"
x <- setdiff(names(df_h2o), y)
aml <- h2o.automl(x = x,
y= y,
training_frame = train,
leaderboard_frame = valid,
max_runtime_secs = 60) #default 1 hour 3600 secs, more time, more accurate
pred<-as.data.frame(predict(aml, test)) # this works. It reports predicted probabilities of presence vs. absence.
pred_rstr<-predict(aml, bioclim.data) # this doesn't work. Must be an H2oFrame
biodf<-as.data.frame(bioclim.data) # this forces the raster stack into a dataframe. This works, but only for low resolution rasters.
bio_h2o<-as.h2o(biodf)
pred2<-as.data.frame(predict(aml, bio_h2o))
pred_rstr<-bioclim.data[[1]] # pull one of the rasters from the stack to serve as a template
values(pred_rstr)<-as.vector(pred2$p1) # set raster values to predicted probabilities
plot(pred_rstr)
Related
I want to assess if the observations in my data are spatially randomly distributed over the sampling area (Sweden). I wanted to reproduce the example given in this answer: Spatial Autocorrelation Analysis (Global Moran's I) in R
Here is a small subset of my data, and the spatial polygon I used. Note that the coordinates are in SWEREF99 (ESPG: 3006)
## spatial polygon of Sweden
library(rworldmap)
library(sp)
worldmap <- getMap(resolution = "high")
sweden <- worldmap[which(worldmap$SOVEREIGNT == "Sweden"),]
plot(sweden)
sweden
## conversion to EPSG: 3006 (SWEREF99 TM) (https://spatialreference.org/ref/epsg/3006/)
crs.laea <- CRS("+proj=utm +zone=33 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs")
sweden_proj <- spTransform(sweden, crs.laea)
## Data subset
x <- c(669894, 669894, 669894, 671088, 671117, 671117, 671117, 670513, 670513, 670513, 669921, 669310, 669310, 669310, 669303, 629720, 630318, 630925, 630925, 630925)
y <- c(7116684, 7116684, 7116684, 7116706, 7114900, 7114900, 7114900, 7114896, 7114896, 7114896, 7114888, 7115473, 7115473, 7115473, 7116075, 7131172, 7131180, 7131190, 7131190, 7131190)
library(spatstat)
coords.ppp_1 <- ppp( x , y , xrange = c(280227, 911417) , yrange = c(6142436, 7605020) )
coords.ppp <- unique(coords.ppp_1)
### plot data and Sweden map for check
plot(coords.ppp_1)
plot(sweden_proj, add=T)
So far it seems ok. Then I convert the spatial polygon to an owin object, simulate random data for comparison, and do the analysis.
library(maptools)
sw <- as.owin.SpatialPolygons(sweden_proj)
# Generate completely spatially random point patterns to compare against the observed
n <- coords.ppp_1$n
ex <- expression(runifpoint( n , sw))
# Compute a simulation envelope using Gest, which estimates the nearest neighbour distance distribution function G(r)
set.seed(1)
res <- envelope( coords.ppp , Gest , nsim = 99, simulate = ex ,verbose = FALSE, savefuns = TRUE )
plot(res)
With the envelope() I get the following error message:
"In envelopeEngine(X = X, fun = fun, simul = simrecipe, nsim = nsim, :
Window containing simulated patterns is not a subset of data window"
I suspect that there is a problem with the conversion between sp and owin, but I couldn't figure out what the issue really is.
Any advice?
I trying two generate points from rLGCP function. I assumed that the presence of these points in the Window is governed by two covaiates ras1 and ras2. Hence I need to comptute log-lambda.
rm(list= ls(all=T))
#Libraries
library(spatstat)
library(raster)
library(maptools)
library(fields)
Creating the domaine D and two rasters
D <- c(300, 300) # Square Domaine D of side 300
Win <- owin(xrange =c(0, D[1]), yrange =c(0,D[2]))
spatstat.options(npixel=c(D[1],D[2]))
ext <- extent(Win$xrange, Win$yrange) # Extent of the rasters
# First raster ras1
par(mfrow=c(1,1))
ras1 <- raster()
extent(ras1) <- ext
res(ras1) <- 10
names(ras1) <- 'Radiation sim'
crs(ras1) <- "+proj=lcc +lat_1=48 +lat_2=33 +lon_0=-100 +datum=WGS84"
values(ras1) <- matrix(c(seq(from =0, to =50, length.out=200), seq(from=50, to=100, length.out = 100), seq(from=100, to=150, length.out = 200), seq(from=150, to=200, length.out = 200), seq(from=200, to=290, length.out = 200)), nrow = 30, ncol = 30)
ras1
plot(ras1, asp=1)
# Second Raster ras2
ras2 <- raster()
extent(ras2) <- ext
res(ras2) <- 10
names(ras2) <- 'Precipitation sim'
crs(ras2) <- "+proj=lcc +lat_1=48 +lat_2=33 +lon_0=-100 +datum=WGS84"
values(ras2) <- matrix(c(seq(from =-0, to =200, length.out=500), seq(from=400, to=893, length.out = 20), seq(from=200, to=300, length.out = 300),seq(from=300, to = 400, length.out=80)))
ras2
plot(ras2, asp=1)
Rasters.group <- stack(ras1, ras2)
plot(Rasters.group)
graphics.off()
From Rasters to im. objects
im.ras1 <- as.im.RasterLayer(ras1); summary(im.ras1)
im.ras2 <- as.im.RasterLayer(ras2); summary(im.ras2)
covar.list <- list(Radiation.sim=im.ras1, Precipitation.sim=im.ras2)
# plot .im object
par(mfrow=c(1,2))
image.plot(list(x=im.ras1$xcol, y=im.ras1$yrow, z=t(im.ras1$v)), main= "Radiation sim", asp=1)
image.plot(list(x=im.ras2$xcol, y=im.ras2$yrow, z=t(im.ras2$v)), main= "Precipitation sim", asp=1)
Now I can compute log-Lambda
#normalization
norm.im.ras1 <- (im.ras1- summary(im.ras1)$mean)/sd(im.ras1) ; summary(norm.im.ras1)
norm.im.ras2 <- (im.ras2- summary(im.ras2)$mean)/sd(im.ras2) ; summary(norm.im.ras2)
#Compute log-lambda
log.lambda <- norm.im.ras1 + 2*norm.im.ras2
summary(log.lambda)
resut dispays very weak values
Pixel values
range = [-4.657923, 10.94624]
integral = -9.678445e-12
mean = -1.075383e-16
When I try to simulate from rLGCP
gen.lgcp <- rLGCP("matern", mu=log.lambda, var=0.5, scale=0.05, nu=1)
Error: could not allocate a vector of size 181.9 MB
I tried to get around that with
log.lambda0 <- as.im(solutionset(log.lambda>0))
gen.lgcp <- rLGCP("matern", mu=log.lambda0, var=0.5, scale=0.05, nu=1)
summary(gen.lgcp)
I can move forward. But further, I did not get relevent results
#Thinning
image.plot(list(x=log.lambda$xcol, y=log.lambda$yrow, z=t(log.lambda$v)), main= "log.lambda", asp=1)
samp.lgcp <- rthin(gen.lgcp, P=seq(from=0.02, to=0.2, length.out = gen.lgcp$n)); points(samp.lgcp$x, samp.lgcp$y, type = 'p', cex=0.2, lwd=1, col='white')
#point pattern
pts.locations <- as.data.frame(cbind(longitude=samp.lgcp$x, latitude=samp.lgcp$y))
ppp.lgcp <- ppp(pts.locations$longitude, pts.locations$latitude, window = owin(xrange=c(min(pts.locations [,1]),max(pts.locations [,1])), yrange = c(min(pts.locations[,2]),max(pts.locations[,2]))))
plot(ppp.lgcp)
#Extract value of each sampled point covariate
cov.value <- extract(Rasters.group, pts.locations)
cov.value <- as.data.frame(cov.value )
presence.data <- data.frame(pts.locations, cov.value, presence=rep(1, nrow(cov.value)))
### Choosing absence point pattern
abs.region <- crop(Virtual.species.domaine, extent(25.28486 , 162.2897 ,181.7417 , 280.7651 ))
im.abs.region <- as.im.RasterLayer(abs.region)
abs.points <- rasterToPoints(abs.region)
ppp.abs.points <- ppp(abs.points[,1], abs.points[,2], window = owin(xrange = c(min(abs.points[,1]), max(abs.points[,1])), yrange =c(min(abs.points[,2]), max(abs.points[,2]))))
plot(ppp.abs.points)
cov.value.abs <- extract(Rasters.group, abs.points[,1:2])
absence.data <- data.frame(abs.points[,1:2], cov.value.abs, presence=rep(0, nrow(abs.points)))
colnames(absence.data)[1:2] <- c("longitude", "latitude")
head(absence.data)
# Get database for LGCP
LGCP.Data.Set <- rbind(presence.data, absence.data)
#' Model
#' we will use non-stationary formula
covar.formula <- as.formula(paste("~", paste(names(LGCP.Data.Set[,3:4]), collapse = "+")))
#Quadrature scheme
Q.lgcp <- quadscheme(ppp.lgcp, ppp.abs.points, method = 'grid')
plot(Q.lgcp)
Warning message:
In countingweights(id, areas) :
some tiles with positive area do not contain any quadrature points: relative error = 94.2%
# Inhomogenous poisson process Model
fit.ipp <- ppm(Q.lgcp, trend = covar.formula, covariates = LGCP.Data.Set[,3:4])
summary(fit.ipp)
Warning message:
glm.fit: algorithm did not converge
What is going wrong?
My goal is to evaluate de model and the predict with
prediction.ipp <- predict.ppm(fit.ipp, log.lambda, type = 'intensity')
This is a very long and un-focused question but I will try to help.
After constructing the image log.lambda, you say "result shows very weak values". What do you mean? The image values were assigned as a sequence of values ranging from 0 to 200, and then standardised to have mean zero and standard deviation 1. How is this "weak"?
You then call rLGCP using this image as the mean log intensity. The values of log.lambda range from about -4 to +10. This means that the desired intensity will range from exp(-4) to exp(+10), that is, about 0.01 to 20 000 points per square unit. The image dimensions are 30 by 30 units. Thus, a very large number of random points must be generated, and this fails because of memory limits. (The expected number of points is integral(exp(log.lambda)).
You then change log.lambda to another image which takes only the values 0 and 1.
The next body of code appears to take a raster image (of "absence" pixels) and try to construct a quadrature scheme using the "absence" pixels as dummy points. This is not the purpose for which quadscheme is designed (for quadscheme the dummy points should be sparse).
You don't need to construct a quadrature scheme to use ppm. You could just do something like
D <- solist(A=im.ras.1, B=im.ras.2)
ppm(ppp.logi ~ A+B , data=D)
If you really want to construct a quadrature scheme, I suggest you use the function pixelquad instead. Just do pixelquad(ppp.lgcp, im.abs.region) or similar. Then use ppm.
Since the data were generated by a Cox process, it would be more appropriate to use kppm rather than ppm.
See the spatstat book for further information.
I have been working on an exercise using Rstudio. I want to do a kriging interpolation, but I get an error that I have not been able to solve.
I'd appreciate if some of you could tell me how to fix it.
Here my code:
#importing libraries
library(raster)
library(sf)
library(sp)
library(rgdal)
library(gstat)
library(tmap)
library(tidyverse)
#reading csv data
id <- "1u18s9g15USnzNhj-t2EEFYHsW6GZEIgd" # google file ID
read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", id))
silver_data = read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", id))
#creating spatial dataframe
silver_2 <- st_as_sf(silver_data, coords = c("long", "lat"))
st_crs(silver_2) <- 4326
#coordinates transformation
silver_utm2 <- st_transform(silver_2, crs=9155)
#setting variogram
ve <- variogram(silver ~ 1, silver_utm2, cutoff = 5000,width = 200)
plot(ve, plot.numbers = T, asp=1)
# experimental variogram
vt <- vgm(psill = 350, model = "Sph", range = 2000,nugget = 0)
plot(ve, pl = T, model = vt)
#automatic adjustment
va <- fit.variogram(ve, vt)
plot(ve, pl = T, model = va)
#empty raster in order to create grid for kriging interpolation
z <- raster(as(silver_utm2, "Spatial"), ncols = 200, nrows = 80)
# using raster to create grid
grilla2 <- rasterToPoints(z, spatial = TRUE)
gridded(grilla2) <- TRUE
grilla2 <- as(grilla2, "SpatialPixels")
# using kriging function
ok <- krige(silver~1, locations=silver_utm2, newdata=grilla2, model=va)
The error that appears:
> ok <- krige(silver~1, locations=silver_utm2, newdata=grilla2, model=va)
Error in UseMethod("st_crs<-") :
no applicable method for 'st_crs<-' applied to an object of class "c('SpatialPixels', 'SpatialPoints', 'Spatial', 'SpatialVector')
The tutorial I'm following is https://rstudio-pubs-static.s3.amazonaws.com/416462_1463d00750c54fce89955a925eaa4957.html, but the data are not the same.
st_crs() is a function from the package sf and is designed to operate on sf objects only. As your code stands, grilla2 is an sp object. Convert it to sf before using krige().
grilla2 <- sf::st_as_sf(grilla2)
I have a Digital Elevation Model for the Alps (containing x,y,z). I would like to extract polygons representing two elevation levels (<=1000, >1000). The polygons need to be applied to a more elaborate registration of the Alps with different x, y points (x,y,snowdepth). The second registration must be filtered for all points above 1000m. So. it would be nice if I could store the polygon and use it to different spatial data frames. My question is: how can I capture the polygon representing the elevation level above 1000m and filter the spatial data frame in R.
#packages
packages <- c("RCurl", "RColorBrewer","ggmap","rgeos","fields","dismo","rgdal", "deldir", "dplyr","tidyr","ggplot2","contoureR",
"maptools","raster","gstat", "magick","lubridate", "ggplot2","viridis","scales","inlmisc")
has_available <- packages %in% rownames(installed.packages())
if(any(!has_available)) install.packages(packages[!has_available])
lapply(packages,library,character.only = TRUE)
for(pkg in packages) {
library(pkg, character.only = TRUE)
}
geo_proj = "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
alps <- shapefile("Alpine_Convention_Perimeter_2018.shp")
proj4string(alps)
bbalps <- bbox(alps)
alps <- spTransform(alps,geo_proj)
r <- raster(alps,ncols=990,nrows=990)
srtm <- getData('SRTM', lon=15, lat=47)
srtm2 <- getData('SRTM', lon=12, lat=47)
srtm3 <- getData('SRTM', lon=9, lat=47)
srtm4 <- getData('SRTM', lon=6, lat=45)
#Mosaic/merge srtm tiles
srtmmosaic <- mosaic(srtm, srtm2, srtm3, srtm4, fun=mean)
srtmmosaic <- spTransform(srtmmosaic,geo_proj)
rm(srtm, srtm2, srtm3, srtm4)
rst0 <- projectRaster(srtmmosaic, r, crs=geo_proj)
#set the z-factor, which increases contrast
mosaic <- rst0 * 10
#then create the hillshade
slope <- terrain(mosaic, opt="slope", unit='radians')
aspect <- terrain(mosaic, opt="aspect", unit='radians')
hillshade <- hillShade(slope, aspect, angle=45, direction=315)
hillshade <- crop(hillshade, extent(alps))
hillshade <- mask(hillshade, alps)
slope <- crop(slope, extent(alps))
slope <- mask(slope, alps)
rst0 <- crop(rst0, extent(alps))
rst0 <- mask(rst0,alps)
#elevation
elev.df <- rasterToPoints(rst0)
elev.df <- data.frame(elev.df)
colnames(elev.df) <- c("lon", "lat", "elevation")
#ggplot makes me turn the raster into points
hills.df <- rasterToPoints(hillshade)
hills.df <- data.frame(hills.df)
colnames(hills.df) <- c("lon", "lat", "hills")
#merging the slope shade with the hillshade
slope.df <- rasterToPoints(slope)
slope.df <- data.frame(slope.df)
colnames(slope.df) <- c("lon", "lat", "slope")
slope.df$slope <- 1- slope.df$slope #invert the scale so that more slope is darker
#elevation normalised
elev.df$val_norm <- (elev.df[,3]-min(elev.df[,3]))/diff(range(elev.df[,3]))
mnt.df<-hills.df %>%
left_join(slope.df) %>%
full_join(elev.df)
df = getContourLines(mnt.df, binwidth=1000)
ggplot(df,aes(x,y,group=Group,colour=z)) + geom_path()
I have 608 observations of satellite-tagged turtles. I want to model these with environmental data which includes sea surface temperature, current speed, wind speed etc. Of course, both the tagging and environmental data vary spatially and temporally. I have generated pseudo-absence data using the code below which I adapted from here. However, it has now occurred to me that the data points I have generated are only spatial samples. Is there some way I can edit this code to sample temporally as well so that my resulting csv has a date/time for each point so that I can match it up to my environmental data? Alternatively, is there a different package I could try that would allow me to do this?
dir.create(path = "data")
library("sp")
library("raster")
library("maptools")
library("rgdal")
library("dismo")
bioclim.data <- getData(name = "worldclim",
var = "bio",
res = 2.5,
path = "data/")
# Read in observations
obs.data <- read.csv(file = "data/Presence.csv")
# Determine geographic extent of data
max.lat <- ceiling(max(obs.data$Latitude))
min.lat <- floor(min(obs.data$Latitude))
max.lon <- ceiling(max(obs.data$Longitude))
min.lon <- floor(min(obs.data$Longitude))
geographic.extent <- extent(x = c(min.lon, max.lon, min.lat, max.lat))
# Use the bioclim data files for sampling resolution
bil.files <- list.files(path = "data/wc2-5",
pattern = "*.bil$",
full.names = TRUE)
# only need one file, so use the first one in the list of .bil files
mask <- raster(bil.files[1])
# Randomly sample points (same number as our observed points)
background <- randomPoints(mask = mask, # Provides resolution of sampling points
n = nrow(obs.data), # Number of random points
ext = geographic.extent, # Spatially restricts sampling
extf = 1.25) # Expands sampling a little bit
write.csv(background, "pseudo-absence.csv")
I solved this issue by simply generating random times with the code below and merging the resulting and above .csv.
#ADD TIMES
time.start <- as.POSIXct('2014-12-01T01:00:00z', format = "%Y-%m-%dT%H:%M:%S")
time.end <- as.POSIXct('2015-04-30T01:00:00z', format = "%Y-%m-%dT%H:%M:%S")
seconds <- difftime(time.end, time.start, units = "secs")
# Option with runif()
v <- round(runif(6000, 0, seconds))
# Option with sample()
v <- sample(1:seconds, 6000, replace = T)
time.uniform <- time.start + v
write.csv(time.uniform, "time.csv")
tag<-read.csv("pseudo-absence.csv")
time<- read.csv("time.csv")
myfulldata = merge(tag,time)
write.csv(myfulldata, "pseudo-absence_with_time.csv")