NAs in rasters and randomForest::predict() - r
New here, please let me know if you need more info.
My goal: I am using Rehfeldt climate data and eBird presence/absence data to produce niche models using Random Forest models.
My problem: I want to predict niche models for the entirety of North America. The Rehfeldt climate rasters have data values for every cell on the continent, but these are surrounded by NAs in the "ocean cells". See the plot here, where I have colored the NAs dark green. randomForest::predict() does not run if the independent dataset contains NAs. Thus, I want to crop my climate rasters (or set a working extent?) so that the predict() function only operates over the cells which contain data.
Troubleshooting:
I've run the Random Forest model using a smaller extent which does not include the "NA oceans" of the rasters and the model runs just fine. So, I know the NAs are the problem. However, I don't want to predict my niche models for just a rectangular chunk of North America.
I used flowla's approach here for cropping and masking rasters using a polygon shapefile for North America. I hoped that this would remove the NAs but it doesn't. Is there something similar I can do to remove the NAs?
I've done some reading but can't figure out a way to adjust the Random Forest code itself so that predict() ignores NAs. This post looks relevant but I'm not sure whether it helps in my case.
Data
My rasters, the input presence/absence text file, and code for additional functions are here. Use with the main code below for a reproducible example.
Code
require(sp)
require(rgdal)
require(raster)
library(maptools)
library(mapproj)
library(dismo)
library(maps)
library(proj4)
data(stateMapEnv)
# This source code has all of the functions necessary for running the Random Forest models, as well as the code for the function detecting multi-collinearity
source("Functions.R")
# Read in Rehfeldt climate rasters
# these rasters were converted to .img and given WGS 84 projection in ArcGIS
d100 <- raster("d100.img")
dd0 <- raster("dd0.img")
dd5 <- raster("dd5.img")
fday <- raster("fday.img")
ffp <- raster("ffp.img")
gsdd5 <- raster("gsdd5.img")
gsp <- raster("gsp.img")
map <- raster("map.img")
mat <- raster("mat_tenths.img")
mmax <- raster("mmax_tenths.img")
mmin <- raster("mmin_tenths.img")
mmindd0 <- raster("mmindd0.img")
mtcm <- raster("mtcm_tenths.img")
mtwm <- raster("mtwm_tenths.img")
sday <- raster("sday.img")
smrpb <- raster("smrpb.img")
# add separate raster files into one big raster, with each file being a different layer.
rehfeldt <- addLayer(d100, dd0, dd5, fday, ffp, gsdd5, gsp, map, mat, mmax, mmin, mmindd0, mtcm, mtwm, sday, smrpb)
# plot some rasters to make sure everything worked
plot(d100)
plot(rehfeldt)
# read in presence/absence data
LAZB.INBUtemp <- read.table("LAZB.INBU.txt", header=T, sep = "\t")
colnames(LAZB.INBUtemp) <- c("Lat", "Long", "LAZB", "INBU")
LAZB.INBUtemp <- LAZB.INBUtemp[c(2,1,3,4)]
LAZB.INBU <- LAZB.INBUtemp
latpr <- (LAZB.INBU$Lat)
lonpr <- (LAZB.INBU$Long)
sites <- SpatialPoints(cbind(lonpr, latpr))
LAZB.INBU.spatial <- SpatialPointsDataFrame(sites, LAZB.INBU, match.ID=TRUE)
# The below function extracts raster values for each of the different layers for each of the eBird locations
pred <- raster::extract(rehfeldt, LAZB.INBU.spatial)
LAZB.INBU.spatial#data = data.frame(LAZB.INBU.spatial#data, pred)
LAZB.INBU.spatial#data <- na.omit(LAZB.INBU.spatial#data)
# ITERATIVE TEST FOR MULTI-COLINEARITY
# Determines which variables show multicolinearity
cl <- MultiColinear(LAZB.INBU.spatial#data[,7:ncol(LAZB.INBU.spatial#data)], p=0.05)
xdata <- LAZB.INBU.spatial#data[,7:ncol(LAZB.INBU.spatial#data)]
for(l in cl) {
cl.test <- xdata[,-which(names(xdata)==l)]
print(paste("REMOVE VARIABLE", l, sep=": "))
MultiColinear(cl.test, p=0.05)
}
# REMOVE MULTI-COLINEAR VARIABLES
for(l in cl) { LAZB.INBU.spatial#data <- LAZB.INBU.spatial#data[,-which(names(LAZB.INBU.spatial#data)==l)] }
################################################################################################
# FOR LAZB
# RANDOM FOREST MODEL AND RASTER PREDICTION
require(randomForest)
# NUMBER OF BOOTSTRAP REPLICATES
b=1001
# CREATE X,Y DATA
# use column 3 for LAZB and 4 for INBU
ydata <- as.factor(LAZB.INBU.spatial#data[,3])
xdata <- LAZB.INBU.spatial#data[,7:ncol(LAZB.INBU.spatial#data)]
# PERCENT OF PRESENCE OBSERVATIONS
( dim(LAZB.INBU.spatial[LAZB.INBU.spatial$LAZB == 1, ])[1] / dim(LAZB.INBU.spatial)[1] ) * 100
# RUN RANDOM FORESTS MODEL SELECTION FUNCTION
# This model is using the model improvement ratio to select a final model.
pdf(file = "LAZB Random Forest Model Rehfeldt.pdf")
( rf.model <- rf.modelSel(x=xdata, y=ydata, imp.scale="mir", ntree=b) )
dev.off()
# RUN RANDOM FORESTS CLASS BALANCE BASED ON SELECTED VARIABLES
# This code would help in the case of imbalanced sample
mdata <- data.frame(y=ydata, xdata[,rf.model$SELVARS])
rf.BalModel <- rfClassBalance(mdata[,1], mdata[,2:ncol(mdata)], "y", ntree=b)
# CREATE NEW XDATA BASED ON SELECTED MODEL AND RUN FINAL RF MODEL
sel.vars <- rf.model$PARAMETERS[[3]]
rf.data <- data.frame(y=ydata, xdata[,sel.vars])
write.table(rf.data, "rf.data.txt", sep = ",", row.names = F)
# This the code given to me; takes forever to run for my dataset (I haven't tried to let it finish)
# ( rf.final <- randomForest(y ~ ., data=rf.data, ntree=b, importance=TRUE, norm.votes=TRUE, proximity=TRUE) )
# I use this form because it's a lot faster
( rf.final <- randomForest(x = rf.data[2:6], y = rf.data$y, ntree=1000, importance=TRUE, norm.votes=TRUE, proximity=F) )
################################################################################################
# MODEL VALIDATION
# PREDICT TO VALIDATION DATA
# Determines the percent correctly classified
rf.pred <- predict(rf.final, rf.data[,2:ncol(rf.data)], type="response")
rf.prob <- as.data.frame(predict(rf.final, rf.data[,2:ncol(rf.data)], type="prob"))
ObsPred <- data.frame(cbind(Observed=as.numeric(as.character(ydata)),
PRED=as.numeric(as.character(rf.pred)), Prob1=rf.prob[,2],
Prob0=rf.prob[,1]) )
op <- (ObsPred$Observed == ObsPred$PRED)
( pcc <- (length(op[op == "TRUE"]) / length(op))*100 )
# PREDICT MODEL PROBABILITIES RASTER
# The first line of code says what directory I'm working, and then what folder in that directory has the raster files that I'm using to predict the range
# The second line defines the x variable, wich is my final Random Forest model
rpath=paste('~/YOURPATH', "example", sep="/")
xvars <- stack(paste(rpath, paste(rownames(rf.final$importance), "img", sep="."), sep="/"))
tr <- blockSize(xvars)
s <- writeStart(xvars[[1]], filename=paste('~/YOURPATH', "prob_LAZB_Rehfeldt.img", sep="/"), overwrite=TRUE)
for (i in 1:tr$n) {
v <- getValuesBlock(xvars, row=tr$row[i], nrows=tr$nrows[i])
v <- as.data.frame(v)
rf.pred <- predict(rf.final, v, type="prob")[,2]
writeValues(s, rf.pred, tr$row[i])
}
s <- writeStop(s)
prob_LAZB <- raster("prob_LAZB_Rehfeldt.img")
# Write range prediction raster to .pdf
pdf(file="LAZB_range_pred.pdf")
plot(prob_LAZB)
map("state", add = TRUE)
dev.off()
Thanks!!
Did you try setting 'na.action` in your call to RF? The option is clearly labelled in the randomForest R manual. Your call to RF would look like this:
rf.final <- randomForest(x = rf.data[2:6], y = rf.data$y, ntree=1000, importance=TRUE, norm.votes=TRUE, proximity=F, na.action = omit)
This will tell RF to omit rows where NA exists, thereby throwing out those observations. This is not necessarily the best approach, but it might be handy in your situation.
Option 2: rfImpute or na.roughfix: This will fill in your NAs so that you can go ahead with your prediction. Watch out as this can give you spurious predictions wherever the NAs are being imputed/"fixed".
Option 3: Start with Option 2, and after you get your prediction, bring your raster into your GIS/Image processing software of choice, and mask out the areas you don't want. In your case, masking out water bodies would be pretty simple.
Related
R: Random forest with raster as response and explanitory variable
I want to create a fire occurence probability map with random forest method. My response variable is a raster with the mean annual burned area per grid cell. My explanitory variables are mulitple rasters (temperature, elevation, land use and population density). Is it possible to use a raster as the response variable and how would a basic codeline look like? I couldn't find any information on that. files <- list.files(path="C:/Users/fsorb/OneDrive/Desktop/test/fire_prob", pattern="grd", all.files=FALSE, full.names=TRUE,recursive=TRUE) predictors <- stack(files) fire <- raster("C:/Users/fsorb/OneDrive/Desktop/test/env_data/fire.tif") fire_occ_prob <- randomForest(fire ~ ., data = predictors, ntree=500) So is the code I have so far, but I get the error: Error in as.data.frame.default(data) : can not transform ‘structure("RasterStack", package = "raster")’ into data.frame I tried to save the fire raster as.dataframe but all grid cells only get NA value.
I would try to convert the response (fire) raster to points extract the values of the predictors at the points train a random forest model using the resulting data frame. require(raster) require(sf) require(dplyr) require(randomForest) files <- list.files(path="C:/Users/fsorb/OneDrive/Desktop/test/fire_prob", pattern="grd", all.files=FALSE, full.names=TRUE,recursive=TRUE) predictors <- stack(files) fire <- raster("C:/Users/fsorb/OneDrive/Desktop/test/env_data/fire.tif") # convert raster to point response <- rasterToPoints(fire, spatial = TRUE) %>% st_as_sf() response$ID <- c(1:nrow(response)) colnames(response)[1] <- "response" # combine predictor values with the response rs_preds <- full_join(terra::extract(x=r2, y=response, df=TRUE), st_drop_geometry(response), by="ID") # train random forest fire_occ_prob <- randomForest(response ~ ., data = rs_preds[,!names(rs_preds) %in% "ID"], ntree=500, importance = TRUE) # plot variable importance varImpPlot(fire_occ_prob) # make spatial predictions sp_pred <- raster::predict(predictors, model=fire_occ_prob) If your aim is to make spatial (temporal) predictions, make sure to use a spatial (temporal) (cross-) validation strategy. For further information take a look at e.g. Roberts et al. (2016): https://doi.org/10.1111/ecog.02881 Greetings, Jan
unable to make scaled heatmap for diffrential gene analysis
Im new to R so be easy on me, I'm having trouble generating a heatmap for my genes. I performed diffrential gene analysis using DESeq2 package and found the 30 most downregulated genes and with fdr<0.05 for cell lines. I was trying to create a heatmap using the pheatmap package and I wasn't able to generate my heatmap as I want to. I want to generate a heatmap for my top 30 genes for each cell line(which are 8) Here's my code : dds <- DESeqDataSetFromMatrix(countData = GSM_subset, colData = subset, design = ~ Condition) d_analysis <- DESeq(dds) res <- results(d_analysis) res nrow(dds) dds <- dds[rowSums(counts(dds)) > 1,] nrow(dds) mcols(res, use.names = TRUE) summary(res) resLFC1 <- results(d_analysis, lfcThreshold=3) table(resLFC1$padj<0.05) resLFC1 <- resLFC1[complete.cases(resLFC1),] resLFC1 resSig <- subset(resLFC1, log2FoldChange=-3) resSig <- subset(resLFC1, padj<0.05) top30=head(resSig[ order(resSig$log2FoldChange), ],30) top30<-as.data.frame(top30) library(pheatmap) pheatmap(top30)
Heatmaps in the genomics context usually use the scaled (that is Z-transformed) normalized counts on the log2 scale, or similar transformation such as vst or rlog from the DESeq2 package. Given you already use DESeq2 you can do with dds being your DESeqDataSet: vsd <- assay(vst(dds)) # log-normalized and variance-stabilized counts Z <- t(scale(t(vsd))) # z-transformation Z.select <- Z[your.genes.of.interest,] # subset to genes of interest ...and from there use the heatmap package of your choice.
Is there a R loop function (data.table) to run over 100s of `gam` results without exceeding the memory limit?
Spatial Interpolation using gam Statement I am hoping to get many spatial interpolation outputs using Generalised additive models (GAM). There are no problems for predicting a single pollution map, however, I need more than 100 maps. If possible I would like to automate the implementation and also get the results without exceeding the memory limit. Spatial Interpolation process with GAM (mgcv package) Just to let you know, here are the essential steps to get a interpolated map. Get the X, Y coordinates of the pollution monitoring stations Get the pollution data for each station Add the pollution data to the data frame that contains X, Y coordinates Run gam(pollution ~ s(X,Y, k=20)) for each pollution column Create an empty dataframe with min and max X, Y coordinates as a spatial extent Predict the spatial extent using predict and gam result Run the same job over all pollution fields I will show a hands-on example of how I approached it. Sample data To give an example, I created a dataset which is shown below. From the df, you would realise that I have X Y, and 3 pollution variables. library(data.table) library(mgcv) X <- c(197745.8,200443.8,200427.6,208213.4,203691.1,208303.0,202546.4,202407.9,202564.8,194095.5,194508.0,195183.8,185432.5, 190249.0,190927.0,197490.1,193551.5,204204.4,199508.4,210201.4,212088.3,191886.5,201045.2,187321.7,205987.0) Y <- c(451633.1,452496.8,448949.5,449753.3,449282.2,453928.5,452923.2,456347.9,461614.8,456729.3,453019.7,450039.7,449472.0, 444348.1,447274.4,442390.0,443101.2,446446.5,445008.5,446765.2,449508.5,439225.3,460915.6,447392.0,461985.3) poll1 <- c(34,29,29,33,33,38,35,30,41,43,35,34,41,41,40,36,35,27,53,40,37,32,28,36,33) poll2 <- c(27,27,34,30,38,36,36,35,37,39,35,33,41,42,40,34,38,31,43,46,38,32,29,33,34) poll3 <- c(26,30,27,30,37,41,36,36,35,35,35,33,41,36,38,35,34,24,40,43,36,33,30,32,36) df <- data.table(X, Y, poll1, poll2, poll3) How I worked on it 1. Hard code If you look at the code below, you would realised I copy&pasted the same job to all variables. This will be extremely hard to implement a lot of variables. # Run gam gam1 <- gam(poll1 ~ s(X,Y, k=20), data = df) gam2 <- gam(poll2 ~ s(X,Y, k=20), data = df) gam3 <- gam(poll3 ~ s(X,Y, k=20), data = df) # "there are over 5000 variables that needs looping # Create an empty surface for prediction GAM_poll <- data.frame(expand.grid(X = seq(min(df$X), max(df$X), length=200), Y = seq(min(df$Y), max(df$Y), length=200))) # Predict gam results to the empty surface GAM_poll$gam1 <- predict(gam1, GAM_poll, type = "response") GAM_poll$gam2 <- predict(gam2, GAM_poll, type = "response") GAM_poll$gam3 <- predict(gam3, GAM_poll, type = "response") 2. Using for Loop Instead, I made a list and attempted to loop all the variables to get a results. It certainly has no problem per se, but iterating over a multiple variables will take up all the memory (this is what I experienced). # Run gam using list and for loop myList <- list() for(i in 3:length(df)){ myList[[i-2]] <- gam(df[[i]] ~ s(X,Y, k=20), data = df) } # Create an empty surface for prediction GAM_poll <- data.frame(expand.grid(X = seq(min(df$X), max(df$X), length=200), Y = seq(min(df$Y), max(df$Y), length=200))) # Predict gam results to the empty surface myResult <- list() for(j in 1:length(myList)){ myResult[[j]] <- predict(myList[[j]], GAM_poll, type = "response") } Asking for help Is there a better way to get the gam results over multiple variables? Is there a way to not exceed the memory limit during the implementation? Can you help me data.table, purrr users?
The solution I created only keeps the latest prediction in memory and saves the others to disk before overwriting it with the next solution. The files are named after the column name of the model in a folder called results. I also melted the data.table, mostly because I think the code is a little clearer that way. library(data.table) library(mgcv) X <- c(197745.8,200443.8,200427.6,208213.4,203691.1,208303.0,202546.4,202407.9,202564.8,194095.5,194508.0,195183.8,185432.5, 190249.0,190927.0,197490.1,193551.5,204204.4,199508.4,210201.4,212088.3,191886.5,201045.2,187321.7,205987.0) Y <- c(451633.1,452496.8,448949.5,449753.3,449282.2,453928.5,452923.2,456347.9,461614.8,456729.3,453019.7,450039.7,449472.0, 444348.1,447274.4,442390.0,443101.2,446446.5,445008.5,446765.2,449508.5,439225.3,460915.6,447392.0,461985.3) poll1 <- c(34,29,29,33,33,38,35,30,41,43,35,34,41,41,40,36,35,27,53,40,37,32,28,36,33) poll2 <- c(27,27,34,30,38,36,36,35,37,39,35,33,41,42,40,34,38,31,43,46,38,32,29,33,34) poll3 <- c(26,30,27,30,37,41,36,36,35,35,35,33,41,36,38,35,34,24,40,43,36,33,30,32,36) df <- data.table(X, Y, poll1, poll2, poll3) # melt the data.table df <- melt.data.table(df, id.vars = c('X', 'Y')) dir.create('results') gam1 <- list() for(i in unique(df$variable)){ gam1[[i]] <- gam(value ~ s(X,Y, k=20), data = df[variable == i]) GAM_poll <- data.table(expand.grid(X = seq(min(df$X), max(df$X), length=200), Y = seq(min(df$Y), max(df$Y), length=200))) GAM_poll[, 'prediction' := predict(gam1[[i]], GAM_poll, type = "response")] write.csv(GAM_poll$prediction, paste('results/model_', i, '.csv'), row.names = FALSE) }
Problems with raster prediction from linear model in r
I'm having problems with predicting a raster using a linear model. Firstly i create my model from the data found in my polygons. # create model poly <- st_read("polygon.shp") df <- na.omit(poly) df <- df[df$gdp > 0 & df$ntl2 > 0 & df$pop2 > 0,] x <- log(df$ntl2) y <- log(df$gdp*df$pop2) c <- df$iso d <- data.frame(x,y,c) m <- lm(y~x+c,data=d) Then i want to use raster::predict to estimate an output raster # raster data iso <- raster("iso.tif") viirs <- raster("viirs.tif") x <- log(viirs) c <- iso ## predict with models s <- stack(x,c) predicted <- raster::predict(x,model=m) however i get following response: Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : object is not a matrix I don't know what the problem is and how to fix it. My current throughts are that its something to do with the factors/country codes: My model includes country codes, as I would like to include some country fixed effects. Maybe there is a problems with including these. However even when excluding the country codes from the model and the entire dataframe, i still get the same error message. Futhermore, my model is based on regional values from the whole world and the prediction datasets only include the extent of Turkey. Maybe this is the problem? And here is the data: https://drive.google.com/open?id=16cy7CJFrxQCTLhx-hXDNHJz8ej3vTEED
Perhaps it works if you do like this: iso <- raster("iso.tif") viirs <- raster("viirs.tif") s <- stack(log(viirs), iso) names(s) <- c("x", "c") predicted <- raster::predict(s, model=m) It won't work if the values in df$iso and iso.tif don't match (is one a factor, and the other numeric?).
SVM is not generating forecast using R
I have sales data for 5 different product along with weather information.To read the data, we have daily sales data at a particular store and daily weather information like what is the temperature, average speed of the area where store is located. I am using Support Vector Machine for prediction. It works well for all the products except one. Its giving me following error: tunedModelLOG named numeric(0) Below is the code: # load the packages library(zoo) library(MASS) library(e1071) library(rpart) library(caret) normalize <- function(x) { a <- min(x, na.rm=TRUE) b <- max(x, na.rm=TRUE) (x - a)/(b - a) } # Define the train and test data test_data <- train[1:23,] train_data<-train[24:nrow(train),] # Define the factors for the categorical data names<-c("year","month","dom","holiday","blackfriday","after1","back1","after2","back2","after3","back3","is_weekend","weeday") train_data[,names]<- lapply(train_data[,names],factor) test_data[,names] <- lapply(test_data[,names],factor) # Normalized the continuous data normalized<-c("snowfall","depart","cool","preciptotal","sealevel","stnpressure","resultspeed","resultdir") train_data[,normalized] <- data.frame(lapply(train_data[,normalized], normalize)) test_data[,normalized] <- data.frame(lapply(test_data[,normalized], normalize)) # Define the same level in train and test data levels(test_data$month)<-levels(train_data$month) levels(test_data$dom)<-levels(train_data$dom) levels(test_data$year)<-levels(train_data$year) levels(test_data$after1)<-levels(train_data$after1) levels(test_data$after2)<-levels(train_data$after2) levels(test_data$after3)<-levels(train_data$after3) levels(test_data$back1)<-levels(train_data$back1) levels(test_data$back2)<-levels(train_data$back2) levels(test_data$back3)<-levels(train_data$back3) levels(test_data$holiday)<-levels(train_data$holiday) levels(test_data$is_weekend)<-levels(train_data$is_weekend) levels(test_data$blackfriday)<-levels(train_data$blackfriday) levels(test_data$is_weekend)<-levels(train_data$is_weekend) levels(test_data$weeday)<-levels(train_data$weeday) # Fit the SVM model and tune the parameters svmReFitLOG=tune(svm,logunits~year+month+dom+holiday+blackfriday+after1+after2+after3+back1+back2+back3+is_weekend+depart+cool+preciptotal+sealevel+stnpressure+resultspeed+resultdir,data=train_data,ranges = list(epsilon = c(0,0.1,0.01,0.001), cost = 2^(2:9))) retunedModeLOG <- svmReFitLOG$best.model tunedModelLOG <- predict(retunedModeLOG,test_data) Working file is available at the below link https://drive.google.com/file/d/0BzCJ8ytbECPMVVJ1UUg2RHhQNFk/view?usp=sharing What I am doing wrong? I would appreciate any kind of help. Thanks in advance.