Predict xgboost model onto raster stack yields error - r

I am using an xgboost model to predict onto a raster stack. I have successfully used the same approach with CART, xgb and Random Forest models:
library(raster)
# create a RasterStack or RasterBrick with with a set of predictor layers
logo <- brick(system.file("external/rlogo.grd", package="raster"))
names(logo)
# known presence and absence points
p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85,
66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31,
22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
# extract values for points
xy <- rbind(cbind(1, p), cbind(0, a))
v <- data.frame(cbind(pa=xy[,1], extract(logo, xy[,2:3])))
xgb <- xgboost(data = data.matrix(subset(v, select = -c(pa))), label = v$pa,
nrounds = 5)
raster::predict(model = xgb, logo)
But with xgboost I get the following error:
Error in xgb.DMatrix(newdata, missing = missing) :
xgb.DMatrix does not support construction from list

The problem is that predict.xgb.Booster does not accept a data.frame for argument newdata (see ?predict.xgb.Booster). That is unexpected (all common predict.* methods take a data.frame), but we can work around it. I show how to do that below, using the "terra" package instead of the obsolete "raster" package (but the solution is exactly the same for either package).
The example data
library(terra)
library(xgboost)
logo <- rast(system.file("ex/logo.tif", package="terra"))
p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85,
66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31,
22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
xy <- rbind(cbind(1, p), cbind(0, a))
v <- extract(logo, xy[,2:3])
xgb <- xgboost(data = data.matrix(v), label=xy[,1], nrounds = 5)
The work-around is to write a prediction function that first coerces the data.frame with "new data" to a matrix. We can use that function with predict<SpatRaster>
xgbpred <- function(model, data, ...) {
predict(model, newdata=as.matrix(data), ...)
}
p <- predict(logo, model=xgb, fun=xgbpred)
plot(p)

Related

Why will glm model converge but caret train version of the same model will not?

As far as I can tell, I have specified this simple GLM the same way using a basic glm function and the caret train function. However, the caret version will not converge. Is there something missing in how I am specifying the train model?
library(caret)
logo <- rast(system.file("ex/logo.tif", package="terra"))
names(logo) <- c("red", "green", "blue")
p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85,
66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31,
22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
xy <- rbind(cbind(1, p), cbind(0, a))
# extract predictor values for points
e <- terra::extract(logo, xy[,2:3])
# combine with response (excluding the ID column)
v <- data.frame(cbind(pa=xy[,1], e))
v$pa <- as.factor(v$pa)
#GLM model
model <- glm(formula=as.numeric(pa)~ red + blue + green , data=v)
#Train model
model2 <- train(pa ~ red + green + blue,
data=v,
method = "glm")
>Warning messages:
> 1: glm.fit: algorithm did not converge
> 2: glm.fit: fitted probabilities numerically 0 or 1 occurred
Compliments of #Ben Bolker in the comments:
The train model was using family="binomial" as the default because the response variable is 0 and 1. The train model works with family="gaussian".

Random Forest class probabilities in seperate raster layers

I'm using the randomForest package to classify a raster stack of different predictors. Classification works fine, but I also want to retrieve the class probabilities. With my code I only get a RasterLayer with the probability of the first class, but I'd like to get a RasterStack with the class probabilities for each class in one layer.
PRED_train$response <- as.factor(PRED_train$response)
rf <- randomForest(response~., data = PRED_train, na.action = na.omit, confusion = T)
pred_RF <- raster::predict(PRED,rf,)
beginCluster()
pred_RF <- clusterR(PRED, predict, args = list(rf,type="prob"))
endCluster()
The first place to look should be ?raster::predict; which has an example that shows how to do that. Here it is:
library(raster)
logo <- brick(system.file("external/rlogo.grd", package="raster"))
p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85,
66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31,
22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
xy <- rbind(cbind(1, p), cbind(0, a))
v <- data.frame(cbind(pa=xy[,1], extract(logo, xy[,2:3])))
v$pa <- as.factor(v$pa)
library(randomForest)
rfmod <- randomForest(pa ~., data=v)
rp <- predict(logo, rfmod, type='prob', index=1:2)
spplot(rp)

Discretize the columns first - Apriori in R

I extracted this data from a file:
forests<-read.table("~/Desktop/f.txt", header = FALSE, sep = " ", fill = TRUE)
library(arules) //after installing the package
I get an error when I type
forests<- apriori(forests, parameter = list(support=0.3))
Error in asMethod(object) : column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
78, 79, 80, 81, 82, 83, 84, 85 not logical or a factor. Discretize the
columns first.
I tried discritize(forests). It still doesn't work.
itemFrequencyPlot(forests) also gives an error saying unable to find inherited method.
Have I imported the data incorrectly?

Decomposition of time series data

Can anybody help be decipher the output of ucm. My main objective is to check if the ts data is seasonal or not. But i cannot plot and look and every time. I need to automate the entire process and provide an indicator for the seasonality.
I want to understand the following output
ucmxmodel$s.season
# Time Series:
# Start = c(1, 1)
# End = c(4, 20)
# Frequency = 52
# [1] -2.391635076 -2.127871717 -0.864021134 0.149851212 -0.586660213 -0.697838635 -0.933982269 0.954491859 -1.531715424 -1.267769820 -0.504165631
# [12] -1.990792301 1.273673437 1.786860414 0.050859315 -0.685677002 -0.921831488 -1.283081922 -1.144376739 -0.964042949 -1.510837956 1.391991657
# [23] -0.261175626 5.419494363 0.543898305 0.002548125 1.126895943 1.474427901 2.154721023 2.501352782 0.515453691 -0.470886132 1.209419689
ucmxmodel$vs.season
# [1] 1.375832 1.373459 1.371358 1.369520 1.367945 1.366632 1.365582 1.364795 1.364270 1.364007 1.364007 1.364270 1.364795 1.365582 1.366632 1.367945
# [17] 1.369520 1.371358 1.373459 1.375816 1.784574 1.784910 1.785223 1.785514 1.785784 1.786032 1.786258 1.786461 1.786643 1.786802 1.786938 1.787052
# [33] 1.787143 1.787212 1.787257 1.787280 1.787280 1.787257 1.787212 1.787143 1.787052 1.786938 1.786802 1.786643 1.786461 1.786258 1.786032 1.785784
# [49] 1.785514 1.785223 1.784910 1.784578 1.375641 1.373276 1.371175 1.369337 1.367762 1.366449 1.365399 1.364612 1.364087 1.363824 1.363824 1.364087
# [65] 1.364612 1.365399 1.366449 1.367762 1.369337 1.371175 1.373276 1.375636 1.784453 1.784788 1.785101 1.785392 1.785662 1.785910 1.786136 1.786339
ucmxmodel$est.var.season
# Season_Variance
# 0.0001831373
How can i use the above info without looking at the plots to determine the seasonality and at what level ( weekly, monthly, quarterly or yearly)?
In addition, i am getting NULL in est
ucmxmodel$est
# NULL
Data
The data for a test is:
structure(c(44, 81, 99, 25, 69, 42, 6, 25, 75, 90, 73, 65, 55,
9, 53, 43, 19, 28, 48, 71, 36, 1, 66, 46, 55, 56, 100, 89, 29,
93, 55, 56, 35, 87, 77, 88, 18, 32, 6, 2, 15, 36, 48, 80, 48,
2, 22, 2, 97, 14, 31, 54, 98, 43, 62, 94, 53, 17, 45, 92, 98,
7, 19, 84, 74, 28, 11, 65, 26, 97, 67, 4, 25, 62, 9, 5, 76, 96,
2, 55, 46, 84, 11, 62, 54, 99, 84, 7, 13, 26, 18, 42, 72, 1,
83, 10, 6, 32, 3, 21, 100, 100, 98, 91, 89, 18, 88, 90, 54, 49,
5, 95, 22), .Tsp = c(1, 3.15384615384615, 52), class = "ts")
and
structure(c(40, 68, 50, 64, 26, 44, 108, 90, 62, 60, 90, 64, 120, 82, 68, 60,
26, 32, 60, 74, 34, 16, 22, 44, 50, 16, 34, 26, 42, 14, 36, 24, 14, 16, 6, 6,
12, 20, 10, 34, 12, 24, 46, 30, 30, 46, 54, 42, 44, 42, 12, 52, 42, 66, 40,
60, 42, 44, 64, 96, 70, 52, 66, 44, 64, 62, 42, 86, 40, 56, 50, 50, 62, 22,
24, 14, 14, 18, 18, 10, 20, 10, 4, 18, 10, 10, 14, 20, 10, 32, 12, 22, 20, 20,
26, 30, 36, 28, 56, 34, 14, 54, 40, 30, 42, 36, 52, 30, 32, 52, 42, 62, 46,
64, 70, 48, 40, 64, 40, 120, 58, 36, 40, 34, 36, 26, 18, 28, 16, 32, 18, 12,
20), .Tsp = c(1, 4.36, 52), class = "ts")
I think the most straightforward approach would be to follow Rob Hyndman's approach (he is the author of many time series packages in R). For your data it would work as follows,
require(fma)
# Create a model with multiplicative errors (see https://www.otexts.org/fpp/7/7).
fit1 <- stlf(test2)
# Create a model with additive errors.
fit2 <- stlf(data, etsmodel = "ANN")
deviance <- 2 * c(logLik(fit1$model) - logLik(fit2$model))
df <- attributes(logLik(fit1$model))$df - attributes(logLik(fit2$model))$df
# P-value
1 - pchisq(deviance, df)
# [1] 1
Based on this analysis we find the p-value of 1 which would lead us to conclude there is no seasonality.
I quite like the stl() function provided in R. Try this minimal example:
# some random data
x <- rnorm(200)
# as a time series object
xt <- ts(x, frequency = 10)
# do the decomposition
xts <- stl(xt, s.window = "periodic")
# plot the results
plot(xts)
Now you can get an estimate of the 'seasonality' by comparing the variances.
vars <- apply(xts$time.series, 2, var)
vars['seasonal'] / sum(vars)
You now have the seasonal variance as a proportion of sum of variances after decomposition.
I highly recommend reading the original paper so that you understand whats happening under the hood here. Its very accessible and I like this method as it is quite intuitive.

How to predict new raster using model generated by cforest

I use randomForest model to predict class memberships. 'x' consists of 10 classes that I use to train 'training_predictors' values extracted from a large rasterstack/brick. The specific line of codes is:
r_tree<-randomForest(x ~. , data=training_predictors, ...)
Then I run 'predict' using the model 'r_tree' that I apply to the rasterstack 'predictor_data', as follow:
predictions<-predict(predictor_data, r_tree, filename=outraster, fun=predict na.rm=TRUE, format="PCDISK", overwrite=TRUE, progress="text", type="response").
The output is a raster that I use as thematic map.
I would like to use the conditional inference trees mode 'cforest' instead of randomForest to achieve the same goals.
I understand that 'predict' can be used with cforest, yet, I have not been able to generate raster files, such as those with randomForest as illustrated above.
It should run fine, but you may need to add the argument OOB=TRUE, and identify factors if there are any.
Example data
p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85,
66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31,
22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
# extract values for points
xy <- rbind(cbind(1, p), cbind(0, a))
v <- data.frame(cbind(xy[,1], extract(logo, xy[,2:3])))
colnames(v)[1] <- 'pa'
Basic model
library(party)
m1 <- cforest(pa~., control=cforest_unbiased(mtry=3), data=v)
pc1 <- predict(logo, m1, OOB=TRUE)
plot(pc1)
Model with factors
v$red <- as.factor(round(v$red/100))
logo$red <- round(logo[[1]]/100)
m2 <- cforest(pa~., control=cforest_unbiased(mtry=3), data=v)
f <- list(levels(v$red))
names(f) <- 'red'
pc2 <- predict(logo, m2, OOB=TRUE, factors=f)
plot(pc2)
By the way, this comes almost straight out of the help file of raster::predict

Resources