Plot Confidence Ellipses for Multiple Factor Analysis - r

I'm trying to plot ellipses with the function http://dx.doi.org/10.1016/j.foodqual.2012.04.010 for the results of a Multiple Factor Analysis however I can't get the individual ellipses. For this I am trying to use the example of the FactoMineR package. If anyone can help me identify the error I would be very grateful.
library(FactoMineR)
data(wine)
res <- MFA(wine, group=c(2,5,3,10,9,2), type=c("n",rep("s",5)),
ncp=5, name.group=c("orig","olf","vis","olfag","gust","ens"),
num.group.sup=c(1,6))
Article function:
MFAconf = function(MFAresob, axes = c(1,2)){
if (!require("FactoMineR")) install.packages("FactoMineR");
library("FactoMineR")
# The number of samples (n)
n = dim(MFAresob$ind$coord)[1]
# The number of groups of variables (m)
m = dim(MFAresob$group$coord)[1]
# Creating a new data frame with one row for each sample’s assiociated MFA group of variables.
CATnames <- vector(mode="character", length = n*m)
for (j in 1:n){CATnames[(((j-1)*m)+1):(j*m)] <- dimnames(MFAresob$ind$coord[order(row.names(MFAresob$ind$coord)),])[[1]][j]}
PartielDim <- cbind.data.frame(names = CATnames,MFAresob$ind$coord.partiel)
PartielDim$names = as.factor(PartielDim$names)
# Bootstrapping the new data frame
Boot <- simule(PartielDim, nb.simul = 500)
# Creating ellipses around 95% of the bootstrapped means
EllipCoord <- coord.ellipse(Boot$simul, level.conf = 0.95, bary = FALSE, axes=axes, npoint = 100)
#Plotting the ellipses
plot.MFA(MFAresob, choix = "ind",title ="", axes = axes, ellipse = EllipCoord,ellipse.par = NULL)
}
#Applying the function to the results for MFA:
MFAconf(MFAresob = res)

Related

Getting a list of points (in x, y form) in a Julia graph

If I have a Julia plot with any number of points, would it be possible for me to get a list of all of the data points within the graph (using the Plots library)?
EDIT: I am working with GeoStats.jl to create temporal variograms, and I just wanted to calculate the error (using RMSE and MAE) of the model's fit. To do this, I thought I had to compare the points within model's curve with the original semivariogram. The current code I have running is:
using GeoStats, Plots, DataFrames, CSV, Dates, MLJ
data_frame = CSV.read("C:/Users/VSCode/MINTS-Variograms/data/MINTS_001e06373996_IPS7100_2022_01_02.csv", DataFrame)
ms = [parse(Float64,x[20:26]) for x in data_frame[!,:dateTime]]
ms = string.(round.(ms,digits = 3)*1000)
ms = chop.(ms,tail= 2)
data_frame.dateTime = chop.(data_frame.dateTime,tail= 6)
data_frame.dateTime = data_frame.dateTime.* ms
data_frame.dateTime = DateTime.(data_frame.dateTime,"yyyy-mm-dd HH:MM:SS.sss")
ls_index = findall(x-> Millisecond(500)<x<Millisecond(1500), diff(data_frame.dateTime))
df = data_frame[ls_index, :]
#include calculation for average lag
#initialize georef data
𝒟 = georef((Z=df.pm2_5, ))
#empirical variogram - same thing as semivariogram
g = EmpiricalVariogram(𝒟, :Z, maxlag=300.)
plot(g, label = "")
γ = fit(Variogram, g)
plot!(γ, label = "")
hline!([γ.nugget], label = "")
hline!([γ.sill], label = "")
println("nugget: " * string(γ.nugget))
println("sill: " * string(γ.sill))

Visualizing pcAlgo object in ggdag?

There's documentation here (https://www.rdocumentation.org/packages/pcalg/versions/2.7-3/topics/pc) on how to run the pc algorithm for binary data.
library(pcalg)
##################################################
## Using discrete data
##################################################
## Load data
data(gmD)
V <- colnames(gmD$x)
## define sufficient statistics
suffStat <- list(dm = gmD$x, nlev = c(3,2,3,4,2), adaptDF = FALSE)
## estimate CPDAG
pc.D <- pc(suffStat,
## independence test: G^2 statistic
indepTest = disCItest, alpha = 0.01, labels = V, verbose = TRUE)
if (require(Rgraphviz)) {
## show estimated CPDAG
par(mfrow = c(1,2))
plot(pc.D, main = "Estimated CPDAG")
plot(gmD$g, main = "True DAG")
}
I'm wondering if there's anyway I can use the pc.D object to visualize this in ggdag (https://cran.r-project.org/web/packages/ggdag/vignettes/intro-to-ggdag.html).
Is there a way?

sampling from posterior predictive distribution (stan vs inla)

I'm trying to implement functions from bayesplot package on a INLA object and a little unsure of how to draw from the posterior predictive distribution. I think I almost have it but rstan draws are more variable than the INLA ones.
In rstan, using the simplified example from bayesplot vignette I can:
library(bayesplot)
library(ggplot2)
library(rstanarm)
library(ggpubr)
library(tidyverse)
#rstan model set up
roaches$roach100 <- roaches$roach1 / 100 # pre-treatment number of roaches (in 100s)
fit_poisson <- stan_glm(y ~ roach100 + treatment + senior, offset = log(exposure2), family = poisson(link = "log"), data = roaches, seed = 1111, refresh = 0)
#In order to use the PPC functions from the bayesplot package we need a vector y of outcome values:
y <- roaches$y
#and a matrix yrep of draws from the posterior predictive distribution,
yrep_poisson <- posterior_predict(fit_poisson, draws = 500)
#then plot:
p1 <- bayesplot::ppc_dens_overlay(y, yrep_poisson[1:50, ])
p1
I want to replicate that plot on a INLA object. According to the bayesplot vignette you can do this as they have provided code to define a simple pp_check method that creates fitted model objects of class e.g. foo:
pp_check.foo <- function(object, type = c("multiple", "overlaid"), ...) {
type <- match.arg(type)
y <- object[["y"]]
yrep <- object[["yrep"]]
stopifnot(nrow(yrep) >= 50)
samp <- sample(nrow(yrep), size = ifelse(type == "overlaid", 50, 5))
yrep <- yrep[samp, ]
if (type == "overlaid") {
ppc_dens_overlay(y, yrep, ...)
} else {
ppc_hist(y, yrep, ...)
}
}
To use pp_check.foo we can just make a list with y and yrep components and give it class foo:
x <- list(y = rnorm(200), yrep = matrix(rnorm(1e5), nrow = 500, ncol = 200))
class(x) <- "foo"
#create plot above:
pp_check(x, type = "overlaid")
INLA
#create same model but in inla:
library(INLA)
fit_poisson_inla <- inla(y ~ roach100 + treatment + senior, offset = log(exposure2), data = roaches,
control.predictor = list(compute = T),
family = "poisson")
inla_object_name$marginals.fitted.values returns a posterior predictive distribution for each y:
fit_poisson_inla$marginals.fitted.values
#so to get distribution for first oberservation:
fitted.Predictor.1 <- fit_poisson_inla$marginals.fitted.values[[1]]
I think repeatedly sampling from this would give me what I need but there are only 75 values (dim(fitted.Predictor.1) per observation used to create this distribution when in reality I would want to be sampling from a full range of values. I think we can do this (section 4.3 here) by using inla.tmarginal using linear predictor:
fitted_dist <- fit_poisson_inla$marginals.linear.predictor
#should i have used "inla.rmarginal(n, marginal)"?
marginal_dist <- lapply(fitted_dist, function(y) inla.tmarginal(function(x) {exp(x)}, y)) %>% map(~ as.data.frame(.) %>% rename(., xx = x))
#resample 500 times
yrep_poisson_inla <- as.matrix(bind_rows(rerun(500, lapply(marginal_dist, function(x) sample(x$xx, 1)) %>% as.data.frame())))
#convert to class foo for pp_check
x <- list(y = y, yrep = yrep_poisson_inla[1:50, ])
class(x) <- "foo"
p2 <- pp_check(x, type = "overlaid")
#plot
ggarrange(p1, p2, ncol = 1, nrow = 2, labels = c("rstan", "inla sample"))
My question is how do I correctly get a matrix of draws from the posterior predictive distribution from this inla (fit_poisson_inla) object to pass into pp_check? yrep_poisson produces discrete values while yrep_poisson_inla produces continuous values. There is a lot more variation in the rstan draws than INLA (second plot). Is what I have done correct and this is just some sampling issue or is it an artifact of the different methods? In more complicated examples the differences could be substantial.
Thanks

Error using PCA outlier dectection in HyperSpec

I have been finding HyperSpec to very useful, however, I continue to receive errors when running through "Outlier Removal by Principal Component Analysis (PCA)" following the steps in the Chondro tutorial . The code I'm running is below:
pca <- prcomp (spc_N2, center = TRUE)
scores <- decomposition (spc_N2, pca$x, label.wavelength="PC",label.spc_N2="score/a.u.")
loadings <- decomposition (spc_N2, t(pca$rotation), scores = FALSE,label.spc_N2="laoding I/a.u.")
pairs (scores [[,,1:20]], pch = 19, cex = 0.5)
This results in a plot of the first 20 score pairs as expected. When I try to identify spectra:
out <- map.identify (scores [,,5])
I receive the following error:
Error in eval(modelRHS[[2]], data, env) : object 'x' not found
Any suggestions would be greatly appreciated.
Thank you
Haley
**edit
I've added an example file. Here is the code I used to import and pre-process the file:
library(hyperSpec)
#import file
file <- read.table ("t0_CA_bln_adj.csv", header = TRUE, dec = ".", sep = ",")
spc <- new ("hyperSpec", wavelength = file [,1], spc = t (file [, -1]), data = data.frame (sample = colnames (file [, -1])), labels = list ((.wavelength = "cm-1"), spc = "I"))
#initial plot
plot (spc)
#intensity standardize to mean of N2 peak
factors_N2 <- 1/apply(spc[, , 2200~2400],1,mean)
spc_N2<-sweep(spc,1,factors_N2,"*")
plot(spc_N2)
#PCA
pca <- prcomp (spc_N2, center = TRUE)
scores <- decomposition (spc_N2, pca$x, label.wavelength="PC",label.spc_N2="score/a.u.")
loadings <- decomposition (spc_N2, t(pca$rotation), scores = FALSE,label.spc_N2="laoding I/a.u.")
#plot score plots of the first 20 and first 5 PCs
pairs (scores [[,,1:20]], pch = 19, cex = 0.5)
pairs (scores [[,,1:5]], pch = 19, cex = 0.5)
#attempt to identify outliers
out <- map.identify (scores [,,5])
file example: t0_CA_bln_adj

binning continuous variables by IV value in R

I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))

Resources