I have been finding HyperSpec to very useful, however, I continue to receive errors when running through "Outlier Removal by Principal Component Analysis (PCA)" following the steps in the Chondro tutorial . The code I'm running is below:
pca <- prcomp (spc_N2, center = TRUE)
scores <- decomposition (spc_N2, pca$x, label.wavelength="PC",label.spc_N2="score/a.u.")
loadings <- decomposition (spc_N2, t(pca$rotation), scores = FALSE,label.spc_N2="laoding I/a.u.")
pairs (scores [[,,1:20]], pch = 19, cex = 0.5)
This results in a plot of the first 20 score pairs as expected. When I try to identify spectra:
out <- map.identify (scores [,,5])
I receive the following error:
Error in eval(modelRHS[[2]], data, env) : object 'x' not found
Any suggestions would be greatly appreciated.
Thank you
Haley
**edit
I've added an example file. Here is the code I used to import and pre-process the file:
library(hyperSpec)
#import file
file <- read.table ("t0_CA_bln_adj.csv", header = TRUE, dec = ".", sep = ",")
spc <- new ("hyperSpec", wavelength = file [,1], spc = t (file [, -1]), data = data.frame (sample = colnames (file [, -1])), labels = list ((.wavelength = "cm-1"), spc = "I"))
#initial plot
plot (spc)
#intensity standardize to mean of N2 peak
factors_N2 <- 1/apply(spc[, , 2200~2400],1,mean)
spc_N2<-sweep(spc,1,factors_N2,"*")
plot(spc_N2)
#PCA
pca <- prcomp (spc_N2, center = TRUE)
scores <- decomposition (spc_N2, pca$x, label.wavelength="PC",label.spc_N2="score/a.u.")
loadings <- decomposition (spc_N2, t(pca$rotation), scores = FALSE,label.spc_N2="laoding I/a.u.")
#plot score plots of the first 20 and first 5 PCs
pairs (scores [[,,1:20]], pch = 19, cex = 0.5)
pairs (scores [[,,1:5]], pch = 19, cex = 0.5)
#attempt to identify outliers
out <- map.identify (scores [,,5])
file example: t0_CA_bln_adj
Related
I'm trying to plot ellipses with the function http://dx.doi.org/10.1016/j.foodqual.2012.04.010 for the results of a Multiple Factor Analysis however I can't get the individual ellipses. For this I am trying to use the example of the FactoMineR package. If anyone can help me identify the error I would be very grateful.
library(FactoMineR)
data(wine)
res <- MFA(wine, group=c(2,5,3,10,9,2), type=c("n",rep("s",5)),
ncp=5, name.group=c("orig","olf","vis","olfag","gust","ens"),
num.group.sup=c(1,6))
Article function:
MFAconf = function(MFAresob, axes = c(1,2)){
if (!require("FactoMineR")) install.packages("FactoMineR");
library("FactoMineR")
# The number of samples (n)
n = dim(MFAresob$ind$coord)[1]
# The number of groups of variables (m)
m = dim(MFAresob$group$coord)[1]
# Creating a new data frame with one row for each sample’s assiociated MFA group of variables.
CATnames <- vector(mode="character", length = n*m)
for (j in 1:n){CATnames[(((j-1)*m)+1):(j*m)] <- dimnames(MFAresob$ind$coord[order(row.names(MFAresob$ind$coord)),])[[1]][j]}
PartielDim <- cbind.data.frame(names = CATnames,MFAresob$ind$coord.partiel)
PartielDim$names = as.factor(PartielDim$names)
# Bootstrapping the new data frame
Boot <- simule(PartielDim, nb.simul = 500)
# Creating ellipses around 95% of the bootstrapped means
EllipCoord <- coord.ellipse(Boot$simul, level.conf = 0.95, bary = FALSE, axes=axes, npoint = 100)
#Plotting the ellipses
plot.MFA(MFAresob, choix = "ind",title ="", axes = axes, ellipse = EllipCoord,ellipse.par = NULL)
}
#Applying the function to the results for MFA:
MFAconf(MFAresob = res)
I'm trying to implement functions from bayesplot package on a INLA object and a little unsure of how to draw from the posterior predictive distribution. I think I almost have it but rstan draws are more variable than the INLA ones.
In rstan, using the simplified example from bayesplot vignette I can:
library(bayesplot)
library(ggplot2)
library(rstanarm)
library(ggpubr)
library(tidyverse)
#rstan model set up
roaches$roach100 <- roaches$roach1 / 100 # pre-treatment number of roaches (in 100s)
fit_poisson <- stan_glm(y ~ roach100 + treatment + senior, offset = log(exposure2), family = poisson(link = "log"), data = roaches, seed = 1111, refresh = 0)
#In order to use the PPC functions from the bayesplot package we need a vector y of outcome values:
y <- roaches$y
#and a matrix yrep of draws from the posterior predictive distribution,
yrep_poisson <- posterior_predict(fit_poisson, draws = 500)
#then plot:
p1 <- bayesplot::ppc_dens_overlay(y, yrep_poisson[1:50, ])
p1
I want to replicate that plot on a INLA object. According to the bayesplot vignette you can do this as they have provided code to define a simple pp_check method that creates fitted model objects of class e.g. foo:
pp_check.foo <- function(object, type = c("multiple", "overlaid"), ...) {
type <- match.arg(type)
y <- object[["y"]]
yrep <- object[["yrep"]]
stopifnot(nrow(yrep) >= 50)
samp <- sample(nrow(yrep), size = ifelse(type == "overlaid", 50, 5))
yrep <- yrep[samp, ]
if (type == "overlaid") {
ppc_dens_overlay(y, yrep, ...)
} else {
ppc_hist(y, yrep, ...)
}
}
To use pp_check.foo we can just make a list with y and yrep components and give it class foo:
x <- list(y = rnorm(200), yrep = matrix(rnorm(1e5), nrow = 500, ncol = 200))
class(x) <- "foo"
#create plot above:
pp_check(x, type = "overlaid")
INLA
#create same model but in inla:
library(INLA)
fit_poisson_inla <- inla(y ~ roach100 + treatment + senior, offset = log(exposure2), data = roaches,
control.predictor = list(compute = T),
family = "poisson")
inla_object_name$marginals.fitted.values returns a posterior predictive distribution for each y:
fit_poisson_inla$marginals.fitted.values
#so to get distribution for first oberservation:
fitted.Predictor.1 <- fit_poisson_inla$marginals.fitted.values[[1]]
I think repeatedly sampling from this would give me what I need but there are only 75 values (dim(fitted.Predictor.1) per observation used to create this distribution when in reality I would want to be sampling from a full range of values. I think we can do this (section 4.3 here) by using inla.tmarginal using linear predictor:
fitted_dist <- fit_poisson_inla$marginals.linear.predictor
#should i have used "inla.rmarginal(n, marginal)"?
marginal_dist <- lapply(fitted_dist, function(y) inla.tmarginal(function(x) {exp(x)}, y)) %>% map(~ as.data.frame(.) %>% rename(., xx = x))
#resample 500 times
yrep_poisson_inla <- as.matrix(bind_rows(rerun(500, lapply(marginal_dist, function(x) sample(x$xx, 1)) %>% as.data.frame())))
#convert to class foo for pp_check
x <- list(y = y, yrep = yrep_poisson_inla[1:50, ])
class(x) <- "foo"
p2 <- pp_check(x, type = "overlaid")
#plot
ggarrange(p1, p2, ncol = 1, nrow = 2, labels = c("rstan", "inla sample"))
My question is how do I correctly get a matrix of draws from the posterior predictive distribution from this inla (fit_poisson_inla) object to pass into pp_check? yrep_poisson produces discrete values while yrep_poisson_inla produces continuous values. There is a lot more variation in the rstan draws than INLA (second plot). Is what I have done correct and this is just some sampling issue or is it an artifact of the different methods? In more complicated examples the differences could be substantial.
Thanks
I'm currently using the textmineR package to run LDA topicmodels on news articles from 2016-2019.
However, I am quite new to R and i don't know how to display my results from the model.
I want to show the prevalence of the 8 topics my model finds, over the time period I have collected data. The data is structured in a dataframe. My data is defined at the day to day level as %y-%m-%d
My LDA model is made like this:
## get textmineR dtm
dtm <- CreateDtm(doc_vec = dat$fulltext, # character vector of documents
ngram_window = c(1, 2),
doc_names = dat$names,
stopword_vec = c(stopwords::stopwords("da"), custom_stopwords),
lower = T, # lowercase - this is the default value
remove_punctuation = T, # punctuation - this is the default
remove_numbers = T, # numbers - this is the default
verbose = T,
cpus = 4)
dtm <- dtm[, colSums(dtm) > 3]
dtm <- dtm[, str_length(colnames(dtm)) > 3]
############################################################
## RUN & EXAMINE TOPIC MODEL
############################################################
# Draw quasi-random sample from the pc
set.seed(34838)
model <- FitLdaModel(dtm = dtm,
k = 8,
iterations = 500,
burnin = 200,
alpha = 0.1,
beta = 0.05,
optimize_alpha = TRUE,
calc_likelihood = TRUE,
calc_coherence = TRUE,
calc_r2 = TRUE,
cpus = 4)
# model log-likelihood
plot(model$log_likelihood, type = "l")
# topic coherence
summary(model$coherence)
hist(model$coherence,
col= "blue",
main = "Histogram of probabilistic coherence")
# top terms by topic
model$top_terms1 <- GetTopTerms(phi = model$phi, M = 10)
t(model$top_terms1)
# topic prevalence
model$prevalence <- colSums(model$theta) / sum(model$theta) * 100
# prevalence should be proportional to alpha
plot(model$prevalence, model$alpha, xlab = "prevalence", ylab = "alpha")
Can anyone tell me how to plot the most prevalent topics the model finds over time?
Do I need to tokenize the text or something like that?
I hope this makes sense.
Best,
Tokenization happens in the CreateDtm function. So, it doesn't sound like that's your issue.
You can get the prevalence of topics over a set of documents by taking a mean over the columns of theta, a matrix that's part of the resulting model.
I can't give you an exact answer with your data, but I can show you a similar example with the nih_sample data that ships with textmineR
# load the NIH sample data
data(nih_sample)
# create a dtm and topic model
dtm <- CreateDtm(doc_vec = nih_sample$ABSTRACT_TEXT,
doc_names = nih_sample$APPLICATION_ID)
m <- FitLdaModel(dtm = dtm, k = 20, iterations = 100, burnin = 75)
# aggregate theta by the year of the PROJECT_END variable
end_year <- stringr::str_split(string = nih_sample$PROJECT_END, pattern = "/")
end_year <- sapply(end_year, function(x) x[length(x)])
end_year <- as.numeric(end_year)
topic_by_year <- by(data = m$theta, INDICES = end_year, FUN = function(x){
if (is.null(nrow(x))) {
# if only one row, gets converted to a vector
# just return that vector
return(x)
} else { # if multiple rows, then aggregate
return(colMeans(x))
}
})
topic_by_year <- as.data.frame(do.call(rbind, topic_by_year))
topic_by_year <- as.data.frame(do.call(rbind, topic_by_year))
# plot topic 10's prevalence by year
plot(topic_by_year$year, topic_by_year$t_10, type = "l")
I have predicted values and confidence intervals that I want to add to my 3D plot using trans3d, but I get an error on the line that uses seq I already tried length(z.bin), and read other possible solutions, but it's still not working.
Error in seq.default(lowerCI, upperCI, length.out = 25) :
'from' must be of length 1
I hope you can help me to fix my code. Here are the predicted values (z.bin), upper CI (UCI) and lower CI (LCI):
z.bin= c(0.0293498087331418, 0.090245714112389, 0.184180408140189, 0.288479689911685,
0.380290727519617, 0.447221380019439, 0.486749948207999, 0.515460732539617,
0.524544278048373, 0.517863012982977, 0.499015552138662, 0.471040830332284,
0.436384769878271, 0.39696995466237, 0.354295721949241, 0.309542936297033,
0.263681366413638, 0.217589473510825, 0.172201272125033, 0.128688774135519,
0.0886552840745102, 0.0542241604227149, 0.0277504883386967, 0.0108213094005216,
0.00277584412160996)
UCI=c(0.0366603230533126, 0.0902131425743432, 0.190710608825939,
0.329281535177887, 0.37359325824382, 0.49083302601992, 0.502923852215148,
0.532414036794941, 0.542594424500199, 0.544876477822669, 0.513975201348124,
0.500360540087923, 0.460641689148807, 0.415363280410005, 0.358399020245284,
0.321189810843667, 0.285678220416678, 0.234306786216362, 0.185151688725085,
0.141800528101782, 0.0848830167493455, 0.0596895934068413, 0.034797331186028,
0.0136423698337293, 0.00416130620917585)
LCI=c(0.0203880237502624, 0.0639803379126716, 0.15252099326726, 0.279883133515488,
0.321969495145084, 0.433138773211774, 0.445700330934391, 0.474863237969827,
0.485779389412345, 0.489219946727086, 0.461012139808171, 0.449297954511444,
0.412682077834953, 0.370799794091489, 0.317884618001687, 0.283779930784182,
0.251320227770169, 0.20400383106003, 0.158982316141284, 0.119627373509671,
0.0683623411169277, 0.0464255905587446, 0.0252020843583765, 0.00810835262770212,
0.0014811836711362)
code (please don't run the lines points() and trans3d(), res2 is not included, but it's there to show you the loop I want to use to create the CI bars):
y.bin <- rep(1,25)
x.bin <- seq(-10,10,length.out = 25)
# points(trans3d(x.bin, y.bin, z.bin, pmat = res2), col = 1, pch = 16)
for (i in 1:length(z.bin)) {
lowerCI <- LCI
upperCI <- UCI
CI.bar <- seq(lowerCI,upperCI,length.out=25)
# lines (trans3d(x.bin[i], y.bin[i], z = CI.bar, pmat = res2), col = #1, lwd=2)
}
Looks like you just need to index the LCI and UCI values, otherwise you're feeding the seq function the entire lists:
for (i in 1:length(z.bin)) {
lowerCI <- LCI[i]
upperCI <- UCI[i]
CI.bar <- seq(lowerCI, upperCI, length.out = 25)
# lines (trans3d(x.bin[i], y.bin[i], z = CI.bar, pmat = res2), col = #1, lwd=2)
}
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))