Fit Gamma distribution and plot by factor in R - r

I have data.frame object with a numeric column amount and categorical column fraud:
amount <- [60.00, 336.38, 119.00, 115.37, 220.01, 60.00, 611.88, 189.78 ...]
fraud <- [1,0,0,0,0,0,1,0, ...]
I want to fit a gamma distribution to amount but to plot it by factor(fraud).
I want a graph that will show me 2 curves with 2 different colors that will distinguish between the 2 sets (fraud/non fraud groups).
Here is what I have done so far:
fit.gamma1 <- fitdist(df$amount[df$fraud == 1], distr = "gamma", method = "mle")
plot(fit.gamma1)
fit.gamma0 <- fitdist(df$amount[df$fraud == 0], distr = "gamma", method = "mle")
plot(fit.gamma0)
I have used this reference:
How would you fit a gamma distribution to a data in R?

Perhaps what you want is
curve(dgamma(x, shape = fit.gamma0$estimate[1], rate = fit.gamma0$estimate[2]),
from = min(amount), to = max(amount), ylab = "")
curve(dgamma(x, shape = fit.gamma1$estimate[1], rate = fit.gamma1$estimate[2]),
from = min(amount), to = max(amount), col = "red", add = TRUE)
or with ggplot2
ggplot(data.frame(x = range(amount)), aes(x)) +
stat_function(fun = dgamma, aes(color = "Non fraud"),
args = list(shape = fit.gamma0$estimate[1], rate = fit.gamma0$estimate[2])) +
stat_function(fun = dgamma, aes(color = "Fraud"),
args = list(shape = fit.gamma1$estimate[1], rate = fit.gamma1$estimate[2])) +
theme_bw() + scale_color_discrete(name = NULL)

Related

Plotting classification prediction (K-nearest neighbor)

Im trying to plot my predictions using the k-nearest neighbor method but am unable to do do, I get an error message as seen below. Im sure it's something to do with how ive set up my plot but unsure as to how i need to change it. Dataset is here; https://drive.google.com/file/d/1GYnlsXgT2GS9ubeXq8Pm7iNUWDRGogU_/view?usp=sharing
set.seed(20220719)
#splitting training and testing data
ii = createDataPartition(classification[,3], p = .75, list = F)
#split the data using the indices returned by
createDataPartition
xTrain = classification[ii, 1:2] #predictors for training
yTrain = classification[ii, 3] #class label for training
xTest = classification[-ii, 1:2] #predictors for testing
yTest = classification[-ii, 3] #class label for testing
#set training options
#repeat 10 fold cross-validation, 5 times
opts = trainControl(method = 'repeatedcv', number = 10, repeats = 5)
#find optimal k (model)
kmeans_mod = train(x = xTrain, y = as.factor(yTrain),
method ='knn',
trControl = opts,
tuneGrid = data.frame(k = seq(3, 10)))
#test model on testing data
yTestPred = predict(kmeans_mod, newdata = xTest)
confusionMatrix(as.factor(yTestPred), as.factor(yTest))
#plot
plot(kmeans_mod, xTrain)
Gives the error message
Error in if (!(plotType %in% c("level", "scatter", "line"))) stop("plotType must be either level, scatter or line") :
the condition has length > 1
Im looking for an output like this;
To get a plot similar to the one in the question, you can create a grid of prediction points to produce the background classification map, then plot the test data on top using ggplot.
# Create prediction data frame for test data
preds <- data.frame(X1 = xTest[,1], X2 = xTest[,2], Group = yTestPred)
# Create classification grid
gr <- expand.grid(X1 = seq(min(classification[,1]), max(classification[,1]),
length.out = 100),
X2 = seq(min(classification[,2]), max(classification[,2]),
length.out = 100))
gr$Group <- predict(kmeans_mod, newdata = gr)
# Plot the result
library(ggplot2)
ggplot(gr, aes(X1, X2, col = Group)) +
geom_point(size = 0.6) +
geom_point(data = preds, shape = 21, aes(fill = Group),
col = "black", size = 3) +
theme_minimal(base_size = 16)
Though you may prefer a raster:
library(ggplot2)
ggplot(gr, aes(X1, X2, fill = Group)) +
geom_raster(alpha = 0.3) +
geom_point(data = preds, shape = 21, col = "black", size = 3) +
theme_minimal(base_size = 16)
And you may wish to color the test data points with their actual level rather than their predicted level to get a visual impression of the model accuracy:
library(ggplot2)
ggplot(gr, aes(X1, X2, fill = Group)) +
geom_raster(alpha = 0.3) +
geom_point(data = within(preds, Group <- factor(yTest)),
col = "black", size = 3, shape = 21) +
theme_minimal(base_size = 16)

Given the probability find y for gamma distribution

Looking for some assistance using r. I know that there is invgamma but I am not sure if that will work/how to use it correctly. If X has a Gamma distribution with shape parameter r = 3 and scale parameter ρ = 6 is there a way to calculate y such that Prob(X < y) = .95? thanks!
In R you have 4 types of functions for distribution:
r[name of the distribution]- Random number generator
q[name of the distribution]- Quantile function
d[name of the distribution]- Density function
p[name of the distribution]- Distribution function
So since you have the probability, you need yo use qgamma.
P(X < x) = 0.95
shape <- 3
rate <- 6
x_95 <- qgamma(p = 0.95,shape = shape, rate = rate)
[1] 1.049299
Plot code
df <-
tibble(
x = seq(0,3,l = 1000)
) %>%
mutate(y = dgamma(x = x,shape = shape,rate = rate))
ggplot(df,aes(x,y)) +
geom_function(fun = dgamma, args = list(shape = shape, rate = rate))+
geom_vline(xintercept = x_95, linetype = "dashed")+
theme_bw()+
scale_x_continuous(breaks = x_95)+
geom_area(data = df %>% filter(x <= x_95),
alpha = .7, fill = "chocolate2")+
scale_y_continuous(expand = c(0,0))+
annotate(geom = "text",x = .5,y = .7,label = "95%",size = 12)

How to fit negative binomial function to data that is facet_wrapped by a factor in R ggplot?

I'm trying to fit a negative binomial distribution to counts data but scaled back to counts like in this example In my data, I have to separate out the binomial function plotting for two species. However, there is not easy way to specify this within the function and getting the line legends with parameter values in the key for both species.
set.seed(111)
count <- rbinom(500,100,0.1)
species <- rep(c("A","B"),time = 250)
df <- data.frame(count,species)
#Specifying negative binomial function
negbinom.params <- fitdistr(df$count,"negative binomial", method = "SANN")$estimate
dist.params <- map(list(`Negative Binomial` = negbinom.params),~ map2(names(.),.,~ paste0(.x," = ",round(.y,2))) %>% unlist %>% paste0(.,collapse = ", ")) %>% map2_chr(names(.),., ~ paste(.x,.y,sep=":\n"))
#Plotting
mybinwidth = 2
ggplot(df, aes(x = count, colour = species, fill = species)) +
facet_grid(.~species) +
geom_histogram(aes(y=..count..),alpha = 0.5, binwidth = mybinwidth) +
stat_function(aes(color = "orange"),
fun = function(x,size, mu) {
mybinwidth * nrow(df) * dnbinom(x,size = size, mu = mu)
},
args=fitdistr(df$count, "negative binomial", method="SANN")$estimate,
xlim=c(0,50),n=20)
You are right, this is a bit of a pain to get right. I've adapted your example a little bit to show two different distribution more clearly. Here is my attempt to make your approach work:
library(ggplot2)
library(MASS)
#> Warning: package 'MASS' was built under R version 3.6.2
set.seed(111)
df <- data.frame(
count = rnbinom(500, rep(c(5, 10), each = 250), 0.5),
species = rep(c("A", 'B'), each = 250)
)
# Not the prettiest formatting, but it'll show the point
ests <- sapply(split(df$count, df$species), function(x) {
est <- fitdistr(x, "negative binomial", method = "SANN")$estimate
formatted <- paste0(names(est)[1], " = ", format(est, digits = 2)[1], ",",
names(est)[2], " = ", format(est, digits = 2)[2])
formatted
})
mybinwidth <- 1
spec_A = df[df$species == "A",]
spec_B = df[df$species == "B",]
ggplot(df, aes(count)) +
geom_histogram(binwidth = mybinwidth,
aes(fill = species), alpha = 0.5,
position = "identity") +
stat_function(aes(color = "A"),
data = data.frame(species = "A"),
fun = function(x, size, mu) {
mybinwidth * nrow(spec_A) * dnbinom(x,size = size, mu = mu)
},
args = fitdistr(spec_A$count, "negative binomial", method="SANN")$estimate,
xlim = c(0, max(df$count)), n= max(df$count) + 1, inherit.aes = FALSE) +
stat_function(aes(color = "B"),
data = data.frame(species = "B"),
fun = function(x, size, mu) {
mybinwidth * nrow(spec_B) * dnbinom(x,size = size, mu = mu)
},
args = fitdistr(spec_B$count, "negative binomial", method="SANN")$estimate,
xlim = c(0, max(df$count)), n= max(df$count) + 1, inherit.aes = FALSE) +
scale_colour_discrete(labels = unname(ests), name = "fit") +
facet_wrap(~ species)
#> Warning: `mapping` is not used by stat_function()
#> Warning: `data` is not used by stat_function()
#> Warning: `mapping` is not used by stat_function()
#> Warning: `data` is not used by stat_function()
Created on 2020-04-15 by the reprex package (v0.3.0)
There are also packages that do the majority of this work for you. Disclaimer: I wrote ggh4x, so I'm not unbiased. You can also replace the ggplot code with the following (assuming similar preprocessing)
library(ggh4x)
ggplot(df, aes(count)) +
geom_histogram(binwidth = mybinwidth,
aes(fill = species), alpha = 0.5,
position = "identity") +
stat_theodensity(aes(colour = species,
y = after_stat(count * mybinwidth)),
distri = "nbinom") +
scale_colour_discrete(labels = unname(ests), name = "fit") +
facet_wrap(~ species)
Hope that helped!

Weighting using predict function

I have used 'predict' find a fit line for a linear model(lm) I have created. Because the lm was built on only 2 data points and needs to have a positive slope, I have forced it to go thru the origin (0,0). I have also weighted the function by the number of observations underlying each data point.
Question 1: (SOLVED -see comment by #Gregor)
Why does the predicted line lie so much closer to my second data point (B) than my first data point (A), when B has fewer underlying observations? Did I code something wrong here when weighting the model?
Question 2:
Plotting GLM (link=logit) now, but how can still I force this through 0,0? I've tried adding formula = y~0+x in several places, none of which seem to work.
M <- data.frame("rate" = c(0.4643,0.2143), "conc" = c(300,6000), "nr_dead" = c(13,3), "nr_surv" = c(15,11), "region" = c("A","B"))
M$tot_obsv <- (M$nr_dead+M$nr_surv)
M_conc <- M$conc
M_rate <- M$rate
M_tot_obsv <- M$tot_obsv
#**linear model of data, force 0,0 intercept, weighted by nr. of observations of each data point.**
M_lm <- lm(data = M, rate~0+conc, weights = tot_obsv)
#**plot line using "predict" function**
x_conc <-c(600, 6700)
y_rate <- predict(M_lm, list(conc = x_conc), weights = tot_obsv, type = 'response')
plot(x = M$conc, y = M$rate, pch = 16, ylim = c(0, 0.5), xlim = c(0,7000), xlab = "conc", ylab = "death rate")
lines(x_conc, y_rate, col = "red", lwd = 2)
#**EDIT 1:**
M_glm <- glm(cbind(nr_dead, nr_surv) ~ (0+conc), data = M, family = "binomial")
#*plot using 'predict' function*
binomial_smooth <- function(formula = (y ~ 0+x),...) {
geom_smooth(method = "glm", method.args = list(family = "binomial"), formula = (y ~ 0+x), ...)
}
tibble(x_conc = c(seq(300, 7000, 1), M$conc), y_rate = predict.glm(M_glm, list(conc = x_conc), type = "response")) %>% left_join(M, by = c('x_conc' = 'conc')) %>%
ggplot(aes(x = x_conc, y = y_rate)) + xlab("concentration") + ylab("death rate") +
geom_point(aes(y = rate, size = tot_obsv)) + binomial_smooth(formula = (y ~ 0+x)) + theme_bw()

Plotting lift curve in MLR

I would like to know how to plot lift curves in MLR especially for a Benchmark experiment with multiple algorithms and tasks. Help with ROC curve plotting will also be appreciated.
Thanks.
I am not a mlr user but here is a general way.
First some data:
Two class problem
iris2 = iris[iris$Species!="setosa",]
iris2$Species = factor(iris2$Species)
1st model:
log_model = glm(Species~., data = iris2, family = "binomial")
prob = predict(log_model, iris2, type = "response") #get the logistic regression prob
2nd model:
library(e1071)
svm_model = svm(Species~., data = iris2, probability = TRUE)
prob_svm = predict(svm_model, iris2, probability = TRUE)
prob_svm = attr(prob_svm , "probabilities")[,2] #get the probability for svm model
make a data frame from classes (1/0 coding) and additional columns for predicted probabilities for each model
for_lift = data.frame(Class = as.factor(ifelse(iris2$Species == "versicolor", 1, 0)), glm = prob, svm = prob_svm)
make a lift object
library(caret)
lift_obj = lift(Class ~ glm+svm, data = for_lift)
xyplot(lift_obj, auto.key = list(columns = 2,
lines = TRUE,
points = FALSE))
You can use the same data frame to plot ROC curves
library(pROC)
plot(pROC::roc(response = for_lift$Class,
predictor = for_lift$glm,
levels=c(0, 1)),
lwd=1.5)
plot(
pROC::roc(response = for_lift$Class,
predictor = for_lift$svm ,
levels=c(0, 1)),
add=T, lty=2, lwd=1.5)
legend(0.9, 0.9, c("logistic", "svm"), lty = c(1,2))
You can also check the ROCR package: https://cran.r-project.org/web/packages/ROCR/ROCR.pdf it has methods to plot both types of plots
Additionally if you are a ggplot2 user you can use the lift_obj to plot lift and ROC curves with it also.
library(ggplot2)
p1 = ggplot(lift_obj$data)+
geom_line(aes(CumTestedPct, CumEventPct, color = liftModelVar))+
xlab("% Samples tested")+
ylab("% Samples found")+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_polygon(data = data.frame(x = c(0, lift_obj$pct, 100, 0),
y = c(0, 100, 100, 0)),
aes(x = x, y = y), alpha = 0.1)
p2 = ggplot(lift_obj$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))
library(cowplot)
plot_grid(p1, p2, labels=c("lift", "ROC"))

Resources