Why is geom_density plotting my data differently from the expected image? - r

I have the following task statement:
In this task we want to simulate random variables with density
To do this, write a function r_density(n) that simulates n of such random variables.
Then use this function to simulate N = 1000 of such random variables. Using geom_density() you can now estimate the density from the simulated random variables. We can compare this estimate with the real density. To do this, create a graph that looks like this:
Problem is, however, that I don't understand why my output looks like this:
Why is the raked density plotted in such a weird way? Can someone explain to me why it looks like that and how to get the estimated density from the expected image?
This is the corresponding code I wrote for the above plot:
library(tidyverse)
N <- 1000
r_density <- function(n){
exp(-abs(n))/2
}
x <- runif(N)
tb <- tibble(
x = x,
density_fkt = r_density(x)
)
ggplot() +
geom_density(
data = tb,
mapping = aes(
x = density_fkt,
y = ..scaled..
)
) +
geom_function(
fun = r_density,
xlim = c(-6,6),
color = "red",
size = 1
) +
theme_minimal() +
labs(
x = "x",
y = "Dichtefunktion f(x)",
title = "Geschätzte (schwarz) vs echte (rot) Dichte"
)

You may use inverse transform sampling or rejection sampling. I choose rejection sampling.
library(tidyverse)
N <- 1000
r_density <- function(n){
exp(-abs(n))/2
}
x = c()
while (length(x) < N) {
y = rnorm(1)
while (y > 6 | y < -6) {
y = rnorm(1)
}
u = runif(1)
if (u < r_density(y)/(dnorm(y) * 3)) {
x=append(x, y)
}
}
tb <- tibble(
x = x,
density_fkt = r_density(x)
)
ggplot() +
geom_density(
data = tb,
mapping = aes(
x = x,
y = ..density..
)
) +
geom_function(
fun = r_density,
xlim = c(-6,6),
color = "red",
size = 1
) +
theme_minimal() +
labs(
x = "x",
y = "Dichtefunktion f(x)",
title = "Geschätzte (schwarz) vs echte (rot) Dichte"
)

Here's the inverse transform sampling method (this involves some difficult integration, so perhaps not what your teacher intended)
r_density <- function(n) {
cdf <- function(x) {
1/4 * exp(-x) * (-1 + 2 * exp(x) + exp(2*x) - (-1 + exp(x))^2 * sign(x))
}
sapply(runif(n), function(i) {
uniroot(function(x) cdf(x) - i, c(-30, 20))$root
})
}
Plotting gives:
ggplot() +
geom_density(aes(r_density(1000))) +
geom_function(
fun = function(x) exp(-abs(x))/2,
xlim = c(-6,6),
color = "red",
size = 1
) +
theme_minimal() +
labs(
x = "x",
y = "Dichtefunktion f(x)",
title = "Geschätzte (schwarz) vs echte (rot) Dichte"
)

Related

Monte Carlo Sim in R plots STRAIGHTS

So I am getting started with Monte Carlo Sims, and went with this basic code to simulate Returns for a given portfolio. Well somehow a portion of the simulated returns always results in straight linear lines which are easy to see on the plotted graph. First I decreased the number of sims so you can see it clearer and I also played around with some other factors but they keep showing up. The rest of the output looks promising and "random".
Added the link to the image as my account is new and also the code, appreciate any help!:
library(quantmod)
library(ggplot2)
maxDate<- "2000-01-01"
tickers<-c("MSFT", "AAPL", "BRK-B")
getSymbols(tickers, from=maxDate)
Port.p<-na.omit(merge(Cl(AAPL),Cl(MSFT),Cl(`BRK-B`)))
Port.r<-ROC(Port.p, type = "discrete")[-1,]
stock_Price<- as.matrix(Port.p[,1:3])
stock_Returns <- as.matrix(Port.r[,1:3])
mc_rep = 50 # Number of Sims
training_days = 200
portfolio_Weights = c(0.5,0.3,0.2)
coVarMat = cov(stock_Returns)
miu = colMeans(stock_Returns)
Miu = matrix(rep(miu, training_days), nrow = 3)
portfolio_Returns_m = matrix(0, training_days, mc_rep)
set.seed(2000)
for (i in 1:mc_rep) {
Z = matrix ( rnorm( dim(stock_Returns)[2] * training_days ), ncol = training_days )
L = t( chol(coVarMat) )
daily_Returns = Miu + L %*% Z
portfolio_Returns_200 = cumprod( portfolio_Weights %*% daily_Returns + 1 )
portfolio_Returns_m[,i] = portfolio_Returns_200;
}
x_axis = rep(1:training_days, mc_rep)
y_axis = as.vector(portfolio_Returns_m-1)
plot_data = data.frame(x_axis, y_axis)
ggplot(data = plot_data, aes(x = x_axis, y = y_axis)) + geom_path(col = 'red', size = 0.1) +
xlab('Days') + ylab('Portfolio Returns') +
ggtitle('Simulated Portfolio Returns in 200 days')+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
The lines are the 'return' from the end of each series to the beginning of the next. You can keep the lines separate by adding a grouping variable to your plotting data and using the group aesthetic to tell ggplot about it:
g <- rep(1:training_days, each = mc_rep)
plot_data = data.frame(x_axis, y_axis, g)
ggplot(data = plot_data, aes(x = x_axis, y = y_axis, group = g)) + ...

saving ggplot in a list gives me the same graph

I am trying to plot 12 different plots on a 3 by 4 grid. But,it only plots the last one 12 times. Can any one help me? I am so fed up with it. Thanks
library(ggplot2)
library(gridExtra)
pmax=0.85
K_min = 0.0017
T = seq(100,1200,by=100) ## ISIs
lambda =1/T
p=list()
for(i in (1:length(lambda))){
p[[i]]<-ggplot(data.frame(x = c(0, 1)), aes(x = x)) +
stat_function(fun = function (x) (lambda[i]*(1-(1-pmax))/K_min)*(1-x)^((lambda[i]/K_min)-1)*
(1-(1-pmax)*x)^-((lambda[i]/K_min)+1),colour = "dodgerblue3")+
scale_x_continuous(name = "Probability") +
scale_y_continuous(name = "Frequency") + theme_bw()
main <- grid.arrange(grobs=p,ncol=4)
}
This code produces the correct picture but I need to use ggplot since my other figures are in ggplot.
par( mfrow = c( 3, 4 ) )
for (i in (1:length(lambda))){
f <- function (x) ((lambda[i]*(1-(1-pmax))/K_min)*(1-x)^((lambda[i]/K_min)-1)*
(1-(1-pmax)*x)^-((lambda[i]/K_min)+1) )
curve(f,from=0, to=1, col = "violet",lwd=2,sub = paste0("ISI = ",round(1/lambda[i],3), ""),ylab="PDF",xlab="R")
}
Correct plot using curve:
ggplot objects created in a loop are evaluated at the end of the loop. Since all the ggplot objects in this case use data calculated with lambda[i], they get the same result based on the last i value (12). Here are two possible workarounds:
Workaround 1. Convert each ggplot object into a grob within the loop, & save that to the list:
for(i in (1:length(lambda))){
# code for generating each plot is unchanged
g <- ggplot(data.frame(x = c(0, 1)), aes(x = x)) +
stat_function(fun = function (x) (lambda[i]*(1-(1-pmax))/K_min)*(1-x)^((lambda[i]/K_min)-1)*
(1-(1-pmax)*x)^-((lambda[i]/K_min)+1),colour = "dodgerblue3")+
scale_x_continuous(name = "Probability") +
scale_y_continuous(name = "Frequency") + theme_bw()
p[[i]] <- ggplotGrob(g)
}
main <- grid.arrange(grobs=p, ncol=4)
Workaround 2. Put all the data in a data frame, & create a single ggplot with a facet for each ISI:
library(dplyr)
pmax = 0.85
K_min = 0.0017
ISI = seq(100, 1200, by = 100) # I changed this; using `T` as a name clashes with T from TRUE/FALSE
lambda = 1/ISI
df <- data.frame(
x = rep(seq(0, 1, length.out = 101), length(ISI)),
ISI = rep(ISI, each = 101),
l = rep(lambda, each = 101)
) %>%
mutate(y = (l * pmax / K_min) * (1-x) ^ ((l / K_min) - 1) *
(1 - (1 - pmax) * x)^-((l / K_min) + 1))
ggplot(data,
aes(x = x, y = y, group = 1)) +
geom_line(colour = "dodgerblue3") +
facet_wrap(~ISI, nrow = 3, scales = "free_y") +
labs(x = "Probability", y = "Frequency") +
theme_bw()

How to display different levels in a multilevel analysis with different colors

I am a beginner at multilevel analysis and try to understand how I can do graphs with the plot functions from base-R. I understand the output of fit below but I am struggeling with the visualization. df is just some simple test data:
t <- seq(0, 10, 1)
df <- data.frame(t = t,
y = 1.5+0.5*(-1)^t + (1.5+0.5*(-1)^t) * t,
p1 = as.factor(rep(c("p1", "p2"), 10)[1:11]))
fit <- lm(y ~ t * p1, data = df)
# I am looking for an automated version of that:
plot(df$t, df$y)
lines(df$t[df$p1 == "p1"],
fit$coefficients[1] + fit$coefficients[2] * df$t[df$p1 == "p1"], col = "blue")
lines(df$t[df$p1 == "p2"],
fit$coefficients[1] + fit$coefficients[2] * df$t[df$p1 == "p2"] +
+ fit$coefficients[3] + fit$coefficients[4] * df$t[df$p1 == "p2"], col = "red")
It should know that it has to include p1 and that there are two lines.
The result should look like this:
Edit: Predict est <- predict(fit, newx = t) gives the same result as fit but still I don't know "how to cluster".
Edit 2 #Keith: The formula y ~ t * p1 reads y = (a + c * p1) + (b + d * p1) * t. For the "first blue line" c, d are both zero.
This is how I would do it. I'm including a ggplot2 version of plot as well because I find it better fitted for the way I think about plots.
This version will account for the number of levels in p1. If you want to compensate for the number of model parameters, you will just have to adjust the way you construct xy to include all the relevant variables. I should point out that if you omit the newdata argument, fitting will be done on the dataset provided to lm.
t <- seq(0, 10, 1)
df <- data.frame(t = t,
y = 1.5+0.5*(-1)^t + (1.5+0.5*(-1)^t) * t,
p1 = as.factor(rep(c("p1", "p2"), 10)[1:11]))
fit <- lm(y ~ t * p1, data = df)
xy <- data.frame(t = t, p1 = rep(levels(df$p1), each = length(t)))
xy$fitted <- predict(fit, newdata = xy)
library(RColorBrewer) # for colors, you can define your own
cols <- brewer.pal(n = length(levels(df$p1)), name = "Set1") # feel free to ignore the warning
plot(x = df$t, y = df$y)
for (i in 1:length(levels(xy$p1))) {
tmp <- xy[xy$p1 == levels(xy$p1)[i], ]
lines(x = tmp$t, y = tmp$fitted, col = cols[i])
}
library(ggplot2)
ggplot(xy, aes(x = t, y = fitted, color = p1)) +
theme_bw() +
geom_point(data = df, aes(x = t, y = y)) +
geom_line()

How to plot the intersection of a hyperplane and a plane in R

I have a set of (2-dimensional) data points that I run through a classifier that uses higher order polynomial transformations. I want to visualize the results as a 2 dimensional scatterplot of the points with the classifier superimbosed on top, preferably using ggplot2 as all other visualizations are made by this. Pretty much like this one that was used in the ClatechX online course on machine learning (the background color is optional).
I can display the points with colors and symbols and all, that's easy but I can't figure out how to draw anything like the classifiers (the intersection of the classifiing hyperplane with the plane representing my threshold). The only thing I found was stat_function and that only takes a function with a single argument.
Edit:
The example that was asked for in the comments:
sample data:
"","x","y","x","x","y","value"
"1",4.17338115745224,0.303530843229964,1.26674990184152,17.4171102853774,0.0921309727918932,-1
"2",4.85514814266935,3.452660451876,16.7631779801937,23.5724634872656,11.9208641959486,1
"3",3.51938610081561,3.41200957307592,12.0081790673332,12.3860785266141,11.6418093267617,1
"4",3.18545089452527,0.933340128976852,2.97310914874565,10.1470974014319,0.87112379635852,-16
"5",2.77556006214581,2.49701633118093,6.93061880335166,7.70373365857888,6.23509055818427,-1
"6",2.45974169578403,4.56341833807528,11.2248303614692,6.05032920997851,20.8247869282818,1
"7",2.73947941488586,3.35344674880616,9.18669833727041,7.50474746458339,11.2456050970786,-1
"8",2.01721803518012,3.55453519499861,7.17027250203368,4.06916860145595,12.6347204524838,-1
"9",3.52376445778646,1.47073399974033,5.1825201951431,12.4169159539591,2.1630584979922,-1
"10",3.77387718763202,0.509284208528697,1.92197605658768,14.2421490273294,0.259370405056702,-1
"11",4.15821685106494,1.03675272315741,4.31104264382058,17.2907673804804,1.0748562089743,-1
"12",2.57985028671101,3.88512040604837,10.0230289934507,6.65562750184287,15.0941605694935,1
"13",3.99800728890114,2.39457673509605,9.5735352407471,15.9840622821066,5.73399774026327,1
"14",2.10979392635636,4.58358959294856,9.67042948411309,4.45123041169019,21.0092935565863,1
"15",2.26988795562647,2.96687697409652,6.73447830932721,5.15239133109813,8.80235897942413,-1
"16",1.11802248633467,0.114183261757717,0.127659454208164,1.24997427994995,0.0130378172656312,-1
"17",0.310411276295781,2.09426849964075,0.650084557879535,0.0963551604515758,4.38596054858751,-1
"18",1.93197490065359,1.72926536411978,3.340897280049,3.73252701675543,2.99035869954433,-1
"19",3.45879891654477,1.13636834081262,3.93046958599847,11.9632899450912,1.29133300600123,-1
"20",0.310697768582031,0.730971727753058,0.227111284709427,0.0965331034018534,0.534319666774291,-1
"21",3.88408110360615,0.915658151498064,3.55649052359657,15.0860860193904,0.838429850404852,-1
"22",0.287852146429941,2.16121324687265,0.622109872005114,0.0828588582043242,4.67084269845782,-1
"23",2.80277011333965,1.22467750683427,3.4324895146344,7.85552030822994,1.4998349957458,-1
"24",0.579150241101161,0.57801398797892,0.334756940497835,0.335415001767533,0.334100170299295-,1
"25",2.37193428212777,1.58276639413089,3.7542178708388,5.62607223873297,2.50514945839009,-1
"26",0.372461311053485,2.51207412336953,0.935650421453748,0.138727428231681,6.31051640130279,-1
"27",3.56567220995203,1.03982002707198,3.70765737388213,12.7140183088242,1.08122568869998,-1
"28",0.634770628530532,2.26303249713965,1.43650656059435,0.402933750845047,5.12131608311011,-1
"29",2.43812176748179,1.91849716124125,4.67752968967431,5.94443775306852,3.68063135769073,-1
"30",1.08741064323112,3.01656032912433,3.28023980783858,1.18246190701233,9.0996362192467,-1
"31",0.98,2.74,2.6852,0.9604,7.5076,1
"32",3.16,1.78,5.6248,9.9856,3.1684,1
"33",4.26,4.28,18.2328,18.1476,18.3184,-1
The code to generate a classifier:
perceptron_train <- function(data, maxIter=10000) {
set.seed(839)
X <- as.matrix(data[1:5])
Y <- data["value"]
d <- dim(X)
X <- cbind(rep(1, d[1]), X)
W <- rep(0, d[2] + 1)
count <- 0
while (count < maxIter){
H <- sign(X %*% W)
indexs <- which(H != Y)
if (length(indexs) == 0){
break
} else {
i <- sample(indexs, 1)
W <- W + 0.1 * (X[i,] * Y[i,])
}
count <- count + 1
point <- as.data.frame(data[i,])
plot_it(data, point, W, paste("plot", sprintf("%05d", count), ".png", sep=""))
}
W
}
The code to generate the plot:
plot_it <- function(data, point, weights, name = "plot.png") {
line <- weights_to_line(weights)
point <- point
png(name)
p = ggplot() + geom_point(data = data, aes(x, y, color = value, size = 2)) + theme(legend.position = "none")
p = p + geom_abline(intercept = line[2], slope = line[1])
print(p)
dev.off()
}
This was solved using material from the question and answers from Issues plotting a fitted SVM model's decision boundary using ggplot2's stat_contour(). I skipped the call to geom_point for the grid-entires and some of the aesthetical definitions like scale_fill_manual and scale_colour_manual. Removing the dots for the grid entries solved the problem with the vanishing contour-line in my case.
train_and_plot_svm <- function(train, kernel = "sigmoid", type ="C", cost, gamma) {
fit <- svm(as.factor(value) ~ x + y, data = train, kernel = kernel, type = type, cost = cost)
grid <- expand.grid (x = seq(from = -0.1, to = 15, length = 100), y = seq(from = -0.1, to = 15, length = 100))
decisionValues <- as.vector(attributes(predict(fit, grid, decision.values = TRUE))$decision)
p <- predict(fit, grid)
grid$value <- p
grid$z <- decisionValues
p <- ggplot() + stat_contour(data = grid, aes(x = x, y = y, z = z), breaks = c(0))
p <- p + geom_point(data = train, aes(x, y, colour = as.factor(value)), alpha = 0.7)
p <- p + xlim(0,15) + ylim(0,15) + theme(legend.position="none")
}
Note that this function doesn't return the result of the svm training but the ggplot2 object.
This is, what I got:

How to get something like Matplotlib's symlog scale in ggplot or lattice?

For very heavy-tailed data of both positive and negative sign, I sometimes like to see all the data on a plot without hiding structure in the unit interval.
When plotting with Matplotlib in Python, I can achieve this by selecting a symlog scale, which uses a logarithmic transform outside some interval, and linear plotting inside it.
Previously in R I have constructed similar behavior by transforming the data with an arcsinh on a one-off basis. However, tick labels and the like are very tricky to do right (see below).
Now, I am faced with a bunch of data where the subsetting in lattice or ggplot would be highly convenient. I don't want to use Matplotlib because of the subsetting, but I sure am missing symlog!
Edit:
I see that ggplot uses a package called scales, which solves a lot of this problem (if it works). Automatically choosing tick mark and label placing still looks pretty hard to do nicely though. Some combination of log_breaks and cbreaks perhaps?
Edit 2:
The following code is not too bad
sinh.scaled <- function(x,scale=1){ sinh(x)*scale }
asinh.scaled <- function(x,scale=1) { asinh(x/scale) }
asinh_breaks <- function (n = 5, scale = 1, base=10)
{
function(x) {
log_breaks.callable <- log_breaks(n=n,base=base)
rng <- rng <- range(x, na.rm = TRUE)
minx <- floor(rng[1])
maxx <- ceiling(rng[2])
if (maxx == minx)
return(sinh.scaled(minx, scale=scale))
big.vals <- 0
if (minx < (-scale)) {
big.vals = big.vals + 1
}
if (maxx>scale) {
big.vals = big.vals + 1
}
brk <- c()
if (minx < (-scale)) {
rbrk <- log_breaks.callable( c(-min(maxx,-scale), -minx ) )
rbrk <- -rev(rbrk)
brk <- c(brk,rbrk)
}
if ( !(minx>scale | maxx<(-scale)) ) {
rng <- c(max(minx,-scale), min(maxx,scale))
minc <- floor(rng[1])
maxc <- ceiling(rng[2])
by <- floor((maxc - minc)/(n-big.vals)) + 1
cb <- seq(minc, maxc, by = by)
brk <- c(brk,cb)
}
if (maxx>scale) {
brk <- c(brk,log_breaks.callable( c(max(minx,scale), maxx )))
}
brk
}
}
asinh_trans <- function(scale = 1) {
trans <- function(x) asinh.scaled(x, scale)
inv <- function(x) sinh.scaled(x, scale)
trans_new(paste0("asinh-", format(scale)), trans, inv,
asinh_breaks(scale = scale),
domain = c(-Inf, Inf))
}
A solution based on the package scales and inspired by Brian Diggs' post mentioned by #Dennis:
symlog_trans <- function(base = 10, thr = 1, scale = 1){
trans <- function(x)
ifelse(abs(x) < thr, x, sign(x) *
(thr + scale * suppressWarnings(log(sign(x) * x / thr, base))))
inv <- function(x)
ifelse(abs(x) < thr, x, sign(x) *
base^((sign(x) * x - thr) / scale) * thr)
breaks <- function(x){
sgn <- sign(x[which.max(abs(x))])
if(all(abs(x) < thr))
pretty_breaks()(x)
else if(prod(x) >= 0){
if(min(abs(x)) < thr)
sgn * unique(c(pretty_breaks()(c(min(abs(x)), thr)),
log_breaks(base)(c(max(abs(x)), thr))))
else
sgn * log_breaks(base)(sgn * x)
} else {
if(min(abs(x)) < thr)
unique(c(sgn * log_breaks()(c(max(abs(x)), thr)),
pretty_breaks()(c(sgn * thr, x[which.min(abs(x))]))))
else
unique(c(-log_breaks(base)(c(thr, -x[1])),
pretty_breaks()(c(-thr, thr)),
log_breaks(base)(c(thr, x[2]))))
}
}
trans_new(paste("symlog", thr, base, scale, sep = "-"), trans, inv, breaks)
}
I am not sure whether the impact of a parameter scale is the same as in Python, but here are a couple of comparisons (see Python version here):
data <- data.frame(x = seq(-50, 50, 0.01), y = seq(0, 100, 0.01))
data$y2 <- sin(data$x / 3)
# symlogx
ggplot(data, aes(x, y)) + geom_line() + theme_bw() +
scale_x_continuous(trans = symlog_trans())
# symlogy
ggplot(data, aes(y, x)) + geom_line() + theme_bw()
scale_y_continuous(trans="symlog")
# symlog both, threshold = 0.015 for y
# not too pretty because of too many breaks in short interval
ggplot(data, aes(x, y2)) + geom_line() + theme_bw()
scale_y_continuous(trans=symlog_trans(thr = 0.015)) +
scale_x_continuous(trans = "symlog")
# Again symlog both, threshold = 0.15 for y
ggplot(data, aes(x, y2)) + geom_line() + theme_bw()
scale_y_continuous(trans=symlog_trans(thr = 0.15)) +
scale_x_continuous(trans = "symlog")

Resources