I've plotted a confusion matrix (predicting 5 outcomes) in R using ggplot and scales for geom_text labeling.
The way geom_text(aes(label = percent(Freq/sum(Freq))) is written in code, it's showing Frequency of each box divided by sum of all observations, but what I want to do is get Frequency of each box divided by sum Frequency for each Reference.
In other words, instead of A,A = 15.8%,
it should be A,A = 15.8%/(0.0%+0.0%+0.0%+0.0%+15.8%%) = 100.0%
library(ggplot2)
library(scales)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
p <-
ggplot(data = as.data.frame(m$table) ,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percent(Freq/sum(Freq)))) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
The problem is that, as far as I know, ggplot is not able to do group calculation. See this recent post for similar question.
To solve your problem you should take advantage of the dplyrpackage.
This should work
library(ggplot2)
library(scales)
library(caret)
library(dplyr)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
data_c <- mutate(group_by(as.data.frame(m$table), Reference ), percentage =
percent(Freq/sum(Freq)))
p <-
ggplot(data = data_c,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percentage)) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
And the result:
Related
I have the following graph and code:
Graph
ggplot(long2, aes(x = DATA, y = value, fill = variable)) + geom_area(position="fill", alpha=0.75) +
scale_y_continuous(labels = scales::comma,n.breaks = 5,breaks = waiver()) +
scale_fill_viridis_d() +
scale_x_date(date_labels = "%b/%Y",date_breaks = "6 months") +
ggtitle("Proporcions de les visites, només 9T i 9C") +
xlab("Data") + ylab("% visites") +
theme_minimal() + theme(legend.position="bottom") + guides(fill=guide_legend(title=NULL)) +
annotate("rect", fill = "white", alpha = 0.3,
xmin = as.Date.character("2020-03-16"), xmax = as.Date.character("2020-06-22"),
ymin = 0, ymax = 1)
But it has some sawtooth, how am I supposed to smooth it out?
I believe your situation is roughly analogous to the following, wherein we have missing x-positions for one group, but not the other at the same position. This causes spikes if you set position = "fill".
library(ggplot2)
x <- seq_len(100)
df <- data.frame(
x = c(x[-c(25, 75)], x[-50]),
y = c(cos(x[-c(25, 75)]), sin(x[-50])) + 5,
group = rep(c("A", "B"), c(98, 99))
)
ggplot(df, aes(x, y, fill = group)) +
geom_area(position = "fill")
To smooth out these spikes, it has been suggested to linearly interpolate the data at the missing positions.
# Find all used x-positions
ux <- unique(df$x)
# Split data by group, interpolate data groupwise
df <- lapply(split(df, df$group), function(xy) {
approxed <- approx(xy$x, xy$y, xout = ux)
data.frame(x = ux, y = approxed$y, group = xy$group[1])
})
# Recombine data
df <- do.call(rbind, df)
# Now without spikes :)
ggplot(df, aes(x, y, fill = group)) +
geom_area(position = "fill")
Created on 2022-06-17 by the reprex package (v2.0.1)
P.S. I would also have expected a red spike at x=50, but for some reason this didn't happen.
This is really basic. Still hope I can get your help. I need to superimpose two density plots. The first is a generated normal density plot given mean and sd of AAPL. >
x <- seq(-20, 20, length.out = 5113)
normAAPL<-data.frame(x, f = dnorm(x,mean = meanAAPL, sd = sdAAPL)) %>%
ggplot(aes(x, f)) +
geom_line() +
stat_function(fun=dnorm, geom="line", col=2, lty=2)+
ylim(0,0.2)
> meanAAPL
[1] 0.101133
> sdAAPL
[1] 2.461525
The next is the actual distribution
dAAPL <-density(oldandnew$AAPL)
Where the 20 first AAPL data is
c(-8.810021, 1.45281, -9.051401, 4.628075, -1.774445, -5.25055,
-6.181806, 10.40407, 3.74302, 3.425328, 2.48944, 6.309463, -1.948374,
-4.652429, 5.493372, -1.852238, -0.1725783, -7.924, 2.074379,
-3.431709)
Do I need to combine the data in one data frame to plot them in the same ggplot?
Hope you can help me out.
df <- data.frame(x = seq(-20, 20, length.out = 5113),
f = dnorm(x))
df2 <- data.frame(x = c(-8.810021, 1.45281, -9.051401, 4.628075, -1.774445, -5.25055,
-6.181806, 10.40407, 3.74302, 3.425328, 2.48944, 6.309463, -1.948374,
-4.652429, 5.493372, -1.852238, -0.1725783, -7.924, 2.074379,
-3.431709))
ggplot() +
geom_line(data = df, aes(x, f, colour = "Normal")) +
geom_density(data = df2, aes(x, colour = "Actual")) +
ylim(0,0.2) +
scale_color_manual(name = "Distribution", values = c("Normal" = "Blue", "Actual" = "Red")) +
theme_minimal() + theme(legend.position = "top", aspect.ratio = 1)
Produces:
I have the following to plot a boxplot of some data "Samples" and add points of the "Baseline" and "Theoretical" data.
library(reshape2)
library(ggplot2)
meltshear <- melt(Shear)
samples <- rep(c("Samples"), each = 10)
baseline <- c("Baseline",samples)
method <- rep(baseline, 4)
xlab <- rep(c("EXT.Single","EXT.Multi","INT.Single","INT.Multi"), each = 11)
plotshear <- data.frame(Source = c(method,"theoretical","theoretical","theoretical"),
Shear = c(xlab,"EXT.Multi","INT.Single","INT.Multi"),
LLDF = c(meltshear[,2],0.825,0.720,0.884))
data <- subset(plotshear, Source %in% c("Samples"))
baseline <- subset(plotshear, Source %in% c("Baseline"))
theoretical <- subset(plotshear, Source %in% c("theoretical"))
ggplot(data = data, aes(x = Shear, y = LLDF)) + geom_boxplot(outlier.shape = NA) +
stat_summary(fun = mean, geom="point", shape=23, size=3) +
stat_boxplot(geom='errorbar', linetype=1, width=0.5) +
geom_jitter(data = baseline, colour = "green4") +
geom_jitter(data = theoretical, colour = "red")
I get the following plot but I cannot add the legend to the plot. I want to have the legend showing labels = c("Samples","Baseline","Theoretical") for the boxplot shape, green dot, and red dot respectively.
You could try to add fill into aes.
ggplot(data = data, aes(x = Shear, y = LLDF, fill = Shear))
Or you can see this resource, maybe it is useful http://www.cookbook-r.com/Graphs/
I'm plotting 11 curves and the program bellow works well. BUT I'm not able two change the wild colors to plot 11 black curves:
library(ggplot2)
#library(latex2exp)
library(reshape)
fn <- "img/plot.eps"
fct1 <- function(x0 ){
return(1/sin(x0)+1/tan(x0))
}
fct2 <- function(beta, t ){
return(2*atan(exp(t)/beta))
}
t<-seq(from=0,to=10,by=0.01)
s1<-cbind(t, fct2(fct1(-pi+0.0001),t),
fct2(fct1(-1.5),t),
fct2(fct1(-0.5),t),
fct2(fct1(-0.05),t),
fct2(fct1(-0.01),t),
fct2(fct1(0),t),
fct2(fct1(0.01),t),
fct2(fct1(0.05),t),
fct2(fct1(0.5),t),
fct2(fct1(1.5),t),
fct2(fct1(pi),t))
colnames(s1)<-c("time","y1","y2","y3","y4","y5","y6","y7","y8","y9","y10","y11")
s2 <- melt(as.data.frame(s1), id = "time")
q <- ggplot(s2, aes(x = time, y = value, color = variable))
q <- q + geom_line() + ylab("y") + xlab("t")+ ylab("x(t)")+
theme_bw(base_size = 7) + guides(colour = FALSE)
ggsave(file = fn, width = 2, height = 1)
q
EDIT Now the code should be reproducible
You need to map the variable to the grouping, and it will produce black lines by default.
q <- ggplot() +
geom_line(data = s2, aes(x = time, y = value,
group = variable)) +
xlab("t")+ ylab("x(t)") +
theme_bw(base_size = 7) + guides(colour = FALSE)
q
To be perfectly clear, it is possible to map the color to the variable, which can produce black lines, but not without changing the legend. Here is how you would amend the colors after the fact, if you wanted to, having already mapped the color to the variable.
q <- ggplot() +
geom_line(data = s2, aes(x = time, y = value,
color = variable)) +
xlab("t")+ ylab("x(t)") +
theme_bw(base_size = 7) + guides(colour = FALSE) +
scale_color_manual(values = rep("black",11))
q
I am trying to plot a point histogram (a histogram that shows the values with a point instead of bars) that is log-scaled. The result should look like this:
MWE:
Lets simulate some Data:
set.seed(123)
d <- data.frame(x = rnorm(1000))
To get the point histogram I need to calculate the histogram data (hdata) first
hdata <- hist(d$x, plot = FALSE)
tmp <- data.frame(mids = hdata$mids,
density = hdata$density,
counts = hdata$counts)
which we can plot like this
p <- ggplot(tmp, aes(x = mids, y = density)) + geom_point() +
stat_function(fun = dnorm, col = "red")
p
to get this graph:
In theory we should be able to apply the log scales (and set the y-limits to be above 0) and we should have a similar picture to the target graph.
However, if I apply it I get the following graph:
p + scale_y_log10(limits = c(0.001, 10))
The stat_function clearly shows non-scaled values instead of producing a figure closer to the solid line in the first picture.
Any ideas?
Bonus
Are there any ways to graph the histogram with dots without using the hist(..., plot = FALSE) function?
EDIT Workaround
One possible solution is to calculate the dnorm-data outside of ggplot and then insert it as a line. For example
tmp2 <- data.frame(mids = seq(from = min(tmp$mids), to = max(tmp$mids),
by = (max(tmp$mids) - min(tmp$mids))/10000))
tmp2$dnorm <- dnorm(tmp2$mids)
# Plot it
ggplot() +
geom_point(data = tmp, aes(x = mids, y = density)) +
geom_line(data = tmp2, aes(x = mids, y = dnorm), col = "red") +
scale_y_log10()
This returns a graph like the following. This is basically the graph, but it doesn't resolve the stat_function issue.
library(ggplot2)
set.seed(123)
d <- data.frame(x = rnorm(1000))
ggplot(d, aes(x)) +
stat_bin(geom = "point",
aes(y = ..density..),
#same breaks as function hist's default:
breaks = pretty(range(d$x), n = nclass.Sturges(d$x), min.n = 1),
position = "identity") +
stat_function(fun = dnorm, col = "red") +
scale_y_log10(limits = c(0.001, 10))
Another possible solution that I found while revisiting this issue is to apply the log10 to the stat_function-call.
library(ggplot2)
set.seed(123)
d <- data.frame(x = rnorm(1000))
hdata <- hist(d$x, plot = FALSE)
tmp <- data.frame(mids = hdata$mids,
density = hdata$density,
counts = hdata$counts)
ggplot(tmp, aes(x = mids, y = density)) + geom_point() +
stat_function(fun = function(x) log10(dnorm(x)), col = "red") +
scale_y_log10()
Created on 2018-07-25 by the reprex package (v0.2.0).