Plotting Cumulative Gains Curve Plot R - r

I am trying to generate a cumulative gain plot using ggplot2 in R. Basically I want to replicate following using ggplot2.
My Data is this
df
# A tibble: 10 x 6
Decile resp Cumresp Gain Cumlift
<int> <dbl> <dbl> <dbl> <dbl>
1 8301 8301 57.7 5.77
2 2449 10750 74.8 3.74
3 1337 12087 84.0 2.80
4 751 12838 89.3 2.23
5 462 13300 92.5 1.85
6 374 13674 95.1 1.58
7 252 13926 96.8 1.38
8 195 14121 98.2 1.23
9 136 14257 99.1 1.10
10 124 14381 100 1
## Cumulative Gains Plot
ggplot(df, aes(Decile, Gain)) +
geom_point() +
geom_line() +
geom_abline(intercept = 52.3 , slope = 4.77)
scale_y_continuous(breaks = seq(0, 100, by = 20)) +
scale_x_continuous(breaks = c(1:10)) +
labs(title = "Cumulative Gains Plot",
y = "Cumulative Gain %")
However, I am not able to get the diagonal line, even though I tried geom_abline or niether my y-axis is right. I could not start from 0 to 100.
I would really appreciate if someone can get me the plot as in picture using ggplot2.
Thanks in advance

library(dplyr); library(ggplot2)
df2 <- df %>%
add_row(Decile = 0, Gain =0) %>%
arrange(Decile)
ggplot(df2, aes(Decile, Gain)) +
geom_point() +
geom_line() +
# This makes another geom_line that only sees the first and last row of the data
geom_line(data = df2 %>% slice(1, n())) +
scale_y_continuous(breaks = seq(0, 100, by = 20), limits = c(0,100)) +
scale_x_continuous(breaks = c(1:10)) +
labs(title = "Cumulative Gains Plot",
y = "Cumulative Gain %")

Related

Extract ggplot2 color code information generated from `scale_fill_gradient`

I don't think this question has ever actually been answered on Stack Overflow (this answer provides a workaround, but that is all I could find, and the question is now over 10 years old).
I want to extract the color code information generated from the scale_fill_gradient(low = "skyblue", high = "dodgerblue4") code used in the ggplot2 code shown below.
ggplot(mtcars, aes(wt*1000, mpg)) +
geom_point(size = 4, aes(colour = hp)) +
xlab("Weight (pounds)") + ylab("Miles per gallon (MPG)") + labs(color='Horse power') +
scale_x_continuous(limits = c(1000, 6000),
breaks = c(seq(1000,6000,1000)),
labels = c("1,000", "2,000", "3,000", "4,000", "5,000", "6,000")) +
scale_fill_gradient(low = "skyblue", high = "dodgerblue4") +
theme_classic()
Ideally, I would like to generate a dataset with a row for each point in the plot and columns with all the standard information (wt, mpg, hp) but then also the color code of the individual points that can be seen on the plot. This would correspond with the hp ("Horse power") variable, as seen below, and would (presumably) be generated by the scale_fill_gradient function. This would allow me to reproduce the color of a point on the plot with an hp score of, say, 150.
Thanks in advance for any input.
You can extract any data, including colour codes, that the geom layer uses from the plot using layer_data():
library(ggplot2)
ggplot(mtcars, aes(wt*1000, mpg)) +
geom_point(size = 4, aes(colour = hp)) +
xlab("Weight (pounds)") + ylab("Miles per gallon (MPG)") + labs(color='Horse power') +
scale_x_continuous(limits = c(1000, 6000),
breaks = c(seq(1000,6000,1000)),
labels = c("1,000", "2,000", "3,000", "4,000", "5,000", "6,000")) +
scale_fill_gradient(low = "skyblue", high = "dodgerblue4") +
theme_classic()
ld <- layer_data(last_plot())
(head(ld))
#> colour x y PANEL group shape size fill alpha stroke
#> 1 #204464 2620 21.0 1 -1 19 4 NA NA 0.5
#> 2 #204464 2875 21.0 1 -1 19 4 NA NA 0.5
#> 3 #1C3C5A 2320 22.8 1 -1 19 4 NA NA 0.5
#> 4 #204464 3215 21.4 1 -1 19 4 NA NA 0.5
#> 5 #2E618C 3440 18.7 1 -1 19 4 NA NA 0.5
#> 6 #1E4261 3460 18.1 1 -1 19 4 NA NA 0.5
Created on 2021-07-13 by the reprex package (v1.0.0)

ggplot boxplot with mean and confidence interval by group

I'd like to make a boxplot with mean instead of median. Moreover, I would like the line to stop at 5% (lower) end 95% (upper) quantile. Here the code;
ggplot(data, aes(x=Cement, y=Mean_Gap, fill=Material)) +
geom_boxplot(fatten = NULL,aes(fill=Material), position=position_dodge(.9)) +
xlab("Cement") + ylab("Mean cement layer thickness") +
stat_summary(fun=mean, geom="point", aes(group=Material), position=position_dodge(.9),color="black")
I'd like to change geom to errorbar, but this doesn't work. I tried middle = mean(Mean_Gap), but this doesn't work either. I tried ymin = quantile(y,0.05), but nothing was changing. Can anyone help me?
The standard boxplot using ggplot. fill is Material:
Here is how you can create the boxplot using custom parameters for the box and whiskers. It's the solution shown by #lukeA in stackoverflow.com/a/34529614/6288065, but this one will also show you how to make several boxes by groups.
The R built-in data set called "ToothGrowth" is similar to your data structure so I will use that as an example. We will plot the length of tooth growth (len) for each vitamin C supplement group (supp), separated/filled by dosage level (dose).
# "ToothGrowth" at a glance
head(ToothGrowth)
# len supp dose
#1 4.2 VC 0.5
#2 11.5 VC 0.5
#3 7.3 VC 0.5
#4 5.8 VC 0.5
#5 6.4 VC 0.5
#6 10.0 VC 0.5
library(dplyr)
# recreate the data structure with specific "len" coordinates to plot for each group
df <- ToothGrowth %>%
group_by(supp, dose) %>%
summarise(
y0 = quantile(len, 0.05),
y25 = quantile(len, 0.25),
y50 = mean(len),
y75 = quantile(len, 0.75),
y100 = quantile(len, 0.95))
df
## A tibble: 6 x 7
## Groups: supp [2]
# supp dose y0 y25 y50 y75 y100
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 OJ 0.5 8.74 9.7 13.2 16.2 19.7
#2 OJ 1 16.8 20.3 22.7 25.6 26.9
#3 OJ 2 22.7 24.6 26.1 27.1 30.2
#4 VC 0.5 4.65 5.95 7.98 10.9 11.4
#5 VC 1 14.0 15.3 16.8 17.3 20.8
#6 VC 2 19.8 23.4 26.1 28.8 33.3
# boxplot using the mean for the middle and 95% quantiles for the whiskers
ggplot(df, aes(supp, fill = as.factor(dose))) +
geom_boxplot(
aes(ymin = y0, lower = y25, middle = y50, upper = y75, ymax = y100),
stat = "identity"
) +
labs(y = "len", title = "Boxplot with Mean Middle Line") +
theme(plot.title = element_text(hjust = 0.5))
In the figure above, the boxplot on the left is the standard boxplot with regular median line and regular min/max whiskers. The boxplot on the right uses the mean middle line and 5%/95% quantile whiskers.

Visualize multiple box plot selecting differents rows of a dataframe

I am developing an EDA (Estimation of Distribution Algorithm). I'm getting all measure of the Pareto Front's solutions with distint configurations.
I have a structure with all values:
> metrics20
# A tibble: 320 x 6
File Hypervolume `Modified Hypervolume` Spread Spacing Time
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 001-unif-0.csv 25771 26294. 391. 30.1 16.8
2 002-unif-0.csv 27481 28416. 534. 41.1 16.5
3 003-unif-0.csv 26394 26842. 356. 29.6 16.5
4 004-unif-0.csv 30828 31696 418. 38.0 16.5
5 005-unif-0.csv 28146 28727 444. 34.2 16.6
6 006-unif-0.csv 30176 31006 451. 50.1 16.6
7 007-unif-0.csv 29374 30216 537. 35.8 16.5
8 008-unif-0.csv 27434 28156. 439. 31.4 16.5
9 009-unif-0.csv 28944 29426 471. 33.7 16.4
10 010-unif-0.csv 28339 29302. 576. 44.3 16.4
I want to visualize the values by this way. I take for example the Hipervolume column, I split data by File column value: -unif-, -sat-, -eff- and -prod- distribution and show values with -0.csv,-0.25.csv,-0.5.csv and -0.75.csv in x axis for the same distribution.
Reproducible example:
library(readr)
metrics20 <- read_csv("./metrics20.csv")
Data: Link
Hopefully this is a step towards what you're looking for:
library(readr)
library(dplyr)
library(ggplot2)
metrics20 <- read_csv("metrics20.csv")
metrics20 %>%
mutate(tag = factor(gsub("(^\\d+-)(\\w+)(-.*$)", "\\2", .$File), levels = c("unif", "sat", "eff", "prod")),
level = gsub("(^\\d+-\\w+-)(.*)(\\.csv$)", "\\2", .$File)) %>%
ggplot(aes(x = level, y = Hypervolume)) +
geom_boxplot() +
facet_wrap(~tag, nrow = 1)+
theme_minimal() +
theme(panel.border = element_rect(colour = "black", fill = NA),
panel.grid = element_blank())
From here there may be other things you want to tweak if you need to adjust it to be more like the example plot. You should be able to find all next steps in the help for the functions used.

ggplot2 geom_smooth didn't work

I'm plotting two different variables on the same plot.
sex_female is chr, including 0 and 1.
epoch_36:epoch_144 are num, time variables.
Here is my code:
total %>%
select(sex_female, epoch_36:epoch_144)%>%
gather(key = time, value = ac, epoch_36:epoch_144) %>%
group_by(sex_female,time) %>%
mutate(mean = mean(ac)) %>%
ggplot(aes(x = time, y = mean,color = sex_female)) +
geom_point(alpha = .3)+
geom_smooth(method = "lm")+
theme(axis.text.x = element_text(angle = 90,hjust = 1))
After the mutation, I got the tibble:
A tibble: 45,780 x 4
# Groups: sex_female, time [218]
sex_female time ac mean
<chr> <chr> <dbl> <dbl>
1 1 epoch_36 49.8 54.96406
2 0 epoch_36 34.7 55.43448
3 0 epoch_36 70.9 55.43448
4 0 epoch_36 12.3 55.43448
5 1 epoch_36 102.7 54.96406
6 1 epoch_36 77.9 54.96406
7 0 epoch_36 1.1 55.43448
8 1 epoch_36 140.0 54.96406
9 1 epoch_36 51.3 54.96406
10 0 epoch_36 0.0 55.43448
# ... with 45,770 more rows
I've tried using the solution suggested in a similar question: Plot dashed regression line with geom_smooth in ggplot2, but no lines showed up. How do I fix my code to produce lines?
Your time column is categorical and you should transform it into numerical.
mutTibble$time <- as.numeric(mutTibble$time)
And for plotting you can use this:
library(ggplot2)
ggplot(mutTibble,
aes(time, mean, color = factor(sex_female))) +
geom_point(alpha = 0.3)+
geom_smooth(method = "lm")+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Time",
y = "Mean"
color = "Gender (female)")

How can I add a line plot in a BOX plot using secondary y axis but same x axis

I am trying to add a line plot to my box plot, on secondary y axis, but i am not able to. what to do? Please help
code for my box plot are:
library(ggplot2)
mydata<-read.csv("boxplot2.csv")
mydata$Class <- factor(mydata$Class,labels = c("1", "2", "3", "4", "5", "6"))
p10 <- ggplot(mydata, aes(x = mydata$Class, y = log(mydata$erosion))) +
geom_boxplot()
p10
p10 <- p10 +
scale_x_discrete(name = "Mean Annual Precipitation(mm/yr)") +
scale_y_continuous(name = "Log Average Erosion Rate(m/My)")
p10 <- ggplot(mydata, aes(x = mydata$Class, y = log(mydata$erosion))) +
geom_boxplot(varwidth=TRUE)
p10 <- p10 +
scale_x_discrete(name = "Mean Annual Precipitation(mm/yr)") +
scale_y_continuous(name = "Log Average Erosion Rate(m/My)")
I want similar figure, but instead of histograms, i will have box plot
add sample data
% Vegetation erosion Class
0 0.43 1
0 0.81 1
2 0.26 1
3 1.05 1
3 0.97 1
12.76 15.97 2
12.84 17.69 2
11.01 14.76 2
13.44 17.94 2
10.76 10.65 2
7.28 67.47 2
23 120.4 3
21 298.63 3
52 21.4 3
9 64.94 3
50 291.88 3
16 493.98 3
11 183.45 3
You just have to specify different aesthetics for the geom_line, something like this:
ggplot(iris,aes(x=Species, y=Sepal.Length, fill=Species)) +
geom_boxplot() +
geom_line(aes(x=Species, y=Petal.Length, group=1), stat = "summary", fun.y="mean") +
scale_y_continuous(sec.axis = sec_axis(~.))

Resources