How to plot two Lorenz curve on same plot? - r

I want to plot two lorenz curves on one graph (data1 and data2), so please help me with the code?
I have one code which work perfectly with one curve (data1), but I want to make comparison with tho curves (data1 and data2) on same plot.
#Packages
library(ineq)
library(ggplot2)
library(scales)
library(grid)
#DATA SETS
set.seed(1)
data1<-sample(1000)
data2<-c(1000:2000) # I want to put this data set into second lorenz curve
# compute lorenz curve
lcolc <- Lc(data1)
# bring lorenz curve in another format easily readable by ggplot2
# namely reverse the L column so that lorenz curve is mirrored on diagonal
# p stays p (the diagonal)
# Uprob contains the indices of the L's, but we need percentiles
lcdf <- data.frame(L = rev(1-lcolc$L), p = lcolc$p, Uprob = c(1:length(lcolc$L)/length(lcolc$L)))
# basic plot with the diagonal line and the L line
p <- ggplot(lcdf, aes(y = Uprob, x = L)) + geom_line(colour = hcl(h=15, l=65, c=100)) + geom_line(aes(y = p, x = p))
# compute annotation lines at 50 percent L (uses a heuristic)
index <- which(lcdf$L >= 0.499 & lcdf$L <= 0.501)[1]
ypos <- lcdf$L[index]
yposs <- c(0,ypos)
xpos <- index/length(lcdf$L)
xposs <- c(0,xpos)
ypositions <- data.frame(y = xposs, x = c(ypos,ypos))
xpositions <- data.frame(y = c(xpos,xpos), x = yposs)
# add annotation line
p <- p + geom_line(data = ypositions, aes(x = x, y = y),
linetype="dashed") + geom_line(data = xpositions, aes(x = x, y = y),
linetype="dashed")
# set axes and labels (namely insert custom breaks in scales)
p <- p + scale_x_continuous(breaks=c(0, xpos,0.25,0.5,0.75,1),
labels = percent_format()) + scale_y_continuous(
labels = percent_format())
# add minimal theme
p <- p + theme_minimal() + xlab("Percent of Population") + ylab("Percent of Income")
# customize theme
p <- p + theme(plot.margin = unit(c(0.5,1,1,1), "cm"),
axis.title.x = element_text(vjust=-1),
axis.title.y = element_text(angle=90, vjust=0),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = rgb(0.99,0.99,0.99), linetype=0))
# print plot
p

Here is a minimal, reproducible example of how to display multiple curves on the same plot using ggplot2. The key point here is to pass data in "long-form" to the ggplot() function.
library(ggplot2)
# Construct example data.frame with separate columns for each curve.
x = seq(from=0, to=3, by=1)
dat = data.frame(x=x,
y1=9 * x,
y2=3 * x^2,
y3=x^3)
dat
# x y1 y2 y3
# 1 0 0 0 0
# 2 1 9 3 1
# 3 2 18 12 8
# 4 3 27 27 27
# Convert data to long form, putting all y-values in a single column.
mdat = reshape2::melt(dat, id.vars="x", measure.vars=c("y1", "y2", "y3"))
mdat
# x variable value
# 1 0 y1 0
# 2 1 y1 9
# 3 2 y1 18
# 4 3 y1 27
# 5 0 y2 0
# 6 1 y2 3
# 7 2 y2 12
# 8 3 y2 27
# 9 0 y3 0
# 10 1 y3 1
# 11 2 y3 8
# 12 3 y3 27
p = ggplot(data=mdat, aes(x=x, y=value, colour=variable, group=variable)) +
geom_point() +
geom_line()

Related

Creating stimuli in R with ggplot

I am trying to generate my own stimuli for an experiment using R. Below is the code that creates my (x,y) coordinates using the rnorm() with different a sample size of 100, different means and sd. I also create another variable to represent the size of the circles, which are determined by the runif().
dt <- data.frame(x = NA,
y = NA,
size = NA,
M = NA,
sd = NA,
col = NA,
iter = NA)
sa<-0
mySD<-c(5, 15)
myMeans<-c(35, 45)
colors<-c("Blues", "Reds")
for(i in 1:10){
for(s in mySD){
for(m in myMeans){
x = abs(rnorm(n=1, mean=m, sd=s))
y = abs(rnorm(n=1, mean=m, sd=s))
size = runif(1, 1, 25) #select a random x speed between [25,35]
sa<-sa+1
dt[sa,] <- NA
dt$x[sa]<-x
dt$y[sa]<-y
dt$M[sa]<-m
dt$sd[sa]<-s
dt$size[sa]<-size
dt$iter[sa]<-i
}
}
}
}
Next, I want to use ggplot(dt, aes(x, y, size=size) to plot. I want to randomly select 4 (x,y) values to plot for one graph, then 8 for another, then 16 for another, etc. Basically, I want to plot different graphs with a different number of data points. For example, some graphs that you would see would have 4 data points that vary by size and color, others would have 32 data points that vary in size and color. I m not sure how to select a set of unique data points from the data frame that I created. Any help would be great. I'm pretty new to R.
Here are two ways - depending if you wanted each group to not contain points from any other group.
I'll just use a dummy data frame that just has columns x, y, and size.
library(tidyverse)
dt <- tibble(x = runif(100), y = runif(100), size = runif(100))
Allowing groups to share the same points
Create a vector for the size of each group.
sample_sizes <- 2^(seq_len(4) + 1)
sample_sizes
#> [1] 4 8 16 32
Randomly sample the data frame and add a group column.
sampled <- map_dfr(
sample_sizes,
~sample_n(dt, .),
.id = "group"
)
Plot using facets.
ggplot(sampled, aes(x, y, size = size)) +
geom_point() +
facet_wrap(~group)
Requiring groups to have different points
First, we need a way to generate four 1s, eight 2s etc. This can be done using log2 and some tricks.
groups <- floor(log2(seq_len(nrow(dt)) + 3)) - 1
groups
#> [1] 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4
#> [36] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5
#> [71] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
Shuffle this vector and add it as a column.
dt$group <- sample(groups)
Facet using this new column to generate the desired plots.
ggplot(dt, aes(x, y, size = size)) +
geom_point() +
facet_wrap(~group)
First of all, the question's data creation code can be greatly simplified, rewritten with no loops at all. R is a vectorized language and the following will create a data frame with the same structure.
Don't forget to set the RNG seed, in order to make the results reproducible.
library(ggplot2)
set.seed(2020) # make the results reproducible
sd <- rep(rep(mySD, each = 2), 10)
M <- rep(myMeans, 2*10)
x <- abs(rnorm(n = 40, mean = M, sd = sd))
y <- abs(rnorm(n = 40, mean = M, sd = sd))
size <- runif(40, 1, 25)
iter <- seq_along(x)
dt2 <- data.frame(x, y, size, M, sd, iter)
dt2$col <- c("blue", "red")
Now the plots. The following function accepts a data frame X as its first argument and a number of points to draw as the second one. Then plots n points chosen at random with color col and size (a continuous variable) size.
plot_fun <- function(X, n){
Colors <- unique(X[["col"]])
Colors <- setNames(Colors, Colors)
i <- sample(nrow(X), n)
g <- ggplot(X[i,], aes(x, y, size = size, color = col)) +
geom_point() +
scale_color_manual(values = Colors) +
theme_bw()
g
}
plot_fun(dt2, 8)
To plot several values for n, produce the plots with lapply then use grid.arrange from package gridExtra.
plot_list <- lapply(c(4,8,16,32), function(n) plot_fun(dt2, n))
gridExtra::grid.arrange(grobs = plot_list)
Individual plots are still possible with
plot_list[[1]]
plot_list[[2]]
and so on.
Another way is to use faceting. Write another function, plot_fun_facets assigning the number of points to a new variable in the sample data frames, n, and use that variable as a faceting variable.
plot_fun_facets <- function(X, n){
Colors <- unique(X[["col"]])
Colors <- setNames(Colors, Colors)
X_list <- lapply(n, function(.n){
i <- sample(nrow(X), .n)
Y <- X[i,]
Y$n <- .n
Y
})
X <- do.call(rbind, X_list)
g <- ggplot(X, aes(x, y, size = size, color = col)) +
geom_point() +
scale_color_manual(values = Colors) +
facet_wrap(~ n) +
theme_bw()
g
}
plot_fun_facets(dt2, c(4,8,16,32))

Considereing the number of the coordinate in the heatmap

I'm trying to make a heatmap considering the value of the point (variable 'x'). But when I run my code I only have the heatmap considering the points, and not its values.
Here is my code:
head(dengue)
lat long x
1 7791000 598157.0 156
2 7790677 598520.0 307
3 7790795 598520.0 153
4 7790153 598808.0 135
5 7790935 598813.0 1888
6 7790765 598881.7 1169
library(ggplot2)
library(ggsn)
hmap <- ggplot(dengue, aes(x=long, y=lat)) +
stat_density2d(aes(fill = ..level..), alpha=0.8, geom="polygon") +
geom_point(colour="red") +
geom_path(data=map.df,aes(x=long, y=lat,group=group), colour="grey50") +
scale_fill_gradientn(colours=rev(brewer.pal(5,"Spectral"))) +
coord_fixed() +
scalebar(location="bottomright",y.min=7781600.0, y.max=7812898.0,
x.min=597998.4, x.max=619721.2,
dist=2, transform = F,
st.dist=.04,dist_unit="km") +
blank() +
guides(fill=guide_legend(title=""))
north2(hmap, x=.7, y=.9, symbol=16)
And here is the map that I got:
Any hint on how can I make a heatmap considering the values of the points (variable 'x'), and not just its coordinates?
There was a post here that describes the adaptation of the MASS package's kde2d function to take into account the weights of points.
library(MASS)
kde2d.weighted <- function (x, y, w, h, n = 25, lims = c(range(x), range(y))) {
nx <- length(x)
if (length(y) != nx)
stop("data vectors must be the same length")
gx <- seq(lims[1], lims[2], length = n) # gridpoints x
gy <- seq(lims[3], lims[4], length = n) # gridpoints y
if (missing(h))
h <- c(bandwidth.nrd(x), bandwidth.nrd(y));
if (missing(w))
w <- numeric(nx)+1;
h <- h/4
ax <- outer(gx, x, "-")/h[1] # distance of each point to each grid point in x-direction
ay <- outer(gy, y, "-")/h[2] # distance of each point to each grid point in y-direction
z <- (matrix(rep(w,n), nrow=n, ncol=nx, byrow=TRUE)*matrix(dnorm(ax), n, nx)) %*% t(matrix(dnorm(ay), n, nx))/(sum(w) * h[1] * h[2]) # z is the density
return(list(x = gx, y = gy, z = z))
}
This is not natively embedded in ggplot2 as far as I'm aware, but you could preprocess your data outside ggplot to get the data you can put into stat_contour:
# Reading in your example data
zz <- " lat long x
1 7791000 598157.0 156
2 7790677 598520.0 307
3 7790795 598520.0 153
4 7790153 598808.0 135
5 7790935 598813.0 1888
6 7790765 598881.7 1169"
df <- read.table(text = zz)
# Doing the weighted 2d kde
wdf <- kde2d.weighted(df$lat, df$long, df$x)
wdf <- data.frame(lat = wdf$x[row(wdf$z)],
long = wdf$y[col(wdf$z)],
value = wdf$z[T])
# Plotting the result:
ggplot(df, aes(lat, long)) +
stat_contour(data = wdf, aes(z = value, fill = stat(level)), geom = "polygon") +
geom_text(aes(label = x)) # to show the weights
As you can see, the contours are a bit cut off at ugly points, but I suppose this could be amended by playing around with the lims argument of the kde2d.weighted().

Incorrect colour gradient when using cowplot to patch together plots

Say I have a data set with x and y values that are grouped according to two variables: grp is a, b, or c, while subgrp is E, F, or G.
a has y values in [0, 1]
b has y values in [10, 11]
c has y values in [100, 101].
I'd like to plot y against x with the colour of the point defined by y for all grp and subgrp combinations. Since each grp has very different y values, I can't just use facet_grid alone, as the colour scales would be useless. So, I plot each grp with its own scale then patch them together with plot_grid from cowplot. I also want to use a three-point gradient specified by scale_colour_gradient2. My code looks like this:
# Set RNG seed
set.seed(42)
# Toy data frame
df <- data.frame(x = runif(270), y = runif(270) + rep(c(0, 10, 100), each = 90),
grp = rep(letters[1:3], each = 90), subgrp = rep(LETTERS[4:6], 90))
head(df)
#> x y grp subgrp
#> 1 0.9148060 0.1362958 a D
#> 2 0.9370754 0.7853494 a E
#> 3 0.2861395 0.4533034 a F
#> 4 0.8304476 0.1357424 a D
#> 5 0.6417455 0.8852210 a E
#> 6 0.5190959 0.3367135 a F
# Load libraries
library(cowplot)
library(ggplot2)
library(dplyr)
# Plotting list
g_list <- list()
# Loop through groups 'grp'
for(i in levels(df$grp)){
# Subset the data
df_subset <- df %>% filter(grp == i)
# Calculate the midpoint
mp <- mean(df_subset$y)
# Print midpoint
message("Midpoint: ", mp)
g <- ggplot(df_subset) + geom_point(aes(x = x, y = y, colour = y))
g <- g + facet_grid(. ~ subgrp) + ggtitle(i)
g <- g + scale_colour_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = mp)
g_list[[i]] <- g
}
#> Midpoint: 0.460748857570191
#> Midpoint: 10.4696476330981
#> Midpoint: 100.471083269571
plot_grid(plotlist = g_list, ncol = 1)
Created on 2019-04-17 by the reprex package (v0.2.1)
In this code, I specify the midpoint of the colour gradient as the mean of y for each grp. I print this and verify that it is correct. It is.
My question: why are my colour scales incorrect for the first two plots?
It appears the same range is applied to each grp despite subsetting the data. If I replace for(i in levels(df$grp)){ with for(i in levels(df$grp)[1]){, the colour scale is correct for the single plot that is produced.
Update
Okay, this is weird. Inserting ggplot_build(g)$data[[1]]$colour immediately before g_list[[i]] <- g solves the problem. But, why?
Long story short, you're creating unevaluated promises and then evaluate them at a time when the original data is gone. This problem is generally avoided if you use proper functional programming style rather than procedural code. I.e., define a function that does the work and then use an apply function for the loop.
set.seed(42)
# Toy data frame
df <- data.frame(x = runif(270), y = runif(270) + rep(c(0, 10, 100), each = 90),
grp = rep(letters[1:3], each = 90), subgrp = rep(LETTERS[4:6], 90))
library(cowplot)
library(ggplot2)
library(dplyr)
# Loop through groups 'grp'
g_list <- lapply(
levels(df$grp),
function(i) {
# Subset the data
df_subset <- df %>% filter(grp == i)
# Calculate the midpoint
mp <- mean(df_subset$y)
# Print midpoint
message("Midpoint: ", mp)
g <- ggplot(df_subset) + geom_point(aes(x = x, y = y, colour = y))
g <- g + facet_grid(. ~ subgrp) + ggtitle(i)
g <- g + scale_colour_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = mp)
g
}
)
#> Midpoint: 0.460748857570191
#> Midpoint: 10.4696476330981
#> Midpoint: 100.471083269571
plot_grid(plotlist = g_list, ncol = 1)
Created on 2019-04-17 by the reprex package (v0.2.1)

R loess prediction does not match ggplot geom_smooth(). Error in my prediction formula?

I am trying to predict, on my own, the loess values provided by ggplot geom_smooth(). I have attached links to my data and the output plot of the predictions.
Data can be found here. I followed an example provided from this post about loess prediction to reproduce the values from ggplot, so I think I am on the right track, but I am missing something.
library("ggplot2")
load(file="data5a.RData")
lsmod = loess(Flux~DA_SQ_KM, data=data5a, control=loess.control(surface="direct"))
xrange <- max(data5a$DA_SQ_KM,na.rm=TRUE)
xseq <- c(0.01,0.05,0.1,0.2,0.3,0.5,seq(from=1, to=xrange, length=100))
pred = predict(lsmod,newdata=data.frame(DA_SQ_KM = xseq), se=TRUE)
y = pred$fit
ci <- pred$se.fit * qt(0.95 / 2 + .5, pred$df)
ymin = y - ci
ymax = y + ci
loess.DF <- data.frame(x = xseq, y, ymin, ymax, se = pred$se.fit)
ggplot(data5a, aes(DA_SQ_KM, Flux)) +
geom_point()+
geom_smooth(method="loess")+
geom_smooth(aes_auto(loess.DF), data=loess.DF, stat="identity",col="red")+
geom_smooth(method="lm",se=FALSE,col="green")+
theme(legend.position = "bottom")+
scale_y_log10()+
scale_x_log10()
Where is the error in my code for reproducing the data in the blue curve that is predicted by geom_smooth()?
Here is an image of the output within ggplot:
UPDATE 1:
I have included updated code here based on input provided by Roland. I have modified my code to use the mgcv::gam function since my data contains greater than 1000 points. The issue still remains that I cannot reproduce the model created by geom_smooth within ggplot. A new issue has also emerged with the confidence intervals.
library("ggplot2")
library("mgcv")
load(file="data5a.RData")
#Attempt to re-create the gam model myself
gammod = mgcv::gam(Flux~s(DA_SQ_KM, bs = "cs"),data=data5a)
xrange <- max(data5a$DA_SQ_KM,na.rm=TRUE)
xseq <- c(0.001,0.01,0.05,0.1,0.2,0.3,0.5,seq(from=1, to=xrange, length=100))
pred = predict(gammod ,newdata=data.frame(DA_SQ_KM = xseq), se=TRUE)
y = pred$fit
ci <- pred$se.fit * qt(0.95 / 2 + .5, pred$df)
ymin = y - ci
ymax = y + ci
gam.DF <- data.frame(x = xseq, y, ymin, ymax, se = pred$se.fit)
ggplot(data5a, aes(DA_SQ_KM, Flux)) +
geom_point()+
geom_smooth(aes_auto(gam.DF), data=gam.DF, stat="identity",col="red")+
stat_smooth(method=mgcv::gam,formula = y ~ s(x, bs = "cs"),se=TRUE,col="purple")+
theme(legend.position = "bottom")+
scale_y_log10()+
scale_x_log10()
Here is the gam output within ggplot:
ggplot2 fits the model to the transformed variables if you use scale_* transformations:
DF <- data.frame(x = 1:3, y = c(10, 100, 1e3))
library(ggplot2)
p <- ggplot(DF, aes(x, y)) +
geom_point() +
scale_y_log10() +
stat_smooth(method = "lm", n = 3)
g <- ggplot_build(p)
g[["data"]][[2]]
# x y ymin ymax se PANEL group colour fill size linetype weight alpha
#1 1 1 1 1 0 1 -1 #3366FF grey60 1 1 1 0.4
#2 2 2 2 2 0 1 -1 #3366FF grey60 1 1 1 0.4
#3 3 3 3 3 0 1 -1 #3366FF grey60 1 1 1 0.4
Note the zero SEs, which indicate a perfect fit.
log10(predict(lm(y ~ x, data = DF)))
# 1 2 3
#NaN 2.568202 2.937016
predict(lm(log10(y) ~ x, data = DF))
#1 2 3
#1 2 3

How to add fixed-slope line in multipanel plot using different intercepts in each panel

I generate a multipanel plot using ggplot2 (requires ggplot2 package). Each of the ten panels represents a different plant growth form. I need to plot a line in each panel that has a slope of 0.704. However, the intercept should be different for each panel, for example -7.9 in the case of 'fern', -7.31 for 'fern ally', etc. Currently, I use the code below which generates a line where both slope and intercept are the same in each panel:
ggplot(veg, aes(x=ord1, y=log(lai+0.000019))) +
scale_x_discrete(limits=c("1","2","3","4","5","6","7","8","9")) +
scale_y_continuous(limits=c(-12,3)) +
geom_point(shape=1) +
geom_segment(aes(x = 1, xend = 9, y = -8.32 + .704, yend = -8.32 + .704*9),
col = "black", size = 1, lty="longdash", lwd=1) +
facet_wrap( ~ plant_growth_form, ncol=5)
How could I modify this code in ggplot2 to specify a different intercept for each growth form?
A reproducible data subset generated with dput() can be found at: How to compute standard errors for predicted data
First, create a separate data frame housing intercepts and slope. e.g. "a" is intercepts, "b" is the slope. We call this data frame "df3".
df3 <- data.frame(plant_growth_form = unique(veg[,3]),
a = c(-10,-9,-8,-7,-6,-5,-4,-3,-2,-1), # assigned arbitrarily
b= 0.704)
# plant_growth_form a b
# 1 herb -10 0.704
# 2 moss -9 0.704
# 3 woody climber -8 0.704
# 4 tree sapling -7 0.704
# 5 fern -6 0.704
# 6 herbaceous climber -5 0.704
# 7 undet -4 0.704
# 8 herbaceous shrub -3 0.704
# 9 grass -2 0.704
# 10 woody shrub -1 0.704
to plot
ggplot(veg, aes(x=ord1, y=log(lai+0.000019))) +
scale_x_discrete(limits=c("1","2","3","4","5","6","7","8","9")) +
scale_y_continuous(limits=c(-12,3)) +
geom_point(shape=1) +
geom_abline(aes(intercept=a, slope=b), data=df3) +
facet_wrap( ~ plant_growth_form, ncol=5)
update the tweak per #aosmith's comment.
# you could separately modify the dataframe of intercepts & slope, or
# adjust directly in the code below for small data (see <--)
df3 <- data.frame(plant_growth_form = unique(veg[,3]),
a = c(-10,-9,-8,-7,-6,-5,-4,-3,-2,-1), # <-- here
b= 0.704) # <-- here
# to add the code of extra segment (see <--), in red.
ggplot(veg, aes(x=ord1, y=log(lai+0.000019))) +
scale_x_discrete(limits=c("1","2","3","4","5","6","7","8","9")) +
scale_y_continuous(limits=c(-12,3)) +
geom_point(shape=1) +
#geom_abline(aes(intercept=a, slope=b), data=df3) +
geom_segment(data = df3, aes(x = 1, xend = 9, y = a + b, yend = a + b*9)) +
geom_segment(aes(x = 1, xend = 9, y = -8.32 + .704, yend = -8.32 + .704*9), # <-- here
col = "red", size = 1, lty="longdash", lwd=1) +
facet_wrap( ~ plant_growth_form, ncol=5)

Resources