fill area between two lines with different x-values - r

I have a data frame with two columns x's and y's. Each row represents a line and in each cell is a list with 51 consecutive observations (so 2 lists in each row for x and y).
I want to fill the space between the lines in the data frame.
The problem is that there's a randomness in x and y, so I can't just take the ymin and ymax for each data point on x.
This code would create sample data (with only 2 lines of 10 observations each) that is similar to my actual dataset:
library(data.table)
genData <- function() {
set.seed(42)
genOneLine <- function(m_x, m_y) {
xs = seq(0,1,by=0.1)
x_ran <- rnorm(8, m_x, 0.1)
xs[2:9] = xs[2:9] + x_ran
ys = seq(0,1,by=0.1)
y_ran <- rnorm(8, m_y, 0.1)
ys[2:9] = ys[2:9] + y_ran
return (data.table(x = list(xs), y = list(ys)))
}
return (rbind(genOneLine(-0.1, -0.1), genOneLine(0.1, 0.1)))
}

See if this is what you have in mind?
library(dplyr)
library(ggplot2)
library(data.table)
df <- genData()
df %>%
# add identifier variable to each line
mutate(id = seq(1, n())) %>%
# make each element of the list its own row
tidyr::unnest() %>%
# add sequence identifier, from start of line 1 to its end, then
# from the end of line 2 to its start
group_by(id) %>%
mutate(order = ifelse(id == 1, seq(1, n()), n() + seq(n(), 1))) %>%
ungroup() %>%
# sort data frame
arrange(order) %>%
# plot
ggplot(aes(x = x, y = y)) +
geom_polygon(aes(group = 1), fill = "grey20", alpha = 0.5) +
geom_path(aes(color = factor(id)), size = 2)

Related

How to plot line graph of normalized differences from binned data with ggplot?

I have several sets of data that I calculate binned normalized differences for. The results I want to plot within a single line plot using ggplot. The lines representing different combinations of the paired differences are supposed to be distinguished by colors and line types.
I am stuck on taking the computed values from the bins (would be y-axis values now), and plotting these onto an x-axis.
Below is the code I use for importing the data and calculating the normalized differences.
# Read data from column 3 as data table for different number of rows
# you could use replicate here for test
# dat1 <- data.frame(replicate(1,sample(25:50,10000,rep=TRUE)))
# dat2 <- data.frame(replicate(1,sample(25:50,9500,rep=TRUE)))
dat1 <- fread("/dir01/a/dat01.txt", header = FALSE, data.table=FALSE, select=c(3))
dat2 <- fread("/dir02/c/dat02.txt", header = FALSE, data.table=FALSE, select=c(3))
# Change column names
colnames(dat1) <- c("Dat1")
colnames(dat2) <- c("Dat2")
# Perhaps there is a better way to compute the following as all-in-one? I have broken these down step by step.
# 1) Sum for each bin
bin1 = cut(dat1$Dat1, breaks = seq(25, 50, by = 2))
sum1 = tapply(dat1$Dat1, bin1, sum)
bin2 = cut(dat2$Dat2, breaks = seq(25, 50, by = 2))
sum2 = tapply(dat2$Dat2, bin2, sum)
# 2) Total sum of all bins
sumt1 = sum(sum1)
sumt2 = sum(sum2)
# 3) Divide each bin by total sum of all bins
sumn1 = lapply(sum1, `/`, sumt1)
sumn2 = lapply(sum2, `/`, sumt2)
# 4) Convert to data frame as I'm not sure how to difference otherwise
df_sumn1 = data.frame(sumn1)
df_sumn2 = data.frame(sumn2)
# 5) Difference between the two as percentage
dbin = (df_sumn1 - df_sumn2)*100
How can I plot those results using ggplot() and geom_line()?
I want
dbin values on the x-axis ranging from 25-50
different colors and line types for the lines
Here is what I tried:
p1 <- ggplot(dbin, aes(x = ?, color=Data, linetype=Data)) +
geom_line() +
scale_linetype_manual(values=c("solid")) +
scale_x_continuous(limits = c(25, 50)) +
scale_color_manual(values = c("#000000"))
dput(dbin) outputs:
structure(list(X.25.27. = -0.0729132928804117, X.27.29. = -0.119044772581772,
X.29.31. = 0.316016473225017, X.31.33. = -0.292812782147632,
X.33.35. = 0.0776336591308158, X.35.37. = 0.0205584754637611,
X.37.39. = -0.300768421159599, X.39.41. = -0.403235174844081,
X.41.43. = 0.392510458816457, X.43.45. = 0.686758883448307,
X.45.47. = -0.25387105113263, X.47.49. = -0.0508324553382303), class = "data.frame", row.names = c(NA,
-1L))
Edit
The final piece of code that works, using only the dbin and plots multiple dbins:
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100)))
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100)))
dat3 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 12:37/100)))
dat4 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 37:12/100)))
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinA = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
diff_data2 <-
full_join(
calc_bin_props(data = dat3),
calc_bin_props(dat4),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinB = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
# Combine two differences, and remove sum.x and sum.y
full_data <- cbind(diff_data, diff_data2[,4])
full_data <- full_data[,-c(2:3)]
# Melt the data to plot more than 1 variable on a plot
m <- melt(full_data, id.vars="bin")
theme_update(plot.title = element_text(hjust = 0.5))
ggplot(m, aes(as.numeric(bin), value, col=variable, linetype = variable)) +
geom_line() +
scale_linetype_manual(values=c("solid", "longdash")) +
scale_color_manual(values = c("black", "black"))
dev.off()
library(tidyverse)
Creating example data as shown in question, but adding different probabilities to the two sample() calls, to create so visible difference
between the two sets of randomized data.
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100))) %>% as_tibble()
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100))) %>% as_tibble()
Using dplyr we can handle this within data.frames (tibbles) without
the need to switch to other datatypes.
Let’s define a function that can be applied to both datasets to get
the preprocessing done.
We use base::cut() to create
a new column that pairs each value with its bin. We then group the data
by bin, calculate the sum for each bin and finally divide the bin sums
by the total sum.
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2), labels = seq(25, 48, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
Now we call calc_bin_props() on both datasets and join them by bin.
This gives us a dataframe with the columns bin, sum.x and sum.y.
The latter two are correspond to the bin sums derived from dat1 and
dat2. With the mutate() line we calculate the differences between the
two columns.
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
mutate(dbin = (sum.x - sum.y),
bin = as.numeric(as.character(bin))) %>%
select(-starts_with("trsh"))
Before we feed the data into ggplot() we convert it to the long
format using pivot_longer() this allows us to instruct ggplot() to
plot the results for sum.x, sum.y and dbin as separate lines.
diff_data %>%
pivot_longer(-bin) %>%
ggplot(aes(as.numeric(bin), value, color = name, linetype = name)) +
geom_line() +
scale_linetype_manual(values=c("longdash", "solid", "solid")) +
scale_color_manual(values = c("black", "purple", "green"))

Use scale_x_continuous with labeller function that also takes a data frame as an argument as well as default breaks

Here's a code block:
# scale the log of price per group (cut)
my_diamonds <- diamonds %>%
mutate(log_price = log(price)) %>%
group_by(cut) %>%
mutate(scaled_log_price = scale(log_price) %>% as.numeric) %>% # scale within each group as opposed to overall
nest() %>%
mutate(mean_log_price = map_dbl(data, ~ .x$log_price %>% mean)) %>%
mutate(sd_log_price = map_dbl(data, ~ .x$log_price %>% sd)) %>%
unnest %>%
select(cut, price, price_scaled:sd_log_price) %>%
ungroup
# for each cut, find the back transformed actual values (exp) of each unit of zscore between -3:3
for (i in -3:3) {
my_diamonds <- my_diamonds %>%
mutate(!! paste0('mean_', ifelse(i < 0 , 'less_', 'plus_'), abs(i), 'z') := map2(.x = mean_log_price, .y = sd_log_price, ~ (.x + (i * .y)) %>% exp) %>% unlist)
}
my_diamonds_split <- my_diamonds %>% group_split(cut)
split_names <- my_diamonds %>% mutate(cut = as.character(cut)) %>% group_keys(cut) %>% pull(cut)
names(my_diamonds_split) <- split_names
I now have a variable my_diamonds_split that is a list of data frames. I would like to loop over these data frames and each time create a new ggplot.
I can use a custom labeller function with a single df, but I don't know how to do this within a loop:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(ex_df$price) * x + mean(ex_df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
This creates a plot for the 'Ideal' cut of diamonds. I also get two data points on the x axis, the zscore values at -2, 0 and 2 as well as the raw dollar values of 3.8K, 3.9K and 11.8K.
When I define the labeller function, I must specify the df to scale with. Tried instead with placing the dot instead of my_df, hoping that on each iteration ggplot would get the value of the df on any iteration:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(.$price) * x + mean(.$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
Returns:
Error in is.data.frame(x) : object '.' not found
I then tried writing the function to accept an argument for the df to scale with:
labeller <- function(x, df) {
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller(df = ex_df), limits = c(-3, 3)) # because when it comes to running in real life, I will try something like labeller(df = my_diamonds_split[[i]])
Error in paste0(x, "\n", scales::dollar(sd(df$price) * x + mean(df$price))) :
argument "x" is missing, with no default
Bearing in mind that the scaling must be done per iteration, how could I loop over my_diamonds_split, and on each iteration generate a ggplot per above?
labeller <- function(x) {
# how can I make df variable
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
for (i in split_names) {
my_diamonds_split[[i]] %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, # <--- here, labeller must be defined with df$price except that will difer on each iteration
limits = c(-3, 3))
}
There's a hacky way to get this result in facets. Basically, after converting to z scores, you add different amounts (say, multiples of 1000) to each group's z scores. Then you set all the breaks to this collection of points and label them with pre-calculated labels.
library(ggplot2)
library(dplyr)
f <- function(x) {
y <- diamonds$price[diamonds$cut == x]
paste(seq(-3, 3), scales::dollar(round(mean(y) + seq(-3, 3) * sd(y))), sep = "\n")
}
breaks <- as.vector(sapply(levels(diamonds$cut), f))
diamonds %>%
group_by(cut) %>%
mutate(z = scale(price) + 3 + 1000 * as.numeric(cut)) %>%
ggplot(aes(z)) +
geom_point(aes(x = z - 2, y = 1), alpha = 0) +
geom_density() +
scale_x_continuous(breaks = as.vector(sapply(1:5 * 1000, "+", 0:6)),
labels = breaks) +
facet_wrap(vars(cut), scales = "free_x") +
theme(text = element_text(size = 16),
axis.text.x = element_text(size = 6))
You would have to increase the plot size to make the dollar values more visible of course.
Created on 2020-08-04 by the reprex package (v0.3.0)

Apply Geom Layer Conditionally - Separate Points & Lines

I have a data set similar to the one below where I have a lot of data for certain groups and then only single observations for other groups. I would like my single observations to show up as points but the other groups with multiple observations to show up as lines (no points). My code is below:
EDIT: I'm attempting to find a way to do this without using multiple datasets in the geom_* calls because of the issues it causes with the legend. There was an answer that has since been deleted that was able to handle the legend but didn't get rid of the points on the lines. I would potentially like a single legend with points only showing up if they are a single observation.
library(tidyverse)
dat <- tibble(x = runif(10, 0, 5),
y = runif(10, 0, 20),
group = c(rep("Group1", 4),
rep("Group2", 4),
"Single Point 1",
"Single Point 2")
)
dat %>%
ggplot(aes(x = x, y = y, color = group)) +
geom_point() +
geom_line()
Created on 2019-04-02 by the reprex package (v0.2.1)
Only plot the data with 1 point in geom_point() and the data with >1 point in geom_line(). These can be precalculated in mutate().
dat = dat %>%
group_by(group) %>%
mutate(n = n() )
dat %>%
ggplot(aes(x = x, y = y, color = group)) +
geom_point(data = filter(dat, n == 1) ) +
geom_line(data = filter(dat, n > 1) )
Having the legend match this is trickier. This is the sort of thing that that override.aes argument in guide_legend() can be useful for.
In your case I would separately calculate the number of observations in each group first, since that is what the line vs point is based on.
sumdat = dat %>%
group_by(group) %>%
summarise(n = n() )
The result is in the same order as the factor levels in the legend, which is why this works.
Now we need to remove lines and keep points whenever the group has only a single observation. 0 stands for a blank line and NA stands for now shape. I use an ifelse() statement for linetype and shape for override.aes, based on the number of observations per group.
dat %>%
ggplot(aes(x = x, y = y, color = group)) +
geom_point(data = filter(dat, n == 1) ) +
geom_line(data = filter(dat, n > 1) ) +
guides(color = guide_legend(override.aes = list(linetype = ifelse(sumdat$n == 1, 0, 1),
shape = ifelse(sumdat$n == 1, 19, NA) ) ) )

Plot data from list using ggplot2

I have a list of 4 different matrix length. I wish to plot them as set of time series like in the example below just that x-axis is a running number (e.g. 1:75) and y-axis is the matrix value (e.g. sin(1:75)).
(https://homepage.divms.uiowa.edu/~luke/classes/STAT4580/timeseries_files/figure-html/unnamed-chunk-39-2.png).
I know that that ggplot2 does not handle lists so any idea how to advance?
Script:
mat1 <- matrix(cos(1:50), nrow = 50, ncol = 1)
mat2 <- matrix(sin(1:75), nrow = 75, ncol = 1)
mat3 <- matrix(tan(1:50), nrow = 50, ncol = 1)
mat4 <- matrix(1:100, nrow = 100, ncol = 1)
myList <- list(mat1, mat2, mat3, mat4)
names(myList)[1] <- "mat1"
names(myList)[2] <- "mat2"
names(myList)[3] <- "mat3"
names(myList)[4] <- "mat4"
Something like this?
library(tidyverse)
map_dfr(myList, ~as.data.frame(.x), .id = "id") %>%
group_by(id) %>%
mutate(n = 1:n()) %>%
ungroup() %>%
mutate(id = as.factor(id)) %>%
ggplot(aes(n, V1, colour = id)) +
geom_line() +
facet_wrap(~ id, scales = "free")
Explanation: We first convert all matrices to data.frames and bind all rows together into a single data.frame including an id which derives from the list names; we can then number rows by id and then plot the row number vs. the single column.
Here is the same code "un-piped" and "uglified"
library(tidyverse)
# Convert from list of matrices to long data.frame
df.long <- map_dfr(myList, ~as.data.frame(.x), .id = "id")
# Group by id
df.long <- group_by(df.long, id)
# Add row number (per group)
df.long <- mutate(df.long, n = 1:n())
# ungroup
df.long <- ungroup(df.long)
# Make sure id is a factor
df.long <- mutate(df.long, id = as.factor(id))
# (gg)plot
ggplot(df.long, aes(n, V1, colour = id)) +
geom_line() +
facet_wrap(~ id, scales = "free")
It's easy to see how %>% takes the left object and uses it as the first argument of the function on the right; so f(x) would become x %>% f().
library(tidyverse)
enframe(myList) %>%
unnest() %>%
group_by(name) %>%
rowid_to_column() %>%
ungroup() %>%
ggplot(aes(rowid, value)) +
geom_line() +
facet_wrap(~name, scales = "free")

R - ggplot2 contour plot

I am trying to replicate the code from Andrew Ng's Machine Learning course on Coursera in R (as the course is in Octave).
Basically I have to plot a non linear decision boundary (at p = 0.5) for a polynomial regularized logistic regression.
I can easily replicate the plot with the base library:
contour(u, v, z, levels = 0)
points(x = data$Test1, y = data$Test2)
where:
u <- v <- seq(-1, 1.5, length.out = 100)
and z is a matrix 100x100 with the values of z for every point of the grid.
Dimension of data is 118x3.
I cannot do it in ggplot2. Does somebody know how to replicate the same in ggplot2? I tried with:
z = as.vector(t(z))
ggplot(data, aes(x = Test1, y = Test2) + geom_contour(aes(x = u, y =
v, z = z))
But I get the error: Aesthetics must be either length 1 or the same as the data (118): colour, x, y, shape
Thanks.
EDIT (Adding plot created from code of missuse):
What you need is to convert the coordinates into long format. Here is an example using volcano data set:
data(volcano)
in base R:
contour(volcano)
with ggplot2:
library(tidyverse)
as.data.frame(volcano) %>% #convert the matrix to data frame
rownames_to_column() %>% #get row coordinates
gather(key, value, -rowname) %>% #convert to long format
mutate(key = as.numeric(gsub("V", "", key)), #convert the column names to numbers
rowname = as.numeric(rowname)) %>%
ggplot() +
geom_contour(aes(x = rowname, y = key, z = value))
if you would like to label it directly as in base R plot you can use library directlabels:
First map the color/fill to a variable:
as.data.frame(volcano) %>%
rownames_to_column() %>%
gather(key, value, -rowname) %>%
mutate(key = as.numeric(gsub("V", "", key)),
rowname = as.numeric(rowname)) %>%
ggplot() +
geom_contour(aes(x = rowname,
y = key,
z = value,
colour = ..level..)) -> some_plot
and then
library(directlabels)
direct.label(some_plot, list("far.from.others.borders", "calc.boxes", "enlarge.box",
box.color = NA, fill = "transparent", "draw.rects"))
to add markers at specific coordinates you just need to add another layer with appropriate data:
the previous plot
as.data.frame(volcano) %>%
rownames_to_column() %>%
gather(key, value, -rowname) %>%
mutate(key = as.numeric(gsub("V", "", key)),
rowname = as.numeric(rowname)) %>%
ggplot() +
geom_contour(aes(x = rowname, y = key, z = value)) -> plot_cont
add layer with points for instance:
plot_cont +
geom_point(data = data.frame(x = c(35, 47, 61),
y = c(22, 37, 15)),
aes(x = x, y = y), color = "red")
you can add any type of layer this way: geom_line, geom_text to name a few.
EDIT2: to change the scale of the axis there are several options, one is to assign appropriate rownames and colnames to the matrix:
I will assign a sequence from 0 - 2 for the x axis and 0 - 5 to the y axis:
rownames(volcano) <- seq(from = 0,
to = 2,
length.out = nrow(volcano)) #or some vector like u
colnames(volcano) <- seq(from = 0,
to = 5,
length.out = ncol(volcano)) #or soem vector like v
as.data.frame(volcano) %>%
rownames_to_column() %>%
gather(key, value, -rowname) %>%
mutate(key = as.numeric(key),
rowname = as.numeric(rowname)) %>%
ggplot() +
geom_contour(aes(x = rowname, y = key, z = value))
ggplot2 works most efficiently with data in long format. Here's an example with fake data:
library(tidyverse)
u <- v <- seq(-1, 1.5, length.out = 100)
# Generate fake data
z = outer(u, v, function(a, b) sin(2*a^3)*cos(5*b^2))
rownames(z) = u
colnames(z) = v
# Convert data to long format and plot
as.data.frame(z) %>%
rownames_to_column(var="row") %>%
gather(col, value, -row) %>%
mutate(row=as.numeric(row),
col=as.numeric(col)) %>%
ggplot(aes(col, row, z=value)) +
geom_contour(bins=20) +
theme_classic()

Resources