geom_density plots with nested vectors - r

I have a data frame with a nested vector in one column. Any ideas how to ggplot a geom_density using the values from the nested vector?
If I use pivot_longer the entire data frame, I get 25 million rows, so I'd prefer to avoid that if possible.
library(ggplot2)
df = data.frame(a = rep(letters[1:5],length.out = 100), b = sample(LETTERS, 100, replace = T))
df[["c"]] = purrr::map(1:100, function(x) rnorm(100))
# works but too heavy for the actual implementation
ggplot(tidyr::unnest(df, c), aes(c, group = a)) + geom_density() + facet_wrap(vars(b))
# doesn't work
ggplot(df, aes(c, group = a)) + geom_density() + facet_wrap(vars(b))

Different solution: Prepare each plot separately and rearrange your plots afterwards using gridExtra package.
library(ggplot2)
df = data.frame(a = rep(letters[1:5],length.out = 100), b = sample(LETTERS, 100, replace = T))
df[["c"]] = purrr::map(1:100, function(x) rnorm(100))
lst_plot <- lapply(sort(unique(df$b)), function(x){
data <- df[df$b == x,
data <- purrr::map_dfr(seq(length(data$a)), ~ data.frame(a = data$a[.x], c = data$c[.x][[1]]))
gg <- ggplot(data) +
geom_density(aes(c, group = a)) +
ylab(NULL)
return(gg)
})
gridExtra::grid.arrange(grobs = lst_plot, ncol = 6, left = "density")
To be honest, I'm not sure how well this works with your massive dataset...

Related

How can I create multiple plots from same dataset in R?

Let me first share a dummy data, from which I want to prepare ggplot graphs.
library(tidyverse)
set.seed(1)
sample_size <- 1200
dates <- sample(seq(1,31),sample_size,replace = TRUE)
Monthss <- sample(seq(1,12),sample_size,replace = TRUE)
hrs <- sample(seq(1,23),sample_size,replace = TRUE)
minutes <- sample(seq(1,59),sample_size,replace = TRUE)
date_time_vector <- paste0(dates,"-",Monthss,"-",2022," ",hrs,":",minutes) |> lubridate::parse_date_time("dmy HM")
Conversion <- sample(c(TRUE,FALSE),sample_size, prob = c(0.25,0.75), replace = TRUE)
df <- data.frame(Date = date_time_vector, Conversion_Status = Conversion)
df <- df |> mutate(Leads = round(runif(sample_size, min = 0,max = 10),digits = 0))
df <- df[complete.cases(df), ]
The code above gives me a data.frame with columns Date, Leads and Conversion_Status. I want to prepare Monthly column chart of total leads per day. (For example, daily leads in January, daily leads in February, etc.) So, basically, I will need to split the data on the basis of Month, and prepare one chart for each month. How can I prepare such charts?
I have tried following way:
bar_function <- function(df, col1, col2, title) {
df %>%
ggplot2::ggplot(aes(x = {{col1}}, y = {{col2}})) +
ggplot2::geom_col(fill = "steelblue") +
theme(plot.background = element_rect(fill = "white")) +theme(plot.title = element_text(hjust = 0.5))+coord_flip() +
ggplot2::labs(title = title)
}
mycharts <- df |> dplyr::nest_by(Month) |> dplyr::mutate(plot = bar_function(df,Date,Leads,"Daily Leads by Month"))
But it is giving me errors.
You can split according to month(year) and plot that.
library(ggplot2)
library(lubridate)
set.seed(1)
sample_size <- 1200
dates <- sample(seq(1,31),sample_size,replace = TRUE)
Monthss <- sample(seq(1,12),sample_size,replace = TRUE)
hrs <- sample(seq(1,23),sample_size,replace = TRUE)
minutes <- sample(seq(1,59),sample_size,replace = TRUE)
date_time_vector <- paste0(dates,"-",Monthss,"-",2022," ",hrs,":",minutes) |> lubridate::parse_date_time("dmy HM")
Conversion <- sample(c(TRUE,FALSE),sample_size, prob = c(0.25,0.75), replace = TRUE)
df <- data.frame(Date = date_time_vector, Conversion_Status = Conversion)
df$Leads <- round(runif(sample_size, min = 0,max = 10),digits = 0)
df <- df[complete.cases(df), ]
df$month_year <- strftime(df$Date, format = "%m-%Y")
df.split <- split(df, f = df$month_year)
out <- vector("list", length(df.split))
names(out) <- names(df.split)
for (i in seq_along(df.split)) {
out[[i]] <- ggplot(data = df.split[[i]], mapping = aes(x = Date, y = Leads)) +
geom_col(fill = "steelblue") +
theme(plot.background = element_rect(fill = "white")) +
theme(plot.title = element_text(hjust = 0.5))+
coord_flip() +
labs(title = "Daily leads by month")
}
To plot you can just print e.g. out[[1]].
If you want to change the desired columns dynamically, you can use aes_string for mapping. This can naturally be wrapped into sapply and there are probably other ways of approaching the problem. The for loop is pretty agnostic and I find that it's readable even by people who do not dabble in R (compared to say sapply).
There are some issues with your code. First, your dataset has no Month column, i.e. you have to add it for which I use lubridate::month. Second, you are passing the dataset df to your bar function instead of the splitted data column from your nested df. Third, in the mutate step you have to wrap the result in list():
library(ggplot2)
library(dplyr, warn=FALSE)
mycharts <- df |>
nest_by(Month = lubridate::month(Date)) |>
mutate(plot = list(bar_function(data, Date, Leads, "Daily Leads by Month")))
mycharts$plot[[1]]
mycharts$plot[[5]]
I finally found an answer. I used following code:
lapply(split(df, df$Month),
function(x)
ggplot(x, aes(x=Date, y=Leads)) +
geom_col(fill = "steelblue") + coord_flip()+
ggtitle(x$Month[1]))
Thank you all for your support.

How to plot a(n unknown) number of data series as geom_line in same chart

My first Q here, so please go lightly if I'm out of step anywhere.
I'm trying to code R to produce a single chart to contain a number of data series lines. The number of data series may vary but will be provided in the data frame. I have tried to rearrange another thread's content to print the geom_line , but not successfully.
The logic is:
#desire to replace loop of 1:5 with ncol(df)
print(ggplot(df,aes(x=time))
for (i in 1:5) {
print (+ geom_line(aes(y=df[,i]))
}
#functioning geom point loops ggplot production:
for (i in 1:5) {
print(ggplot(df,aes(x=time,y=df[,i]))+geom_point())
}
#functioning multi-line ggplot where n is explicit:
ggplot(data=df, aes(x=time), group=1) +
geom_line(aes(y=df$`3`))+
geom_line(aes(y=df$`4`))
The functioning example code produces n number of point charts, 5 in this case. I would like just one chart to contain n line series.
This may be similar to How to plot n dimensional matrix? for which there are currently no relevant answers
Any contributions much appreciated, thanks
You can use gather from tidyverse "world" to do that.
As you didn't supply a sample data I used mtcars.
I created two data.frames one with 3 columns one with 9. In each one of them I plotted all of the variables against the variable mpg.
library(tidyverse)
df3Columns <- mtcars[, 1:4]
df9Columns <- mtcars[, 1:10]
df3Columns %>%
gather(var, value, -mpg) %>%
ggplot(aes(mpg, value, group = var, color = var)) +
geom_line()
df9Columns %>%
gather(var, value, -mpg) %>%
ggplot(aes(mpg, value, group = var, color = var)) +
geom_line()
Edit - using the sample data in comments.
library(tidyverse)
df %>%
rownames_to_column("time") %>%
gather(var, value, -time) %>%
ggplot(aes(time, value, group = var, color = var)) +
geom_line()
Sample data:
df <- structure(list("39083" = c(96, 100, 100), "39090" = c(99, 100, 100), "39097" = c(99, 100, 100)), row.names = 3:5, class = "data.frame")
To strictly answer your question, you can simply store your ggplot in a variable and add the geom_line one by one:
df <- structure(list("39083" = c(96, 100, 100), "39090" = c(99, 100, 100), "39097" = c(99, 100, 100)), row.names = 3:5, class = "data.frame")
g <- ggplot(df, aes(x = 1:nrow(df)))
for (i in colnames(df))
{
g <- g + geom_line(y = df[,i])
}
g <- g + scale_y_continuous(limits = c(min(df), max(df)))
print(g)
However, this is not a very convenient solution. I would highly recommend to refactor your data frame to be more ggplot style.
df.ultimate <- data.frame(time = numeric(), value = numeric(), group = character())
for (i in colnames(df))
{
df.ultimate <- rbind(df.ultimate, data.frame(time = 1:nrow(df), value = df[, i], group = i))
}
g <- ggplot(df.ultimate, aes(x = time, y = value, color = group))
g <- g + geom_line()
print(g)
A one-line solution:
ggplot(data.frame(time = rep(1:nrow(df), ncol(df)),
value = as.vector(as.matrix(df)),
group = rep(colnames(df), each = nrow(df))),
aes(x = time, y = value, color = group)) + geom_line()

Plot lines in ggplot from a list of dataframes

I have a list of data.frames:
samplelist = list(a = data.frame(x = c(1:10), y=rnorm(10),
b= data.frame(x=c(5:10), y = rnorm(5),
c = data.frame(x=c(2:12), y=rnorm(10))
I'd like to structure a ggplot of the following format:
ggplot()+
geom_line(data=samplelist[[1]], aes(x,y))+
geom_line(data=samplelist[[2]], aes(x,y))+
geom_line(data=samplelist[[3]], aes(x,y))
But that isn't super automated. Does anyone have a suggestion for how to address this?
Thanks!
ggplot works most efficiently with data in "long" format. In this case, that means stacking your three data frames into a single data frame with an extra column added to identify the source data frame. In that format, you need only one call to geom_line, while the new column identifying the source data frame can be used as a colour aesthetic, resulting in a different line for each source data frame. The dplyr function bind_rows allows you to stack the data frames on the fly, within the call to ggplot.
library(dplyr)
library(ggplot2)
samplelist = list(a = data.frame(x=c(1:10), y=rnorm(10)),
b = data.frame(x=c(5:10), y=rnorm(6)),
c = data.frame(x=c(2:12), y=rnorm(11)))
ggplot(bind_rows(samplelist, .id="df"), aes(x, y, colour=df)) +
geom_line()
I assumed above that you would want each line to be a different color and for there to be a legend showing the color mapping. However, if, for some reason, you just want three black lines and no legend, just change colour=df to group=df.
Or you could use lapply.
library(ggplot2)
samplelist = list(a = data.frame(x = c(1:10), y=rnorm(10)),
b= data.frame(x=c(5:10), y = rnorm(6)),
c = data.frame(x=c(2:12), y=rnorm(11)))
p <- ggplot()
plot <- function(df){
p <<- p + geom_line(data=df, aes(x,y))
}
lapply(samplelist, plot)
p
This will work -
library(ggplot2)
samplelist <- list(a = data.frame(x = c(1:10), y=rnorm(10)),
b = data.frame(x=c(5:10), y = rnorm(6)),
c = data.frame(x=c(2:12), y=rnorm(11)))
p <- ggplot()
for (i in 1:3) p <- p + geom_line(data=samplelist[[i]], aes(x,y))
p
Reduce is another option to add things iteratively,
library(ggplot2)
samplelist = list(a = data.frame(x = c(1:10), y=rnorm(10)),
b= data.frame(x=c(5:10), y = rnorm(6)),
c = data.frame(x=c(2:12), y=rnorm(11)))
pl <- Reduce(f = function(p, d) p + geom_line(data=d, aes(x,y)),
x = samplelist, init = ggplot(), accumulate = TRUE)
gridExtra::grid.arrange(grobs = pl)

How to make one bar-chart from different data-frames with same format?

I have three different data-frames that have same format and I can not combine them because each one represent different data source. I would like to show percentage of one variable for different data frames in one bar chart.
I can get bar-chart for column1 of one dataframe by using:
ggplot(baseline, aes(x = c1)) +
geom_bar(aes(y = (..count..)/sum(..count..)),fill="blue",colour="blue") +
geom_text(aes(y = ((..count..)/sum(..count..)), label=scales::percent((..count..)/sum(..count..))), stat = "count")
I want output similar to this plot(except that I am showing percentage of each category) while race will be name of different data-frames and factor is values of column 1 of data frames.
I do not use ggplot2 but here is an illustration of how to accomplish what you want. It will be easiest to add a column to your data.frames indicating the source of each data.frame. Then calculate whatever metric you want, by source, then plot. Alternatively, you could calculate the metrics first, then combine the data.frames.
library(RColorBrewer)
library(data.table)
set.seed(1234)
make_data <- function() {
n <- sample(5:10, 1)
data.frame(id = rep(c("A", "B", "C"), each = n),
vals = c(rnorm(n, 5, 1), rnorm(n, 10, 1), rnorm(n, 15, 1)))
}
df1 <- make_data()
df2 <- make_data()
df3 <- make_data()
df4 <- make_data()
df1$src <- "source1"
df2$src <- "source2"
df3$src <- "source3"
df4$src <- "source4"
dat <- do.call(rbind, list(df1, df2, df3, df4))
dat <- as.data.table(dat)
res <- dat[ , mean(vals), by = list(id, src)][order(id)]
barplot(height = res$V1, col = rep(brewer.pal(4, "Set1"), 3))
EDIT
Here is the ggplot2 code provided by Sumedh:
library(ggplot2)
ggplot(res, aes(x = id, y = V1, fill = src)) +
geom_bar(stat = "identity", position = "dodge")

How to combine geom_bar for three dataframe?

Suppose I have:
a = data.frame(a = sample(1:10, 20, replace = T))
b = data.frame(b = sample(1:11, 19, replace = T))
c = data.frame(c = sample(1:9, 21, replace = T))
a.a = ggplot(data = a, aes(a)) + geom_bar()
b.b = ggplot(data = b, aes(b)) + geom_bar()
c.c = ggplot(data = c, aes(c)) + geom_bar()
How can I combine a.a, b.b and c.c into one plot? Like
I have tried
d = ggplot() +
geom_bar(data = a.a, aes(a)) +
geom_bar(data = b.b, aes(b)) +
geom_bar(data = c.c, aes(c))
d
But it doesn't work...
Combine them into a single "long" data frame that has a grouping column marking which data frame each row came from.
library(reshape2)
library(dplyr)
# Individual data frames
a = data.frame(a = sample(1:10, 20, replace = T))
b = data.frame(b = sample(1:11, 19, replace = T))
c = data.frame(c = sample(1:9, 21, replace = T))
Combine data frames in "long" format. The data frames have different numbers of rows, so we need our new grouping variable (called data_source below) to repeat each data frame's name a number of times equal to the number of rows in each data frame. We use the rep function to take care of this. One way is as follows: rep(c("a","b","c"), times=c(nrow(a), nrow(b), nrow(c))), however, I use sapply below because is seemed cleaner (though perhaps more opaque).
df = data.frame(value =c(a$a,b$b,c$c),
data_source=rep(c("a","b","c"), times=sapply(list(a,b,c), nrow)))
# Pre-summarise counts in order to add zero counts for empty categories
df.summary = df %>% group_by(data_source, value) %>%
tally %>%
dcast(data_source ~ value, value.var="n", fill=0) %>%
melt(id.var="data_source", variable.name="value", value.name="n")
ggplot(df.summary, aes(value, n, fill=data_source)) +
geom_bar(stat="identity", position="dodge", colour="grey20", lwd=0.3)
If we didn't have some categories with zero counts (for example, data frames b and c have no values equal to 10), then we could just do this:
ggplot(df, aes(factor(value), fill=data_source)) +
geom_bar(position="dodge", colour="grey20", lwd=0.3)
But then note how ggplot expands the remaining bars when one or two data frames don't contain a given value:

Resources