I want to show different barplots for the years and gender with the mean values of the variables Q1 to Q5, which should look like a density.
I have data that looks like this:
data <- data.frame(userid = c(1,1,1,2,2,2,3,3,3),
year = c(2013,2014,2015,2013,2014,2015,2013,2014,2015),
gender = c(1,1,1,0,0,0,0,0,0),
Q1 = c(3,2,3,1,0,1,2,1,0),
Q2 = c(4,3,4,2,0,2,1,4,3),
Q3 = c(1,2,1,3,5,4,5,4,5),
Q4 = c(1,2,1,2,4,3,2,2,1),
Q5 = c(1,1,1,2,1,0,0,0,1))
My solution was to filter() for year and gender first and then use summarise(),
to get a vector of the means and put this into the barplot() function:
data %>% filter(gender==1,year==2013) %>% select(-userid,-gender,-year) %>% summarise_all(mean) %>%
as.numeric() %>%
barplot()
Instead of doing this for every combination of year and gender,
is there a more elegant way, using ggplot and facet_wrap()?
I may have misunderstood how you want the plot arranged, but if you want to show the mean score answer per year and gender group, you could do facets like this:
library(tidyverse)
data %>%
pivot_longer(starts_with("Q")) %>%
group_by(year, gender, name) %>%
summarize(value = mean(value)) %>%
ggplot(aes(name, value)) +
geom_col(fill = 'deepskyblue4') +
facet_grid(year ~ gender) +
labs(x = 'Question', y = 'Average score') +
theme_minimal(base_size = 16)
Maybe you want something like this with facet_wrap and geom_col where the mean is calculate using rowMeans like this:
library(dplyr)
library(ggplot2)
data %>%
mutate(mean = rowMeans(select(., starts_with("Q")), na.rm = TRUE)) %>%
ggplot(aes(x = year, y = mean, fill = factor(gender))) +
geom_col() +
labs(x = 'Year', y = 'Mean Q1 to Q5', fill = 'Gender') +
theme_bw() +
facet_wrap(~userid)
Created on 2022-10-28 with reprex v2.0.2
First, pivot your data from wide to long format, group by year, gender, and Q, and summarize to mean values.
library(tidyr)
library(dplyr)
library(ggplot2)
data_long <- data %>%
pivot_longer(Q1:Q5, names_to = "Q", values_to = "value") %>%
group_by(year, gender, Q) %>%
summarize(value = mean(value), .groups = "drop")
data_long
# A tibble: 30 × 4
year gender Q value
<dbl> <dbl> <chr> <dbl>
1 2013 0 Q1 1.5
2 2013 0 Q2 1.5
3 2013 0 Q3 4
4 2013 0 Q4 2
5 2013 0 Q5 1
6 2013 1 Q1 3
7 2013 1 Q2 4
8 2013 1 Q3 1
9 2013 1 Q4 1
10 2013 1 Q5 1
# … with 20 more rows
Then plot using ggplot2::facet_grid().
ggplot(data_long, aes(Q, value)) +
geom_col() +
facet_grid(year ~ gender)
aggregate then barplot.
par(mfrow=c(1, 4))
sapply(unique(data$year), \(x) {
as.matrix(aggregate(cbind(Q1, Q2, Q3, Q4, Q5) ~ gender, data[data$year == x, ], FUN=mean)[-1]) |>
barplot(beside=TRUE, col=c(2, 4), main=x)
})
plot.new()
legend('left', legend=c('m', 'f'), col=c(2, 4), cex=1.2, pch=15, bty='n')
This approach does not require you to first calculate the mean, that is handled by stat_summary(), specifying fun = mean.
library(tidyverse)
data <- data.frame(userid = c(1,1,1,2,2,2,3,3,3),
year = c(2013,2014,2015,2013,2014,2015,2013,2014,2015),
gender = c(1,1,1,0,0,0,0,0,0),
Q1 = c(3,2,3,1,0,1,2,1,0),
Q2 = c(4,3,4,2,0,2,1,4,3),
Q3 = c(1,2,1,3,5,4,5,4,5),
Q4 = c(1,2,1,2,4,3,2,2,1),
Q5 = c(1,1,1,2,1,0,0,0,1))
data %>%
select(starts_with("Q")) %>% # Selects each column that starts with "Q"
pivot_longer(cols = everything()) %>% # Pivot to long format
ggplot(aes(x = name, y = value, fill = name)) +
stat_summary(geom = "bar", fun = "mean") + # Geom and function can be changed easily
theme_classic() +
labs(x = "Q", y = "Mean value", fill = NULL)
Created on 2022-10-28 by the reprex package (v2.0.1)
I am trying to create a heatmap that should assign colors based on % vaccinated for each month (for each row)
for example Comparison by colors between all states in month of Jan, then
for example Comparison by colors between all states in month of March .. .
then Apr ... Jun etc
Issue: Basically I would like Each month to have its own high & low scale & I am trying to do that with facet but it is assigning one common low-high scale for all the facets/months.
library(tidyverse)
library(lubridate)
library(scales)
file_url1 <- url("https://raw.githubusercontent.com/johnsnow09/covid19-df_stack-code/main/df_vaccination.csv")
df_vaccination <- read.csv(url(file_url1))
df_vaccination <- df_vaccination %>%
mutate(Updated.On = as.Date(Updated.On))
Code: I have tried
df_vaccination %>%
filter(State != "India") %>%
# summarise each month, state's vaccination
mutate(month_abbr = month(Updated.On, label = TRUE, abbr = TRUE),
State = fct_reorder(State, Population, max)) %>%
group_by(month_abbr, State) %>%
summarise(monthly_ind_vaccinated = sum(Total.Individuals.Vaccinated_Dailycalc,
na.rm = TRUE),
Population = first(Population), .groups = "drop") %>%
# get % Vaccination to State population for each month
group_by(State) %>%
mutate(prc_vaccinated_per_pop = monthly_ind_vaccinated / Population) %>%
na.omit() %>%
ungroup() %>%
filter(State %in% c("Delhi","Maharashtra")) %>%
# group_by(month_abbr) %>%
ggplot(aes(x = State, y = month_abbr, fill = prc_vaccinated_per_pop)) +
geom_tile() +
scale_fill_gradient2(low = "white", high = "darkblue", labels = percent) +
facet_wrap(~as.factor(month_abbr), scales = "free_y", nrow = 6) +
theme(axis.text.x = element_text(angle = 90, vjust = -.02),
strip.text = element_blank()) +
labs(title = "States with highest % Vaccination each month ?",
caption = "created by ViSa",
fill = "% Vaccinated each month",
x = "", y = "")
output:
I think since the color value is based on fill so it is not letting different scales apply on different facets.
Is there anything like (scales = free_fill) instead of (scales = free_y) ?
data output:
# A tibble: 12 x 5
# Groups: month_abbr [6]
month_abbr State monthly_ind_vaccina~ Population prc_vaccinated_per_~
<ord> <fct> <int> <dbl> <dbl>
1 Jan Delhi 43948 18710922 0.00235
2 Jan Maharash~ 228424 123144223 0.00185
3 Feb Delhi 322859 18710922 0.0173
4 Feb Maharash~ 794370 123144223 0.00645
5 Mar Delhi 666628 18710922 0.0356
6 Mar Maharash~ 4590035 123144223 0.0373
7 Apr Delhi 1547324 18710922 0.0827
8 Apr Maharash~ 7942882 123144223 0.0645
9 May Delhi 1613335 18710922 0.0862
10 May Maharash~ 4455440 123144223 0.0362
11 Jun Delhi 250366 18710922 0.0134
12 Jun Maharash~ 1777873 123144223 0.0144
Let's say I have a data.frame consisting of industry type and starting and ending dates (e.g. for an employee).
mydf <- data.frame(industry = c("Government", "Education", "Military", "Private Sector", "Government", "Private Sector"),
start_date = c("2014-01-01", "2016-02-01", "2012-11-01", "2013-03-01", "2012-12-01", "2011-12-01"),
end_date = c("2020-12-01", "2016-10-01", "2014-01-01", "2016-10-01", "2015-10-01", "2014-09-01"))
> mydf
industry start_date end_date
1 Government 2014-01-01 2020-12-01
2 Education 2016-02-01 2016-10-01
3 Military 2012-11-01 2014-01-01
4 Private Sector 2013-03-01 2016-10-01
5 Government 2012-12-01 2015-10-01
6 Private Sector 2011-12-01 2014-09-01
I'd like to create a stacked ggplot bar chart in which each unique year in the start_date column is on the X axis (e.g. 2011-2016) and the y axis represents the total number of observations (the row count) represented in a given industry for that year.
I'm not sure what the right way to manipulate the data.frame to allow for this. Presumably I'd need to manipulate the data to have columns for industry year and count. But I'm not sure how to produce the year columns from a date range. Any ideas?
Convert the date columns to Date, create the 'date' sequence from the 'start_date' to 'end_date' for each row with map2 (from purrr), unnest the list output, count the year and plot with geom_bar
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
mydf %>%
mutate(across(c(start_date, end_date), as.Date)) %>%
transmute(industry, date = map2(start_date, end_date, seq, by = 'day')) %>%
unnest(c(date)) %>%
count(industry, year = factor(year(date))) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_col() +
theme_bw()
If the plot should be separate for each 'industry'
mydf %>%
mutate(across(c(start_date, end_date), as.Date)) %>%
transmute(industry, date = map2(start_date, end_date, seq, by = 'day')) %>%
unnest(c(date)) %>%
count(industry, year = factor(year(date))) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_col() +
facet_wrap(~ industry) +
theme_bw()
-output
As #IanCampbell suggested, the by for seq can be 'year'
mydf %>%
mutate(across(c(start_date, end_date), as.Date)) %>%
transmute(industry, date = map2(start_date, end_date, seq, by = 'year')) %>%
unnest(c(date)) %>%
count(industry, year = factor(year(date))) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_col() +
facet_wrap(~ industry) +
theme_bw()
Is this what you're looking for?
I would recommend using purrr::pmap to create a new data frame with one row for each year based on each row of the original data.
We can use the purrr::pmap_dfr to automatically return a single data frame bound by row.
We can use the ~with(list(...), ) trick to be able to reference columns by name.
Then we can use dplyr::count to count by combinations of columns. Then it's easy.
library(dplyr)
library(purrr)
library(lubridate)
library(ggplot)
mydf %>%
mutate(across(c(start_date, end_date), as.Date),
start_year = year(start_date),
end_year = year(end_date)) %>%
pmap_dfr(~with(list(...),data.frame(industry,
year = seq(start_year, end_year)))) %>%
count(year, industry) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_bar(stat="identity")
I recently started working dygraphs in R, and wanted to achieve a ribbon line plot with it.
Currently, I have the below ggplot which displays a ribbon (for data from multiple batches over time) and its median for two groups. Below is the code for it.
ggplot(df,
aes(x=variable, y=A, color=`[category]`, fill = `[category]`)) +
stat_summary(geom = "ribbon", alpha = 0.35) +
stat_summary(geom = "line", size = 0.9) +
theme_minimal()+ labs(x="TimeStamp")
I could add the median solid line on the dygraph, but I'm unable to add the ribbon to it. Below is the dygraph and my code for it.
df_Medians<- df%>%
group_by(variable,`[category]`) %>%
summarise(A = median(A[!is.na(A)]))
median <- cbind(as.ts(df_Medians$A))
dygraph(median) %>%
dyRangeSelector()
Is there anyway to plot something similar to the above ggplot on dygraphs? Thanks in advance.
See if the following serves your purpose:
ggplot code (for mean, replace median_se with mean_se in the stat_summary layers):
library(ggplot2)
ggplot(df,
aes(x=variable, y=A, color=category, fill = category)) +
stat_summary(geom = "ribbon", alpha = 0.35, fun.data = median_se) +
stat_summary(geom = "line", size = 0.9, fun.data = median_se) +
theme_minimal()
dygraph code (for mean, replace median_se with mean_se in the summarise step):
library(dplyr)
library(dygraph)
# calculate summary statistics for each category, & spread results out such that each row
# corresponds to one position on the x-axis
df_dygraph <- df %>%
group_by(variable, category) %>%
summarise(data = list(median_se(A))) %>%
ungroup() %>%
tidyr::unnest(data) %>%
mutate(category = as.integer(factor(category))) %>% # optional: standardizes the column
# names for summary stats
tidyr::pivot_wider(id_cols = variable, names_from = category,
values_from = c(ymin, y, ymax))
> head(df_dygraph)
# A tibble: 6 x 7
variable ymin_1 ymin_2 y_1 y_2 ymax_1 ymax_2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 3817. 2712. 4560. 2918. 5304. 3125.
2 2 3848. 2712. 4564. 2918. 5279. 3125.
3 3 3847. 2826. 4564 2961 5281. 3096.
4 4 3722. 2827. 4331 2962. 4940. 3098.
5 5 3833. 2831. 4570. 2963 5306. 3095.
6 6 3835. 2831. 4572 2964 5309. 3097.
dygraph(df_dygraph, main = "Dygraph title") %>%
dySeries(c("ymin_1", "y_1", "ymax_1"), label = "Category 1") %>%
dySeries(c("ymin_2", "y_2", "ymax_2"), label = "Category 2") %>%
dyRangeSelector()
Code for median counterpart of mean_se:
median_se <- function(x) {
x <- na.omit(x)
se <- sqrt(var(x) / length(x))
med <- median(x)
ggplot2:::new_data_frame(list(y = med,
ymin = med - se,
ymax = med + se),
n = 1)
}
Sample data:
df <- diamonds %>%
select(price, cut) %>%
filter(cut %in% c("Fair", "Ideal")) %>%
group_by(cut) %>%
slice(1:1000) %>%
mutate(variable = rep(seq(1, 50), times = 20)) %>%
ungroup() %>%
rename(A = price, category = cut)
I have a data frame with data for max 2 years period on different objects:
ISBN Date Quantity
3457 2004-06-15 10
3457 2004-08-16 6
3457 2004-08-19 10
3457 2005-04-19 7
3457 2005-04-20 12
9885 2013-01-15 10
9885 2013-03-16 6
9855 2013-08-19 10
9885 2014-09-19 7
9885 2014-09-20 12
How can I plot Jan to Dec for the 1st year, continued by Jan to Dec for the 2nd year?
I guess the idea is to normalize the years (to have 1st, 2nd), but not the months. (here's an example)
Number of Items Sold over 2 Years Period Since Release
I'd use the lubridate package for something like this. Note I am calling for dataframe df because you didn't give it a name.
So for example:
library(lubridate)
First format the date like so:
df$Date <- ymd(df$Date)
Then extract the month and the year:
df$Month <- month(df$Date, label=TRUE, abbr=TRUE)
df$Year <- year(df$Date)
From there you can plot your results with ggplot2:
library(ggplot2)
ggplot(df, aes(x=Month, y=Quantity, colour=Year)) +
geom_point()
Note your question could be asked better here as you haven't provided a reproducible example.
You could try:
data <- df %>%
group_by(ISBN) %>%
arrange(Date) %>%
mutate(Year = year(Date),
Month = month(Date, label = TRUE),
Rank = paste(sapply(cumsum(Year != lag(Year,default=0)), toOrdinal), "Year")) %>%
group_by(Rank, Month, add = TRUE) %>%
summarise(Sum = sum(Quantity))
ggplot(data = data, aes(x = Month, y = Sum,
group = factor(ISBN),
colour = factor(ISBN))) +
geom_line(stat = "identity") +
facet_grid(. ~ Rank) +
scale_colour_discrete(name = "ISBN") +
theme(panel.margin = unit(0, "lines"),
axis.text.x = element_text(angle = 90))
Aussming the following df:
df <- data.frame(
ISBN = sample(c(3457, 9885), 1000, replace = TRUE),
Date = sample(seq(as.Date('2004/01/01'),
as.Date('2011/12/31'), by = "month"),
1000, replace = TRUE),
Quantity = sample(1:12, 1000, replace = TRUE)
)
This would produce: