Here is the codes and the present outplot
df <- data.frame(state = c('0','1'),
male = c(26287942,9134784),
female = c(16234000,4406645))
#output
> df
state male female
1 0 26287942 16234000
2 1 9134784 4406645
library(ggplot2)
library(tidyr)
df_long <- pivot_longer(df, cols = c("female","male"))
names(df_long) <- c('state','sex','observations')
ggplot(data = df_long) +
geom_col(aes(x = sex, y =observations, fill = state)) +
theme(legend.position = c(0.1,0.9),
legend.background = element_rect(fill='lightgrey') )
I want to adjust the plots like this. (I marked what I want to change.)
Simplify the scientific records in y-axis.
Count the ratio (the number of state 1)/(the number of state 0 + state 1) and plot like this.
It may be a little complicated, and I don't know which functions to use. If possible, can anyone tell me some related functions or examples?
You can set options(scipen = 99) to disable scientific notation on y-axis. We can create a separate dataset for label data.
library(tidyverse)
options(scipen = 99)
long_data <- df %>%
pivot_longer(cols = c(male, female),
names_to = "sex",
values_to = "observations")
label_data <- long_data %>%
group_by(sex) %>%
summarise(perc = observations[match(1, state)]/sum(observations),
total = sum(observations), .groups = "drop")
ggplot(long_data) +
geom_col(aes(x = sex, y = observations, fill = state)) +
geom_text(data = label_data,
aes(label = round(perc, 2), x = sex, y = total),
vjust = -0.5) +
theme(legend.position = c(0.1,0.9),
legend.background = element_rect(fill='lightgrey'))
By searching the Internet for about two days, I have finished the work!
sex <- c('M','F')
y0 <- c(26287942,16234000)
y1 <- c(9134784, 4406645)
y0 <- y0*10^{-7}
y1 <- y1*10^{-7}
ratio <- y1/(y0+y1)
ratio <- round(ratio,2)
m <- t(matrix(c(y0,y1),ncol=2))
colnames(m) <- c(as.character(sex))
df <- as.data.frame(m)
df <- cbind(c('0','1'),df)
colnames(df)[1] <- 'observations'
df
df_long <- pivot_longer(df, cols = as.character(sex))
names(df_long) <- c('state','sex','observations')
df_r <- as.data.frame(df_long)
df_r <- data.frame(df_r,ratio=rep(ratio,2))
ggplot(data = df_r) +
geom_col(aes(x =sex, y = observations, fill = state))+
theme(legend.position = c(0.1,0.9),
legend.background = element_rect(fill=NULL) )+
geom_line(aes(x=sex,y=ratio*10),group=1)+
geom_point(aes(x=sex,y=ratio*10))+
geom_text(aes(x=sex,y=ratio*10+0.35),label=rep(ratio,2))+
scale_y_continuous(name =expression(paste('observations(','\u00D7', 10^7,')')),
sec.axis = sec_axis(~./10,name='ratio'))
The output:
Given two monthly time series data sample from this link.
I will need to create one plot containing 3 subplots: plot1 for the original values, plot2 for month over month changes, and plot3 for year over year changes.
I'm able to draw the plot with code below, but the code is too redundant. So my question is how could achieve that in a concise way? Thanks.
library(xlsx)
library(ggplot2)
library(reshape)
library(dplyr)
library(tidyverse)
library(lubridate)
library(cowplot)
library(patchwork)
df <- read.xlsx('./sample_data.xlsx', 'Sheet1')
colnames(df)
# df
cols <- c('food_index', 'energy_index')
df <- df %>% mutate(date=as.Date(date)) %>%
mutate(across(-contains('date'), as.numeric)) %>%
mutate(date= floor_date(date, 'month')) %>%
group_by(date) %>%
summarise_at(vars(cols), funs(mean(., na.rm=TRUE))) %>%
mutate(across(cols, list(yoy = ~(. - lag(., 12))/lag(., 12)))*100) %>%
mutate(across(cols, list(mom = ~(. - lag(., 1))/lag(., 1)))*100) %>%
filter(date >= '2018-01-01' & date <= '2021-12-31') %>%
as.data.frame()
df1 <- df %>%
select(!grep('mom|yoy', names(df)))
df1_long <- melt(df1, id.vars = 'date')
plot1 <- ggplot(df1_long[!is.na(df1_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: $'
)
# MoM changes
df2 <- df %>%
select(grep('date|mom', names(df)))
df2_long <- melt(df2, id.vars = 'date')
plot2 <- ggplot(df2_long[!is.na(df2_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
# YoY changes
df3 <- df %>%
select(grep('date|yoy', names(df)))
df3_long <- melt(df3, id.vars = 'date')
plot3 <- ggplot(df3_long[!is.na(df3_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
plot <- plot1 + plot2 + plot3 + plot_layout(ncol=1)
# plot <- plot_grid(plot1, plot2, plot3, labels = c('Value', 'MoM', 'YoY'), label_size = 12)
plot
Out:
The expected result will be similar to the plot below (the upper plot will display the original data, the middle plot will display the mom changes data, and the lower plot will display the yoy changes data):
References:
https://waterdata.usgs.gov/blog/beyond-basic-plotting/
http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
Side-by-side plots with ggplot2
Maybe this is what you are looking for? By reshaping your data to the right shape, using a plot function and e.g. purrr::map2 you could achieve your desired result without duplicating your code like so.
Using some fake random example data to mimic your true data:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
labs(
x = "",
y = ylab
)
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
plots <- purrr::map2(yvars, ylabs, plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
DATA
set.seed(123)
date <- seq.POSIXt(as.POSIXct("2017-01-31"), as.POSIXct("2022-12-31"), by = "month")
food_index <- runif(length(date))
energy_index <- runif(length(date))
df <- data.frame(date, food_index, energy_index)
EDIT Adding subtitles to each plot when using patchwork is (as of the moment) a bit tricky. What I would do in this case would be to use a faceting "hack". To this end I slightly adjusted the function to take a subtitle argument and switched to purrr::pmap:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab, subtitle) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
facet_wrap(~.env$subtitle) +
labs(
x = "",
y = ylab
) +
theme(strip.background = element_blank(), strip.text.x = element_text(hjust = 0))
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
subtitle <- c("Original", "Month-to-Month", "Year-to-Year")
plots <- purrr::pmap(list(y = yvars, ylab = ylabs, subtitle = subtitle), plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
The target output is done with facets rather than stitching plots together. You could do this too if you like, but it requires reshaping your data in a different way. Which approach you take is really a matter of taste.
library(ggplot2)
library(dplyr)
yoy <- function(x) 100 * (x - lag(x, 13)) / lag(x, 12)
mom <- function(x) 100 * (x - lag(x)) / lag(x)
df %>%
mutate(date = as.Date(date, origin = "1899-12-30"),
`Actual value (Dollars).Food Index` = food_index,
`Month-on-month change (%).Food Index` = mom(food_index),
`Year-on-year change (%).Food Index` = yoy(food_index),
`Actual value (Dollars).Energy Index` = energy_index,
`Month-on-month change (%).Energy Index` = mom(energy_index),
`Year-on-year change (%).Energy Index` = yoy(energy_index)) %>%
select(-food_index, -energy_index) %>%
tidyr::pivot_longer(-1) %>%
filter(date > as.Date("2018-01-01")) %>%
tidyr::separate(name, into = c("series", "index"), sep = "\\.") %>%
ggplot(aes(date, value, color = index)) +
geom_point(na.rm = TRUE) +
geom_line() +
facet_grid(series~., scales = "free_y") +
theme_bw(base_size = 16)
Reproducible data taken from link in question
df <- structure(list(date = c(42766, 42794, 42825, 42855, 42886, 42916,
42947, 42978, 43008, 43039, 43069, 43100, 43131, 43159, 43190,
43220, 43251, 43281, 43312, 43343, 43373, 43404, 43434, 43465,
43496, 43524, 43555, 43585, 43616, 43646, 43677, 43708, 43738,
43769, 43799, 43830, 43861, 43890, 43921, 43951, 43982, 44012,
44043, 44074, 44104, 44135, 44165, 44196, 44227, 44255, 44286,
44316, 44347, 44377, 44408, 44439, 44469, 44500, 44530, 44561
), food_index = c(58.53, 61.23, 55.32, 55.34, 61.73, 56.91, 54.27,
59.08, 60.11, 66.01, 60.11, 63.41, 69.8, 72.45, 81.11, 89.64,
88.64, 88.62, 98.27, 111.11, 129.39, 140.14, 143.44, 169.21,
177.39, 163.88, 135.07, 151.28, 172.81, 143.82, 162.13, 172.22,
176.67, 179.3, 157.27, 169.12, 192.51, 194.2, 179.4, 169.1, 193.17,
174.92, 181.92, 188.41, 192.14, 203.41, 194.19, 174.3, 174.86,
182.33, 182.82, 185.36, 192.41, 195.59, 202.6, 201.51, 225.01,
243.78, 270.67, 304.57), energy_index = c(127.36, 119.87, 120.96,
112.09, 112.19, 109.24, 109.56, 106.89, 109.35, 108.35, 112.39,
117.77, 119.52, 122.24, 120.91, 125.41, 129.72, 135.25, 139.33,
148.6, 169.62, 184.23, 204.38, 198.55, 189.29, 202.47, 220.23,
240.67, 263.12, 249.74, 240.84, 243.42, 261.2, 256.76, 258.69,
277.98, 289.63, 293.46, 310.81, 318.68, 310.04, 302.17, 298.62,
260.92, 269.29, 258.84, 241.68, 224.18, 216.36, 226.57, 235.98,
253.86, 267.37, 261.99, 273.37, 280.91, 291.84, 297.88, 292.78,
289.79)), row.names = c(NA, 60L), class = "data.frame")
I am trying to reproduce a map I found here: http://zevross.com/blog/2018/10/02/creating-beautiful-demographic-maps-in-r-with-the-tidycensus-and-tmap-packages/
I am using RStudio and am running the following code:
library(ggplot2) # For plotting
library(tidycensus) # For downloading Census data
library(tmap) # For creating tmap
library(tmaptools) # For reading and processing spatial data related to tmap
library(dplyr) # For data wrangling
library(sf) # For reading, writing and working with spatial objects
census_api_key("enter your API key here", overwrite = TRUE)
dat12 <- get_acs("county", table = "B27001", year = 2012,
output = "tidy", state = NULL, geometry = FALSE) %>%
rename(`2012` = estimate) %>%
select(-NAME, -moe)
dat16 <- get_acs("county", table = "B27001", year = 2016,
output = "tidy", state = NULL, geometry = TRUE, shift_geo = TRUE) %>%
rename(`2016` = estimate) %>%
select(-moe)
dat <- left_join(dat16, dat12, by = c("GEOID", "variable"))
st_geometry(dat) <- NULL # This drops the geometry and leaves a table
head(dat)
dat <- mutate(dat,
cat = case_when(
variable %in% paste0("B27001_0",
c("09","12","37","40")) ~ "pop1834",
variable %in% paste0("B27001_0",
c("11","14","39","42")) ~ "pop1834ni")) %>%
filter(!is.na(cat))
# Create long version
dat <- tidyr::gather(dat, year, estimate, c(`2012`, `2016`))
# Group the data by our new categories and sum
dat <- group_by(dat, GEOID, NAME, year, cat) %>%
summarize(estimate = sum(estimate)) %>%
ungroup() %>%
tidyr::spread(cat, estimate)
dat <- mutate(dat, est = (pop1834ni/pop1834) * 100) %>%
select(-c(pop1834, pop1834ni)) %>%
tidyr::spread(year, est) %>%
mutate(diff = `2016`-`2012`)
head(dat)
datlong <- select(dat, -diff) %>%
tidyr::gather(year, estimate, c(`2012`, `2016`)) %>%
group_by(year) %>%
mutate(med = round(median(estimate, na.rm = TRUE), 1))
ggplot(datlong, aes(estimate)) +
geom_histogram(fill = "firebrick2",
color = "white", bins = 60) +
xlab("Uninsured adults ages 18-34 by county (%)") +
theme(plot.title = element_text(hjust = 0.5)) +
facet_wrap(~year, ncol = 1) +
geom_vline(aes(xintercept = med,
group = year), lty = "dashed") +
geom_text(aes(label = paste("Median = ", med), x = med, y = 55))
d10 <- top_n(dat, 10, diff) %>%
mutate(type = "Insured population decreased",
difftemp = diff)
i10 <- top_n(dat, -10, diff) %>%
mutate(type = "Insured population increased",
difftemp = abs(diff))
id10 <- bind_rows(list(i10, d10)) %>%
arrange(desc(difftemp))
ggplot(id10) +
geom_col(aes(x = forcats::fct_reorder(NAME, difftemp),
y = difftemp, fill = type)) +
coord_flip() +
scale_fill_manual(values = c("firebrick2", "cyan4")) +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
legend.title = element_blank()) +
ggtitle("Counties with the greatest change (+/-) in
insured population, ages 18-34, 2012-2016") +
ylab("Difference in % insured (2016 - 2012)") +
xlab("")
shp <- dat16 %>%
filter(variable == "B27001_001") # much faster than using distinct()
select(GEOID, NAME) %>%
left_join(dat, by = c("GEOID", "NAME")) %>%
arrange(GEOID) %>%
rename(uninsured_2012 = `2012`,
uninsured_2016 = `2016`,
uninsured_diff = diff)
Up until the last bit of code, the one that begins with shp, everything runs perfect. Once,
shp <- dat16 %>%
filter(variable == "B27001_001") # much faster than using distinct()
select(GEOID, NAME) %>%
left_join(dat, by = c("GEOID", "NAME")) %>%
arrange(GEOID) %>%
rename(uninsured_2012 = `2012`,
uninsured_2016 = `2016`,
uninsured_diff = diff)
is run, I get the following error:
Error in select(GEOID, NAME) : object 'GEOID' not found
I have checked dat16 and dat. GEOID and NAME are present there. I am not sure what is wrong with the SELECT function as I have not loaded another library which may interfere with it. Any help would be appreciated.
I see now what was missing, a %>% (pipe) following the 'filter':
shp <- dat16 %>%
filter(variable == "B27001_001") %>% # much faster than using distinct()
select(GEOID, NAME) %>%
left_join(dat, by = c("GEOID", "NAME")) %>%
arrange(GEOID) %>%
rename(
uninsured_2012 = `2012`,
uninsured_2016 = `2016`,
uninsured_diff = diff
)
This question already has answers here:
R + ggplot2: how to hide missing dates from x-axis?
(3 answers)
Closed 5 years ago.
I would like to plot the development of different indexes with ggplot2. My problem is, that 2018-02-03 and 2018-02-04 are non-working days, and thus there is no data available for these days, but when plotting ggplot2 adds them by extrapolating. How could I omit the non-business days, so that I get 2018-02-02 followed by 2018-02-05?
library(tidyverse)
library(quantmod)
#retrieve data
getSymbols("BTCUSD=X;^DJI;^VIX;^GDAXI", from="2017-01-01")
#merge all dataframes together
df <- merge(`BTCUSD=X`,`DJI`, all = TRUE)
df <- merge(df, `VIX`, all = TRUE)
df <- merge(df, `GDAXI`, all = TRUE)
#creating a dataframe with first column as date that comes from xts object extracted by index()
df <- data.frame(date=index(df), coredata(df))
#selecting columns and filtering the time series start date
df_1 <- df%>% select(date, contains("Close"))%>% na.omit() %>% filter(date>"2018-01-25")
#df_1 <- df_1 %>%mutate(BTCUSD.X.Close=BTCUSD.X.Close/BTCUSD.X.Close[1], DJI.Close=DJI.Close/DJI.Close[1], GDAXI.Close=GDAXI.Close/GDAXI.Close[1], VIX.Close=VIX.Close/VIX.Close[1])
df_1 <- df_1 %>% gather(var, closing, 2:5)
png("indexes.png", width = 9, height = 6, units = 'in', res = 600)
plot_1 <- ggplot(data=df_1)+
geom_line(aes(x=date, y=closing))+
facet_wrap(~var, scales = "free")+
scale_x_date(breaks = df_1$date, date_minor_breaks = "1 day", date_labels = "%y-%m-%d")+
theme(text = element_text(size=7), axis.text.x = element_text(angle = 90, hjust = 1))
plot_1
dev.off()
plot_1
The package bdscale was designed for this purpose, once added, you can substitute your scale_x_date line with:
scale_x_bd(
business.dates = df_1$date,
max.major.breaks = 10,
labels = date_format("%y-%m-%d")
)
To produce this plot...
Fully reproducible code
library(tidyverse)
library(quantmod)
library(bdscale)
library(scales)
getSymbols("BTCUSD=X;^DJI;^VIX;^GDAXI", from = "2017-01-01")
df <- merge(`BTCUSD=X`,`DJI`, all = TRUE) %>%
merge(`VIX`, all = TRUE) %>%
merge(`GDAXI`, all = TRUE)
df <- data.frame(date = index(df), coredata(df))
df_1 <- df %>%
select(date, contains("Close")) %>%
na.omit %>%
filter(date > "2018-01-25") %>%
gather(var, closing,2:5)
ggplot(data = df_1, aes(x = date, y = closing)) +
geom_line() +
facet_wrap(~var, scales = "free") +
scale_x_bd(business.dates = df_1$date,
max.major.breaks = 10,
labels = date_format("%y-%m-%d")) +
theme(text = element_text(size = 7),
axis.text.x = element_text(angle = 90, hjust = 1))