Line plot with multible lines with data from Excel - r

I have a table in Excel that show which weapon where used to commit murder per year.
I want to read the data and plot it as a line plot with multiple lines:
However my code only gives me a gibberish graph:
This is my code:
library("readxl")
library(data.table)
library(ggplot2)
# Read excel data
res <- as.data.frame(readxl::read_excel("murders_per_modus_veapon.xlsx", sheet = 1))
res$r = c('Unknown','Knife/stabbing weapon','Axe','Firearms','Suffocation','Blunt violence','Other')
res = reshape::melt(res)
# Plot data
ggplot(res, aes(x=variable,y=value))+ geom_line()

You need to convert your variable column from factor to date. See below;
library(reshape2)
library(dplyr)
library(lubridate)
library(ggplot2)
reshape2::melt(res, value.name = "MR", variable.name = "Year") %>%
mutate(Year = make_date(as.character(Year), 1, 1)) %>%
ggplot(., aes(x=Year, y=MR, color = Modus)) +
geom_line() +
scale_x_date(date_breaks = "1 year",date_labels = "%Y")
#> Using Modus as id variables
Created on 2021-05-25 by the reprex package (v2.0.0)
Data:
read.table(text = "Modus 2018 2019 2020
Unknown 2 0 0
Knife_stabbing_weapon 8 14 16
Axe 1 1 0
Firearms 3 3 2
Suffocation 2 5 6
Blunt_violence 8 3 4
Other 1 1 0",
header = T, stringsAsFactors = F, check.names = F) -> res
You can still run this part from your own code to import the data, and then use mine for plotting;
res <- as.data.frame(readxl::read_excel("murders_per_modus_veapon.xlsx", sheet = 1))
res$r = c('Unknown','Knife/stabbing weapon','Axe','Firearms',
'Suffocation','Blunt violence','Other')

Since you have not provided reproducible data, I created tiny data that may look like a small subset of your original xlsx data:
df = data.frame(weapon = c("Unknown", "knife", "axe"),
x2018 = c(2, 8, 1),
x2019 = c(0, 14, 1),
x2020 = c(0, 16, 0))
Then, I make the data tidy using dplyr and tidyr. Finally I produce a line plot that you may be looking for:
df %>%
pivot_longer(cols = 2:4, names_to = "year", values_to = "amount") %>%
mutate(year = gsub("x", "", year)) %>%
ggplot(aes(as.numeric(year), amount, col = weapon)) +
geom_line() +
scale_x_continuous(breaks = c(2018, 2019, 2020))

This could be an option. Thanks to M-- for the data.
library(ggplot2)
library(ggrepel)
df <- df %>%
pivot_longer(
cols = c("2018", "2019", "2020")
) %>%
mutate(label = if_else(name == max(name), as.character(Modus), NA_character_)) %>%
mutate(name = as.factor(name))
p <- ggplot(df, aes(x=name, y=value, colour=Modus, group=Modus)) +
geom_point () +
geom_line(size = 0.8) +
theme_bw()
p + geom_label_repel(aes(label = label),
nudge_x = 1,
na.rm = TRUE) +
theme(legend.position = "none")

Related

How to get r to not remove a row in ggplot - geom_line

I'm trying to produce a graph of growth rates over time based upon the following data which has blanks in two groups.
When I try to make a growth plot of this using geom_line to join points there is no line for group c.
I'm just wondering if there is anyway to fix this
One option would be to get rid of the missing values which prevent the points to be connected by the line:
Making use of the code from the answer I provided on your previous question but adding tidyr::drop_na:
Growthplot <- data.frame(
Site = letters[1:4],
July = 0,
August = c(1, -1, NA, 2),
September = c(3, 2, 3, NA)
)
library(ggplot2)
library(tidyr)
library(dplyr, warn=FALSE)
growth_df <- Growthplot %>%
pivot_longer(-Site, names_to = "Month", values_to = "Length") %>%
mutate(Month = factor(Month, levels = c("July", "August", "September"))) %>%
drop_na()
ggplot(growth_df, aes(x = Month, y = Length, colour = Site, group = Site)) +
geom_point() +
geom_line()+
labs(color = "Site", x = "Month", y = "Growth in cm") +
theme(axis.line = element_line(colour = "black", size = 0.24))

How to adjust the the y axis like this in ggplot2?

Here is the codes and the present outplot
df <- data.frame(state = c('0','1'),
male = c(26287942,9134784),
female = c(16234000,4406645))
#output
> df
state male female
1 0 26287942 16234000
2 1 9134784 4406645
library(ggplot2)
library(tidyr)
df_long <- pivot_longer(df, cols = c("female","male"))
names(df_long) <- c('state','sex','observations')
ggplot(data = df_long) +
geom_col(aes(x = sex, y =observations, fill = state)) +
theme(legend.position = c(0.1,0.9),
legend.background = element_rect(fill='lightgrey') )
I want to adjust the plots like this. (I marked what I want to change.)
Simplify the scientific records in y-axis.
Count the ratio (the number of state 1)/(the number of state 0 + state 1) and plot like this.
It may be a little complicated, and I don't know which functions to use. If possible, can anyone tell me some related functions or examples?
You can set options(scipen = 99) to disable scientific notation on y-axis. We can create a separate dataset for label data.
library(tidyverse)
options(scipen = 99)
long_data <- df %>%
pivot_longer(cols = c(male, female),
names_to = "sex",
values_to = "observations")
label_data <- long_data %>%
group_by(sex) %>%
summarise(perc = observations[match(1, state)]/sum(observations),
total = sum(observations), .groups = "drop")
ggplot(long_data) +
geom_col(aes(x = sex, y = observations, fill = state)) +
geom_text(data = label_data,
aes(label = round(perc, 2), x = sex, y = total),
vjust = -0.5) +
theme(legend.position = c(0.1,0.9),
legend.background = element_rect(fill='lightgrey'))
By searching the Internet for about two days, I have finished the work!
sex <- c('M','F')
y0 <- c(26287942,16234000)
y1 <- c(9134784, 4406645)
y0 <- y0*10^{-7}
y1 <- y1*10^{-7}
ratio <- y1/(y0+y1)
ratio <- round(ratio,2)
m <- t(matrix(c(y0,y1),ncol=2))
colnames(m) <- c(as.character(sex))
df <- as.data.frame(m)
df <- cbind(c('0','1'),df)
colnames(df)[1] <- 'observations'
df
df_long <- pivot_longer(df, cols = as.character(sex))
names(df_long) <- c('state','sex','observations')
df_r <- as.data.frame(df_long)
df_r <- data.frame(df_r,ratio=rep(ratio,2))
ggplot(data = df_r) +
geom_col(aes(x =sex, y = observations, fill = state))+
theme(legend.position = c(0.1,0.9),
legend.background = element_rect(fill=NULL) )+
geom_line(aes(x=sex,y=ratio*10),group=1)+
geom_point(aes(x=sex,y=ratio*10))+
geom_text(aes(x=sex,y=ratio*10+0.35),label=rep(ratio,2))+
scale_y_continuous(name =expression(paste('observations(','\u00D7', 10^7,')')),
sec.axis = sec_axis(~./10,name='ratio'))
The output:

Write a function to plot original value, mom and yoy change for time series data in 3 subplots [duplicate]

Given two monthly time series data sample from this link.
I will need to create one plot containing 3 subplots: plot1 for the original values, plot2 for month over month changes, and plot3 for year over year changes.
I'm able to draw the plot with code below, but the code is too redundant. So my question is how could achieve that in a concise way? Thanks.
library(xlsx)
library(ggplot2)
library(reshape)
library(dplyr)
library(tidyverse)
library(lubridate)
library(cowplot)
library(patchwork)
df <- read.xlsx('./sample_data.xlsx', 'Sheet1')
colnames(df)
# df
cols <- c('food_index', 'energy_index')
df <- df %>% mutate(date=as.Date(date)) %>%
mutate(across(-contains('date'), as.numeric)) %>%
mutate(date= floor_date(date, 'month')) %>%
group_by(date) %>%
summarise_at(vars(cols), funs(mean(., na.rm=TRUE))) %>%
mutate(across(cols, list(yoy = ~(. - lag(., 12))/lag(., 12)))*100) %>%
mutate(across(cols, list(mom = ~(. - lag(., 1))/lag(., 1)))*100) %>%
filter(date >= '2018-01-01' & date <= '2021-12-31') %>%
as.data.frame()
df1 <- df %>%
select(!grep('mom|yoy', names(df)))
df1_long <- melt(df1, id.vars = 'date')
plot1 <- ggplot(df1_long[!is.na(df1_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: $'
)
# MoM changes
df2 <- df %>%
select(grep('date|mom', names(df)))
df2_long <- melt(df2, id.vars = 'date')
plot2 <- ggplot(df2_long[!is.na(df2_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
# YoY changes
df3 <- df %>%
select(grep('date|yoy', names(df)))
df3_long <- melt(df3, id.vars = 'date')
plot3 <- ggplot(df3_long[!is.na(df3_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
plot <- plot1 + plot2 + plot3 + plot_layout(ncol=1)
# plot <- plot_grid(plot1, plot2, plot3, labels = c('Value', 'MoM', 'YoY'), label_size = 12)
plot
Out:
The expected result will be similar to the plot below (the upper plot will display the original data, the middle plot will display the mom changes data, and the lower plot will display the yoy changes data):
References:
https://waterdata.usgs.gov/blog/beyond-basic-plotting/
http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
Side-by-side plots with ggplot2
Maybe this is what you are looking for? By reshaping your data to the right shape, using a plot function and e.g. purrr::map2 you could achieve your desired result without duplicating your code like so.
Using some fake random example data to mimic your true data:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
labs(
x = "",
y = ylab
)
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
plots <- purrr::map2(yvars, ylabs, plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
DATA
set.seed(123)
date <- seq.POSIXt(as.POSIXct("2017-01-31"), as.POSIXct("2022-12-31"), by = "month")
food_index <- runif(length(date))
energy_index <- runif(length(date))
df <- data.frame(date, food_index, energy_index)
EDIT Adding subtitles to each plot when using patchwork is (as of the moment) a bit tricky. What I would do in this case would be to use a faceting "hack". To this end I slightly adjusted the function to take a subtitle argument and switched to purrr::pmap:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab, subtitle) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
facet_wrap(~.env$subtitle) +
labs(
x = "",
y = ylab
) +
theme(strip.background = element_blank(), strip.text.x = element_text(hjust = 0))
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
subtitle <- c("Original", "Month-to-Month", "Year-to-Year")
plots <- purrr::pmap(list(y = yvars, ylab = ylabs, subtitle = subtitle), plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
The target output is done with facets rather than stitching plots together. You could do this too if you like, but it requires reshaping your data in a different way. Which approach you take is really a matter of taste.
library(ggplot2)
library(dplyr)
yoy <- function(x) 100 * (x - lag(x, 13)) / lag(x, 12)
mom <- function(x) 100 * (x - lag(x)) / lag(x)
df %>%
mutate(date = as.Date(date, origin = "1899-12-30"),
`Actual value (Dollars).Food Index` = food_index,
`Month-on-month change (%).Food Index` = mom(food_index),
`Year-on-year change (%).Food Index` = yoy(food_index),
`Actual value (Dollars).Energy Index` = energy_index,
`Month-on-month change (%).Energy Index` = mom(energy_index),
`Year-on-year change (%).Energy Index` = yoy(energy_index)) %>%
select(-food_index, -energy_index) %>%
tidyr::pivot_longer(-1) %>%
filter(date > as.Date("2018-01-01")) %>%
tidyr::separate(name, into = c("series", "index"), sep = "\\.") %>%
ggplot(aes(date, value, color = index)) +
geom_point(na.rm = TRUE) +
geom_line() +
facet_grid(series~., scales = "free_y") +
theme_bw(base_size = 16)
Reproducible data taken from link in question
df <- structure(list(date = c(42766, 42794, 42825, 42855, 42886, 42916,
42947, 42978, 43008, 43039, 43069, 43100, 43131, 43159, 43190,
43220, 43251, 43281, 43312, 43343, 43373, 43404, 43434, 43465,
43496, 43524, 43555, 43585, 43616, 43646, 43677, 43708, 43738,
43769, 43799, 43830, 43861, 43890, 43921, 43951, 43982, 44012,
44043, 44074, 44104, 44135, 44165, 44196, 44227, 44255, 44286,
44316, 44347, 44377, 44408, 44439, 44469, 44500, 44530, 44561
), food_index = c(58.53, 61.23, 55.32, 55.34, 61.73, 56.91, 54.27,
59.08, 60.11, 66.01, 60.11, 63.41, 69.8, 72.45, 81.11, 89.64,
88.64, 88.62, 98.27, 111.11, 129.39, 140.14, 143.44, 169.21,
177.39, 163.88, 135.07, 151.28, 172.81, 143.82, 162.13, 172.22,
176.67, 179.3, 157.27, 169.12, 192.51, 194.2, 179.4, 169.1, 193.17,
174.92, 181.92, 188.41, 192.14, 203.41, 194.19, 174.3, 174.86,
182.33, 182.82, 185.36, 192.41, 195.59, 202.6, 201.51, 225.01,
243.78, 270.67, 304.57), energy_index = c(127.36, 119.87, 120.96,
112.09, 112.19, 109.24, 109.56, 106.89, 109.35, 108.35, 112.39,
117.77, 119.52, 122.24, 120.91, 125.41, 129.72, 135.25, 139.33,
148.6, 169.62, 184.23, 204.38, 198.55, 189.29, 202.47, 220.23,
240.67, 263.12, 249.74, 240.84, 243.42, 261.2, 256.76, 258.69,
277.98, 289.63, 293.46, 310.81, 318.68, 310.04, 302.17, 298.62,
260.92, 269.29, 258.84, 241.68, 224.18, 216.36, 226.57, 235.98,
253.86, 267.37, 261.99, 273.37, 280.91, 291.84, 297.88, 292.78,
289.79)), row.names = c(NA, 60L), class = "data.frame")

Creating a geographic file for use with tmap and coming up with error when coding shapefile

I am trying to reproduce a map I found here: http://zevross.com/blog/2018/10/02/creating-beautiful-demographic-maps-in-r-with-the-tidycensus-and-tmap-packages/
I am using RStudio and am running the following code:
library(ggplot2) # For plotting
library(tidycensus) # For downloading Census data
library(tmap) # For creating tmap
library(tmaptools) # For reading and processing spatial data related to tmap
library(dplyr) # For data wrangling
library(sf) # For reading, writing and working with spatial objects
census_api_key("enter your API key here", overwrite = TRUE)
dat12 <- get_acs("county", table = "B27001", year = 2012,
output = "tidy", state = NULL, geometry = FALSE) %>%
rename(`2012` = estimate) %>%
select(-NAME, -moe)
dat16 <- get_acs("county", table = "B27001", year = 2016,
output = "tidy", state = NULL, geometry = TRUE, shift_geo = TRUE) %>%
rename(`2016` = estimate) %>%
select(-moe)
dat <- left_join(dat16, dat12, by = c("GEOID", "variable"))
st_geometry(dat) <- NULL # This drops the geometry and leaves a table
head(dat)
dat <- mutate(dat,
cat = case_when(
variable %in% paste0("B27001_0",
c("09","12","37","40")) ~ "pop1834",
variable %in% paste0("B27001_0",
c("11","14","39","42")) ~ "pop1834ni")) %>%
filter(!is.na(cat))
# Create long version
dat <- tidyr::gather(dat, year, estimate, c(`2012`, `2016`))
# Group the data by our new categories and sum
dat <- group_by(dat, GEOID, NAME, year, cat) %>%
summarize(estimate = sum(estimate)) %>%
ungroup() %>%
tidyr::spread(cat, estimate)
dat <- mutate(dat, est = (pop1834ni/pop1834) * 100) %>%
select(-c(pop1834, pop1834ni)) %>%
tidyr::spread(year, est) %>%
mutate(diff = `2016`-`2012`)
head(dat)
datlong <- select(dat, -diff) %>%
tidyr::gather(year, estimate, c(`2012`, `2016`)) %>%
group_by(year) %>%
mutate(med = round(median(estimate, na.rm = TRUE), 1))
ggplot(datlong, aes(estimate)) +
geom_histogram(fill = "firebrick2",
color = "white", bins = 60) +
xlab("Uninsured adults ages 18-34 by county (%)") +
theme(plot.title = element_text(hjust = 0.5)) +
facet_wrap(~year, ncol = 1) +
geom_vline(aes(xintercept = med,
group = year), lty = "dashed") +
geom_text(aes(label = paste("Median = ", med), x = med, y = 55))
d10 <- top_n(dat, 10, diff) %>%
mutate(type = "Insured population decreased",
difftemp = diff)
i10 <- top_n(dat, -10, diff) %>%
mutate(type = "Insured population increased",
difftemp = abs(diff))
id10 <- bind_rows(list(i10, d10)) %>%
arrange(desc(difftemp))
ggplot(id10) +
geom_col(aes(x = forcats::fct_reorder(NAME, difftemp),
y = difftemp, fill = type)) +
coord_flip() +
scale_fill_manual(values = c("firebrick2", "cyan4")) +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
legend.title = element_blank()) +
ggtitle("Counties with the greatest change (+/-) in
insured population, ages 18-34, 2012-2016") +
ylab("Difference in % insured (2016 - 2012)") +
xlab("")
shp <- dat16 %>%
filter(variable == "B27001_001") # much faster than using distinct()
select(GEOID, NAME) %>%
left_join(dat, by = c("GEOID", "NAME")) %>%
arrange(GEOID) %>%
rename(uninsured_2012 = `2012`,
uninsured_2016 = `2016`,
uninsured_diff = diff)
Up until the last bit of code, the one that begins with shp, everything runs perfect. Once,
shp <- dat16 %>%
filter(variable == "B27001_001") # much faster than using distinct()
select(GEOID, NAME) %>%
left_join(dat, by = c("GEOID", "NAME")) %>%
arrange(GEOID) %>%
rename(uninsured_2012 = `2012`,
uninsured_2016 = `2016`,
uninsured_diff = diff)
is run, I get the following error:
Error in select(GEOID, NAME) : object 'GEOID' not found
I have checked dat16 and dat. GEOID and NAME are present there. I am not sure what is wrong with the SELECT function as I have not loaded another library which may interfere with it. Any help would be appreciated.
I see now what was missing, a %>% (pipe) following the 'filter':
shp <- dat16 %>%
filter(variable == "B27001_001") %>% # much faster than using distinct()
select(GEOID, NAME) %>%
left_join(dat, by = c("GEOID", "NAME")) %>%
arrange(GEOID) %>%
rename(
uninsured_2012 = `2012`,
uninsured_2016 = `2016`,
uninsured_diff = diff
)

Removing the non-business days from ggplot axis [duplicate]

This question already has answers here:
R + ggplot2: how to hide missing dates from x-axis?
(3 answers)
Closed 5 years ago.
I would like to plot the development of different indexes with ggplot2. My problem is, that 2018-02-03 and 2018-02-04 are non-working days, and thus there is no data available for these days, but when plotting ggplot2 adds them by extrapolating. How could I omit the non-business days, so that I get 2018-02-02 followed by 2018-02-05?
library(tidyverse)
library(quantmod)
#retrieve data
getSymbols("BTCUSD=X;^DJI;^VIX;^GDAXI", from="2017-01-01")
#merge all dataframes together
df <- merge(`BTCUSD=X`,`DJI`, all = TRUE)
df <- merge(df, `VIX`, all = TRUE)
df <- merge(df, `GDAXI`, all = TRUE)
#creating a dataframe with first column as date that comes from xts object extracted by index()
df <- data.frame(date=index(df), coredata(df))
#selecting columns and filtering the time series start date
df_1 <- df%>% select(date, contains("Close"))%>% na.omit() %>% filter(date>"2018-01-25")
#df_1 <- df_1 %>%mutate(BTCUSD.X.Close=BTCUSD.X.Close/BTCUSD.X.Close[1], DJI.Close=DJI.Close/DJI.Close[1], GDAXI.Close=GDAXI.Close/GDAXI.Close[1], VIX.Close=VIX.Close/VIX.Close[1])
df_1 <- df_1 %>% gather(var, closing, 2:5)
png("indexes.png", width = 9, height = 6, units = 'in', res = 600)
plot_1 <- ggplot(data=df_1)+
geom_line(aes(x=date, y=closing))+
facet_wrap(~var, scales = "free")+
scale_x_date(breaks = df_1$date, date_minor_breaks = "1 day", date_labels = "%y-%m-%d")+
theme(text = element_text(size=7), axis.text.x = element_text(angle = 90, hjust = 1))
plot_1
dev.off()
plot_1
The package bdscale was designed for this purpose, once added, you can substitute your scale_x_date line with:
scale_x_bd(
business.dates = df_1$date,
max.major.breaks = 10,
labels = date_format("%y-%m-%d")
)
To produce this plot...
Fully reproducible code
library(tidyverse)
library(quantmod)
library(bdscale)
library(scales)
getSymbols("BTCUSD=X;^DJI;^VIX;^GDAXI", from = "2017-01-01")
df <- merge(`BTCUSD=X`,`DJI`, all = TRUE) %>%
merge(`VIX`, all = TRUE) %>%
merge(`GDAXI`, all = TRUE)
df <- data.frame(date = index(df), coredata(df))
df_1 <- df %>%
select(date, contains("Close")) %>%
na.omit %>%
filter(date > "2018-01-25") %>%
gather(var, closing,2:5)
ggplot(data = df_1, aes(x = date, y = closing)) +
geom_line() +
facet_wrap(~var, scales = "free") +
scale_x_bd(business.dates = df_1$date,
max.major.breaks = 10,
labels = date_format("%y-%m-%d")) +
theme(text = element_text(size = 7),
axis.text.x = element_text(angle = 90, hjust = 1))

Resources