how do i vectorise (automate) plot creation in R - r

edited to include sample data:
Sample data
I have been trying to write code to generate and save multiple plots from a large dataset and have to admit defeat. Would love some help if possible..
i have a df (dat) of 4 years of daily monitoring data (sampling year goes July - June, so Sampling.Year notation is YYYY-YYYY). I would like to export jpgs for each SITENAME, with facet wrap/facet grid so each Sampling.Year is stacked vertically. Individual Sampling.Year plots show timeseries data for the full year (x=DATE, y = Daily.Ave.PAF). End result should be individual jpg files (SITENAME saved in file name) with sampling years stacked but DATE (x axis) aligned. That way we can get a quick snapshot of differences over time. The string is below and my (probably crappy) code is below that. The code is exporting plots just fine, but the data seems to be mixed up - i.e. where a SITENAME only has 2 Sampling.Years worth of data there should only be 2 plots in the jpg but this code produces 4... it's obviously wrong but I don't know how to fix it. THanks in advance.
'data.frame': 521 obs. of 6 variables:
$ STATION : chr "1240062" "125013A" "122013A" "126001A" ...
$ SITENAME : chr "Oconnell River at Caravan Park" "Pioneer River at Dumbleton Weir Headwater" "Proserpine River at Glen Isla" "Sandy Creek at Homebush" ...
$ Sampling.Year: chr "2016-2017" "2018-2019" "2018-2019" "2018-2019" ...
$ DATE : Date, format: "2017-02-01" "2019-02-01" "2019-02-01" "2019-02-01" ...
$ Daily.Ave.PAF: num 24.344 15.226 45.529 44.936 0.208 ...
$ Site.Year : chr "Oconnell River at Caravan Park_2016-2017" "Pioneer River at Dumbleton Weir Headwater_2018-2019" "Proserpine River at Glen Isla_2018-2019" "Sandy Creek at Homebush_2018-2019" …
CODE:
for(i in 1:length(dat)){
png(filename = paste("N:/Projects and project proposals/", dat$SITENAME[i], ".png", sep=""), width = 1500, height = 1000)
print({pesticidePlot <- ggplot(dat, aes(DATE, Daily.Ave.PAF)) +
geom_point(aes(colour = Daily.Ave.PAF)) +
scale_colour_gradientn(colours=c("dark green","yellow","orange", "red"),
breaks=c(5,10,20), labels=format(c("5", "10", "20"))) +
facet_wrap(~Sampling.Year, ncol = 1,scales="free") +
labs(x = "Month", y = "Total PAF (% affected)") +
scale_x_date(breaks = "1 month", labels = date_format("%B")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))})
dev.off()
}

This code can help you. I have used the data you included (Just define a directory to save the plots):
library(tidyverse)
#Data
dat <- read.csv('Sample.csv',stringsAsFactors = F)
dat$DATE <- as.Date(dat$DATE,'%d/%m/%Y')
#Create a list
List <- split(dat,dat$SITENAME)
#Function for plots
myplot <- function(x)
{
pesticidePlot <- ggplot(x, aes(DATE, Daily.Ave.PAF)) +
geom_point(aes(colour = Daily.Ave.PAF)) +
scale_colour_gradientn(colours=c("dark green","yellow","orange", "red"),
breaks=c(5,10,20), labels=format(c("5", "10", "20"))) +
facet_wrap(~Sampling.Year, ncol = 1,scales="free") +
labs(x = "Month", y = "Total PAF (% affected)") +
scale_x_date(breaks = "1 month", labels = scales::date_format("%B-%y")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle(unique(x$SITENAME))
return(pesticidePlot)
}
#Create plots
List2 <- lapply(List,myplot)
#Export
namesvec <- paste0(names(List2),'.png')
mapply(ggsave, List2,filename=namesvec,width = 15,units = 'cm')
That code will create next plots:
You can modify myplot if you need a more customized plots.

Here is a solution that will save the plots created in a lapply loop. The files are then written in another loop, this time with mapply.
In the example below the files are saved in the working directory, change this at will.
library(ggplot2)
SITENAME_plot <- function(X){
ggplot(X, aes(DATE, Daily.Ave.PAF)) +
geom_point(aes(colour = Daily.Ave.PAF)) +
scale_colour_gradientn(colours=c("dark green","yellow","orange", "red"),
breaks=c(5,10,20), labels=format(c("5", "10", "20"))) +
labs(x = "Month", y = "Total PAF (% affected)") +
scale_x_date(breaks = "1 month", labels = scales::date_format("%B")) +
facet_wrap(~Sampling.Year, ncol = 1, scales = "free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}
SITENAME_plot_write <- function(name, g, dir = "N:/Projects and project proposals"){
flname <- file.path(dir, name)
flname <- paste0(flname, ".png")
png(filename = flname, width = 1500, height = 1000)
print(g)
dev.off()
flname
}
dat$DATE <- as.Date(dat$DATE, format = "%d/%m/%Y")
sp <- split(dat, dat$SITENAME)
gg_list <- sapply(sp, SITENAME_plot, simplify = FALSE)
mapply(SITENAME_plot_write, names(gg_list), gg_list, MoreArgs = list(dir = getwd()))
rm(sp) # final clean-up

Related

Custom manhattan plot multi x-axis

I have the following data set gwas_data
Running
head -n 23 gwas_data gives me the following table.
gwas_data <-
data.frame(
stringsAsFactors = FALSE,
udi = c("A","B","C","D","E",
"F","G","H","I","J","K","A","B","C","D","E",
"F","G","H","I","J","K"),
snp = c("rs71628639_A",
"rs71628639_A","rs71628639_A","rs71628639_A","rs71628639_A",
"rs71628639_A","rs71628639_A","rs71628639_A",
"rs71628639_A","rs71628639_A","rs71628639_A","rs12726330_A",
"rs12726330_A","rs12726330_A","rs12726330_A",
"rs12726330_A","rs12726330_A","rs12726330_A","rs12726330_A",
"rs12726330_A","rs12726330_A","rs12726330_A"),
chr = c(1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L),
bp = c(154988255L,154988255L,
154988255L,154988255L,154988255L,154988255L,154988255L,
154988255L,154988255L,154988255L,154988255L,
155108167L,155108167L,155108167L,155108167L,155108167L,
155108167L,155108167L,155108167L,155108167L,
155108167L,155108167L),
p = c(0.580621191,0.356577427,
0.494774059,0.984005886,0.492034614,0.581479389,
0.24820214,0.202720896,0.295462221,0.845848783,
0.954714162,0.343101621,0.740942238,0.929127071,0.717965027,
0.335111376,0.857154424,0.480087195,0.980307843,
0.521114038,0.583150471,0.925783695),
beta = c(0.000852277,0.003943912,
0.001091986,-3.18e-05,0.000564413,0.000120028,
0.026156467,0.000303135,0.069146449,-2.96e-07,-2.11e-05,
0.001274261,-0.001232397,0.000123948,-0.000498507,
-0.000689988,-3.41e-50,-0.013934416,5.12e-06,
-0.03696031,-7.28e-07,-3.01e-05),
bp_cum = c(1.154988255,1.154988255,
1.154988255,1.154988255,1.154988255,1.154988255,
1.154988255,1.154988255,1.154988255,1.154988255,
1.154988255,1.155108167,1.155108167,1.155108167,
1.155108167,1.155108167,1.155108167,1.155108167,1.155108167,
1.155108167,1.155108167,1.155108167)
)
I would like to make a manhattan plot, the X-axis should have chromosomal numbers from 1:22, I want each entry to be on the x-axis according to the BP position. The id should act as colour and the y-axis would be -log10(p).
I have rewritten the r command as follows, but my graph doesn't look correct.
library(plyr)
library(dplyr)
library(purrr)
library(tidyverse)
library(ggtext)
library(stringr)
gwas_data <- read.table("gwas_data", header=T)
sig <- 5e-8
manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(p), color = udi)) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_point(aes(color=as.factor(udi)), alpha=0.8, size=2) +
scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
#scale_color_manual(values = rep(c("#276FBF", "#183059"), (length(axis_set$chr)))) +
scale_size_continuous(range = c(0.5,3)) +
theme_minimal()
print(manhplot)
I would also like to add the name of the ID and SNP if they are above the significant threshold.
My axis_set looks as follows with test data which goes from chromosome 1:4
chr center
1 179641307
2 354697451
3 553030055
4 558565909
My final graph looks as follows:

ggplot: aggregate multi-year data by Month-Year, aesthetic length error

i've read every relevant aggregate() by month and lubridate question i could find but am still running into an error of aesthetic length. lots didn't work for me bc they grouped data by month but the dataframe only contained data from one year. i don't need the cumulative total of every January across time – i need it to be month- AND year-specific.
my sample data: (df is called "sales")
order_date_create order_sum
2020-05-19 900
2020-08-29 500
2020-08-30 900
2021-02-01 200
2021-02-06 500
aggregating by month-year:
# aggregate by month (i used _moyr short for month year)
sales$bymonth <- aggregate(cbind(order_sum)~month(order_date_create),
data=sales,FUN=sum)
sales$order_moyr <- format(sales$order_date_create, '%m-%Y') # why does this get saved under values instead of data?
here's my ggplot:
# plot
ggplot(sales, aes(order_moyr, order_sum)) +
scale_x_date(limits = c(min, as.Date(now())),
breaks = "1 month",
labels = date_format("%m-%Y")) +
scale_y_continuous(labels = function(x) format(x, big.mark = "'", decimal.mark = ".", scientific = FALSE)) +
labs(x = "Date", y = "Sales Volume", title = "Sales by Month") +
geom_bar(stat="identity")+ theme_economist(base_size = 10, base_family = "sans", horizontal = TRUE, dkpanel = FALSE) + scale_colour_economist()
if i use x = order_date_create and y = order_sum it plots correctly, with month-year axis, but each bar is still daily sum.
if i use x = order_moyr and y = bymonth, i get this error:
Error: Aesthetics must be either length 1 or the same as the data (48839): y
tangentially, if anyone knows how to use both scale::dollar AND format the thousands separator in the same scale_y_continous fcn it would be a great help. i've not found how to do both.
library(scales); library(lubridate); library(dplyr);
library(ggthemes)
sales %>%
count(order_moyr = floor_date(order_date_create, "month"),
wt = order_sum, name = "order_sum") %>%
ggplot(aes(order_moyr, order_sum)) +
scale_x_date(breaks = "1 month",
labels = date_format("%m-%Y")) +
scale_y_continuous(labels = scales::dollar_format(big.mark = "'",
decimal.mark = ".")) +
labs(x = "Date", y = "Sales Volume", title = "Sales by Month") +
geom_bar(stat="identity", width = 25)+
theme_economist(base_size = 10, base_family = "sans",
horizontal = TRUE, dkpanel = FALSE) +
scale_colour_economist()

how do i combine multiple data sources in ggplot using split and sapply?

this question is linked to a previous one answered by #Rui Barradas and #Duck, but i need more help. Previous link here:
how do i vectorise (automate) plot creation in R
Basically, I need to combine 3 datasets into one plot with a secondary y axis. All datasets need to be split by SITENAME and will facet wrap by Sampling.Year. I am using split and sapply. Being facet wrap the plots look something like this:
However, i'm now trying to add the two other data sources into the plots, to look something like this:
But i am struggling to add the two other data sources and get them to split by SITENAME. Her is my code so far...
Record plot format as a function to be applied to a split list df (ideally 'df' would be added as geom_line with a secondary y axis, and 'FF_start_dates' will be added as a vertical dashed line):
SITENAME_plot <- function(AllDates_TPAF){
ggplot(AllDates_TPAF, aes(DATE, Daily.Ave.PAF)) +
geom_point(aes(colour = Risk), size = 3) +
scale_colour_manual(values=c("Very Low" = "dark green","Low" = "light green",
"Moderate" = "yellow", "High" = "orange", "Very High" = "red"), drop = FALSE) +
labs(x = "Month", y = "Total PAF (% affected)") +
scale_x_date(breaks = "1 month", labels = scales::date_format("%B")) +
facet_wrap(~Sampling.Year, ncol = 1, scales = "free")+
scale_y_continuous(limits = c(0, 100), sec.axis = sec_axis(~., name = "Water level (m)")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
theme(legend.text=element_text(size=15)) +
theme(axis.text=element_text(size=15),
axis.title=element_text(size=15,face="bold")) +
guides(color = guide_legend(reverse = TRUE))+
theme_bw() +
ggtitle(unique(AllDates_TPAF$SITENAME))
}
plot write function:
SITENAME_plot_write <- function(name, g, dir = "N:/abc/"){
flname <- file.path(dir, name)
flname <- paste0(flname, ".jpg")
png(filename = flname, width = 1500, height = 1000)
print(g)
dev.off()
flname
}
Apply function to list split by SITENAME:
sp1 <- split(AllDates_TPAF, AllDates_TPAF$SITENAME)
gg_list <- sapply(sp1, SITENAME_plot, simplify = FALSE)
mapply(SITENAME_plot_write, names(gg_list), gg_list, MoreArgs = list(dir = getwd()))
dev.off()
I have uploaded samples of all 3 datasets here: Sample Data
Apologies for not using gsub but there was too much data and I couldn't get it to work properly
thanks in advance for any help you can give, even if it is just to point me towards a web tutorial of some kind.
You can try next code. I used the data you shared. Just be careful with names of all datasets. Ideally, the key columns as DATE and Sampling.Year should be present in all dataframes before making the split. Also some variables as Risk was absent so I added an example var with same name. Here the code, I added a function for the plot you want:
library(tidyverse)
library(readxl)
#Data
df1 <- read_excel('Sample data.xlsx',1)
#Create var
df1$Risk <- c(rep(c("Very Low","Low","Moderate","High","Very High"),67),"Very High")
#Other data
df2 <- read_excel('Sample data.xlsx',2)
df3 <- read_excel('Sample data.xlsx',3)
#Split 1
L1 <- split(df1,df1$SITENAME)
L2 <- split(df2,df2$SITENAME)
L3 <- split(df3,df3$`Site Name`)
#Function to create plots
myplot <- function(x,y,z)
{
#Merge x and y
#Check for duplicates and avoid column
y <- y[!duplicated(paste(y$DATE,y$Sampling.Year)),]
y$SITENAME <- NULL
xy <- merge(x,y,by.x = c('Sampling.Year','DATE'),by.y = c('Sampling.Year','DATE'),all.x=T)
#Format to dates
xy$DATE <- as.Date(xy$DATE)
#Scale factor
scaleFactor <- max(xy$Daily.Ave.PAF) / max(xy$Height)
#Rename for consistency in names
names(z)[4] <- 'DATE'
#Format date
z$DATE <- as.Date(z$DATE)
#Plot
#Plot
G <- ggplot(xy, aes(DATE, Daily.Ave.PAF)) +
geom_point(aes(colour = Risk), size = 3) +
scale_colour_manual(values=c("Very Low" = "dark green","Low" = "light green",
"Moderate" = "yellow", "High" = "orange", "Very High" = "red"), drop = FALSE) +
scale_x_date(breaks = "1 month", labels = scales::date_format("%b %Y")) +
geom_line(aes(x=DATE,y=Height*scaleFactor))+
scale_y_continuous(name="Total PAF (% affected)", sec.axis=sec_axis(~./scaleFactor, name="Water level (m)"))+
labs(x = "Month") +
geom_vline(data = z,aes(xintercept = DATE),linetype="dashed")+
facet_wrap(~Sampling.Year, ncol = 1, scales = "free")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
theme(legend.text=element_text(size=15)) +
theme(axis.text=element_text(size=15),
axis.title=element_text(size=15,face="bold")) +
guides(color = guide_legend(reverse = TRUE))+
theme_bw() +
ggtitle(unique(xy$SITENAME))
return(G)
}
#Create a list of plots
Lplots <- mapply(FUN = myplot,x=L1,y=L2,z=L3,SIMPLIFY = FALSE)
#Now format names
vnames <- paste0(names(Lplots),'.png')
mapply(ggsave, Lplots,filename = vnames,width = 30,units = 'cm')
You will end up with plots like these saved in your dir:
Some dashed lines do not appear in plots because they were not present in the data you provided.

How to plot a subset of forecast in R?

I have a simple R script to create a forecast based on a file.
Data has been recorded since 2014 but I am having trouble trying to accomplish below two goals:
Plot only a subset of the forecast information (starting on 11/2017 onwards).
Include month and year in a specific format (i.e. Jun 17).
Here is the link to the dataset and below you will find the code made by me so far.
# Load required libraries
library(forecast)
library(ggplot2)
# Load dataset
emea <- read.csv(file="C:/Users/nsoria/Downloads/AMS Globales/EMEA_Depuy_Finanzas.csv", header=TRUE, sep=';', dec=",")
# Create time series object
ts_fin <- ts(emea$Value, frequency = 26, start = c(2014,11))
# Pull out the seasonal, trend, and irregular components from the time series
model <- stl(ts_fin, s.window = "periodic")
# Predict the next 3 bi weeks of tickets
pred <- forecast(model, h = 5)
# Plot the results
plot(pred, include = 5, showgap = FALSE, main = "Ticket amount", xlab = "Timeframe", ylab = "Quantity")
I appreciate any help and suggestion to my two points and a clean plot.
Thanks in advance.
Edit 01/10 - Issue 1:
I added the screenshot output for suggested code.
Plot1
Edit 01/10 - Issue 2:
Once transformed with below code, it somehow miss the date count and mess with the results. Please see two screenshots and compare the last value.
Screenshot 1
Screenshot 2
Plotting using ggplot2 w/ ggfortify, tidyverse, lubridate and scales packages
library(lubridate)
library(tidyverse)
library(scales)
library(ggfortify)
# Convert pred from list to data frame object
df1 <- fortify(pred) %>% as_tibble()
# Convert ts decimal time to Date class
df1$Date <- as.Date(date_decimal(df1$Index), "%Y-%m-%d")
str(df1)
# Remove Index column and rename other columns
# Select only data pts after 2017
df1 <- df1 %>%
select(-Index) %>%
filter(Date >= as.Date("2017-01-01")) %>%
rename("Low95" = "Lo 95",
"Low80" = "Lo 80",
"High95" = "Hi 95",
"High80" = "Hi 80",
"Forecast" = "Point Forecast")
df1
### Updated: To connect the gap between the Data & Forecast,
# assign the last non-NA row of Data column to the corresponding row of other columns
lastNonNAinData <- max(which(complete.cases(df1$Data)))
df1[lastNonNAinData, !(colnames(df1) %in% c("Data", "Fitted", "Date"))] <- df1$Data[lastNonNAinData]
# Or: use [geom_segment](http://ggplot2.tidyverse.org/reference/geom_segment.html)
plt1 <- ggplot(df1, aes(x = Date)) +
ggtitle("Ticket amount") +
xlab("Time frame") + ylab("Quantity") +
geom_ribbon(aes(ymin = Low95, ymax = High95, fill = "95%")) +
geom_ribbon(aes(ymin = Low80, ymax = High80, fill = "80%")) +
geom_point(aes(y = Data, colour = "Data"), size = 4) +
geom_line(aes(y = Data, group = 1, colour = "Data"),
linetype = "dotted", size = 0.75) +
geom_line(aes(y = Fitted, group = 2, colour = "Fitted"), size = 0.75) +
geom_line(aes(y = Forecast, group = 3, colour = "Forecast"), size = 0.75) +
scale_x_date(breaks = scales::pretty_breaks(), date_labels = "%b %y") +
scale_colour_brewer(name = "Legend", type = "qual", palette = "Dark2") +
scale_fill_brewer(name = "Intervals") +
guides(colour = guide_legend(order = 1), fill = guide_legend(order = 2)) +
theme_bw(base_size = 14)
plt1

How to add diagonal lines in NA value polygons using ggplot?

I'm working to plot the consolidated Z-value deviations (for a series of factors) from the national average for Pakistan on a fortified SPDF. For the purposes of this question, my data is irrelevant. I could provide it if necessary.
I am using ggplot to create my output where the command and result look something like this:
ggplot() + geom_polygon(data = plot.pakmod_sumZ, aes(x = long, y = lat, group = group, fill = SumZ.Cat), color = "black", size = 0.25, na.rm = TRUE) + scale_fill_manual(name = "Deviations from National Average", labels = c("-7", "-6", "-5", "-4", "-3", "-2", "-1", "Positive"), values = c("darkorange4","brown", "orangered1","tomato1","darkorange3","orange","yellow", "greenyellow"), na.value = "Grey", guide = guide_legend(reverse = TRUE)) + coord_map() + labs(x = NULL, y = NULL) + scale_x_discrete(breaks = NULL) + scale_y_discrete(breaks = NULL) + theme_minimal()
Deviations from National Average
I am trying to figure out now if it's possible to add diagonal lines in the polygons which have missing values and are coloured grey. Can this be done using ggplot?
This is an example I took from here. I opted to use the horizontal error bar geom. Mind that this isn't the only way of doing this.
library(ggplot2)
library(sp)
library(rgdal)
library(rgeos)
# create a local directory for the data
localDir <- "R_GIS_data"
if (!file.exists(localDir)) {
dir.create(localDir)
}
# download and unzip the data
url <- "ftp://www.ecy.wa.gov/gis_a/inlandWaters/wria.zip"
file <- paste(localDir, basename(url), sep='/')
if (!file.exists(file)) {
download.file(url, file)
unzip(file,exdir=localDir)
}
# create a layer name for the shapefiles (text before file extension)
layerName <- "WRIA_poly"
# read data into a SpatialPolygonsDataFrame object
dataProjected <- readOGR(dsn=localDir, layer=layerName)
dataProjected#data$id <- rownames(dataProjected#data)
# create a data.frame from our spatial object
watershedPoints <- fortify(dataProjected)
# merge the "fortified" data with the data from our spatial object
watershedDF <- merge(watershedPoints, dataProjected#data, by = "id")
dataProjected#data$id <- rownames(dataProjected#data)
watershedPoints <- fortify(dataProjected)
watershedDF <- merge(watershedPoints, dataProjected#data, by = "id")
ggWatershed <- ggplot(data = watershedDF, aes(x=long, y=lat, group = group, fill = WRIA_NM)) +
geom_polygon() +
geom_path(color = "white") +
scale_fill_hue(l = 40) +
coord_equal() +
theme(legend.position = "none", title = element_blank())
# Adding coordinates to the data part of SPDF. `sd` is the variable of interest
# which is beign plotted here. Each line extends sd away from long coordinate
dataProjected#data$sd <- rnorm(nrow(xy), mean = 50000, sd = 10000)
xy <- coordinates(dataProjected)
dataProjected#data$long <- xy[, 1]
dataProjected#data$lat <- xy[, 2]
ggWatershed +
geom_errorbarh(data = dataProjected#data, aes(group = id, xmin = long - sd, xmax = long + sd))

Resources