desired_output_sample
I have following data:
#1. dates of 15 day frequency:
dates = seq(as.Date("2016-09-01"), as.Date("2020-07-30"), by=15) #96 times observation
#2. water content in crops corresponding to the times given.
water <- c(0.5702722, 0.5631781, 0.5560839, 0.5555985, 0.5519783, 0.5463459,
0.5511598, 0.546652, 0.5361545, 0.530012, 0.5360571, 0.5396569,
0.5683526, 0.6031535, 0.6417821, 0.671358, 0.7015542, 0.7177007,
0.7103561, 0.7036985, 0.6958607, 0.6775161, 0.6545367, 0.6380155,
0.6113306, 0.5846186, 0.5561815, 0.5251135, 0.5085149, 0.495352,
0.485819, 0.4730029, 0.4686458, 0.4616468, 0.4613918, 0.4615532,
0.4827496, 0.5149105, 0.5447824, 0.5776764, 0.6090217, 0.6297454,
0.6399422, 0.6428941, 0.6586344, 0.6507473, 0.6290631, 0.6011123,
0.5744375, 0.5313527, 0.5008027, 0.4770338, 0.4564025, 0.4464508,
0.4309046, 0.4351668, 0.4490393, 0.4701232, 0.4911582, 0.5162941,
0.5490387, 0.5737573, 0.6031149, 0.6400073, 0.6770058, 0.7048311,
0.7255012, 0.739107, 0.7338938, 0.7265202, 0.6940718, 0.6757214,
0.6460862, 0.6163091, 0.5743775, 0.5450822, 0.5057753, 0.4715266,
0.4469859, 0.4303232, 0.4187793, 0.4119401, 0.4201316, 0.426369,
0.4419331, 0.4757525, 0.5070846, 0.5248457, 0.5607567, 0.5859825,
0.6107531, 0.6201754, 0.6356589, 0.6336177, 0.6275579, 0.6214981)
I want to compute trend of the water content or moisture data corresponding to different subperiods. Lets say: one trend from 2016 - 09-01 to 2019-11-30.
and other trend from 2019-12-15 to the last date (in this case 2020-07-27).
And I want to make a plot like the one attached.
Appreciate your help. Can be in R or in python.
To draw a trend line, you can look on this tutorial
https://www.statology.org/ggplot-trendline/
Or on this stackoverflow question
Draw a trend line using ggplot
To split your dataset in two groups you simply need to do something like this (in R).
data <- data.frame(dates, water)
#This neat trick allows you to turn a logical value into a number
data$group <- 1 + (data$dates > "2019-11-30")
old <- subset(data,group == 1)
new <- subset(data,group == 2)
For the plots:
library(ggplot2)
ggplot(old,aes(x = dates, y = water)) +
geom_smooth(method = "lm", col = "blue") +
geom_point()
ggplot(new,aes(x = dates, y = water)) +
geom_smooth(method = "lm", col = "red") +
geom_point()
Here is a full-fledged example with added labels:
library(dplyr)
library(ggplot2)
dates <- seq(as.Date("2016-09-01"), as.Date("2020-07-30"), by=15)
wc <- as.numeric(strsplit("0.5702722 0.5631781 0.5560839 0.5555985 0.5519783 0.5463459 0.5511598 0.5466520 0.5361545 0.5300120 0.5360571 0.5396569 0.5683526 0.6031535 0.6417821 0.6713580 0.7015542 0.7177007 0.7103561 0.7036985 0.6958607 0.6775161 0.6545367 0.6380155 0.6113306 0.5846186 0.5561815 0.5251135 0.5085149 0.4953520 0.4858190 0.4730029 0.4686458 0.4616468 0.4613918 0.4615532 0.4827496 0.5149105 0.5447824 0.5776764 0.6090217 0.6297454 0.6399422 0.6428941 0.6586344 0.6507473 0.6290631 0.6011123 0.5744375 0.5313527 0.5008027 0.4770338 0.4564025 0.4464508 0.4309046 0.4351668 0.4490393 0.4701232 0.4911582 0.5162941 0.5490387 0.5737573 0.6031149 0.6400073 0.6770058 0.7048311 0.7255012 0.7391070 0.7338938 0.7265202 0.6940718 0.6757214 0.6460862 0.6163091 0.5743775 0.5450822 0.5057753 0.4715266 0.4469859 0.4303232 0.4187793 0.4119401 0.4201316 0.4263690 0.4419331 0.4757525 0.5070846 0.5248457 0.5607567 0.5859825 0.6107531 0.6201754 0.6356589 0.6336177 0.6275579 0.6214981", " |\\n")[[1]])
data <- data.frame(date=dates, water_content=wc) %>%
mutate(group = ifelse(date <= as.Date("2019-11-30"), "g1", "g2"))
# calculate linear regression and create labels
lmo <- data %>%
group_by(group) %>%
summarise(res=list(stats::lm(water_content ~ date, data = cur_data()))) %>%
.$res
lab <- sapply(lmo, \(x)
paste("Slope=", signif(x$coef[[2]], 5),
"\nAdj R2=", signif(summary(x)$adj.r.squared, 5),
"\nP=", signif(summary(x)$coef[2,4], 5)))
ggplot(data=data, aes(x=date, y=water_content, col=group)) +
geom_point() +
stat_smooth(geom="smooth", method="lm") +
geom_text(aes(date, y, label=lab),
data=data.frame(data %>% group_by(group) %>%
summarise(date=first(date)), y=Inf, lab=lab),
vjust=1, hjust=.2)
Created on 2022-11-23 with reprex v2.0.2
Here is a way. Create a grouping variable by dates, coerce it to factor and geom_smooth will draw the two regression lines.
suppressPackageStartupMessages({
library(ggplot2)
library(ggpubr)
})
df1 <- data.frame(dates, water)
breakpoint <- as.Date("2019-11-30")
df1$group <- factor(df1$dates > breakpoint, labels = c("before", "after"))
ggplot(df1, aes(dates, water, colour = group)) +
geom_line() +
geom_point(shape = 21, fill = 'white') +
geom_smooth(formula = y ~ x, method = lm) +
geom_vline(xintercept = breakpoint, linetype = "dotdash", linewidth = 1) +
stat_cor(label.y = c(0.43, 0.38), show.legend = FALSE) +
stat_regline_equation(label.y = c(0.45, 0.4), show.legend = FALSE) +
scale_color_manual(values = c(before = 'red', after = 'blue')) +
theme_bw(base_size = 15)
Created on 2022-11-23 with reprex v2.0.2
Related
I would like to show the mean of two groups in a scatterplot. I have sorted the data so the groups are next to each other. Group 1 is the first 11 records and group2 is the next 133. How can I tell ggplot to draw one line across the range for the first group (House 1-11) and a second line for the second (House 12-133).
Here is what I have so far:
And the code is here:
library(tidyverse)
library(tidymodels)
data(ames)
ames <- AmesHousing::make_ames()
set.seed(1)
split <- initial_split(ames, prop = 0.95, strata = "Sale_Price")
ames_plot <- testing(split)
model1 <- lm(Sale_Price ~ Central_Air, data = ames_plot)
p1 <- model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha = 0.3) +
geom_segment(aes(x = 1, y = .fitted, xend = 144, yend =.fitted)) +
scale_y_continuous(labels = scales::dollar)
p1
Using geom_smooth(formula = 'y ~ x', se = FALSE, method = "lm") instead of geom_segment() gets me close to what I want but I want to show the actual predicted values coming form the lm().
It would be best just to summarize your data for that layer. For example
model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha=.3) +
geom_segment(aes(x = first, y = .fitted, xend = last, yend =.fitted),
data = function(x) {
x %>%
group_by(Central_Air) %>%
summarize(first=first(House), last=last(House), .fitted=mean(.fitted), .groups="drop_last")
}) +
scale_y_continuous(labels = scales::dollar)
So i have a dataframe with 2 columns : "ID" and "Score"
ID contain the name of a simulation and each simulation have 58 different scores that are listed in the column Score.
There is 10 simulations.
I am doing a geom_density plot :
my_dataframe %>%
ggplot(aes(x=`Score`), xlim = c(0, 1)) +
geom_density(aes(color = ID)) +
theme_bw() +
labs(title = "Scores")
https://imgur.com/a/9DUTmWw
How can i tell ggplot that i want the curves of Simulation1 and Simulation2 to not be like the others, i want them to be in red and with an higher width than all the other one.
Thank you for your help,
Best,
Maxime
Something like this?
my_dataframe %>% mutate(group = ifelse(ID %in% c(1,2), 'special', 'NonSpecial')) %>%
ggplot(aes(x=`Score`, lty = group), xlim = c(0, 1)) +
geom_density(aes(color = ID)) +
theme_bw() +
labs(title = "Scores")
I used this data:
my_dataframe <- data.frame(ID = factor(sample(1:4, 100, T)), Score = sin(1:100))
I'm trying to re-create a plot like this in ggplot:.
This graph takes the residuals from a regression output, and plots them in order (with the X-axis being a rank of residuals).
My best attempt at this was something like the following:
library(ggplot2)
library(modelr)
d <- d %>% add_residuals(mod1, var = "resid")
d$resid_rank <- rank(d$resid)
ggplot(data = d, aes(x = resid_rank, y = resid)) +
geom_bar(stat="identity") +
theme_bw()
However, this yields a completely blank graph. I tried something like this:
ggplot(data = d, aes(x = resid_rank, y = resid)) +
geom_segment(yend = 0, aes(xend=resid)) +
theme_bw()
But this yields the segments that go in the wrong direction. What is the right way to do this, and to color those lines by a third factor?
FAKE DATASET:
library(estimatr)
library(fabricatr)
#simulation
dat <- fabricate(
N = 10000,
y = runif(N, 0, 10),
x = runif(N, 0, 100)
)
#add an outlier
dat <- rbind(dat, c(300, 5))
dat <- rbind(dat, c(500, 3))
dat$y_log <- log(dat$y)
dat$x_log <- log(dat$x)
dat$y_log_s <- scale(log(dat$y))
dat$x_log_s <- scale(log(dat$x))
mod1 <- lm(y_log ~ x_log, data = dat))
I used the build in dataset from the help page on lm() to create this example. I also just directly used resid() to get the residuals. It's unclear where / why the colored bars would be different, but basically you'd need to add a column to your data.frame that specificies why they are red or blue, then pass that to fill.
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 3.4.4
#example from lm
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)
lm.D9 <- lm(weight ~ group)
resids <- data.frame(resid = resid(lm.D9))
#why are some bars red and some blue? No clue - so I'll pick randomly
resids$group <- sample(c("group 1", "group 2"), nrow(resids), replace = TRUE)
#rank
resids$rank <- rank(-1 * resids$resid)
ggplot(resids, aes(rank, resid, fill = group)) +
geom_bar(stat = "identity", width = 1) +
geom_hline(yintercept = c(-1,1), colour = "darkgray", linetype = 2) +
geom_hline(yintercept = c(-2,2), colour = "lightgray", linetype = 1) +
theme_bw() +
theme(panel.grid = element_blank()) +
scale_fill_manual(values = c("group 1" = "red", "group 2" = "blue"))
Created on 2019-01-24 by the reprex package (v0.2.1)
I have a dataframe with 30 columns and I would like to create 30 (gg)plots based on these columns. When creating a plot through ggplot, you have to create a variable to which all the information of the plot is added.
Is there a way how I can create 30 of such variable names in a for loop (so that I don't have to create and store them all locally?
In earlier code I repeated the below steps 30 times:
In earlier code, I had the following:
a1 = ggplot(data = results_round_one,
aes(results_round_one$`R-0,01`))
a1 = a1 + geom_histogram()
a1 = a1 + xlim(0.46, 0.55)
a1 = a1 + geom_vline(xintercept= mean(results_round_one$`R-0,01`),
col = 'blue')
a1 = a1 + geom_vline(xintercept = max(results_round_one$`R-0,01`),
col = 'red')
a1= a1 + labs(y = 'Frequency',
x= 'Validated accuracy',
title = 'Optimizer = RMSProp',
subtitle = 'Learning rate = 0.01')
However, since I only have to change the aes and the labels, I think I should be able to do this process in a for loop as well.
In absence of some example data, here is some code that would loop through iris columns, creating density plots:
library(purrr)
library(dplyr)
df <- iris %>%
select(Sepal.Length:Petal.Width)
df %>%
map2(names(df), ~ .x %>%
as.data.frame %>%
set_names(.y) %>%
ggplot(aes_string(.y)) + geom_density() + ggtitle(.y))
Using your code, something along the lines of:
results_round_one %>%
map2(names(results_round_one), ~ .x %>%
as.data.frame %>%
set_names(.y) %>%
ggplot(aes_string(.y)) +
geom_histogram() +
xlim(0.46, 0.55) +
geom_vline(xintercept = mean(.x), col = 'blue') +
geom_vline(xintercept = max(.x), col = 'red') +
labs(y = 'Frequency',
x= 'Validated accuracy',
title = 'Optimizer = RMSProp',
subtitle = 'Learning rate = 0.01'))
You could apply histogram function:
getImage <- function(col){
a1 = ggplot(data = results_round_one,
aes(results_round_one[, col])) +
geom_histogram() +
xlim(0.46, 0.55) +
geom_vline(xintercept= mean(results_round_one[, col]),
col = 'blue') +
labs(y = 'Frequency',
x= 'Validated accuracy',
title = 'Optimizer = RMSProp',
subtitle = 'Learning rate = 0.01')
return(a1)
}
to a vector of columns iteratively. In this case col_30 is a vector of column names
# e.g. col_30 = c("col1", "col2") etc.
for(col in col_30){
getImage(col)
}
This would generate different plots.
I have a simple R script to create a forecast based on a file.
Data has been recorded since 2014 but I am having trouble trying to accomplish below two goals:
Plot only a subset of the forecast information (starting on 11/2017 onwards).
Include month and year in a specific format (i.e. Jun 17).
Here is the link to the dataset and below you will find the code made by me so far.
# Load required libraries
library(forecast)
library(ggplot2)
# Load dataset
emea <- read.csv(file="C:/Users/nsoria/Downloads/AMS Globales/EMEA_Depuy_Finanzas.csv", header=TRUE, sep=';', dec=",")
# Create time series object
ts_fin <- ts(emea$Value, frequency = 26, start = c(2014,11))
# Pull out the seasonal, trend, and irregular components from the time series
model <- stl(ts_fin, s.window = "periodic")
# Predict the next 3 bi weeks of tickets
pred <- forecast(model, h = 5)
# Plot the results
plot(pred, include = 5, showgap = FALSE, main = "Ticket amount", xlab = "Timeframe", ylab = "Quantity")
I appreciate any help and suggestion to my two points and a clean plot.
Thanks in advance.
Edit 01/10 - Issue 1:
I added the screenshot output for suggested code.
Plot1
Edit 01/10 - Issue 2:
Once transformed with below code, it somehow miss the date count and mess with the results. Please see two screenshots and compare the last value.
Screenshot 1
Screenshot 2
Plotting using ggplot2 w/ ggfortify, tidyverse, lubridate and scales packages
library(lubridate)
library(tidyverse)
library(scales)
library(ggfortify)
# Convert pred from list to data frame object
df1 <- fortify(pred) %>% as_tibble()
# Convert ts decimal time to Date class
df1$Date <- as.Date(date_decimal(df1$Index), "%Y-%m-%d")
str(df1)
# Remove Index column and rename other columns
# Select only data pts after 2017
df1 <- df1 %>%
select(-Index) %>%
filter(Date >= as.Date("2017-01-01")) %>%
rename("Low95" = "Lo 95",
"Low80" = "Lo 80",
"High95" = "Hi 95",
"High80" = "Hi 80",
"Forecast" = "Point Forecast")
df1
### Updated: To connect the gap between the Data & Forecast,
# assign the last non-NA row of Data column to the corresponding row of other columns
lastNonNAinData <- max(which(complete.cases(df1$Data)))
df1[lastNonNAinData, !(colnames(df1) %in% c("Data", "Fitted", "Date"))] <- df1$Data[lastNonNAinData]
# Or: use [geom_segment](http://ggplot2.tidyverse.org/reference/geom_segment.html)
plt1 <- ggplot(df1, aes(x = Date)) +
ggtitle("Ticket amount") +
xlab("Time frame") + ylab("Quantity") +
geom_ribbon(aes(ymin = Low95, ymax = High95, fill = "95%")) +
geom_ribbon(aes(ymin = Low80, ymax = High80, fill = "80%")) +
geom_point(aes(y = Data, colour = "Data"), size = 4) +
geom_line(aes(y = Data, group = 1, colour = "Data"),
linetype = "dotted", size = 0.75) +
geom_line(aes(y = Fitted, group = 2, colour = "Fitted"), size = 0.75) +
geom_line(aes(y = Forecast, group = 3, colour = "Forecast"), size = 0.75) +
scale_x_date(breaks = scales::pretty_breaks(), date_labels = "%b %y") +
scale_colour_brewer(name = "Legend", type = "qual", palette = "Dark2") +
scale_fill_brewer(name = "Intervals") +
guides(colour = guide_legend(order = 1), fill = guide_legend(order = 2)) +
theme_bw(base_size = 14)
plt1