Related
I am simply trying to get every month of data to plot the same color.
Plotly is falsely plotting lines that connect each factor color together, generating a giant mess on the chart.
connectgaps = FALSE does not work, because the gaps are not in the data. They are being generated by plotly on the ~factor(month), which colors each month the same.
This has something to do with multiple observations of the factor, rather than just one. If the factor was based on year, rather than month, no line traces would occur.
How can I remove these lines? Thank you!
library(tidyverse)
library(lubridate)
library(plotly)
start_date <- as.Date('2020-1-1')
end_date <- as.Date('2023-2-5')
d1 <- tibble::tibble(date = seq(min(start_date), max(end_date), by = "1 day"))
d2 <- tibble::tibble(rnorm(1132,2))
d3 <- data.frame(d1,d2)
d3 <- d3 %>% mutate(month = month(date))
colnames(d3) <- c('date', 'var', 'month')
plot_ly(d3, x = ~date, y = ~var, type = "scatter", mode = "lines", color = ~factor(month), connectgaps =FALSE)
You could add another column which is the year so you can group on each year using group_by to prevent connecting the lines per month for each year like this:
start_date <- as.Date('2020-1-1')
end_date <- as.Date('2023-2-5')
library(lubridate)
library(dplyr)
library(plotly)
d1 <- tibble::tibble(date = seq(min(start_date), max(end_date), by = "1 day"))
d2 <- tibble::tibble(rnorm(1132,2))
d3 <- data.frame(d1,d2)
d3 <- d3 %>% mutate(month = month(date),
year = year(date))
colnames(d3) <- c('date', 'var', 'month', 'year')
d3 %>%
group_by(year) %>%
plot_ly(x = ~date, y = ~var, type = "scatter", mode = "lines", color = ~factor(month))
Created on 2023-02-05 with reprex v2.0.2
Idea nr. 2:
library(dplyr)
library(lubridate)
library(plotly)
d3 %>%
mutate(year = year(date)) %>%
group_by(year) %>%
plot_ly(x = ~ yday(date)) %>%
add_lines(y = ~var,
color = ~ factor(month))
To answer you question with the lines:
In ggplot when we use group = 1 like:
ggplot(d3, aes(x = date, y = var, group = 1, color = factor(month)))+
geom_line()
We get:
I am trying to compare different years' variables but I am having trouble plotting them together.
The time series is a temperature series which can be found in https://github.com/gonzalodqa/timeseries as temp.csv
I would like to plot something like the image but I find it difficult to subset the months between the years and then combine the lines in the same plot under the same months
If someone can give some advice or point me in the right direction I would really appreciate it
You can try this way.
The first chart shows all the available temperatures, the second chart is aggregated by month.
In the first chart, we force the same year so that ggplot will plot them aligned, but we separate the lines by colour.
For the second one, we just use month as x variable and year as colour variable.
Note that:
with scale_x_datetime we can hide the year so that no one can see that we forced the year 2020 to every observation
with scale_x_continous we can show the name of the months instead of the numbers
[just try to run the charts with and without scale_x_... to understand what I'm talking about]
month.abb is a useful default variable for months names.
# read data
df <- readr::read_csv2("https://raw.githubusercontent.com/gonzalodqa/timeseries/main/temp.csv")
# libraries
library(ggplot2)
library(dplyr)
# line chart by datetime
df %>%
# make datetime: force unique year
mutate(datetime = lubridate::make_datetime(2020, month, day, hour, minute, second)) %>%
ggplot() +
geom_line(aes(x = datetime, y = T42, colour = factor(year))) +
scale_x_datetime(breaks = lubridate::make_datetime(2020,1:12), labels = month.abb) +
labs(title = "Temperature by Datetime", colour = "Year")
# line chart by month
df %>%
# average by year-month
group_by(year, month) %>%
summarise(T42 = mean(T42, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
geom_line(aes(x = month, y = T42, colour = factor(year))) +
scale_x_continuous(breaks = 1:12, labels = month.abb, minor_breaks = NULL) +
labs(title = "Average Temperature by Month", colour = "Year")
In case you want your chart to start from July, you can use this code instead:
months_order <- c(7:12,1:6)
# line chart by month
df %>%
# average by year-month
group_by(year, month) %>%
summarise(T42 = mean(T42, na.rm = TRUE), .groups = "drop") %>%
# create new groups starting from each July
group_by(neworder = cumsum(month == 7)) %>%
# keep only complete years
filter(n() == 12) %>%
# give new names to groups
mutate(years = paste(unique(year), collapse = " / ")) %>%
ungroup() %>%
# reorder months
mutate(month = factor(month, levels = months_order, labels = month.abb[months_order], ordered = TRUE)) %>%
# plot
ggplot() +
geom_line(aes(x = month, y = T42, colour = years, group = years)) +
labs(title = "Average Temperature by Month", colour = "Year")
EDIT
To have something similar to the first plot but starting from July, you could use the following code:
# libraries
library(ggplot2)
library(dplyr)
library(lubridate)
# custom months order
months_order <- c(7:12,1:6)
# fake dates for plot
# note: choose 4 to include 29 Feb which exist only in leap years
dates <- make_datetime(c(rep(3,6), rep(4,6)), months_order)
# line chart by datetime
df %>%
# create date time
mutate(datetime = make_datetime(year, month, day, hour, minute, second)) %>%
# filter years of interest
filter(datetime >= make_datetime(2018,7), datetime < make_datetime(2020,7)) %>%
# create increasing group after each july
group_by(year, month) %>%
mutate(dummy = month(datetime) == 7 & datetime == min(datetime)) %>%
ungroup() %>%
mutate(dummy = cumsum(dummy)) %>%
# force unique years and create custom name
group_by(dummy) %>%
mutate(datetime = datetime - years(year - 4) - years(month>=7),
years = paste(unique(year), collapse = " / ")) %>%
ungroup() %>%
# plot
ggplot() +
geom_line(aes(x = datetime, y = T42, colour = years)) +
scale_x_datetime(breaks = dates, labels = month.abb[months_order]) +
labs(title = "Temperature by Datetime", colour = "Year")
To order month differently and sum up the values in couples of years, you've to work a bit with your data before plotting them:
library(dplyr) # work data
library(ggplot2) # plots
library(lubridate) # date
library(readr) # fetch data
# your data
df <- read_csv2("https://raw.githubusercontent.com/gonzalodqa/timeseries/main/temp.csv")
df %>%
mutate(date = make_date(year, month,day)) %>%
# reorder month
group_by(month_2 = factor(as.character(month(date, label = T, locale = Sys.setlocale("LC_TIME", "English"))),
levels = c('Jul','Aug','Sep','Oct','Nov','Dec','Jan','Feb','Mar','Apr','May','Jun')),
# group years as you like
year_2 = ifelse( year(date) %in% (2018:2019), '2018/2019', '2020/2021')) %>%
# you can put whatever aggregation function you need
summarise(val = mean(T42, na.rm = T)) %>%
# plot it!
ggplot(aes(x = month_2, y = val, color = year_2, group = year_2)) +
geom_line() +
ylab('T42') +
xlab('month') +
theme_light()
A slightly different solution without the all dates to 2020 trick.
library(tidyverse)
library(lubridate)
df <- read_csv2("https://raw.githubusercontent.com/gonzalodqa/timeseries/main/temp.csv")
df <- df |>
filter(year %in% c(2018, 2019, 2020)) %>%
mutate(year = factor(year),
month = ifelse(month<10, paste0(0,month), month),
day = paste0(0, day),
month_day = paste0(month, "-", day))
df |> ggplot(aes(x=month_day, y=T42, group=year, col=year)) +
geom_line() +
scale_x_discrete(breaks = c("01-01", "02-01", "03-01", "04-01", "05-01", "06-01", "07-01", "08-01", "09-01", "10-01", "11-01", "12-01"))
Let's say I have a data.frame consisting of industry type and starting and ending dates (e.g. for an employee).
mydf <- data.frame(industry = c("Government", "Education", "Military", "Private Sector", "Government", "Private Sector"),
start_date = c("2014-01-01", "2016-02-01", "2012-11-01", "2013-03-01", "2012-12-01", "2011-12-01"),
end_date = c("2020-12-01", "2016-10-01", "2014-01-01", "2016-10-01", "2015-10-01", "2014-09-01"))
> mydf
industry start_date end_date
1 Government 2014-01-01 2020-12-01
2 Education 2016-02-01 2016-10-01
3 Military 2012-11-01 2014-01-01
4 Private Sector 2013-03-01 2016-10-01
5 Government 2012-12-01 2015-10-01
6 Private Sector 2011-12-01 2014-09-01
I'd like to create a stacked ggplot bar chart in which each unique year in the start_date column is on the X axis (e.g. 2011-2016) and the y axis represents the total number of observations (the row count) represented in a given industry for that year.
I'm not sure what the right way to manipulate the data.frame to allow for this. Presumably I'd need to manipulate the data to have columns for industry year and count. But I'm not sure how to produce the year columns from a date range. Any ideas?
Convert the date columns to Date, create the 'date' sequence from the 'start_date' to 'end_date' for each row with map2 (from purrr), unnest the list output, count the year and plot with geom_bar
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
mydf %>%
mutate(across(c(start_date, end_date), as.Date)) %>%
transmute(industry, date = map2(start_date, end_date, seq, by = 'day')) %>%
unnest(c(date)) %>%
count(industry, year = factor(year(date))) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_col() +
theme_bw()
If the plot should be separate for each 'industry'
mydf %>%
mutate(across(c(start_date, end_date), as.Date)) %>%
transmute(industry, date = map2(start_date, end_date, seq, by = 'day')) %>%
unnest(c(date)) %>%
count(industry, year = factor(year(date))) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_col() +
facet_wrap(~ industry) +
theme_bw()
-output
As #IanCampbell suggested, the by for seq can be 'year'
mydf %>%
mutate(across(c(start_date, end_date), as.Date)) %>%
transmute(industry, date = map2(start_date, end_date, seq, by = 'year')) %>%
unnest(c(date)) %>%
count(industry, year = factor(year(date))) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_col() +
facet_wrap(~ industry) +
theme_bw()
Is this what you're looking for?
I would recommend using purrr::pmap to create a new data frame with one row for each year based on each row of the original data.
We can use the purrr::pmap_dfr to automatically return a single data frame bound by row.
We can use the ~with(list(...), ) trick to be able to reference columns by name.
Then we can use dplyr::count to count by combinations of columns. Then it's easy.
library(dplyr)
library(purrr)
library(lubridate)
library(ggplot)
mydf %>%
mutate(across(c(start_date, end_date), as.Date),
start_year = year(start_date),
end_year = year(end_date)) %>%
pmap_dfr(~with(list(...),data.frame(industry,
year = seq(start_year, end_year)))) %>%
count(year, industry) %>%
ggplot(aes(x = year, y = n, fill = industry)) +
geom_bar(stat="identity")
I would like to produce a speghatii plot where i need to see days of the year on the x-axis and data on the y-axis for each Year. I would then want a separate year that had data for only 3 months (PCPNewData) to be plotted on the same figure but different color and bold line. Here is my sample code which produce a graph (attached) where the data for each Year for a particular Day is stacked- i don't want bar graph. I would like to have a line graph. Thanks
library(tidyverse)
library(tidyr)
myDates=as.data.frame(seq(as.Date("2000-01-01"), to=as.Date("2010-12-31"),by="days"))
colnames(myDates) = "Date"
Dates = myDates %>% separate(Date, sep = "-", into = c("Year", "Month", "Day"))
LatestDate=as.data.frame(seq(as.Date("2011-01-01"), to=as.Date("2011-03-31"),by="days"))
colnames(LatestDate) = "Date"
NewDate = LatestDate %>% separate(Date, sep = "-", into = c("Year", "Month", "Day"))
PCPDataHis = data.frame(total_precip = runif(4018, 0,70), Dates)
PCPNewData = data.frame(total_precip = runif(90, 0,70), NewDate)
PCPDataHisPlot =PCPDataHis %>% group_by(Year) %>% gather(key = "Variable", value = "Value", -Year, -Day,-Month)
ggplot(PCPDataHisPlot, aes(Day, Value, colour = Year))+
geom_line()+
geom_line(data = PCPNewData, aes(Day, total_precip))
I would like to have a Figure like below where each line represent data for a particular year
UPDATE:
I draw my desired figure with hand (see attached). I would like to have all the days of the Years on x-axis with its data on the y-axis
You have few errors in your code.
First, your days are in character format. You need to pass them in a numerical format to get line being continuous.
Then, you have multiple data for each days (because you have 12 months per year), so you need to summarise a little bit these data:
Pel2 <- Pelly2Data %>% group_by(year,day) %>% summarise(Value = mean(Value, na.rm = TRUE))
Pel3 <- Pelly2_2011_3months %>% group_by(year, day) %>% summarise(total_precip = mean(total_precip, na.rm = TRUE))
ggplot(Pel2, aes(as.numeric(day), Value, color = year))+
geom_line()+
geom_line(data = Pelly2_2011_3months, aes(as.numeric(day), y= total_precip),size = 2)
It looks better but it is hard to apply a specific color pattern
To my opinion, it will be less confused if you can compare mean of each dataset, such as:
library(tidyverse)
Pel2 <- Pelly2Data %>% group_by(day) %>%
summarise(Mean = mean(Value, na.rm = TRUE),
SEM = sd(Value,na.rm = TRUE)/sqrt(n())) %>%
mutate(Name = "Pel_ALL")
Pel3 <- Pelly2_2011_3months %>% group_by(day) %>%
summarise(Mean = mean(total_precip, na.rm = TRUE),
SEM = sd(total_precip, na.rm = TRUE)/sqrt(n())) %>%
mutate(Name = "Pel3")
Pel <- bind_rows(Pel2,Pel3)
ggplot(Pel, aes(x = as.numeric(day), y = Mean, color = Name))+
geom_ribbon(aes(ymin = Mean-SEM, ymax = Mean+SEM), alpha = 0.2)+
geom_line(size = 2)
EDIT: New graph based on update
To get the graph you post as a drawing, you need to have the day of the year and not the day of the month. We can get this information by setting a date sequence and extract the day of the year by using yday function from `lubridate package.
library(tidyverse)
library(lubridate)
Pelly2$Date = seq(ymd("1990-01-01"),ymd("2010-12-31"), by = "day")
Pelly2$Year_day <- yday(Pelly2$Date)
Pelly2_2011_3months$Date <- seq(ymd("2011-01-01"), ymd("2011-03-31"), by = "day")
Pelly2_2011_3months$Year_day <- yday(Pelly2_2011_3months$Date)
Pelly2$Dataset = "ALL"
Pelly2_2011_3months$Dataset = "2011_Dataset"
Pel <- bind_rows(Pelly2, Pelly2_2011_3months)
Then, you can combine both dataset and represent them with different colors, size, transparency (alpha) as show here:
ggplot(Pel, aes(x = Year_day, y = total_precip, color = year, size = Dataset, alpha = Dataset))+
geom_line()+
scale_size_manual(values = c(2,0.5))+
scale_alpha_manual(values = c(1,0.5))
Does it answer your question ?
I have been trying to plot min and max values of temperature. I actually wanted to plot using geom_area. My data can be downloaded from here.
library(dplyr)
library(ggplot2)
dat <- read.csv("energydata_complete.csv", stringsAsFactors = FALSE)
#renaming attributes meaningfully
#names(dat)[] <- 'temp_kitchen'
dat <- dat %>%
dplyr::rename('temp_kitchen'=T1,'temp_living'=T2,'temp_laundry'=T3,
'temp_office'=T4,'temp_bath'=T5,'temp_build'=T6,'temp_iron'=T7,
'temp_teen'=T8,'temp_parent'=T9,'hum_kitchen'=RH_1,'hum_living'=RH_2,
'hum_laundry'=RH_3,'hum_office'=RH_4,'hum_bath'=RH_5,'hum_build'=RH_6,
'hum_iron'=RH_7,'hum_teen'=RH_8,'hum_parent'=RH_9)
dat$month <- as.factor(months(dat$date))
dat$date <- strptime(dat$date, format = "%Y-%m-%d %H:%M:%S")
dat$date <- as.POSIXct(dat$date, format = "%Y-%m-%d %H:%M:%S")
I have created another dataframe with month and min and max temperature values of each room.
temparature <- dat %>% group_by(month) %>% dplyr::summarise(min_temp_kitch=min(temp_kitchen),
max_temp_kitch=max(temp_kitchen),
min_temp_living=min(temp_living),
max_temp_living=max(temp_living),
min_temp_laundry=min(temp_laundry),
max_temp_laundry=max(temp_laundry),
min_temp_iron=min(temp_iron),
max_temp_iron=max(temp_iron),
min_temp_office=min(temp_office),
max_temp_office=max(temp_office),
min_temp_bath=min(temp_bath),
max_temp_bath=max(temp_bath),
min_temp_parent=min(temp_parent),
max_temp_parent=max(temp_parent),
min_temp_teen=min(temp_teen),
max_temp_teen=max(temp_teen))
Now I am trying to plot min and max temperature values from this dataframe for each room.
Below code didn't give any plot.
ggplot() + geom_area(data = temparature,aes(x=month,y=min_temp_kitch), position = 'stack') +
geom_area(data = temparature,aes(x=month, y=max_temp_kitch), position = 'stack')
Tried to create with geom_ribbon as below.
ggplot(temparature) +
geom_ribbon(aes(x=month, ymin = min_temp_kitch, ymax = max_temp_kitch), color='blue', alpha = 0.5)
This has given
But I want a plot something similar to this with points for each value.
Can someone suggest how to do this please.
You don't need to change your dates to factor and need to make the temperature dataframe into long format :
library(dplyr)
library(ggplot2)
library(lubridate)
dat <- read.csv("energydata_complete.csv", stringsAsFactors = FALSE)
dat <- dat %>%
rename('temp_kitchen'=T1,'temp_living'=T2,'temp_laundry'=T3,
'temp_office'=T4,'temp_bath'=T5,'temp_build'=T6,'temp_iron'=T7,
'temp_teen'=T8,'temp_parent'=T9,'hum_kitchen'=RH_1,'hum_living'=RH_2,
'hum_laundry'=RH_3,'hum_office'=RH_4,'hum_bath'=RH_5,'hum_build'=RH_6,
'hum_iron'=RH_7,'hum_teen'=RH_8,'hum_parent'=RH_9) %>%
mutate(month = floor_date(date(date), unit = 'months'))
temparature <- dat %>%
group_by(month) %>%
summarise(min_temp_kitch=min(temp_kitchen),
max_temp_kitch=max(temp_kitchen),
min_temp_living=min(temp_living),
max_temp_living=max(temp_living),
min_temp_laundry=min(temp_laundry),
max_temp_laundry=max(temp_laundry),
min_temp_iron=min(temp_iron),
max_temp_iron=max(temp_iron),
min_temp_office=min(temp_office),
max_temp_office=max(temp_office),
min_temp_bath=min(temp_bath),
max_temp_bath=max(temp_bath),
min_temp_parent=min(temp_parent),
max_temp_parent=max(temp_parent),
min_temp_teen=min(temp_teen),
max_temp_teen=max(temp_teen))
temp2 <- temparature %>%
tidyr::gather(temp_min_max, Temp, -month)
ggplot() +
geom_area(data = temp2 %>%
filter(temp_min_max %in% c('min_temp_kitch', 'max_temp_kitch')),
aes(x=month,y=Temp,fill = temp_min_max, color = temp_min_max),
position = 'identity')