Related
Let me first share a dummy data, from which I want to prepare ggplot graphs.
library(tidyverse)
set.seed(1)
sample_size <- 1200
dates <- sample(seq(1,31),sample_size,replace = TRUE)
Monthss <- sample(seq(1,12),sample_size,replace = TRUE)
hrs <- sample(seq(1,23),sample_size,replace = TRUE)
minutes <- sample(seq(1,59),sample_size,replace = TRUE)
date_time_vector <- paste0(dates,"-",Monthss,"-",2022," ",hrs,":",minutes) |> lubridate::parse_date_time("dmy HM")
Conversion <- sample(c(TRUE,FALSE),sample_size, prob = c(0.25,0.75), replace = TRUE)
df <- data.frame(Date = date_time_vector, Conversion_Status = Conversion)
df <- df |> mutate(Leads = round(runif(sample_size, min = 0,max = 10),digits = 0))
df <- df[complete.cases(df), ]
The code above gives me a data.frame with columns Date, Leads and Conversion_Status. I want to prepare Monthly column chart of total leads per day. (For example, daily leads in January, daily leads in February, etc.) So, basically, I will need to split the data on the basis of Month, and prepare one chart for each month. How can I prepare such charts?
I have tried following way:
bar_function <- function(df, col1, col2, title) {
df %>%
ggplot2::ggplot(aes(x = {{col1}}, y = {{col2}})) +
ggplot2::geom_col(fill = "steelblue") +
theme(plot.background = element_rect(fill = "white")) +theme(plot.title = element_text(hjust = 0.5))+coord_flip() +
ggplot2::labs(title = title)
}
mycharts <- df |> dplyr::nest_by(Month) |> dplyr::mutate(plot = bar_function(df,Date,Leads,"Daily Leads by Month"))
But it is giving me errors.
You can split according to month(year) and plot that.
library(ggplot2)
library(lubridate)
set.seed(1)
sample_size <- 1200
dates <- sample(seq(1,31),sample_size,replace = TRUE)
Monthss <- sample(seq(1,12),sample_size,replace = TRUE)
hrs <- sample(seq(1,23),sample_size,replace = TRUE)
minutes <- sample(seq(1,59),sample_size,replace = TRUE)
date_time_vector <- paste0(dates,"-",Monthss,"-",2022," ",hrs,":",minutes) |> lubridate::parse_date_time("dmy HM")
Conversion <- sample(c(TRUE,FALSE),sample_size, prob = c(0.25,0.75), replace = TRUE)
df <- data.frame(Date = date_time_vector, Conversion_Status = Conversion)
df$Leads <- round(runif(sample_size, min = 0,max = 10),digits = 0)
df <- df[complete.cases(df), ]
df$month_year <- strftime(df$Date, format = "%m-%Y")
df.split <- split(df, f = df$month_year)
out <- vector("list", length(df.split))
names(out) <- names(df.split)
for (i in seq_along(df.split)) {
out[[i]] <- ggplot(data = df.split[[i]], mapping = aes(x = Date, y = Leads)) +
geom_col(fill = "steelblue") +
theme(plot.background = element_rect(fill = "white")) +
theme(plot.title = element_text(hjust = 0.5))+
coord_flip() +
labs(title = "Daily leads by month")
}
To plot you can just print e.g. out[[1]].
If you want to change the desired columns dynamically, you can use aes_string for mapping. This can naturally be wrapped into sapply and there are probably other ways of approaching the problem. The for loop is pretty agnostic and I find that it's readable even by people who do not dabble in R (compared to say sapply).
There are some issues with your code. First, your dataset has no Month column, i.e. you have to add it for which I use lubridate::month. Second, you are passing the dataset df to your bar function instead of the splitted data column from your nested df. Third, in the mutate step you have to wrap the result in list():
library(ggplot2)
library(dplyr, warn=FALSE)
mycharts <- df |>
nest_by(Month = lubridate::month(Date)) |>
mutate(plot = list(bar_function(data, Date, Leads, "Daily Leads by Month")))
mycharts$plot[[1]]
mycharts$plot[[5]]
I finally found an answer. I used following code:
lapply(split(df, df$Month),
function(x)
ggplot(x, aes(x=Date, y=Leads)) +
geom_col(fill = "steelblue") + coord_flip()+
ggtitle(x$Month[1]))
Thank you all for your support.
I have timeseries with several days data. I need to find a day with maximum number of outliers and plot only this day data.
Here how I do it:
#generate sample data
Sys.setlocale("LC_ALL","English")
Values <- sample(0:100,24241, replace = T)
Values <- rpois(24241, lambda=75)
start <- as.POSIXct("2012-01-15 06:10:00")
interval <- 15
end <- start + as.difftime(4, units="days") + as.difftime(5, units = "hours")
DateTimes <- seq(from=start, by=interval, to=end)
cpu_df <- tibble(datetime = DateTimes, Value = Values)
# find and plot outliers of all days ========================================
upper_bound <- quantile(cpu_df$Value, 0.975)
outlier_ind <- which(cpu_df$Value > upper_bound)
cpu_df_susp <- cpu_df[outlier_ind, ]
alldays_plot <- ggplot(data = cpu_df, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = cpu_df_susp, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
# ========== convert to xts ====================================================
suppressMessages(library(xts))
cpu_df_xts <- xts(x = cpu_df$Value, order.by = cpu_df$datetime)
days <- split(cpu_df_xts, f="days")
#========= find worst day - with biggest number of outliers
outliers_number <- 0
worstday_index <- 0
for (i in 1:(length(days))) {
upper_bound <- quantile( coredata(days[[i]]), 0.975)
outlier_ind <- which(coredata(days[[i]]) > upper_bound)
outlier_day_number <- length(outlier_ind)
if ( outlier_day_number > outliers_number
){
worstday_index <- i
outliers_number <- outlier_day_number
worst_day_outliers_ind <- outlier_ind
}
}
WorstDay <- days[[worstday_index]]
# find outliers of worst day ====================================================
worst_day_outliers <- WorstDay[worst_day_outliers_ind, ]
# convert xts back to tibble
WorstDayTibble <- tibble( datetime = index(WorstDay),
Value = coredata(WorstDay) )
outliersTibble <- tibble( datetime = index(worst_day_outliers),
Value = coredata(worst_day_outliers) )
# plot worst day ====================================================
worstDay_Plot <- ggplot(data = WorstDayTibble, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = outliersTibble, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
library(ggpubr)
ggpubr::ggarrange(alldays_plot, worstDay_Plot)
Here is the result:
What I don't like in my code - to split data to days and search through them I need to convert it to xts. To plot data via ggplot2, I have to convert data back to tibble. Is it possible to avoid that double conversion and make code simplier?
You don't need to convert your data to xts and back. Keeping the data into dateframe/tibble you can get worst day using :
library(dplyr)
#Add date column
cpu_df <- cpu_df %>% mutate(date = as.Date(datetime))
#For each date count number of Value greater than 0.975 quantile
#and select the date with max outliers.
WorstDay <- cpu_df %>%
group_by(date) %>%
summarise(n = sum(Value > quantile(Value, 0.975))) %>%
slice(which.max(n)) %>%
left_join(cpu_df, by = 'date')
You can use this data for plotting.
I struggle on my data for a long time and I don't know how to solve my problem. I work on nutritional data, that can be faked by this data set:
library(tidyverse)
library(lubridate)
# Used for data generation
groupFunction <- function(cat){
case_when(
cat == "apple" ~ "food",
cat == "bread" ~ "food",
cat == "cheese" ~ "food",
cat == "chocolate" ~ "candy",
cat == "water" ~ "drink",
cat == "tea" ~ "drink"
)
}
# Generate the data
set.seed(0)
fakeData <- tibble(
id = c(rep("A", 10), rep("B", 10), rep("C", 10), rep("D", 10)),
eaten_at = sample(seq(as.POSIXct('2020/01/01'), as.POSIXct('2020/01/05'), by="15 min"), 40),
category = sample(rep(c("apple", "bread", "cheese", "chocolate", "water", "tea"), 10), 40),
group = groupFunction(category),
amount = sample(10:100, 40)
)
# For every id, for each day, every hour and each category: sum the eaten amount,
# and keep 0 eaten amount so it is encounted in the mean calculation in step 2!
# PROBLEM: we loose time intervals where a given id didn't eat anything, this will
# biais the mean calculation in step 2!
step1 <- fakeData %>%
mutate(eaten_at_hour = hour(eaten_at)) %>%
group_by(id, eaten_at, eaten_at_hour, category, group) %>%
summarise(eaten_amount = sum(amount)) %>%
ungroup() %>%
complete(nesting(id, eaten_at, eaten_at_hour),
nesting(category, group),
fill = list(eaten_amount = 0))
# For every id, mean over the days the eaten amount for every hour interval.
# As before, keep 0 counts so it's encounted in the mean calculation in step 3!
step2 <- step1 %>%
group_by(id, eaten_at_hour, category, group) %>%
summarise(mean_per_id = mean(eaten_amount)) %>%
ungroup() %>%
complete(nesting(id, eaten_at_hour),
nesting(category, group),
fill = list(mean_per_id = 0))
# Mean over all id
step3 <- step2 %>%
group_by(eaten_at_hour, category, group) %>%
summarise(mean_for_all = mean(mean_per_id)) %>%
ungroup()
# Plot the data
ggplot(step3, aes(x=eaten_at_hour, y=category, color = mean_for_all, shape = group)) +
geom_point( size = 3) +
scale_color_gradient(low="blue", high="red", "Mean eaten\namount [g]")
What I want to build is a plot with 1h time interval on the x axis and the different food categories on the y axis, with mean eaten amount for all ids for each X min period over 24h (i.e. time interval must be flexible). I would like a plot looking like this:
My thought was to compute:
for every id,
for every day this id has eaten something,
for every time interval of X hour (even if the id didn't eat anything),
and for every food category:
-> sum the eaten amount
Then:
for each id,
for each category,
and for each 1 hour interval over the days of participation:
-> average the eaten amount
Then:
-> average all id so that we get for each category and for each 1 hour interval of a 24h period, the mean eaten amount
For this I use the group_by(), nesting() and complete() functions. But I have 3 problems:
I want to be able to set the desired time interval, it can be 15 min but also 2 hours. I didn't found any solutions to this yet.
I need to have all time intervals for all id even if they didn't eat anything (so amout = 0) because when I mean for the days or among ids, the mean would be biaised if I don't include the zero counts.
My actual data set includes about 100k rows, so I think that my way of doing it would not be the most appropriate in term of efficiency. Furthermore, I want to design a shiny app for this data where a user could set the time interval manually for example, that means the plot must be computed again and again (lot of work for a computer when the code is not efficient...)
I'm aware that my question is totally oriented towards a specific problem but since I'm really blocked I would highly appreciate any help/inputs/ideas on one or both of my question. Thanks a lot!
I'm not sure I fully understood your problem, but here is a draft of an answer.
First, a tricky way to group on an interval is to floor the hour (using lubridate::hour) divided by the step, and then multiply the result by the step. Then, I grouped by the id, hour, and group to sum and then by only hour and group to compute the mean.
eaten_n_hours = 2
df = fakeData %>%
mutate(hour = floor(hour(eaten_at)/eaten_n_hours)*eaten_n_hours) %>%
group_by(id, hour, group) %>%
summarise(amount = sum(amount, na.rm=TRUE)) %>%
group_by(hour, group) %>%
summarise(amount_m = mean(amount, na.rm=T),
amount_sd = sd(amount, na.rm=T)) %>%
identity()
Then, you can plot the whole thing like this:
breaks_hour = seq(min(df$hour), max(df$hour)+1, eaten_n_hours)
ggplot(df, aes(x=hour, y=amount_m, group=group, color=group, fill=group))+
geom_col(position="dodge") +
# geom_errorbar(aes(ymin=amount_m-amount_sd, ymax=amount_m+amount_sd), position="dodge") +
scale_x_binned(breaks=breaks_hour)
This is not the prettiest plot ever, but I'm not sure whether it is due to my ununderstanding of the problem or to the example fakeData.
EDIT
I'm not familiar with tiles, but you can try using geom_tiles this way. Also, using scales::breaks_width allow having a flexible time interval.
ggplot(df, aes(x=hour, y=group, fill=amount_m))+
geom_tile()+
scale_x_binned(breaks=scales::breaks_width(3)) # try other values
So I figured out a way to do this (thanks #Dan Chaltiel), it's surely not perfect but I'll post it here so it can be useful to others/or for the discussion:
library(tidyverse)
library(lubridate)
# Used for data generation
groupFunction <- function(cat){
case_when(
cat == "apple" ~ "food",
cat == "bread" ~ "food",
cat == "cheese" ~ "food",
cat == "chocolate" ~ "candy",
cat == "water" ~ "drink",
cat == "tea" ~ "drink"
)
}
# Generate the data
set.seed(0)
fakeData <- tibble(
id = c(rep("A", 10), rep("B", 10), rep("C", 10), rep("D", 10)),
eaten_at = sample(seq(as.POSIXct('2020/01/01 22:00:00'), as.POSIXct('2020/01/05'), by="17 min"), 40),
category = sample(rep(c("apple", "bread", "cheese", "chocolate", "water", "tea"), 10), 40),
group = groupFunction(category),
amount = sample(10:100, 40)
)
# Set time interval in minutes here (0-60 min only):
set_time <- 60
# Generate time sequence for one day (1440 seconds), with the desired interval input. Then set it as factor.
timeLevels <- seq(from = as.POSIXct("2020-1-1 0:00"), by = paste(set_time, "min", sep = " "), length.out = 1440/set_time)
timeLevels <- paste(hour(timeLevels), minute(timeLevels), sep = ":")
# Calculate the means, keeping zero counts
toPlot <- fakeData %>%
mutate(eaten_hour = floor_date(eaten_at, unit = paste(set_time, "min", sep = " ")),
eaten_hour = paste(hour(eaten_hour), minute(eaten_hour), sep = ":"),
eaten_hour = factor(eaten_hour, levels = timeLevels),
eaten_date = date(eaten_at)) %>%
group_by(eaten_date, eaten_hour, category, group) %>%
summarise(sum_amount = sum(amount)) %>%
ungroup() %>%
complete(eaten_date, eaten_hour, nesting(category, group), fill = list(sum_amount = 0)) %>%
group_by(eaten_hour, category, group) %>%
summarise(mean_amount = mean(sum_amount)) %>%
ungroup()
# Plot the data
gg <- ggplot(toPlot, aes(x=eaten_hour, y=category, fill=mean_amount))
gg <- gg + geom_tile(color="white", size=0.1)
gg <- gg + coord_equal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gg <- gg + labs(x = "Time of the day", y = NULL, title = "Mean eaten quantity over one day", fill = "Mean amount [g]")
gg
Output looks like:
Still open on any inputs about how to improve my code!
I have been trying to plot min and max values of temperature. I actually wanted to plot using geom_area. My data can be downloaded from here.
library(dplyr)
library(ggplot2)
dat <- read.csv("energydata_complete.csv", stringsAsFactors = FALSE)
#renaming attributes meaningfully
#names(dat)[] <- 'temp_kitchen'
dat <- dat %>%
dplyr::rename('temp_kitchen'=T1,'temp_living'=T2,'temp_laundry'=T3,
'temp_office'=T4,'temp_bath'=T5,'temp_build'=T6,'temp_iron'=T7,
'temp_teen'=T8,'temp_parent'=T9,'hum_kitchen'=RH_1,'hum_living'=RH_2,
'hum_laundry'=RH_3,'hum_office'=RH_4,'hum_bath'=RH_5,'hum_build'=RH_6,
'hum_iron'=RH_7,'hum_teen'=RH_8,'hum_parent'=RH_9)
dat$month <- as.factor(months(dat$date))
dat$date <- strptime(dat$date, format = "%Y-%m-%d %H:%M:%S")
dat$date <- as.POSIXct(dat$date, format = "%Y-%m-%d %H:%M:%S")
I have created another dataframe with month and min and max temperature values of each room.
temparature <- dat %>% group_by(month) %>% dplyr::summarise(min_temp_kitch=min(temp_kitchen),
max_temp_kitch=max(temp_kitchen),
min_temp_living=min(temp_living),
max_temp_living=max(temp_living),
min_temp_laundry=min(temp_laundry),
max_temp_laundry=max(temp_laundry),
min_temp_iron=min(temp_iron),
max_temp_iron=max(temp_iron),
min_temp_office=min(temp_office),
max_temp_office=max(temp_office),
min_temp_bath=min(temp_bath),
max_temp_bath=max(temp_bath),
min_temp_parent=min(temp_parent),
max_temp_parent=max(temp_parent),
min_temp_teen=min(temp_teen),
max_temp_teen=max(temp_teen))
Now I am trying to plot min and max temperature values from this dataframe for each room.
Below code didn't give any plot.
ggplot() + geom_area(data = temparature,aes(x=month,y=min_temp_kitch), position = 'stack') +
geom_area(data = temparature,aes(x=month, y=max_temp_kitch), position = 'stack')
Tried to create with geom_ribbon as below.
ggplot(temparature) +
geom_ribbon(aes(x=month, ymin = min_temp_kitch, ymax = max_temp_kitch), color='blue', alpha = 0.5)
This has given
But I want a plot something similar to this with points for each value.
Can someone suggest how to do this please.
You don't need to change your dates to factor and need to make the temperature dataframe into long format :
library(dplyr)
library(ggplot2)
library(lubridate)
dat <- read.csv("energydata_complete.csv", stringsAsFactors = FALSE)
dat <- dat %>%
rename('temp_kitchen'=T1,'temp_living'=T2,'temp_laundry'=T3,
'temp_office'=T4,'temp_bath'=T5,'temp_build'=T6,'temp_iron'=T7,
'temp_teen'=T8,'temp_parent'=T9,'hum_kitchen'=RH_1,'hum_living'=RH_2,
'hum_laundry'=RH_3,'hum_office'=RH_4,'hum_bath'=RH_5,'hum_build'=RH_6,
'hum_iron'=RH_7,'hum_teen'=RH_8,'hum_parent'=RH_9) %>%
mutate(month = floor_date(date(date), unit = 'months'))
temparature <- dat %>%
group_by(month) %>%
summarise(min_temp_kitch=min(temp_kitchen),
max_temp_kitch=max(temp_kitchen),
min_temp_living=min(temp_living),
max_temp_living=max(temp_living),
min_temp_laundry=min(temp_laundry),
max_temp_laundry=max(temp_laundry),
min_temp_iron=min(temp_iron),
max_temp_iron=max(temp_iron),
min_temp_office=min(temp_office),
max_temp_office=max(temp_office),
min_temp_bath=min(temp_bath),
max_temp_bath=max(temp_bath),
min_temp_parent=min(temp_parent),
max_temp_parent=max(temp_parent),
min_temp_teen=min(temp_teen),
max_temp_teen=max(temp_teen))
temp2 <- temparature %>%
tidyr::gather(temp_min_max, Temp, -month)
ggplot() +
geom_area(data = temp2 %>%
filter(temp_min_max %in% c('min_temp_kitch', 'max_temp_kitch')),
aes(x=month,y=Temp,fill = temp_min_max, color = temp_min_max),
position = 'identity')
I wish to plot the frequency of subscribers over time using start and end date.
I have a method that creates a row for each day per subscriber, then calculates the frequency per day, then plots the frequency by day.
This works fine for small data but does not scale to large subscriber numbers because the rows per customer step is too big.
Is there an efficient method? Many thanks for any help.
library(ggplot2)
library(dplyr)
# create dummy dataset
subscribers <- data.frame(id = seq(1:10),
start = sample(seq(as.Date('2016/01/01'), as.Date('2016/06/01'), by="day"), 10),
end = sample(seq(as.Date('2017/01/01'), as.Date('2017/06/01'), by="day"), 10))
# creates a row for each day per user - OK for small datasets, but not scalable
date_map <- Map(seq, subscribers$start, subscribers$end, by = "day")
date_rows <- data.frame(
org = rep.int(subscribers$id, vapply(date_map, length, 1L)),
date = do.call(c, date_map))
# finds the frequency of users for each day
date_rows %>%
group_by(date) %>%
dplyr::summarise(users = n()) -> plot_data
ggplot(data = plot_data,
aes(x = date, y = users)) +
geom_line(size = 1.2,alpha = .6)
How's this?
library(tidyverse)
df <- subscribers %>%
gather(key, value, start, end) %>%
mutate(key = ifelse(key == "start",1,-1)) %>%
arrange(value)
df$cum <- cumsum(df$key)
ggplot(data = df,
aes(x = value, y = cum)) +
geom_step()