I have fitted an ARIMA model using the fable R package. When I go to use the model to forecast the distribution using bootstrap resampled errors it returns all NAs.
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
ARIMA_fable <- ARIMA_model %>%
generate(new_data = scenarios, bootstrap = TRUE, times = 100)
I can get it to run using forecast() but I want to see each ensemble member and the errors are not expected to be normally distributed.
ARIMA_fable <- ARIMA_model %>% forecast(new_data = scenarios, bootstrap = FALSE)
Here is a reproducible example:
key <- c('A', 'B', 'C')
time <- seq(start, by = 1, length.out = 15)
set.seed(123)
targets <- expand.grid(time = time, key = key) %>%
mutate(x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30)))
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
test_scenarios <- targets %>%
mutate(time = time + lubridate::days(16),
x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30))) %>%
as_tsibble(key = 'key', index = 'time')
ARIMA_model %>%
forecast(new_data = test_scenarios, bootstrap = T)
I still can't reproduce the NA forecasts that you get, however I have identified and fixed an issue in the code that could be causing issues here (https://github.com/tidyverts/fable/commit/685cc9ec7846a990d7c664f8eb24e4ad75e1673a).
This updated version can be installed with remotes::install_github("tidyverts/fable"), and with this version I am able to run the example you provided without issue:
library(fable)
#> Loading required package: fabletools
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
key <- c('A', 'B', 'C')
time <- seq(Sys.Date(), by = 1, length.out = 15)
set.seed(123)
targets <- expand.grid(time = time, key = key) %>%
mutate(x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30)))
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
test_scenarios <- targets %>%
mutate(time = time + lubridate::days(16),
x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30))) %>%
as_tsibble(key = 'key', index = 'time')
ARIMA_model %>%
forecast(new_data = test_scenarios, bootstrap = T)
#> # A fable: 45 x 6 [1D]
#> # Key: key, .model [3]
#> key .model time y .mean x
#> <fct> <chr> <date> <dist> <dbl> <dbl>
#> 1 A ARIMA(y ~ x) 2022-10-02 sample[5000] 10.2 1.82
#> 2 A ARIMA(y ~ x) 2022-10-03 sample[5000] 11.1 2.73
#> 3 A ARIMA(y ~ x) 2022-10-04 sample[5000] 11.6 2.81
#> 4 A ARIMA(y ~ x) 2022-10-05 sample[5000] 11.9 3.92
#> 5 A ARIMA(y ~ x) 2022-10-06 sample[5000] 12.3 4.26
#> 6 A ARIMA(y ~ x) 2022-10-07 sample[5000] 13.0 4.27
#> 7 A ARIMA(y ~ x) 2022-10-08 sample[5000] 13.3 4.41
#> 8 A ARIMA(y ~ x) 2022-10-09 sample[5000] 14.3 4.63
#> 9 A ARIMA(y ~ x) 2022-10-10 sample[5000] 15.2 5.63
#> 10 A ARIMA(y ~ x) 2022-10-11 sample[5000] 15.6 6.59
#> # … with 35 more rows
Created on 2022-09-16 by the reprex package (v2.0.1)
Related
I have a data frame like
river
discharge
river1
500
river1
450
river1
200
river1
250
river2
375
river2
235
river2
130
river2
250
I want to apply the following list of function to the column discharge ..
f <- list(
mean = function(x, ...) mean(x),
Q50 = function(x, ...) lfquantile(x, exc.freq = 0.5),
Q95 = function(x, ...) lfquantile(x, exc.freq = 0.95),
Q90 = function(x, ...) lfquantile(x, exc.freq = 0.9),
Q70 = function(x, ...) lfquantile(x, exc.freq = 0.7),
)
in the end I am supposed to have a table like this :
river
mean
Q50
Q95
Q90
Q70
river1
river2
rivern
I do not have any idea how to do that :(
If we have all the functions available, then use
library(dplyr)
library(purrr)
imap_dfc(f, ~ df1 %>%
group_by(river) %>%
reframe(!! .y := .x(discharge)))
You could use group_by() function and apply the list of statistics to calculate the summaries and no need to write functions:
library(dplyr)
df %>%
group_by(river)%>%
summarize(
mean = mean(discharge),
q50 = quantile(discharge, 0.50),
q95 = quantile(discharge, 0.95),
q90 = quantile(discharge, 0.90),
q70 = quantile(discharge, 0.70)
)
and the output is:
river mean q50 q95 q90 q70
river1 350 350 492 485 455
river2 248 242 356 338 262
A base R approach. Replacing lfquantile with quantile for this example.
func <- list(mean = function (x, ...) mean(x),
Q50 = function (x, ...) quantile(x, probs = 0.5),
Q95 = function (x, ...) quantile(x, probs = 0.95),
Q90 = function (x, ...) quantile(x, probs = 0.9),
Q70 = function (x, ...) quantile(x, probs = 0.7))
setNames(aggregate(discharge ~ river, df, function(x)
setNames(sapply(names(func), function(nm)
func[[nm]](x)), names(func))), c("river", ""))
river mean Q50 Q95 Q90 Q70
1 river1 350.00 350.00 492.50 485.00 455.00
2 river2 247.50 242.50 356.25 337.50 262.50
Data
df <- structure(list(river = c("river1", "river1", "river1", "river1",
"river2", "river2", "river2", "river2"), discharge = c(500L,
450L, 200L, 250L, 375L, 235L, 130L, 250L)), class = "data.frame",
row.names = c(NA, -8L))
library(tidyverse)
func <- list(mean = function (x, ...) mean(x),
Q50 = function (x, ...) quantile(x, probs = 0.5),
Q95 = function (x, ...) quantile(x, probs = 0.95),
Q90 = function (x, ...) quantile(x, probs = 0.9),
Q70 = function (x, ...) quantile(x, probs = 0.7))
df %>%
group_by(river) %>%
summarise_at(vars(discharge), func)
#> # A tibble: 2 × 6
#> river mean Q50 Q95 Q90 Q70
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 river1 350 350 492. 485 455
#> 2 river2 248. 242. 356. 338. 262.
df %>%
group_by(river) %>%
summarise(across(discharge, func))
#> # A tibble: 2 × 6
#> river discharge_mean discharge_Q50 discharge_Q95 discharge_Q90 discharge_Q70
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 river1 350 350 492. 485 455
#> 2 river2 248. 242. 356. 338. 262.
Created on 2023-02-08 with reprex v2.0.2
EDIT
since the quantile function is vectorized, you could do:
library(tidyverse)
func1 <- function(x){
qnts <- quantile(x, probs = c(0.5,0.95,0.9,0.7))
qnts <- setNames(qnts, paste0('Q', c(50,95,90,70)))
data.frame(mean = mean(x), as.list(qnts))
}
df %>%
summarise(across(discharge, func1, .unpack = TRUE), .by = river)
#> river discharge_mean discharge_Q50 discharge_Q95 discharge_Q90 discharge_Q70
#> 1 river1 350.0 350.0 492.50 485.0 455.0
#> 2 river2 247.5 242.5 356.25 337.5 262.5
Created on 2023-02-08 with reprex v2.0.2
I would like to compute differences among several columns, per identifiers (see script below for reproducible example and target data frame).
This question is somehow similar, but only for pairs of identifiers. I can't think on how to adapt it.
I could also have several data frame, one per identifier, but I also don't know in that case how to compute multiple columns differences.
The code below allows to create a sample dataset, and has the code I currently use. It gives me what I want, I'd just like to know if there is a way not to spell out all the differences I want to compute (in my dataset, I have more parameters and depths than in that sample data).
Thanks in advance for your help!
library(tidyverse)
# sample data
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# Reshape the data and compute differences
dt %>% pivot_wider(id_cols = c(site,date), names_from = c(parameter,depth), values_from = value, names_sep = "_") %>%
# do the difference, depth to depth, parameter by parameter
# What I would like is not have to write manually each differences pair
mutate(temperature_1_2 = temperature_1 - temperature_2,
temperature_1_3 = temperature_1 - temperature_3,
temperature_2_3 = temperature_2 - temperature_3,
oxygen_1_2 = oxygen_1 - oxygen_2,
oxygen_1_3 = oxygen_1 - oxygen_3,
oxygen_2_3 = oxygen_2 - oxygen_3)
library(tidyverse)
library(rlang)
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# result
temperature <- str_c("temperature_", 1:3)
oxygen <- str_c("oxygen_", 1:3)
temperature_frml <- combn(temperature, m = 2, FUN = function(x) str_c(x, collapse = " - "))
oxygen_frml <- combn(oxygen, m = 2, FUN = function(x) str_c(x, collapse = " - "))
all_frml <- c(temperature_frml, oxygen_frml)
df_wider <- dt %>% pivot_wider(
id_cols = c(site, date),
names_from = c(parameter, depth),
values_from = value,
names_sep = "_"
)
bind_cols(df_wider,
map_dfc(
.x = all_frml,
.f = ~ transmute(.data = df_wider,!!.x := eval(parse_expr(.x)))
))
#> # A tibble: 20 x 14
#> site date temperature_1 temperature_2 temperature_3 oxygen_1 oxygen_2
#> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2021-12-11 17.6 17.1 12.9 7.34 6.86
#> 2 A 2021-12-12 17.6 17.1 12.9 7.34 6.86
#> 3 A 2021-12-13 17.6 17.1 12.9 7.34 6.86
#> 4 A 2021-12-14 17.6 17.1 12.9 7.34 6.86
#> 5 B 2021-12-11 17.1 15.6 13.7 8.52 7.58
#> 6 B 2021-12-12 17.1 15.6 13.7 8.52 7.58
#> 7 B 2021-12-13 17.1 15.6 13.7 8.52 7.58
#> 8 B 2021-12-14 17.1 15.6 13.7 8.52 7.58
#> 9 C 2021-12-11 17.7 15.5 13.6 7.66 7.31
#> 10 C 2021-12-12 17.7 15.5 13.6 7.66 7.31
#> 11 C 2021-12-13 17.7 15.5 13.6 7.66 7.31
#> 12 C 2021-12-14 17.7 15.5 13.6 7.66 7.31
#> 13 D 2021-12-11 16.5 16.4 14.5 7.50 7.27
#> 14 D 2021-12-12 16.5 16.4 14.5 7.50 7.27
#> 15 D 2021-12-13 16.5 16.4 14.5 7.50 7.27
#> 16 D 2021-12-14 16.5 16.4 14.5 7.50 7.27
#> 17 E 2021-12-11 16.7 16.1 15.7 7.52 7.51
#> 18 E 2021-12-12 16.7 16.1 15.7 7.52 7.51
#> 19 E 2021-12-13 16.7 16.1 15.7 7.52 7.51
#> 20 E 2021-12-14 16.7 16.1 15.7 7.52 7.51
#> # ... with 7 more variables: oxygen_3 <dbl>,
#> # temperature_1 - temperature_2 <dbl>, temperature_1 - temperature_3 <dbl>,
#> # temperature_2 - temperature_3 <dbl>, oxygen_1 - oxygen_2 <dbl>,
#> # oxygen_1 - oxygen_3 <dbl>, oxygen_2 - oxygen_3 <dbl>
Created on 2021-12-14 by the reprex package (v2.0.1)
I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role() doesn't do that...
data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(vital ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(strata_var, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_smote(vital)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05,
0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
In the last plot I can see the strata variable coming up.
You got this result because of the combination of role selection functions you used in step_dummy(). (full reprex at the end of post)
You used the following selections. Which selects all nominal, but not any outcomes. This selected the strata variables because it is both a nominal variable and not an outcome.
all_nominal(), -all_outcomes()
A better option would be to use all_nominal_predictors() which won't select id/strata variables.
library(tidymodels)
data("penguins")
rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal(), -all_outcomes())
rec_spec1 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 4
#> body_mass_g species island_Dream island_Torgersen
#> <int> <fct> <dbl> <dbl>
#> 1 3750 Adelie 0 1
#> 2 3800 Adelie 0 1
#> 3 3250 Adelie 0 1
#> 4 NA Adelie 0 1
#> 5 3450 Adelie 0 1
#> 6 3650 Adelie 0 1
#> 7 3625 Adelie 0 1
#> 8 4675 Adelie 0 1
#> 9 3475 Adelie 0 1
#> 10 4250 Adelie 0 1
#> # … with 334 more rows
rec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal_predictors())
rec_spec2 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 3
#> island body_mass_g species
#> <fct> <int> <fct>
#> 1 Torgersen 3750 Adelie
#> 2 Torgersen 3800 Adelie
#> 3 Torgersen 3250 Adelie
#> 4 Torgersen NA Adelie
#> 5 Torgersen 3450 Adelie
#> 6 Torgersen 3650 Adelie
#> 7 Torgersen 3625 Adelie
#> 8 Torgersen 4675 Adelie
#> 9 Torgersen 3475 Adelie
#> 10 Torgersen 4250 Adelie
#> # … with 334 more rows
Full reprex
library(tidymodels)
library(themis)
library(forcats)
data("penguins")
penguins0 <- penguins %>%
mutate(ids = row_number(),
species = factor(species == "Adelie")) %>%
drop_na()
data_split <- initial_split(penguins0, prop = 3/4, strata = island)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(species ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(island, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(species)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = island)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20),
mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
Created on 2021-08-20 by the reprex package (v2.0.1)
I would like to group data into multiple seasin such that my season are winter: Dec - Feb; Spring: Mar - May; Summer: Jun -Aug, and Fall: Sep - Nov. I would then like to boxplot the Winter and Spring seasonal data comparing A to B and then A to C. Here is my laborious code so far. I would appreciate an efficient way of data grouping and plotting.
library(tidyverse)
library(reshape2)
Dates30s = data.frame(seq(as.Date("2011-01-01"), to= as.Date("2040-12-31"),by="day"))
colnames(Dates30s) = "date"
FakeData = data.frame(A = runif(10958, min = 0.5, max = 1.5), B = runif(10958, min = 1.6, max = 2), C = runif(10958, min = 0.8, max = 1.8))
myData = data.frame(Dates30s, FakeData)
myData = separate(myData, date, sep = "-", into = c("Year", "Month", "Day"))
myData$Year = as.numeric(myData$Year)
myData$Month = as.numeric(myData$Month)
SeasonalData = myData %>% group_by(Year, Month) %>% summarise_all(funs(mean)) %>% select(Year, Month, A, B, C)
Spring = SeasonalData %>% filter(Month == 3 | Month == 4 |Month == 5)
Winter1 = SeasonalData %>% filter(Month == 12)
Winter1$Year = Winter1$Year+1
Winter2 = SeasonalData %>% filter(Month == 1 | Month == 2 )
Winter = rbind(Winter1, Winter2) %>% filter(Year >= 2012 & Year <= 2040) %>% group_by(Year) %>% summarise_all(funs(mean)) %>% select(-"Month")
BoxData = gather(Winter, key = "Variable", value = "value", -Year )
ggplot(BoxData, aes(x=Variable, y=value,fill=factor(Variable)))+
geom_boxplot() + labs(title="Winter") +facet_wrap(~Variable)
I would like to have Two figures: Figure 1 split in two; one for Winter season and one for Summer season (see BoxPlot 1) and one for Monthly annual average representing average monthly values across the entire time period (2011 -2040) see Boxplot 2
This is what I usually do it. All calculation and plotting are based on water year (WY) or hydrologic year from October to September.
library(tidyverse)
library(lubridate)
set.seed(123)
Dates30s <- data.frame(seq(as.Date("2011-01-01"), to = as.Date("2040-12-31"), by = "day"))
colnames(Dates30s) <- "date"
FakeData <- data.frame(A = runif(10958, min = 0.3, max = 1.5),
B = runif(10958, min = 1.2, max = 2),
C = runif(10958, min = 0.6, max = 1.8))
### Calculate Year, Month then Water year (WY) and Season
myData <- data.frame(Dates30s, FakeData) %>%
mutate(Year = year(date),
MonthNr = month(date),
Month = month(date, label = TRUE, abbr = TRUE)) %>%
mutate(WY = case_when(MonthNr > 9 ~ Year + 1,
TRUE ~ Year)) %>%
mutate(Season = case_when(MonthNr %in% 9:11 ~ "Fall",
MonthNr %in% c(12, 1, 2) ~ "Winter",
MonthNr %in% 3:5 ~ "Spring",
TRUE ~ "Summer")) %>%
select(-date, -MonthNr, -Year) %>%
as_tibble()
myData
#> # A tibble: 10,958 x 6
#> A B C Month WY Season
#> <dbl> <dbl> <dbl> <ord> <dbl> <chr>
#> 1 0.645 1.37 1.51 Jan 2011 Winter
#> 2 1.25 1.79 1.71 Jan 2011 Winter
#> 3 0.791 1.35 1.68 Jan 2011 Winter
#> 4 1.36 1.97 0.646 Jan 2011 Winter
#> 5 1.43 1.31 1.60 Jan 2011 Winter
#> 6 0.355 1.52 0.708 Jan 2011 Winter
#> 7 0.934 1.94 0.825 Jan 2011 Winter
#> 8 1.37 1.89 1.03 Jan 2011 Winter
#> 9 0.962 1.75 0.632 Jan 2011 Winter
#> 10 0.848 1.94 0.883 Jan 2011 Winter
#> # ... with 10,948 more rows
Calculate seasonal and monthly average by WY
### Seasonal Avg by WY
SeasonalAvg <- myData %>%
select(-Month) %>%
group_by(WY, Season) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Season)
SeasonalAvg
#> # A tibble: 366 x 4
#> WY Season State MFI
#> <dbl> <chr> <chr> <dbl>
#> 1 2011 Fall A 0.939
#> 2 2011 Spring A 0.907
#> 3 2011 Summer A 0.896
#> 4 2011 Winter A 0.909
#> 5 2012 Fall A 0.895
#> 6 2012 Spring A 0.865
#> 7 2012 Summer A 0.933
#> 8 2012 Winter A 0.895
#> 9 2013 Fall A 0.879
#> 10 2013 Spring A 0.872
#> # ... with 356 more rows
### Monthly Avg by WY
MonthlyAvg <- myData %>%
select(-Season) %>%
group_by(WY, Month) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Month) %>%
mutate(Month = factor(Month))
MonthlyAvg
#> # A tibble: 1,080 x 4
#> WY Month State MFI
#> <dbl> <ord> <chr> <dbl>
#> 1 2011 Jan A 1.00
#> 2 2011 Feb A 0.807
#> 3 2011 Mar A 0.910
#> 4 2011 Apr A 0.923
#> 5 2011 May A 0.888
#> 6 2011 Jun A 0.876
#> 7 2011 Jul A 0.909
#> 8 2011 Aug A 0.903
#> 9 2011 Sep A 0.939
#> 10 2012 Jan A 0.903
#> # ... with 1,070 more rows
Plot seasonal and monthly data
### Seasonal plot
s1 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
s1
### Monthly plot
m1 <- ggplot(MonthlyAvg, aes(x = Month, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
m1
Bonus
### https://stackoverflow.com/a/58369424/786542
# if (!require(devtools)) {
# install.packages('devtools')
# }
# devtools::install_github('erocoar/gghalves')
library(gghalves)
s2 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05) +
geom_half_violin(aes(fill = State),
side = "r", nudge = 0.01) +
theme_light() +
theme(legend.position = "bottom") +
guides(fill = guide_legend(nrow = 1))
s2
s3 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05, outlier.color = NA) +
geom_dotplot(aes(fill = State),
binaxis = "y", method = "histodot",
dotsize = 0.35,
stackdir = "up", position = PositionDodge) +
theme_light() +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 1))
s3
#> `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Created on 2019-10-16 by the reprex package (v0.3.0)
Let's say I have a dataframe containing the sales for some quarters, while the values for the following quarters are missing. I would like to replace the NAs by a simple formula (with mutate/dplyr like below). The issue is that I don't want to use mutate so many times. How could I do that for all NAs at the same time? Is there a way?
structure(list(Period = c("1999Q1", "1999Q2", "1999Q3", "1999Q4",
"2000Q1", "2000Q2", "2000Q3", "2000Q4", "2001Q1", "2001Q2", "2001Q3",
"2001Q4", "2002Q1", "2002Q2", "2002Q3", "2002Q4", "2003Q1", "2003Q2",
"2003Q3", "2003Q4"), Sales= c(353.2925571, 425.9299841, 357.5204626,
363.80247, 302.8081066, 394.328576, 435.15573, 387.99768, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-20L))
test %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales))
One dplyr and tidyr possibility could be:
df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = replace_na(Sales, last(na.omit(Sales)))) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)
Period Sales
<chr> <dbl>
1 1999Q1 353.
2 1999Q2 426.
3 1999Q3 358.
4 1999Q4 364.
5 2000Q1 303.
6 2000Q2 394.
7 2000Q3 435.
8 2000Q4 388.
9 2001Q1 318.
10 2001Q2 414.
11 2001Q3 457.
12 2001Q4 407.
13 2002Q1 334.
14 2002Q2 435.
15 2002Q3 480.
16 2002Q4 428.
17 2003Q1 351.
18 2003Q2 456.
19 2003Q3 504.
20 2003Q4 449.
Or with just dplyr:
df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = if_else(is.na(Sales), last(na.omit(Sales)), Sales)) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)
x <- test$Sales
# find that last non-NA data
last.valid <- tail(which(!is.na(x)),1)
# store the "base"
base <- ceiling(last.valid/4)*4 + (-3:0)
base <- base + ifelse(base > last.valid, -4, 0)
base <- x[base]
# calculate the "exponents"
expos <- ceiling( ( seq(length(x)) - last.valid ) / 4 )
test$Sales <- ifelse(is.na(x), bases * 1.05 ^ expos, x)
tail(test)
# Period Sales
# 15 2002Q3 479.7592
# 16 2002Q4 427.7674
# 17 2003Q1 350.5382
# 18 2003Q2 456.4846
# 19 2003Q3 503.7472
# 20 2003Q4 449.1558
Here's another base solution:
non_nas <- na.omit(test$Sales)
nas <- length(attr(non_nas, 'na.action'))
test$Sales <- c(non_nas, #keep non_nas
tail(non_nas, 4) * 1.05 ^(rep(1:floor(nas / 4), each = 4, length.out = nas)))
test