Removing strata variable from predictions results - r

I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role() doesn't do that...
data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(vital ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(strata_var, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_smote(vital)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05,
0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
In the last plot I can see the strata variable coming up.

You got this result because of the combination of role selection functions you used in step_dummy(). (full reprex at the end of post)
You used the following selections. Which selects all nominal, but not any outcomes. This selected the strata variables because it is both a nominal variable and not an outcome.
all_nominal(), -all_outcomes()
A better option would be to use all_nominal_predictors() which won't select id/strata variables.
library(tidymodels)
data("penguins")
rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal(), -all_outcomes())
rec_spec1 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 4
#> body_mass_g species island_Dream island_Torgersen
#> <int> <fct> <dbl> <dbl>
#> 1 3750 Adelie 0 1
#> 2 3800 Adelie 0 1
#> 3 3250 Adelie 0 1
#> 4 NA Adelie 0 1
#> 5 3450 Adelie 0 1
#> 6 3650 Adelie 0 1
#> 7 3625 Adelie 0 1
#> 8 4675 Adelie 0 1
#> 9 3475 Adelie 0 1
#> 10 4250 Adelie 0 1
#> # … with 334 more rows
rec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal_predictors())
rec_spec2 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 3
#> island body_mass_g species
#> <fct> <int> <fct>
#> 1 Torgersen 3750 Adelie
#> 2 Torgersen 3800 Adelie
#> 3 Torgersen 3250 Adelie
#> 4 Torgersen NA Adelie
#> 5 Torgersen 3450 Adelie
#> 6 Torgersen 3650 Adelie
#> 7 Torgersen 3625 Adelie
#> 8 Torgersen 4675 Adelie
#> 9 Torgersen 3475 Adelie
#> 10 Torgersen 4250 Adelie
#> # … with 334 more rows
Full reprex
library(tidymodels)
library(themis)
library(forcats)
data("penguins")
penguins0 <- penguins %>%
mutate(ids = row_number(),
species = factor(species == "Adelie")) %>%
drop_na()
data_split <- initial_split(penguins0, prop = 3/4, strata = island)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(species ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(island, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(species)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = island)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20),
mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
Created on 2021-08-20 by the reprex package (v2.0.1)

Related

fable ARIMA with bootstrap giving all NAs

I have fitted an ARIMA model using the fable R package. When I go to use the model to forecast the distribution using bootstrap resampled errors it returns all NAs.
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
ARIMA_fable <- ARIMA_model %>%
generate(new_data = scenarios, bootstrap = TRUE, times = 100)
I can get it to run using forecast() but I want to see each ensemble member and the errors are not expected to be normally distributed.
ARIMA_fable <- ARIMA_model %>% forecast(new_data = scenarios, bootstrap = FALSE)
Here is a reproducible example:
key <- c('A', 'B', 'C')
time <- seq(start, by = 1, length.out = 15)
set.seed(123)
targets <- expand.grid(time = time, key = key) %>%
mutate(x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30)))
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
test_scenarios <- targets %>%
mutate(time = time + lubridate::days(16),
x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30))) %>%
as_tsibble(key = 'key', index = 'time')
ARIMA_model %>%
forecast(new_data = test_scenarios, bootstrap = T)
I still can't reproduce the NA forecasts that you get, however I have identified and fixed an issue in the code that could be causing issues here (https://github.com/tidyverts/fable/commit/685cc9ec7846a990d7c664f8eb24e4ad75e1673a).
This updated version can be installed with remotes::install_github("tidyverts/fable"), and with this version I am able to run the example you provided without issue:
library(fable)
#> Loading required package: fabletools
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
key <- c('A', 'B', 'C')
time <- seq(Sys.Date(), by = 1, length.out = 15)
set.seed(123)
targets <- expand.grid(time = time, key = key) %>%
mutate(x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30)))
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
test_scenarios <- targets %>%
mutate(time = time + lubridate::days(16),
x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30))) %>%
as_tsibble(key = 'key', index = 'time')
ARIMA_model %>%
forecast(new_data = test_scenarios, bootstrap = T)
#> # A fable: 45 x 6 [1D]
#> # Key: key, .model [3]
#> key .model time y .mean x
#> <fct> <chr> <date> <dist> <dbl> <dbl>
#> 1 A ARIMA(y ~ x) 2022-10-02 sample[5000] 10.2 1.82
#> 2 A ARIMA(y ~ x) 2022-10-03 sample[5000] 11.1 2.73
#> 3 A ARIMA(y ~ x) 2022-10-04 sample[5000] 11.6 2.81
#> 4 A ARIMA(y ~ x) 2022-10-05 sample[5000] 11.9 3.92
#> 5 A ARIMA(y ~ x) 2022-10-06 sample[5000] 12.3 4.26
#> 6 A ARIMA(y ~ x) 2022-10-07 sample[5000] 13.0 4.27
#> 7 A ARIMA(y ~ x) 2022-10-08 sample[5000] 13.3 4.41
#> 8 A ARIMA(y ~ x) 2022-10-09 sample[5000] 14.3 4.63
#> 9 A ARIMA(y ~ x) 2022-10-10 sample[5000] 15.2 5.63
#> 10 A ARIMA(y ~ x) 2022-10-11 sample[5000] 15.6 6.59
#> # … with 35 more rows
Created on 2022-09-16 by the reprex package (v2.0.1)

summarise with mean, median, range and quants in R

I am currently working with the palmer penguins data set in R and want to summarise data that combines means, median, range and quants, grouping by sex.
My current solution has the quant data split from the summary data. Is there a way to do this in one go. If not how do I combine the data sets. The group quant is currently in long format, and I am not sure how to combine them.
group_summary <- penguins %>% group_by(sex) %>% summarize(mean = mean(bill_length_mm,
na.rm = TRUE), meadian = median(bill_length_mm, na.rm = TRUE), range =
max(bill_length_mm, na.rm = TRUE) - min(bill_length_mm, na.rm = TRUE))
group_quant <- penguins %>% group_by(sex) %>% summarize(quantile(bill_length_mm,
probs =seq(.1, 1, by = .1), na.rm =TRUE, .groups = 'drop'))
I had the following solution but it drops the NA values from Sex and I am not sure why.
group_summary <- do.call(data.frame,aggregate(bill_length_mm ~ sex, penguins,
function(x) c(mean = mean(x, na.rm = TRUE), median = median(x, na.rm = TRUE), range =
max(x, na.rm = TRUE) - min(x, na.rm = TRUE), quantile(x, probs = seq(.1, 1, by = .1),
na.rm = TRUE, .groups = 'drop'))))
You may save the quantiles in a list and then use unnest_wider to create new columns from them. To calculate range I used diff(range(...)) instead of max(...) - min(...). Both of them are fine but I included it to show an alternative.
library(palmerpenguins)
library(dplyr)
library(tidyr)
penguins %>%
group_by(sex) %>%
summarize(mean = mean(bill_length_mm, na.rm = TRUE),
median = median(bill_length_mm, na.rm = TRUE),
range = diff(range(bill_length_mm, na.rm = TRUE)),
quantile = list(quantile(bill_length_mm, probs = seq(.1, 1, by = .1), na.rm = TRUE))) %>%
unnest_wider(quantile)
# sex mean median range `10%` `20%` `30%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 female 42.1 42.8 25.9 35.8 36.7 38.2 40 42.8 45.1 45.7 46.5 47.5 58
#2 male 45.9 46.8 25 38.8 40.5 41.3 43.2 46.8 49.0 50.0 50.8 51.9 59.6
#3 NA 41.3 42 13.2 36.8 37.7 37.8 38.6 42 44 44.5 45.2 46.4 47.3

Correlation Coefficients between population and suicide in Rstudio

Percentage Change
selectedCountry <- dataset %>%
filter(country == 'Japan') %>%
select(population, suicides_no, year) %>%
group_by(year) %>%
summarise(s_count = sum(suicides_no), p_count = sum(population))
year s_count p_count
<dbl> <dbl> <dbl>
1 1979 20711 107268500
2 1980 20416 108473500
3 1981 19976 109674700
4 1982 20535 110722900
5 1983 24853 111070000
6 1984 24221 111950000
I want to find the correlation between population and suicide after I aggregate population and suicide like this
percentage <- selectedCountry %>%
arrange(year) %>%
mutate(pct.chg.s = 100 * (s_count - lag(s_count,default=first(s_count))) / lag(s_count,default=first(s_count))) %>%
mutate(pct.chg.p = 100 * (p_count - lag(p_count,default=first(p_count))) / lag(p_count,default=first(p_count))) %>%
mutate(correlation = cor(pct.chg.s, pct.chg.p))
head(percentage)
I end up with a result
year s_count p_count pct.chg.s pct.chg.p correlation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1979 20711 107268500 0 0 0.00789
2 1980 20416 108473500 -1.42 1.12 0.00789
3 1981 19976 109674700 -2.16 1.11 0.00789
4 1982 20535 110722900 2.80 0.956 0.00789
5 1983 24853 111070000 21.0 0.313 0.00789
6 1984 24221 111950000 -2.54 0.792 0.00789
to plot the corr between two variables using two way (I can't sure if this true or false)
ggscatter(percentage, x = "pct.chg.s", y = "pct.chg.p",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Percentage Change in Suicide",
ylab = "Percentage Change in Population",
color = "blue", shape = 19,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
ellipse = TRUE, mean.point = TRUE,
star.plot = TRUE)
ggplot(percentage, aes(x = pct.chg.p, y = pct.chg.s)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x) +
labs(title = "Correlation between Pubulation and Suicides",
x = "Percentage Change in Suicide",
y = "Percentage Change in Population")

R tidyeval passing a list containing multiple character vectors to dplyr functions

In this problem I want to map a function over multiple data inputs to create output showing frequencies of item responses.
First I create two data sets using the psych package:
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(psych))
set.seed(123)
data_input_sim <-
as_tibble(sim.poly.ideal(nvar = 50, n = 1000, cat = 4, )[["items"]]) %>%
mutate_all(
~ case_when(
.x == 0 ~ "never",
.x == 1 ~ "occasionally",
.x == 2 ~ "frequently",
.x == 3 ~ "always"
)
) %>%
rename_all( ~ str_c("i", str_pad(
as.character(1:50), 2, side = "left", pad = "0"
))) %>%
mutate(
ID = 100001:101000,
age = sample(c(5:12), 1000, replace = TRUE),
age_range = case_when(
age <=8 ~ "5 to 8 yo",
T ~ "9 to 12 yo"
),
gender = sample(
c("female", "male"),
1000,
replace = TRUE,
prob = c(0.53, 0.47)
),
educ = sample(
c("no_HS", "HS_grad", "some_college", "BA_plus"),
1000,
replace = TRUE,
prob = c(0.119, 0.263, 0.306, 0.311)
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
select(ID:clin_status, i01:i50)
data_input_bfi <- bfi %>%
drop_na() %>%
sample_n(1000) %>%
mutate(
ID = 200001:201000,
age_range = case_when(
age <= 18 ~ "18 yo or younger",
between(age, 19, 24) ~ "19 to 24 yo",
between(age, 25, 39) ~ "25 to 39 yo",
T ~ "40 yo or older"
),
gender = case_when(gender == 1 ~ "male",
gender == 2 ~ "female"),
educ = case_when(
education == 1 ~ "no_HS",
education == 2 ~ "HS_grad",
education == 3 ~ "some_college",
T ~ "BA_plus"
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
mutate_at(
vars(A1:O5),
~
case_when(
.x == 1 ~ "very_inaccurate",
.x == 2 ~ "moderately_inaccurate",
.x == 3 ~ "slightly_inaccurate",
.x == 4 ~ "slightly_accurate",
.x == 5 ~ "moderately_accurate",
.x == 6 ~ "very_accurate",
)
) %>%
select(ID, age:clin_status, A1:O5)
Then I extract and sequence elements unique to each data set: the suffix of its name, the names of its item columns, and the names of its item categories:
data_name_suffix <- c("sim", "bfi")
sim_item_cols <- str_c("i", str_pad(as.character(1:50), 2, side = "left", pad = "0"))
bfi_item_cols <- cross(list(c("A", "C", "E", "N", "O"), seq(1:5))) %>%
map_chr(str_c, collapse = "") %>%
sort()
sim_item_cats <- c("never", "occasionally","frequently", "always")
bfi_item_cats <- c("very_inaccurate", "moderately_inaccurate", "slightly_inaccurate",
"slightly_accurate", "moderately_accurate", "very_accurate")
data_name_suffix is a two-element character vector; I then create two-element lists (using quos()) to hold the item column and category names:
item_cols <- quos(sim_item_cols, bfi_item_cols)
item_cats <- quos(sim_item_cats, bfi_item_cats)
Now I attempt to map the output-creating function over the three inputs, using purrr::pmap():
pmap_df(
list(data_name_suffix,
item_cols,
item_cats),
~
eval(as.name(str_c("data_input_", data_name_suffix))) %>%
select(!!!item_cols) %>%
gather(var, value) %>%
group_by(var, value) %>%
count(var, value) %>%
ungroup() %>%
spread(value, n) %>%
arrange(match(var, !!!item_cols)) %>%
select(var, !!!item_cats) %>%
assign(str_c("freq_item_val_", data_name_suffix), ., envir = .GlobalEnv)
)
And it returns this error:
Error: Unknown columns `A1`, `A2`, `A3`, `A4`, `A5` and ...
Which suggests to me that R is seeing the list item_cols as a single long character vector, rather than two separate character vectors to iterate over.
And here we reach the limit of my understanding of and experience with tidyeval techniques. I suspect that I'm doing something wrong with quos() and !!!.
Thanks in advance for any help, and I hope whoever reads this is safe and healthy during this surreal time.
Here, we could use mget to get the values of the objects
library(stringr)
library(purrr)
library(dplyr)
library(tidyr)
list(mget(str_c('data_input_', data_name_suffix)),
item_cols,
item_cats) %>%
pmap(~ ..1 %>%
select(!!! ..2) %>%
pivot_longer(everything(), names_to = 'var', values_to = 'value') %>%
count(var, value) %>%
pivot_wider(names_from = value, values_from = n) %>%
arrange(match(var, !!!..2)) %>%
select(var, !!! ..3) )
#$data_input_sim
# A tibble: 50 x 5
# var never occasionally frequently always
# <chr> <int> <int> <int> <int>
# 1 i01 465 366 141 28
# 2 i02 489 336 147 28
# 3 i03 457 367 146 30
# 4 i04 433 385 162 20
# 5 i05 418 362 171 49
# 6 i06 420 369 169 42
# 7 i07 405 367 182 46
# 8 i08 361 401 194 44
# 9 i09 346 391 211 52
#10 i10 334 425 203 38
# … with 40 more rows
#$data_input_bfi
# A tibble: 25 x 7
# var very_inaccurate moderately_inaccurate slightly_inaccurate slightly_accurate moderately_accurate very_accurate
# <chr> <int> <int> <int> <int> #<int> <int>
# 1 A1 334 278 151 130 75 32
# 2 A2 18 49 48 197 365 323
# 3 A3 32 51 72 210 353 282
# 4 A4 48 69 60 159 243 421
# 5 A5 26 66 89 207 340 272
# 6 C1 17 48 82 213 383 257
# 7 C2 26 85 98 212 361 218
# 8 C3 35 80 102 272 322 189
# 9 C4 296 270 166 163 83 22
#10 C5 197 212 118 207 167 99
# … with 15 more rows
NOTE: assigning to create multiple objects is not recommended. Instead keep the output in a list and make changes in each of the list elements (if needed) by looping over it with map

Group data into multiple season and boxplot side by side using ggplot in R?

I would like to group data into multiple seasin such that my season are winter: Dec - Feb; Spring: Mar - May; Summer: Jun -Aug, and Fall: Sep - Nov. I would then like to boxplot the Winter and Spring seasonal data comparing A to B and then A to C. Here is my laborious code so far. I would appreciate an efficient way of data grouping and plotting.
library(tidyverse)
library(reshape2)
Dates30s = data.frame(seq(as.Date("2011-01-01"), to= as.Date("2040-12-31"),by="day"))
colnames(Dates30s) = "date"
FakeData = data.frame(A = runif(10958, min = 0.5, max = 1.5), B = runif(10958, min = 1.6, max = 2), C = runif(10958, min = 0.8, max = 1.8))
myData = data.frame(Dates30s, FakeData)
myData = separate(myData, date, sep = "-", into = c("Year", "Month", "Day"))
myData$Year = as.numeric(myData$Year)
myData$Month = as.numeric(myData$Month)
SeasonalData = myData %>% group_by(Year, Month) %>% summarise_all(funs(mean)) %>% select(Year, Month, A, B, C)
Spring = SeasonalData %>% filter(Month == 3 | Month == 4 |Month == 5)
Winter1 = SeasonalData %>% filter(Month == 12)
Winter1$Year = Winter1$Year+1
Winter2 = SeasonalData %>% filter(Month == 1 | Month == 2 )
Winter = rbind(Winter1, Winter2) %>% filter(Year >= 2012 & Year <= 2040) %>% group_by(Year) %>% summarise_all(funs(mean)) %>% select(-"Month")
BoxData = gather(Winter, key = "Variable", value = "value", -Year )
ggplot(BoxData, aes(x=Variable, y=value,fill=factor(Variable)))+
geom_boxplot() + labs(title="Winter") +facet_wrap(~Variable)
I would like to have Two figures: Figure 1 split in two; one for Winter season and one for Summer season (see BoxPlot 1) and one for Monthly annual average representing average monthly values across the entire time period (2011 -2040) see Boxplot 2
This is what I usually do it. All calculation and plotting are based on water year (WY) or hydrologic year from October to September.
library(tidyverse)
library(lubridate)
set.seed(123)
Dates30s <- data.frame(seq(as.Date("2011-01-01"), to = as.Date("2040-12-31"), by = "day"))
colnames(Dates30s) <- "date"
FakeData <- data.frame(A = runif(10958, min = 0.3, max = 1.5),
B = runif(10958, min = 1.2, max = 2),
C = runif(10958, min = 0.6, max = 1.8))
### Calculate Year, Month then Water year (WY) and Season
myData <- data.frame(Dates30s, FakeData) %>%
mutate(Year = year(date),
MonthNr = month(date),
Month = month(date, label = TRUE, abbr = TRUE)) %>%
mutate(WY = case_when(MonthNr > 9 ~ Year + 1,
TRUE ~ Year)) %>%
mutate(Season = case_when(MonthNr %in% 9:11 ~ "Fall",
MonthNr %in% c(12, 1, 2) ~ "Winter",
MonthNr %in% 3:5 ~ "Spring",
TRUE ~ "Summer")) %>%
select(-date, -MonthNr, -Year) %>%
as_tibble()
myData
#> # A tibble: 10,958 x 6
#> A B C Month WY Season
#> <dbl> <dbl> <dbl> <ord> <dbl> <chr>
#> 1 0.645 1.37 1.51 Jan 2011 Winter
#> 2 1.25 1.79 1.71 Jan 2011 Winter
#> 3 0.791 1.35 1.68 Jan 2011 Winter
#> 4 1.36 1.97 0.646 Jan 2011 Winter
#> 5 1.43 1.31 1.60 Jan 2011 Winter
#> 6 0.355 1.52 0.708 Jan 2011 Winter
#> 7 0.934 1.94 0.825 Jan 2011 Winter
#> 8 1.37 1.89 1.03 Jan 2011 Winter
#> 9 0.962 1.75 0.632 Jan 2011 Winter
#> 10 0.848 1.94 0.883 Jan 2011 Winter
#> # ... with 10,948 more rows
Calculate seasonal and monthly average by WY
### Seasonal Avg by WY
SeasonalAvg <- myData %>%
select(-Month) %>%
group_by(WY, Season) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Season)
SeasonalAvg
#> # A tibble: 366 x 4
#> WY Season State MFI
#> <dbl> <chr> <chr> <dbl>
#> 1 2011 Fall A 0.939
#> 2 2011 Spring A 0.907
#> 3 2011 Summer A 0.896
#> 4 2011 Winter A 0.909
#> 5 2012 Fall A 0.895
#> 6 2012 Spring A 0.865
#> 7 2012 Summer A 0.933
#> 8 2012 Winter A 0.895
#> 9 2013 Fall A 0.879
#> 10 2013 Spring A 0.872
#> # ... with 356 more rows
### Monthly Avg by WY
MonthlyAvg <- myData %>%
select(-Season) %>%
group_by(WY, Month) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Month) %>%
mutate(Month = factor(Month))
MonthlyAvg
#> # A tibble: 1,080 x 4
#> WY Month State MFI
#> <dbl> <ord> <chr> <dbl>
#> 1 2011 Jan A 1.00
#> 2 2011 Feb A 0.807
#> 3 2011 Mar A 0.910
#> 4 2011 Apr A 0.923
#> 5 2011 May A 0.888
#> 6 2011 Jun A 0.876
#> 7 2011 Jul A 0.909
#> 8 2011 Aug A 0.903
#> 9 2011 Sep A 0.939
#> 10 2012 Jan A 0.903
#> # ... with 1,070 more rows
Plot seasonal and monthly data
### Seasonal plot
s1 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
s1
### Monthly plot
m1 <- ggplot(MonthlyAvg, aes(x = Month, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
m1
Bonus
### https://stackoverflow.com/a/58369424/786542
# if (!require(devtools)) {
# install.packages('devtools')
# }
# devtools::install_github('erocoar/gghalves')
library(gghalves)
s2 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05) +
geom_half_violin(aes(fill = State),
side = "r", nudge = 0.01) +
theme_light() +
theme(legend.position = "bottom") +
guides(fill = guide_legend(nrow = 1))
s2
s3 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05, outlier.color = NA) +
geom_dotplot(aes(fill = State),
binaxis = "y", method = "histodot",
dotsize = 0.35,
stackdir = "up", position = PositionDodge) +
theme_light() +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 1))
s3
#> `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Created on 2019-10-16 by the reprex package (v0.3.0)

Resources