I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role() doesn't do that...
data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(vital ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(strata_var, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_smote(vital)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05,
0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
In the last plot I can see the strata variable coming up.
You got this result because of the combination of role selection functions you used in step_dummy(). (full reprex at the end of post)
You used the following selections. Which selects all nominal, but not any outcomes. This selected the strata variables because it is both a nominal variable and not an outcome.
all_nominal(), -all_outcomes()
A better option would be to use all_nominal_predictors() which won't select id/strata variables.
library(tidymodels)
data("penguins")
rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal(), -all_outcomes())
rec_spec1 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 4
#> body_mass_g species island_Dream island_Torgersen
#> <int> <fct> <dbl> <dbl>
#> 1 3750 Adelie 0 1
#> 2 3800 Adelie 0 1
#> 3 3250 Adelie 0 1
#> 4 NA Adelie 0 1
#> 5 3450 Adelie 0 1
#> 6 3650 Adelie 0 1
#> 7 3625 Adelie 0 1
#> 8 4675 Adelie 0 1
#> 9 3475 Adelie 0 1
#> 10 4250 Adelie 0 1
#> # … with 334 more rows
rec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal_predictors())
rec_spec2 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 3
#> island body_mass_g species
#> <fct> <int> <fct>
#> 1 Torgersen 3750 Adelie
#> 2 Torgersen 3800 Adelie
#> 3 Torgersen 3250 Adelie
#> 4 Torgersen NA Adelie
#> 5 Torgersen 3450 Adelie
#> 6 Torgersen 3650 Adelie
#> 7 Torgersen 3625 Adelie
#> 8 Torgersen 4675 Adelie
#> 9 Torgersen 3475 Adelie
#> 10 Torgersen 4250 Adelie
#> # … with 334 more rows
Full reprex
library(tidymodels)
library(themis)
library(forcats)
data("penguins")
penguins0 <- penguins %>%
mutate(ids = row_number(),
species = factor(species == "Adelie")) %>%
drop_na()
data_split <- initial_split(penguins0, prop = 3/4, strata = island)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(species ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(island, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(species)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = island)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20),
mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
Created on 2021-08-20 by the reprex package (v2.0.1)
Related
I have fitted an ARIMA model using the fable R package. When I go to use the model to forecast the distribution using bootstrap resampled errors it returns all NAs.
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
ARIMA_fable <- ARIMA_model %>%
generate(new_data = scenarios, bootstrap = TRUE, times = 100)
I can get it to run using forecast() but I want to see each ensemble member and the errors are not expected to be normally distributed.
ARIMA_fable <- ARIMA_model %>% forecast(new_data = scenarios, bootstrap = FALSE)
Here is a reproducible example:
key <- c('A', 'B', 'C')
time <- seq(start, by = 1, length.out = 15)
set.seed(123)
targets <- expand.grid(time = time, key = key) %>%
mutate(x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30)))
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
test_scenarios <- targets %>%
mutate(time = time + lubridate::days(16),
x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30))) %>%
as_tsibble(key = 'key', index = 'time')
ARIMA_model %>%
forecast(new_data = test_scenarios, bootstrap = T)
I still can't reproduce the NA forecasts that you get, however I have identified and fixed an issue in the code that could be causing issues here (https://github.com/tidyverts/fable/commit/685cc9ec7846a990d7c664f8eb24e4ad75e1673a).
This updated version can be installed with remotes::install_github("tidyverts/fable"), and with this version I am able to run the example you provided without issue:
library(fable)
#> Loading required package: fabletools
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
key <- c('A', 'B', 'C')
time <- seq(Sys.Date(), by = 1, length.out = 15)
set.seed(123)
targets <- expand.grid(time = time, key = key) %>%
mutate(x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30)))
ARIMA_model <- targets %>%
as_tsibble(key = 'key', index = 'time') %>%
model(ARIMA(y ~ x))
test_scenarios <- targets %>%
mutate(time = time + lubridate::days(16),
x = sort(runif(45, 0, 30)),
y = sort(runif(45, 0, 30))) %>%
as_tsibble(key = 'key', index = 'time')
ARIMA_model %>%
forecast(new_data = test_scenarios, bootstrap = T)
#> # A fable: 45 x 6 [1D]
#> # Key: key, .model [3]
#> key .model time y .mean x
#> <fct> <chr> <date> <dist> <dbl> <dbl>
#> 1 A ARIMA(y ~ x) 2022-10-02 sample[5000] 10.2 1.82
#> 2 A ARIMA(y ~ x) 2022-10-03 sample[5000] 11.1 2.73
#> 3 A ARIMA(y ~ x) 2022-10-04 sample[5000] 11.6 2.81
#> 4 A ARIMA(y ~ x) 2022-10-05 sample[5000] 11.9 3.92
#> 5 A ARIMA(y ~ x) 2022-10-06 sample[5000] 12.3 4.26
#> 6 A ARIMA(y ~ x) 2022-10-07 sample[5000] 13.0 4.27
#> 7 A ARIMA(y ~ x) 2022-10-08 sample[5000] 13.3 4.41
#> 8 A ARIMA(y ~ x) 2022-10-09 sample[5000] 14.3 4.63
#> 9 A ARIMA(y ~ x) 2022-10-10 sample[5000] 15.2 5.63
#> 10 A ARIMA(y ~ x) 2022-10-11 sample[5000] 15.6 6.59
#> # … with 35 more rows
Created on 2022-09-16 by the reprex package (v2.0.1)
I am currently working with the palmer penguins data set in R and want to summarise data that combines means, median, range and quants, grouping by sex.
My current solution has the quant data split from the summary data. Is there a way to do this in one go. If not how do I combine the data sets. The group quant is currently in long format, and I am not sure how to combine them.
group_summary <- penguins %>% group_by(sex) %>% summarize(mean = mean(bill_length_mm,
na.rm = TRUE), meadian = median(bill_length_mm, na.rm = TRUE), range =
max(bill_length_mm, na.rm = TRUE) - min(bill_length_mm, na.rm = TRUE))
group_quant <- penguins %>% group_by(sex) %>% summarize(quantile(bill_length_mm,
probs =seq(.1, 1, by = .1), na.rm =TRUE, .groups = 'drop'))
I had the following solution but it drops the NA values from Sex and I am not sure why.
group_summary <- do.call(data.frame,aggregate(bill_length_mm ~ sex, penguins,
function(x) c(mean = mean(x, na.rm = TRUE), median = median(x, na.rm = TRUE), range =
max(x, na.rm = TRUE) - min(x, na.rm = TRUE), quantile(x, probs = seq(.1, 1, by = .1),
na.rm = TRUE, .groups = 'drop'))))
You may save the quantiles in a list and then use unnest_wider to create new columns from them. To calculate range I used diff(range(...)) instead of max(...) - min(...). Both of them are fine but I included it to show an alternative.
library(palmerpenguins)
library(dplyr)
library(tidyr)
penguins %>%
group_by(sex) %>%
summarize(mean = mean(bill_length_mm, na.rm = TRUE),
median = median(bill_length_mm, na.rm = TRUE),
range = diff(range(bill_length_mm, na.rm = TRUE)),
quantile = list(quantile(bill_length_mm, probs = seq(.1, 1, by = .1), na.rm = TRUE))) %>%
unnest_wider(quantile)
# sex mean median range `10%` `20%` `30%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 female 42.1 42.8 25.9 35.8 36.7 38.2 40 42.8 45.1 45.7 46.5 47.5 58
#2 male 45.9 46.8 25 38.8 40.5 41.3 43.2 46.8 49.0 50.0 50.8 51.9 59.6
#3 NA 41.3 42 13.2 36.8 37.7 37.8 38.6 42 44 44.5 45.2 46.4 47.3
Percentage Change
selectedCountry <- dataset %>%
filter(country == 'Japan') %>%
select(population, suicides_no, year) %>%
group_by(year) %>%
summarise(s_count = sum(suicides_no), p_count = sum(population))
year s_count p_count
<dbl> <dbl> <dbl>
1 1979 20711 107268500
2 1980 20416 108473500
3 1981 19976 109674700
4 1982 20535 110722900
5 1983 24853 111070000
6 1984 24221 111950000
I want to find the correlation between population and suicide after I aggregate population and suicide like this
percentage <- selectedCountry %>%
arrange(year) %>%
mutate(pct.chg.s = 100 * (s_count - lag(s_count,default=first(s_count))) / lag(s_count,default=first(s_count))) %>%
mutate(pct.chg.p = 100 * (p_count - lag(p_count,default=first(p_count))) / lag(p_count,default=first(p_count))) %>%
mutate(correlation = cor(pct.chg.s, pct.chg.p))
head(percentage)
I end up with a result
year s_count p_count pct.chg.s pct.chg.p correlation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1979 20711 107268500 0 0 0.00789
2 1980 20416 108473500 -1.42 1.12 0.00789
3 1981 19976 109674700 -2.16 1.11 0.00789
4 1982 20535 110722900 2.80 0.956 0.00789
5 1983 24853 111070000 21.0 0.313 0.00789
6 1984 24221 111950000 -2.54 0.792 0.00789
to plot the corr between two variables using two way (I can't sure if this true or false)
ggscatter(percentage, x = "pct.chg.s", y = "pct.chg.p",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Percentage Change in Suicide",
ylab = "Percentage Change in Population",
color = "blue", shape = 19,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
ellipse = TRUE, mean.point = TRUE,
star.plot = TRUE)
ggplot(percentage, aes(x = pct.chg.p, y = pct.chg.s)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x) +
labs(title = "Correlation between Pubulation and Suicides",
x = "Percentage Change in Suicide",
y = "Percentage Change in Population")
In this problem I want to map a function over multiple data inputs to create output showing frequencies of item responses.
First I create two data sets using the psych package:
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(psych))
set.seed(123)
data_input_sim <-
as_tibble(sim.poly.ideal(nvar = 50, n = 1000, cat = 4, )[["items"]]) %>%
mutate_all(
~ case_when(
.x == 0 ~ "never",
.x == 1 ~ "occasionally",
.x == 2 ~ "frequently",
.x == 3 ~ "always"
)
) %>%
rename_all( ~ str_c("i", str_pad(
as.character(1:50), 2, side = "left", pad = "0"
))) %>%
mutate(
ID = 100001:101000,
age = sample(c(5:12), 1000, replace = TRUE),
age_range = case_when(
age <=8 ~ "5 to 8 yo",
T ~ "9 to 12 yo"
),
gender = sample(
c("female", "male"),
1000,
replace = TRUE,
prob = c(0.53, 0.47)
),
educ = sample(
c("no_HS", "HS_grad", "some_college", "BA_plus"),
1000,
replace = TRUE,
prob = c(0.119, 0.263, 0.306, 0.311)
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
select(ID:clin_status, i01:i50)
data_input_bfi <- bfi %>%
drop_na() %>%
sample_n(1000) %>%
mutate(
ID = 200001:201000,
age_range = case_when(
age <= 18 ~ "18 yo or younger",
between(age, 19, 24) ~ "19 to 24 yo",
between(age, 25, 39) ~ "25 to 39 yo",
T ~ "40 yo or older"
),
gender = case_when(gender == 1 ~ "male",
gender == 2 ~ "female"),
educ = case_when(
education == 1 ~ "no_HS",
education == 2 ~ "HS_grad",
education == 3 ~ "some_college",
T ~ "BA_plus"
),
ethnic = sample(
c("hispanic", "asian", "black", "white", "other"),
1000,
replace = TRUE,
prob = c(0.239, 0.048, 0.136, 0.521, .056)
),
region = sample(
c("northeast", "south", "midwest", "west"),
1000,
replace = TRUE,
prob = c(0.166, 0.383, 0.212, 0.238)
),
clin_status = sample(
c("typ", "clin"),
1000,
replace = TRUE,
prob = c(0.8, 0.2)
)
) %>%
mutate_at(
vars(A1:O5),
~
case_when(
.x == 1 ~ "very_inaccurate",
.x == 2 ~ "moderately_inaccurate",
.x == 3 ~ "slightly_inaccurate",
.x == 4 ~ "slightly_accurate",
.x == 5 ~ "moderately_accurate",
.x == 6 ~ "very_accurate",
)
) %>%
select(ID, age:clin_status, A1:O5)
Then I extract and sequence elements unique to each data set: the suffix of its name, the names of its item columns, and the names of its item categories:
data_name_suffix <- c("sim", "bfi")
sim_item_cols <- str_c("i", str_pad(as.character(1:50), 2, side = "left", pad = "0"))
bfi_item_cols <- cross(list(c("A", "C", "E", "N", "O"), seq(1:5))) %>%
map_chr(str_c, collapse = "") %>%
sort()
sim_item_cats <- c("never", "occasionally","frequently", "always")
bfi_item_cats <- c("very_inaccurate", "moderately_inaccurate", "slightly_inaccurate",
"slightly_accurate", "moderately_accurate", "very_accurate")
data_name_suffix is a two-element character vector; I then create two-element lists (using quos()) to hold the item column and category names:
item_cols <- quos(sim_item_cols, bfi_item_cols)
item_cats <- quos(sim_item_cats, bfi_item_cats)
Now I attempt to map the output-creating function over the three inputs, using purrr::pmap():
pmap_df(
list(data_name_suffix,
item_cols,
item_cats),
~
eval(as.name(str_c("data_input_", data_name_suffix))) %>%
select(!!!item_cols) %>%
gather(var, value) %>%
group_by(var, value) %>%
count(var, value) %>%
ungroup() %>%
spread(value, n) %>%
arrange(match(var, !!!item_cols)) %>%
select(var, !!!item_cats) %>%
assign(str_c("freq_item_val_", data_name_suffix), ., envir = .GlobalEnv)
)
And it returns this error:
Error: Unknown columns `A1`, `A2`, `A3`, `A4`, `A5` and ...
Which suggests to me that R is seeing the list item_cols as a single long character vector, rather than two separate character vectors to iterate over.
And here we reach the limit of my understanding of and experience with tidyeval techniques. I suspect that I'm doing something wrong with quos() and !!!.
Thanks in advance for any help, and I hope whoever reads this is safe and healthy during this surreal time.
Here, we could use mget to get the values of the objects
library(stringr)
library(purrr)
library(dplyr)
library(tidyr)
list(mget(str_c('data_input_', data_name_suffix)),
item_cols,
item_cats) %>%
pmap(~ ..1 %>%
select(!!! ..2) %>%
pivot_longer(everything(), names_to = 'var', values_to = 'value') %>%
count(var, value) %>%
pivot_wider(names_from = value, values_from = n) %>%
arrange(match(var, !!!..2)) %>%
select(var, !!! ..3) )
#$data_input_sim
# A tibble: 50 x 5
# var never occasionally frequently always
# <chr> <int> <int> <int> <int>
# 1 i01 465 366 141 28
# 2 i02 489 336 147 28
# 3 i03 457 367 146 30
# 4 i04 433 385 162 20
# 5 i05 418 362 171 49
# 6 i06 420 369 169 42
# 7 i07 405 367 182 46
# 8 i08 361 401 194 44
# 9 i09 346 391 211 52
#10 i10 334 425 203 38
# … with 40 more rows
#$data_input_bfi
# A tibble: 25 x 7
# var very_inaccurate moderately_inaccurate slightly_inaccurate slightly_accurate moderately_accurate very_accurate
# <chr> <int> <int> <int> <int> #<int> <int>
# 1 A1 334 278 151 130 75 32
# 2 A2 18 49 48 197 365 323
# 3 A3 32 51 72 210 353 282
# 4 A4 48 69 60 159 243 421
# 5 A5 26 66 89 207 340 272
# 6 C1 17 48 82 213 383 257
# 7 C2 26 85 98 212 361 218
# 8 C3 35 80 102 272 322 189
# 9 C4 296 270 166 163 83 22
#10 C5 197 212 118 207 167 99
# … with 15 more rows
NOTE: assigning to create multiple objects is not recommended. Instead keep the output in a list and make changes in each of the list elements (if needed) by looping over it with map
I would like to group data into multiple seasin such that my season are winter: Dec - Feb; Spring: Mar - May; Summer: Jun -Aug, and Fall: Sep - Nov. I would then like to boxplot the Winter and Spring seasonal data comparing A to B and then A to C. Here is my laborious code so far. I would appreciate an efficient way of data grouping and plotting.
library(tidyverse)
library(reshape2)
Dates30s = data.frame(seq(as.Date("2011-01-01"), to= as.Date("2040-12-31"),by="day"))
colnames(Dates30s) = "date"
FakeData = data.frame(A = runif(10958, min = 0.5, max = 1.5), B = runif(10958, min = 1.6, max = 2), C = runif(10958, min = 0.8, max = 1.8))
myData = data.frame(Dates30s, FakeData)
myData = separate(myData, date, sep = "-", into = c("Year", "Month", "Day"))
myData$Year = as.numeric(myData$Year)
myData$Month = as.numeric(myData$Month)
SeasonalData = myData %>% group_by(Year, Month) %>% summarise_all(funs(mean)) %>% select(Year, Month, A, B, C)
Spring = SeasonalData %>% filter(Month == 3 | Month == 4 |Month == 5)
Winter1 = SeasonalData %>% filter(Month == 12)
Winter1$Year = Winter1$Year+1
Winter2 = SeasonalData %>% filter(Month == 1 | Month == 2 )
Winter = rbind(Winter1, Winter2) %>% filter(Year >= 2012 & Year <= 2040) %>% group_by(Year) %>% summarise_all(funs(mean)) %>% select(-"Month")
BoxData = gather(Winter, key = "Variable", value = "value", -Year )
ggplot(BoxData, aes(x=Variable, y=value,fill=factor(Variable)))+
geom_boxplot() + labs(title="Winter") +facet_wrap(~Variable)
I would like to have Two figures: Figure 1 split in two; one for Winter season and one for Summer season (see BoxPlot 1) and one for Monthly annual average representing average monthly values across the entire time period (2011 -2040) see Boxplot 2
This is what I usually do it. All calculation and plotting are based on water year (WY) or hydrologic year from October to September.
library(tidyverse)
library(lubridate)
set.seed(123)
Dates30s <- data.frame(seq(as.Date("2011-01-01"), to = as.Date("2040-12-31"), by = "day"))
colnames(Dates30s) <- "date"
FakeData <- data.frame(A = runif(10958, min = 0.3, max = 1.5),
B = runif(10958, min = 1.2, max = 2),
C = runif(10958, min = 0.6, max = 1.8))
### Calculate Year, Month then Water year (WY) and Season
myData <- data.frame(Dates30s, FakeData) %>%
mutate(Year = year(date),
MonthNr = month(date),
Month = month(date, label = TRUE, abbr = TRUE)) %>%
mutate(WY = case_when(MonthNr > 9 ~ Year + 1,
TRUE ~ Year)) %>%
mutate(Season = case_when(MonthNr %in% 9:11 ~ "Fall",
MonthNr %in% c(12, 1, 2) ~ "Winter",
MonthNr %in% 3:5 ~ "Spring",
TRUE ~ "Summer")) %>%
select(-date, -MonthNr, -Year) %>%
as_tibble()
myData
#> # A tibble: 10,958 x 6
#> A B C Month WY Season
#> <dbl> <dbl> <dbl> <ord> <dbl> <chr>
#> 1 0.645 1.37 1.51 Jan 2011 Winter
#> 2 1.25 1.79 1.71 Jan 2011 Winter
#> 3 0.791 1.35 1.68 Jan 2011 Winter
#> 4 1.36 1.97 0.646 Jan 2011 Winter
#> 5 1.43 1.31 1.60 Jan 2011 Winter
#> 6 0.355 1.52 0.708 Jan 2011 Winter
#> 7 0.934 1.94 0.825 Jan 2011 Winter
#> 8 1.37 1.89 1.03 Jan 2011 Winter
#> 9 0.962 1.75 0.632 Jan 2011 Winter
#> 10 0.848 1.94 0.883 Jan 2011 Winter
#> # ... with 10,948 more rows
Calculate seasonal and monthly average by WY
### Seasonal Avg by WY
SeasonalAvg <- myData %>%
select(-Month) %>%
group_by(WY, Season) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Season)
SeasonalAvg
#> # A tibble: 366 x 4
#> WY Season State MFI
#> <dbl> <chr> <chr> <dbl>
#> 1 2011 Fall A 0.939
#> 2 2011 Spring A 0.907
#> 3 2011 Summer A 0.896
#> 4 2011 Winter A 0.909
#> 5 2012 Fall A 0.895
#> 6 2012 Spring A 0.865
#> 7 2012 Summer A 0.933
#> 8 2012 Winter A 0.895
#> 9 2013 Fall A 0.879
#> 10 2013 Spring A 0.872
#> # ... with 356 more rows
### Monthly Avg by WY
MonthlyAvg <- myData %>%
select(-Season) %>%
group_by(WY, Month) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Month) %>%
mutate(Month = factor(Month))
MonthlyAvg
#> # A tibble: 1,080 x 4
#> WY Month State MFI
#> <dbl> <ord> <chr> <dbl>
#> 1 2011 Jan A 1.00
#> 2 2011 Feb A 0.807
#> 3 2011 Mar A 0.910
#> 4 2011 Apr A 0.923
#> 5 2011 May A 0.888
#> 6 2011 Jun A 0.876
#> 7 2011 Jul A 0.909
#> 8 2011 Aug A 0.903
#> 9 2011 Sep A 0.939
#> 10 2012 Jan A 0.903
#> # ... with 1,070 more rows
Plot seasonal and monthly data
### Seasonal plot
s1 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
s1
### Monthly plot
m1 <- ggplot(MonthlyAvg, aes(x = Month, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
m1
Bonus
### https://stackoverflow.com/a/58369424/786542
# if (!require(devtools)) {
# install.packages('devtools')
# }
# devtools::install_github('erocoar/gghalves')
library(gghalves)
s2 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05) +
geom_half_violin(aes(fill = State),
side = "r", nudge = 0.01) +
theme_light() +
theme(legend.position = "bottom") +
guides(fill = guide_legend(nrow = 1))
s2
s3 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05, outlier.color = NA) +
geom_dotplot(aes(fill = State),
binaxis = "y", method = "histodot",
dotsize = 0.35,
stackdir = "up", position = PositionDodge) +
theme_light() +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 1))
s3
#> `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Created on 2019-10-16 by the reprex package (v0.3.0)