I have this data:
library(highcharter)
library(dplyr)
the_dates <- as.Date(c(rep("2021-01-01",3),
rep("2021-02-01",3)))
the_values <- c(2,3,4,5,6,7)
the_group <- c("Group_A","Group_B","Group_B",
"Group_A","Group_B","Group_B")
the_class <- c("X","Y","Z",
"X","Y","Z")
the_data <- data.frame(the_dates,
the_group,
the_class,
the_values,
stringsAsFactors = FALSE)
> the_data
the_dates the_group the_class the_values
1 2021-01-01 Group_A X 2
2 2021-01-01 Group_B Y 3
3 2021-01-01 Group_B Z 4
4 2021-02-01 Group_A X 5
5 2021-02-01 Group_B Y 6
6 2021-02-01 Group_B Z 7
And I want to create a drill down plot. So I would like to see the groups and if I drill down, I would like to see the class. What I have tried is:
the_data %>%
hchart(
type = "spline",
hcaes(x = the_dates, y = the_values, drilldown = the_class),
colorByPoint = TRUE)
But the link to drill down is in the dates. Any help will be greatly appreciated.
Here's one potential solution with a few caveats.
I had some issues with as.Date() in the drilldown x axis names, so I've left them as characters. I've also done a quick mean() on the the_values by the_date so there's actually something to drilldown to.
library(highcharter)
library(dplyr)
library(purrr) # for the map() function
the_dates <- c(rep("2021-01-01",3),
rep("2021-02-01",3))
the_values <- c(2,3,4,5,6,7)
the_group <- c("Group_A","Group_B","Group_B",
"Group_A","Group_B","Group_B")
the_class <- c("X","Y","Z",
"X","Y","Z")
the_data <- data.frame(the_dates,
the_group,
the_class,
the_values,
stringsAsFactors = FALSE)
mean_data <- the_data %>%
group_by(the_dates) %>%
summarise(mean_values = mean(the_values))
drill_data <- the_data %>%
group_nest(the_dates) %>%
mutate(
id = the_dates,
type = "column",
data = map(data, ~ .x %>%
mutate(
name = the_class,
y = the_values
) %>%
list_parse())
)
Now let's build the plot:
mean_data %>%
hchart(
type = "spline",
hcaes(x = the_dates, y = mean_values, drilldown = the_dates, name = the_dates),
name = "Dates",
colorByPoint = TRUE) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = list_parse(drill_data)
)
Related
I have a data frame that contains nested list based on ID. I am trying to apply a function to the nested list within this data frame, but I am running into this error:
Error in make_track(tbl = x, .x = x, .y = y, .t = date, uid = ID, crs = sp::CRS("+init=epsg:32612")) : Non existent columns from tbl were requested.
Here is my reproducible example. I was wondering what the best way to apply a function to a nested list might be, and how I can go about fixing this error. Do I have to do a double lapply to fix this problem?
set.seed(12345)
library(lubridate)
library(dplyr)
library(amt)
f = function(data){
data %>% mutate(
new = floor_date(data$date, "10 days"),
new = if_else(day(new) == 31, new - days(10), new)
) %>%
group_split(new)
}
nested <- tibble(
ID = rep(c("A","B","C","D", "E"), 100),
date = rep_len(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"), 500),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000)
) %>% group_by(ID) %>%
nest() %>%
mutate(data = map(data, f))
track_list <- lapply(nested, function (x){
make_track(tbl = x, .x = x, .y = y, .t = date,
uid = ID,
# lat/long: 4326 (lat/long, WGS84 datum).
# utm: crs = sp::CRS("+init=epsg:32612"))
crs = sp::CRS("+init=epsg:32612"))
})
The issue is that the data is nested, so we need to do one more level inside to pick up the data. Also, the make_track requires all columns to be in the same data object, so we need to create the corresponding uid from the 'ID' column of nested object
library(purrr)
library(dplyr)
library(amt)
out <- map2_dfr(nested$ID, nested$data, function(z, lst1)
map_dfr(lst1, ~ {
dat <- .x %>%
mutate(ID = z)
make_track(tbl = dat, .x = x, .y = y, .t = date, uid = ID,
crs = sp::CRS("+init=epsg:32612"))
}))
-output
> out
# A tibble: 500 x 4
x_ y_ t_ uid
<dbl> <dbl> <date> <chr>
1 74418. 820935. 2010-01-01 A
2 63327. 885896. 2010-01-06 A
3 60691. 873949. 2010-01-11 A
4 69250. 868411. 2010-01-16 A
5 69075. 876142. 2010-01-21 A
6 67797. 829892. 2010-01-26 A
7 75860. 843542. 2010-01-31 A
8 67233. 882318. 2010-02-05 A
9 75644. 826283. 2010-02-10 A
10 66424. 853789. 2010-02-15 A
# … with 490 more rows
If we want the output as a nested list, use remove the _dfr
out <- map2(nested$ID, nested$data, function(z, lst1)
map(lst1, ~ {
dat <- .x %>%
mutate(ID = z)
make_track(tbl = dat, .x = x, .y = y, .t = date, uid = ID,
crs = sp::CRS("+init=epsg:32612"))
}))
I'm trying to make grid search for my ARIMA model working and I need additional help with it.
I have the following data:
head(train)
Date Count
<date> <int>
1 2016-06-15 21
2 2016-06-16 21
3 2016-06-17 12
4 2016-06-18 20
5 2016-06-19 29
6 2016-06-20 30
Train data Date variable ranges from 2016-06-15 to 2019-06-30 with 1111 observations in total
Train data Count variable ranges from min=3 to max=154 with mean=23.83 and sd=13.84.
I was able to define hyper parameters and create 36 ARIMA models with the following code:
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models = map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train,
order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
And than I get an error when I try to calculate RMSE across my test data with the following code:
final_models <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((.x - .y) ** 2))))
I get one error and one warning message:
Error in .x - .y : non-numeric argument to binary operator
In addition: Warning message:
In mean((.x - .y)^2) :
Incompatible methods ("Ops.ts", "Ops.data.frame") for "-"
Can someone please help me with that?
Here is my test data if it's needed to create dummy data:
head(test)
Date Count
<date> <int>
1 2019-07-02 20
2 2019-07-03 28
3 2019-07-04 35
4 2019-07-05 34
5 2019-07-06 60
6 2019-07-07 63
Test data Date variable ranges from 2019-07-01 to 2019-12-31 with 184 observations in total
Train data Count variable ranges from min=6 to max=63 with mean=21.06 and sd=9.89.
The problem is that when you are computing the RMSE you are using time series rather than vectors. So, you have to change the class of both predictions and true values to numeric.
Here is my solution:
# Load libraries
library(fpp2)
library(dplyr)
library(xts)
library(purrr)
library(tidyr)
# Create sample dataset
dates <- seq.Date(as.Date("2019-07-02"), by = "day", length.out = length(WWWusage))
train <- data.frame(Date = dates, Count = WWWusage)
# Get test dataset using drift method
test <- forecast::rwf(WWWusage, h = 183, drift = TRUE)$mean
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models =
map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train, order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
# Estimate RSME for each candidate
# Note: you have to make sure that both .x and .y are numeric
final_models2 <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((as.numeric(.x) - as.numeric(.y)) ** 2))))
I'm trying to count instances where a certain string appears in a dataframe (this will be a sub-string, i.e. "blue" will appear within a larger block of text), and then summarize those counts by another field. Here's the code:
totals_by_county <- county_data %>%
group_by(county_data$county)%>%
summarise(number_occurences = count(grepl('blue', county_data$color,ignore.case = TRUE)))
totals_by_county
And I get this error:
no applicable method for 'summarise_' applied to an object of class "logical"
Is there a way to do this in the method I'm trying to use above? Thanks in advance!
With grepl:
totals_by_county <- county_data %>%
group_by(county) %>%
summarise(number_occurences = sum(grepl('blue', color, ignore.case = TRUE)))
or, with count_if from expss:
totals_by_county <- county_data %>%
group_by(county) %>%
summarise(number_occurences = count_if(perl('blue', ignore.case = TRUE), color))
UPDATE with reproducible example:
library(dplyr)
library(expss)
county_data = data.frame(
county = c("A", "A", "A", "B", "B"),
color = c("blue-blue", "red", "orange-blue", "yellow", "green"),
stringsAsFactors = FALSE)
county_data %>%
group_by(county) %>%
summarise(number_occurences = count_if(perl('blue', ignore.case = TRUE), color))
# A tibble: 2 x 2
# county number_occurences
# <chr> <int>
# 1 A 2
# 2 B 0
I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9
I am trying to build a table, and one of my variables should have another variable superscriptet after it. I can find several related answers here on SO, but they all involve fixed values that need to be superscriptet, instead of vectors as in my case.
Also most examples involve plot legends and not tables like in my case (Although I don't think that makes much of a difference).
Example data:
library(tidyverse)
library(knitr)
df <- crossing(
X = seq(1:2),
Y = c("A", "B"))
df
# A tibble: 4 x 2
X Y
<int> <chr>
1 1 A
2 1 B
3 2 A
4 2 B
I would like to mutate a new variable that is just X with Y values superscriptet after it.
Here is what I have tried (Doesn't work):
df %>% mutate(
New = paste0(X, "^Y")) %>%
kable()
df %>% mutate(
New = paste0(X, ^{Y})) %>%
kable()
df %>% mutate(
New = paste0(X, bquote(^~{.Y}~))) %>%
kable()
Any help appreciated.
You could use tableHTML:
df <- data.frame(
X = seq(1:2),
Y = c("A", "B"))
library(dplyr)
library(tableHTML)
You can slightly modify X with the HTML tag <sup> to display Y as a superset:
df %>%
mutate(X = paste0(X, "<sup>", Y, "</sup>")) %>%
select(X) %>%
tableHTML(rownames = FALSE,
escape = FALSE,
widths = 50)
Edit
As pointed out by Steen, this also works with knitr:
df %>%
mutate(X = paste0(X, "<sup>", Y, "</sup>")) %>%
select(X) %>%
knitr::kable(escape = FALSE)
Is it for a pdf output?
Because in this case the following could work:
library(tidyverse)
library(knitr)
df <- crossing(
X = seq(1:2),
Y = c("A", "B"))
df %>% mutate(
New = paste0(X, "\\textsuperscript{", Y, "}")) %>%
kable(escape = FALSE)
Using escape = FALSE to add LaTeX inside the table.