Applying a function to a nested list - r

I have a data frame that contains nested list based on ID. I am trying to apply a function to the nested list within this data frame, but I am running into this error:
Error in make_track(tbl = x, .x = x, .y = y, .t = date, uid = ID, crs = sp::CRS("+init=epsg:32612")) : Non existent columns from tbl were requested.
Here is my reproducible example. I was wondering what the best way to apply a function to a nested list might be, and how I can go about fixing this error. Do I have to do a double lapply to fix this problem?
set.seed(12345)
library(lubridate)
library(dplyr)
library(amt)
f = function(data){
data %>% mutate(
new = floor_date(data$date, "10 days"),
new = if_else(day(new) == 31, new - days(10), new)
) %>%
group_split(new)
}
nested <- tibble(
ID = rep(c("A","B","C","D", "E"), 100),
date = rep_len(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"), 500),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000)
) %>% group_by(ID) %>%
nest() %>%
mutate(data = map(data, f))
track_list <- lapply(nested, function (x){
make_track(tbl = x, .x = x, .y = y, .t = date,
uid = ID,
# lat/long: 4326 (lat/long, WGS84 datum).
# utm: crs = sp::CRS("+init=epsg:32612"))
crs = sp::CRS("+init=epsg:32612"))
})

The issue is that the data is nested, so we need to do one more level inside to pick up the data. Also, the make_track requires all columns to be in the same data object, so we need to create the corresponding uid from the 'ID' column of nested object
library(purrr)
library(dplyr)
library(amt)
out <- map2_dfr(nested$ID, nested$data, function(z, lst1)
map_dfr(lst1, ~ {
dat <- .x %>%
mutate(ID = z)
make_track(tbl = dat, .x = x, .y = y, .t = date, uid = ID,
crs = sp::CRS("+init=epsg:32612"))
}))
-output
> out
# A tibble: 500 x 4
x_ y_ t_ uid
<dbl> <dbl> <date> <chr>
1 74418. 820935. 2010-01-01 A
2 63327. 885896. 2010-01-06 A
3 60691. 873949. 2010-01-11 A
4 69250. 868411. 2010-01-16 A
5 69075. 876142. 2010-01-21 A
6 67797. 829892. 2010-01-26 A
7 75860. 843542. 2010-01-31 A
8 67233. 882318. 2010-02-05 A
9 75644. 826283. 2010-02-10 A
10 66424. 853789. 2010-02-15 A
# … with 490 more rows
If we want the output as a nested list, use remove the _dfr
out <- map2(nested$ID, nested$data, function(z, lst1)
map(lst1, ~ {
dat <- .x %>%
mutate(ID = z)
make_track(tbl = dat, .x = x, .y = y, .t = date, uid = ID,
crs = sp::CRS("+init=epsg:32612"))
}))

Related

NA values in vectors passed to map_dfr

I'm trying to pass vectors, each with a different number of NA values, through to a map() function but it's returning an error.
I have a tibble of N numeric columns and 1 categorical column. I want to compare the distributions for each of the numeric columns against the other split by the values of the categorical column. I use overlapping::overlap() to calculate the overlap of the distributions, and i feed the numeric columns into a map_dfr function for the iteration. For example:
require(overlapping)
require(dplyr)
require(purrr)
set.seed( 1 )
n <- 100
G1 <- sample( 0:30, size = n, replace = TRUE )
G2 <- sample( 0:30, size = n, replace = TRUE, prob = dbinom( 0:30, 31, .55 ))
G3 <- sample( 0:30, size = n, replace = TRUE, prob = dbinom( 0:30, 41, .65 ))
Data <- data.frame(y = G1, x = G2, z = G3, group = rep(c("G1","G2", "G3"), each = n), class = rep(c("C1","C2", "C3"), each = 1)) %>% as_tibble()
Data
overlap_fcn <- function(.x) {
## construct list of vectors
dist_list <- list(
"C1" = Data %>%
filter(class == 'C1', !is.na(.x)) %>%
pull(.x),
"C2" = Data %>%
filter(class == 'C2', !is.na(.x)) %>%
pull(.x),
"C3" = Data %>%
filter(class == 'C3', !is.na(.x)) %>%
pull(.x)
)
## calculate distribution overlaps
return(
enframe(
overlapping::overlap(dist_list)$OV*100
) %>%
mutate(value = paste0(round(value, 2), "%"),
class = .x) %>%
rename(comparison = name, overlap = value) %>%
relocate(class)
)
}
overlap_table <- purrr::map_dfr(
.x = c('y', 'x', "z"),
.f = ~overlap_fcn(.x))
overlap_table
The above works as intended. However, in practice I have different amounts of missingess in each of x, y, and z. I try to account for this with the filter on !is.na(.x) but it's not working. For example:
Data$x[1:3] <- NA
Data$y[10:20] <- NA
Data$z[100:150] <- NA
overlap_table <- purrr::map_dfr(
.x = c('x', 'y', "z"),
.f = ~overlap_fcn(.x))
returns this error:
Error in density.default(x[[j]], n = nbins, ...): 'x' contains missing values
Error in density.default(x[[j]], n = nbins, ...): 'x' contains missing values
Traceback:
1. purrr::map_dfr(.x = c("x", "y", "z"), .f = ~overlap_fcn(.x))
2. map(.x, .f, ...)
3. .f(.x[[i]], ...)
4. overlap_fcn(.x)
5. enframe(overlapping::overlap(dist_list)$OV * 100) %>% mutate(value = paste0(round(value,
. 2), "%"), class = .x) %>% rename(comparison = name, overlap = value) %>%
. relocate(class) # at line 25-33 of file <text>
6. relocate(., class)
7. rename(., comparison = name, overlap = value)
8. mutate(., value = paste0(round(value, 2), "%"), class = .x)
9. enframe(overlapping::overlap(dist_list)$OV * 100)
10. overlapping::overlap(dist_list)
11. density(x[[j]], n = nbins, ...)
12. density.default(x[[j]], n = nbins, ...)
13. stop("'x' contains missing values")
Can anyone help me out here please? I'm sure it's something super obvious i'm missing; i just can't see what!
Here, the .x is character class. We may need to convert to symbol and evaluate (!!)
overlap_fcn <- function(.x) {
## construct list of vectors
dist_list <- list(
"C1" = Data %>%
filter(class == 'C1', !is.na(!! rlang::sym(.x))) %>%
pull(.x),
"C2" = Data %>%
filter(class == 'C2', !is.na(!! rlang::sym(.x))) %>%
pull(.x),
"C3" = Data %>%
filter(class == 'C3', !is.na(!! rlang::sym(.x))) %>%
pull(.x)
)
## calculate distribution overlaps
return(
enframe(
overlapping::overlap(dist_list)$OV*100
) %>%
mutate(value = paste0(round(value, 2), "%"),
class = .x) %>%
rename(comparison = name, overlap = value) %>%
relocate(class)
)
}
-testing after creating the NAs in Data
> purrr::map_dfr(
+ .x = c('x', 'y', "z"),
+ .f = ~overlap_fcn(.x))
# A tibble: 9 × 3
class comparison overlap
<chr> <chr> <chr>
1 x C1-C2 98.61%
2 x C1-C3 97.46%
3 x C2-C3 97.5%
4 y C1-C2 95.47%
5 y C1-C3 96.22%
6 y C2-C3 97.14%
7 z C1-C2 90.17%
8 z C1-C3 94.9%
9 z C2-C3 89.24%

Extract each data frame name from a list of data frames in lapply function

I have a list of many data frames and I am trying to perform manipulations to each data frame in the list. I created this lapply function and then the list is then merged together. However when trying to rename certain columns so that they include the respective data frame name:
paste(deparse(substitute(x)),"_start"
the dataframe names are being extracted like this :
x[[i]]_start_1
Here is the full code:
df_list <-lapply(df_list, function(x){
lookup <- c(start = paste(deparse(substitute(x)),"_start"),
end = paste(deparse(substitute(x)),"_end"),
top = paste(deparse(substitute(x)),"_top"),
left = paste(deparse(substitute(x)),"_left"),
height = paste(deparse(substitute(x)),"_height"),
width = paste(deparse(substitute(x)),"_width"),
type = paste(deparse(substitute(x)),"_type"),
value = paste(deparse(substitute(x)),"_value"))
x <- x %>% rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId, with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
})
new_df <- Reduce(function(x,y) merge(x,y,all=TRUE), df_list)
Please let me know if there is a workaround so that the data frame names are printed correctly. Thank you!
We may use Map
df_list2 <- Map(function(x, nm) {
lookup <- c(start = paste0(nm,"_start"),
end = paste0(nm, "_end"),
top = paste0(nm,"_top"),
left = paste0(nm,"_left"),
height = paste0(nm,"_height"),
width = paste0(nm,"_width"),
type = paste0(nm,"_type"),
value = paste0(nm,"_value"))
x <- x %>%
rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId,
with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
}, df_list, names(df_list))

Drilldown by class group in HighcharteR - R

I have this data:
library(highcharter)
library(dplyr)
the_dates <- as.Date(c(rep("2021-01-01",3),
rep("2021-02-01",3)))
the_values <- c(2,3,4,5,6,7)
the_group <- c("Group_A","Group_B","Group_B",
"Group_A","Group_B","Group_B")
the_class <- c("X","Y","Z",
"X","Y","Z")
the_data <- data.frame(the_dates,
the_group,
the_class,
the_values,
stringsAsFactors = FALSE)
> the_data
the_dates the_group the_class the_values
1 2021-01-01 Group_A X 2
2 2021-01-01 Group_B Y 3
3 2021-01-01 Group_B Z 4
4 2021-02-01 Group_A X 5
5 2021-02-01 Group_B Y 6
6 2021-02-01 Group_B Z 7
And I want to create a drill down plot. So I would like to see the groups and if I drill down, I would like to see the class. What I have tried is:
the_data %>%
hchart(
type = "spline",
hcaes(x = the_dates, y = the_values, drilldown = the_class),
colorByPoint = TRUE)
But the link to drill down is in the dates. Any help will be greatly appreciated.
Here's one potential solution with a few caveats.
I had some issues with as.Date() in the drilldown x axis names, so I've left them as characters. I've also done a quick mean() on the the_values by the_date so there's actually something to drilldown to.
library(highcharter)
library(dplyr)
library(purrr) # for the map() function
the_dates <- c(rep("2021-01-01",3),
rep("2021-02-01",3))
the_values <- c(2,3,4,5,6,7)
the_group <- c("Group_A","Group_B","Group_B",
"Group_A","Group_B","Group_B")
the_class <- c("X","Y","Z",
"X","Y","Z")
the_data <- data.frame(the_dates,
the_group,
the_class,
the_values,
stringsAsFactors = FALSE)
mean_data <- the_data %>%
group_by(the_dates) %>%
summarise(mean_values = mean(the_values))
drill_data <- the_data %>%
group_nest(the_dates) %>%
mutate(
id = the_dates,
type = "column",
data = map(data, ~ .x %>%
mutate(
name = the_class,
y = the_values
) %>%
list_parse())
)
Now let's build the plot:
mean_data %>%
hchart(
type = "spline",
hcaes(x = the_dates, y = mean_values, drilldown = the_dates, name = the_dates),
name = "Dates",
colorByPoint = TRUE) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = list_parse(drill_data)
)

Grid seach on ARIMA model in R

I'm trying to make grid search for my ARIMA model working and I need additional help with it.
I have the following data:
head(train)
Date Count
<date> <int>
1 2016-06-15 21
2 2016-06-16 21
3 2016-06-17 12
4 2016-06-18 20
5 2016-06-19 29
6 2016-06-20 30
Train data Date variable ranges from 2016-06-15 to 2019-06-30 with 1111 observations in total
Train data Count variable ranges from min=3 to max=154 with mean=23.83 and sd=13.84.
I was able to define hyper parameters and create 36 ARIMA models with the following code:
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models = map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train,
order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
And than I get an error when I try to calculate RMSE across my test data with the following code:
final_models <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((.x - .y) ** 2))))
I get one error and one warning message:
Error in .x - .y : non-numeric argument to binary operator
In addition: Warning message:
In mean((.x - .y)^2) :
Incompatible methods ("Ops.ts", "Ops.data.frame") for "-"
Can someone please help me with that?
Here is my test data if it's needed to create dummy data:
head(test)
Date Count
<date> <int>
1 2019-07-02 20
2 2019-07-03 28
3 2019-07-04 35
4 2019-07-05 34
5 2019-07-06 60
6 2019-07-07 63
Test data Date variable ranges from 2019-07-01 to 2019-12-31 with 184 observations in total
Train data Count variable ranges from min=6 to max=63 with mean=21.06 and sd=9.89.
The problem is that when you are computing the RMSE you are using time series rather than vectors. So, you have to change the class of both predictions and true values to numeric.
Here is my solution:
# Load libraries
library(fpp2)
library(dplyr)
library(xts)
library(purrr)
library(tidyr)
# Create sample dataset
dates <- seq.Date(as.Date("2019-07-02"), by = "day", length.out = length(WWWusage))
train <- data.frame(Date = dates, Count = WWWusage)
# Get test dataset using drift method
test <- forecast::rwf(WWWusage, h = 183, drift = TRUE)$mean
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models =
map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train, order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
# Estimate RSME for each candidate
# Note: you have to make sure that both .x and .y are numeric
final_models2 <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((as.numeric(.x) - as.numeric(.y)) ** 2))))

Find max of rows from specific columns and extract column name and corresponding row value from another column

Here is a data structure that I have:
structure(list(UDD_beta = c(1.17136554204268, 0.939587997289016
), UDD_pval = c(0, 0), UDD_R.sq = c(0.749044972637797, 0.516943886705951
), SSX_beta = c(1.05356804780772, 0.927948300464624), SSX_pval = c(0,
0), SSX_R.sq = c(0.60226298037862, 0.629111666509209), SPP_beta = c(0.675765151939885,
0.516425218613404), SPP_pval = c(0, 0), SPP_R.sq = c(0.479849538274406,
0.378266618442121), EEE_beta = c(0.690521022226874, 0.639380962824289
), EEE_pval = c(0, 0), EEE_R.sq = c(0.585610742768951, 0.676073352909597
)), .Names = c("UDD_beta", "UDD_pval", "UDD_R.sq", "SSX_beta",
"SSX_pval", "SSX_R.sq", "SPP_beta", "SPP_pval", "SPP_R.sq",
"EEE_beta", "EEE_pval", "EEE_R.sq"), row.names = c("DDK", "DDL"
), class = "data.frame")
I want to take R.sq columns and for each row find the max and the column name of the max value. Then take corresponding beta. Expected output:
Name Value
DDK UDD 1.17136554204268
DDL EEE 0.690521022226874
Sorry, the second expected value should be 0.639380962824289.
We could use max.col. Subset the columns of interest i.e. columns that have 'R.sq' using the grep, then get the column index of max value with max.col. Use that to get the column names and also the values that correspond to a particular row (row/column indexing)
i1 <- grep("R.sq", names(df1))
i2 <- max.col(df1[i1], "first")
i3 <- grep("beta", names(df1))
res <- data.frame(Names = sub("_.*", "", names(df1)[i1][i2]),
Value = df1[i3][cbind(1:nrow(df1), i2)])
row.names(res) <- row.names(df1)
sub_data <- data[grep("R.sq", colnames(data))]
colnames(sub_data) <- gsub("_R.sq", "", colnames(sub_data))
sub_data$Name <- NA
sub_data$Value <- NA
for (i in 1:nrow(sub_data)){
sub_data$Name[i] <- names(sub_data[i,])[which.max(apply(sub_data[i,], 2, max))]
sub_data$Value[i] <- max(data[grep(paste0(sub_data$Name[i], "_beta"), colnames(data))], na.rm=T)
}
sub_data[c("Name", "Value")]
# Name Value
#DDK UDD 1.171366
#DDL EEE 0.690521
You can use a tidyverse approach via gathering your df to long and filtering both R.sq vars and max value, i.e.
library(tidyverse)
df %>%
rownames_to_column('ID') %>%
gather(var, val, -ID) %>%
filter(grepl('R.sq|beta', var)) %>%
group_by(ID) %>%
mutate(max1=as.integer(val == max(val[grepl('R.sq', var)]))) %>%
group_by(ID, grp = sub('_.*', '', var)) %>%
filter(!all(max1 == 0) & grepl('beta', var)) %>%
ungroup() %>% select(-c(max1, grp))
which gives,
# A tibble: 2 x 3
ID var val
<chr> <chr> <dbl>
1 DDK UDD_beta 1.171366
2 DDL EEE_beta 0.639381
# Need ID for all possible betas and Rsq
ID <- gsub("_R.sq", "", grep("_R.sq$", names(INPUT), value = TRUE))
dummy <- function(x) {
# Find out which Rsq is largest
i <- ID[which.max(x[paste0(ID, "_R.sq")])]
# Extract beta for largest Rsq
data.frame(Name = i, Value = x[paste0(i, "_beta")])
}
do.call("rbind", apply(INPUT, 1, dummy))

Resources