This question already has an answer here:
How Can I Quickly Inspect Built-in Data Sets (PSA)?
(1 answer)
Closed 2 years ago.
The package datasets and various packages come with a fair amount of useful datasets, however there seems to be no easy way to find your perfect dataset when you need it for your package examples, for teaching purposes, or to ask/answer a question here on SO.
Say for instance I want a dataset that is a data.frame, has at least 2 character columns, and is less than 100 rows long.
How can I explore EVERY dataset available and see a maximum of relevant information to make my choice ?
My past tries were messy, taking time, and crashed with some packages which have an unusual object structure like caret.
I've packaged a solution in a one function github package.
I'm copying the whole code at the bottom but the simplest is :
remotes::install_github("moodymudskipper/datasearch")
library(datasearch)
All data sets from package "dplyr"
dplyr_all <-
datasearch("dplyr")
View(dplyr_all)
Datasets from package "datasets" restricted by condition
datasets_ncol5 <-
datasearch("datasets", filter = ~is.data.frame(.) && ncol(.) == 5)
View(datasets_ncol5)
All datasets from all installed packages, no restriction
# might take more or less time, depends what you have installed
all_datasets <- datasearch()
View(all_datasets)
# subsetting the output
my_subset <- subset(
all_datasets,
class1 == "data.frame" &
grepl("treatment", names_collapsed) &
nrow < 100
)
View(my_subset)
datasearch <- function(pkgs = NULL, filter = NULL){
# make function silent
w <- options()$warn
options(warn = -1)
search_ <- search()
file_ <- tempfile()
file_ <- file(file_, "w")
on.exit({
options(warn = w)
to_detach <- setdiff(search(), search_)
for(pkg in to_detach) eval(bquote(detach(.(pkg))))
# note : we still have loaded namespaces, we could unload those that we ddn't
# have in the beginning but i'm worried about surprising effects, I think
# the S3 method tables should be cleaned too, and maybe other things
# note2 : tracing library and require didn't work
})
# convert formula to function
if(inherits(filter, "formula")) {
filter <- as.function(c(alist(.=), filter[[length(filter)]]))
}
## by default fetch all available packages in .libPaths()
if(is.null(pkgs)) pkgs <- .packages(all.available = TRUE)
## fetch all data sets description
df <- as.data.frame(data(package = pkgs, verbose = FALSE)$results)
names(df) <- tolower(names(df))
item <- NULL # for cmd check note
df <- transform(
df,
data_name = sub('.*\\((.*)\\)', '\\1', item),
dataset = sub(' \\(.*', '', item),
libpath = NULL,
item = NULL
)
df <- df[order(df$package, df$data_name),]
pkg_data_names <- aggregate(dataset ~ package + data_name, df, c)
pkg_data_names <- pkg_data_names[order(pkg_data_names$package, pkg_data_names$data_name),]
env <- new.env()
n <- nrow(pkg_data_names)
pb <- progress::progress_bar$new(
format = "[:bar] :percent :pkg",
total = n)
row_dfs <- vector("list", n)
for(i in seq(nrow(pkg_data_names))) {
pkg <- pkg_data_names$package[i]
data_name <- pkg_data_names$data_name[i]
datasets <- pkg_data_names$dataset[[i]]
pb$tick(tokens = list(pkg = format(pkg, width = 12)))
sink(file_, type = "message")
data(list=data_name, package = pkg, envir = env)
row_dfs_i <- lapply(datasets, function(dataset) {
dat <- get(dataset, envir = env)
if(!is.null(filter) && !filter(dat)) return(NULL)
cl <- class(dat)
nms <- names(dat)
nc <- ncol(dat)
if (is.null(nc)) nc <- NA
nr <- nrow(dat)
if (is.null(nr)) nr <- NA
out <- data.frame(
package = pkg,
data_name = data_name,
dataset = dataset,
class = I(list(cl)),
class1 = cl[1],
type = typeof(dat),
names = I(list(nms)),
names_collapsed = paste(nms, collapse = "/"),
nrow = nr,
ncol = nc,
length = length(dat))
if("data.frame" %in% cl) {
classes <- lapply(dat, class)
cl_flat <- unlist(classes)
out <- transform(
out,
classes = I(list(classes)),
types = I(list(vapply(dat, typeof, character(1)))),
logical = sum(cl_flat == 'logical'),
integer = sum(cl_flat == 'integer'),
numeric = sum(cl_flat == 'numeric'),
complex = sum(cl_flat == 'complex'),
character = sum(cl_flat == 'character'),
raw = sum(cl_flat == 'raw'),
list = sum(cl_flat == 'list'),
data.frame = sum(cl_flat == 'data.frame'),
factor = sum(cl_flat == 'factor'),
ordered = sum(cl_flat == 'ordered'),
Date = sum(cl_flat == 'Date'),
POSIXt = sum(cl_flat == 'POSIXt'),
POSIXct = sum(cl_flat == 'POSIXct'),
POSIXlt = sum(cl_flat == 'POSIXlt'))
} else {
out <- transform(
out,
nrow = NA,
ncol = NA,
classes = NA,
types = NA,
logical = NA,
integer = NA,
numeric = NA,
complex = NA,
character = NA,
raw = NA,
list = NA,
data.frame = NA,
factor = NA,
ordered = NA,
Date = NA,
POSIXt = NA,
POSIXct = NA,
POSIXlt = NA)
}
if(is.matrix(dat)) {
out$names <- list(colnames(dat))
out$names_collapsed = paste(out$names, collapse = "/")
}
out
})
row_dfs_i <- do.call(rbind, row_dfs_i)
if(!is.null(row_dfs_i)) row_dfs[[i]] <- row_dfs_i
sink(type = "message")
}
df2 <- do.call(rbind, row_dfs)
df <- merge(df, df2)
df
}
Extend/modify to your liking.
library(data.table)
dt = as.data.table(data(package = .packages(all.available = TRUE))$results)
dt = dt[, `:=`(Item = sub(' \\(.*', '', Item),
Object = sub('.*\\((.*)\\)', '\\1', Item))]
dt[, {
data(list = Object, package = Package)
d = eval(parse(text = Item))
classes = if (sum(class(d) %in% c('data.frame')) > 0) unlist(lapply(d, class))
else NA_integer_
.(class = paste(class(d), collapse = ","),
nrow = if (!is.null(nrow(d))) nrow(d) else NA_integer_,
ncol = if (!is.null(ncol(d))) ncol(d) else NA_integer_,
charCols = sum(classes == 'character'),
facCols = sum(classes == 'factor'))
}
, by = .(Package, Item)]
# Package Item class nrow ncol charCols facCols
# 1: datasets AirPassengers ts NA NA NA NA
# 2: datasets BJsales ts NA NA NA NA
# 3: datasets BJsales.lead ts NA NA NA NA
# 4: datasets BOD data.frame 6 2 0 0
# 5: datasets CO2 nfnGroupedData,nfGroupedData,groupedData,data.frame 84 5 0 3
# ---
#492: survival transplant data.frame 815 6 0 3
#493: survival uspop2 array 101 2 NA NA
#494: survival veteran data.frame 137 8 0 1
#495: viridis viridis.map data.frame 1024 4 1 0
#496: xtable tli data.frame 100 5 0 3
In package datasets there is no dataset of class data.frame that fulfills your conditions, more exactly if they are of class data.frame and have at most 100 columns, then none of them has two or more columns of class character. I've just found that out with a first version of the following code.
library(datasets)
res <- library(help = "datasets")
dat <- unlist(lapply(strsplit(res$info[[2]], " "), '[[', 1))
dat <- dat[dat != ""]
df_names <- NULL
for(i in seq_along(dat)){
d <- tryCatch(get(dat[i]), error = function(e) e)
if(inherits(d, "data.frame")){
if(nrow(d) <= 100){
char <- sum(sapply(d, is.character))
fact <- sum(sapply(d, is.factor))
if(char >= 2 || fact >= 2){
print(dat[i])
df_names <- c(df_names, dat[i])
}
}
}
}
df_names
[1] "CO2" "esoph" "npk" "sleep" "warpbreaks"
So I had to include extra instructions to handle columns of class factor. By default data frames are created with stringsAsFactors = TRUE. If you can do with those, there you have it, their names are in vector df_names. In order to make them available in the global environment just get the one you want.
The table returned by myfun() can be filtered with appropriate conditions, and the columns of datasets can be identified by its class given in the classes coulmn.
The problem with caret package is that it does not have any data frames or matrix object in it. The datasets may be present in the caret inside the list object. I am not sure about it, some list objects in the caret package contains a list of functions.
Also, if interested, you can make myfun() function to be more specific for returning information about data frames or matrix objects only.
myfun <- function( package )
{
t( sapply( ls( paste0( 'package:', package ) ), function(x){
y <- eval(parse(text = paste0( package, "::`", x, "`")))
data.frame( data_class = paste0(class(y), collapse = ","),
nrow = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
nrow(y),
NA_integer_ ),
ncol = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
ncol(y),
NA_integer_),
classes = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
paste0( unlist(lapply(y, class)), collapse = "," ),
NA),
stringsAsFactors = FALSE )
} ) )
}
library( datasets )
meta_data <- myfun( package = "datasets")
head(meta_data)
# data_class nrow ncol classes
# ability.cov "list" NA NA NA
# airmiles "ts" NA NA NA
# AirPassengers "ts" NA NA NA
# airquality "data.frame" 153 6 "integer,integer,numeric,integer,integer,integer"
# anscombe "data.frame" 11 8 "numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric"
# attenu "data.frame" 182 5 "numeric,numeric,factor,numeric,numeric"
meta_data[ "ChickWeight", ]
# $data_class
# [1] "nfnGroupedData,nfGroupedData,groupedData,data.frame"
#
# $nrow
# [1] 578
#
# $ncol
# [1] 4
#
# $classes
# [1] "numeric,numeric,ordered,factor,factor"
library( 'caret' )
meta_data <- myfun( package = "caret")
# data_class nrow ncol classes
# anovaScores "function" NA NA NA
# avNNet "function" NA NA NA
# bag "function" NA NA NA
# bagControl "function" NA NA NA
# bagEarth "function" NA NA NA
# bagEarthStats "function" NA NA NA
If the loaded packages needs to unloaded after applying the myfun() function on the package, try this:
loaded_pkgs <- search()
library( 'caret' )
meta_data <- myfun( package = "caret")
unload_pkgs <- setdiff( search(), loaded_pkgs )
for( i in unload_pkgs ) {
detach( pos = which( search() %in% i ) )
}
Related
I want to parse a JSON to a data.frame. Possibly the JSON doesn't contain all values. My minimum example shows two ways, one using tidyr::unnest_wider, one using data.table::rbindlist.
rm(list=ls())
library(tibble)
library(tidyr)
library(data.table)
n <- 10000
# Example data - parsed from a json-text-stream, possibly sparse
data <- list()
for(i in seq(n)) {
if(i%%7==0) {
l = list(
b=i*i/n,
c=exp(-i/100)
)
} else {
l = list(
a=i,
b=i*i/n,
c=exp(-i/100)
)
}
data[[i]] = l
}
# Using the tidyverse way
t0 <- Sys.time()
d1 <- tibble::tibble(json=data) %>% tidyr::unnest_wider(json)
t1 <- as.numeric(Sys.time()-t0,units="secs")
paste("This took",t1,"seconds.") %>% message()
# Using the data.table way
t0 <- tic()
d2 <- data %>% data.table::rbindlist(fill=T)
t1 <- as.numeric(Sys.time()-t0,units="secs")
paste("This took",t1,"seconds.") %>% message()
The data looks like this after parsing:
a
b
c
1
0.0001
0.9900498
2
0.0004
0.9801987
3
0.0009
0.9704455
4
0.0016
0.9607894
5
0.0025
0.9512294
6
0.0036
0.9417645
NA
0.0049
0.9323938
8
0.0064
0.9231163
9
0.0081
0.9139312
10
0.0100
0.9048374
The output is:
This took 3.77877902984619 seconds.
This took 0.0209999084472656 seconds.
Why is tidyr::unnest_wider so much slower?
Edit: In my application n can be as large as 10^6, 10^7, that's how I found out about the bottleneck.
unnest_wider is a R function including a loop on columns and calling many other R functions (col_to_wide,unchop,unpack, ...).
Such functions can't compete with a C implementation like data.table, see why.
To view source code, just type :
tidyr::unnest_wider
function (data, col, names_sep = NULL, simplify = TRUE, strict = FALSE,
names_repair = "check_unique", ptype = NULL, transform = NULL)
{
if (!is.data.frame(data)) {
abort("`data` must be a data frame.")
}
check_present(col)
cols <- tidyselect::eval_select(enquo(col), data)
col_names <- names(cols)
if (!is.null(names_sep) && !is_string(names_sep)) {
abort("`names_sep` must be a single string or `NULL`.")
}
if (!is_bool(strict)) {
abort("`strict` must be a single `TRUE` or `FALSE`.")
}
for (i in seq_along(cols)) {
col <- cols[[i]]
col_name <- col_names[[i]]
data[[col]] <- col_to_wide(col = data[[col]], name = col_name,
strict = strict, names_sep = names_sep)
}
data <- unchop(data, all_of(cols))
for (i in seq_along(cols)) {
col <- cols[[i]]
data[[col]] <- df_simplify(x = data[[col]], ptype = ptype,
transform = transform, simplify = simplify)
}
unpack(data, all_of(cols), names_repair = names_repair)
}
In my split(w7, w7$study.name)[48] call below there are 4 rows for which variable control == FALSE.
But I'm wondering why ctlistG(split(w7, w7$study.name)[48]) returns only one of such rows?
ps. I suspect, instead of lapply() I should have used mapply() in ctlistG().
Reproducible R code:
ctlist <- function(List, cont=FALSE, pos=1, outcom=1){
if(!inherits(List, "list")) List <- list(List)
h <- setNames(lapply(List, function(i) i[i$control==cont & i$post == pos & i$outcome == outcom, , drop = FALSE]), names(List))
Filter(NROW, h) }
#====================
ctlistG <- function(m){
input <- setNames(lapply(m, function(i) rev(expand.grid(outcom = seq_len(max(i$outcome, na.rm = TRUE)), pos = seq_len(max(i$post, na.rm = TRUE))))), names(m))
lapply(input, function(i) ctlist(m, cont = FALSE, pos = i$pos, outcom = i$outcom)) }
#==================== EXAMPLE OF USE:
w7 <- read.csv('https://raw.githubusercontent.com/rnorouzian/m/master/w7.csv')
ctlistG(split(w7, w7$study.name)[48]) # I expect 4 rows not 1 below!
#$VanBe_Jng_KenA
#$VanBe_Jng_KenA$VanBe_Jng_KenA
# study.name YofPub group.name n d
#406 VanBe_Jng_KenA 2012 NA 34 NA
If we need 4 rows, based on the function, we may need Map instead of lapply
out <- do.call(rbind, lapply(input, function(inp)
do.call(rbind, Map(function(p, o)
do.call(rbind, lapply(m, function(m1)
m1[m1$control == FALSE & m1$post == p & m1$outcome ==o, , drop = FALSE])),
inp$pos, inp$outcom))))
data
lst1 <- split(w7, w7$study.name)
m <- lst1[48]
I wish to apply a function only to some elements of a nested list
l <- list()
l$a$forecast <- rnorm(3)
l$a$model <- "arima"
l$b$forecast <- rnorm(3)
l$b$model <- "prophet"
The desired output would be like this:
applying the sum function to the $forecast element of the list
fcst <- c(sum(l$a$forecast),sum(l$b$forecast))
mdl <- c(l$a$model,l$b$model)
df <- data.frame(fcst,mdl)
I tried something like this:
df <- lapply(l$forecast, function(x) sum(x))
df <- do.call(rbind, Map(cbind, sku = names(df)))
Another approach using rrapply() in the rrapply-package combined with dplyr's bind_rows(). This also extends to lists containing deeper nested levels.
rrapply::rrapply(l, condition = function(x, .xname) .xname == "forecast", f = sum) %>%
dplyr::bind_rows()
#> # A tibble: 2 x 2
#> forecast model
#> <dbl> <chr>
#> 1 -1.28 arima
#> 2 1.10 prophet
Data
set.seed(1)
l <- list(
a = list(forecast = rnorm(3), model = "arima"),
b = list(forecast = rnorm(3), model = "prophet")
)
do.call(
rbind,
lapply(
l,
function(x) list(fcst = sum(x$forecast), model = x$model)
)
)
Since you know the exact dimensions of your returned object you can use vapply in cases like this for a minor performance improvement:
vapply(
l,
FUN = function(x) list(fcst = sum(x$forecast), model = x$model),
FUN.VALUE = list(fcst = numeric(1), model = character(1))
)
However, the resulting object can be hard to work with.
You can get the letters with the object letters, then using its output in a loop:
n = 2 #number of lists you have
sumfore = model = vector()
for(i in letters[seq(1,n,1)]){
sumfore[i] = sum(l[[i]]$forecast)
model[i] =l[[i]]$model}
newdf = data.frame(sumfore, model)
I am using the following code to obtain the mean of all possible combination (m=2) of the variables whose name starts with "form".
k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], 2, FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), 2, paste0, collapse="")
data <- cbind(data, combined)
The dataset "data" is the following:
structure(list(id = c(5309039, 5284969, 5300279, 5270289, 5259957,
5267086, 5173196, 5057536, 5246135, 5255558, 5241070, 5280194,
5112387, 444459, 5054590, 5048412, 5296390, 5093742, 5293520),
form13 = c(1300.81321145176, 1130.23869905075, 1292.03253463863,
1358.23586808642, 1250.66417156907, 1388.37813595599, 1277.89625553694,
1242.17552321015, 1275.95068420011, 1449.97932094858, 1494.93158409261,
1183.72005024492, 1319.72081010904, 1153.43556746197, 1451.47500658524,
1502.05308533551, 1641.66472289938, 1407.07852441646, 1444.3815517771
), form12 = c(1329.6, 1104.4, 1272, 1322.8, 1195.5, 1487.4,
1195.6, 1258, 1256.4, 1455, 1524, 1170, 1291.4, 1224.6, 1414,
1606, 1765.2, 1441, 1406.8), form11 = c(1325.578, 1201.752,
1346.42, 1424.884, 1328.03, 1367.262, 1294.928, 1278.99,
1330.482, 1493.54, 1524.19, 1242.21, 1379.522, 1178.458,
1438.37, 1475.15, 1611.236, 1426.11, 1431.014), form10 = c(1056.7264,
940.4956, 1076.29, 1149.9412, 1059.028, 1095.8536, 1027.9564,
1012.996, 1061.3296, 1214.386, 1243.156, 978.472, 1107.3616,
918.6304, 1162.6, 1197.124, 1324.8628, 1151.092, 1155.6952
), form9 = c(1265.95883621535, 1104.13796282321, 1292.61038190038,
1391.60226122629, 1269.10247448997, 1319.10781736395, 1226.47462059388,
1205.80097696249, 1272.24391797013, 1476.61400008329, 1514.11964245256,
1157.70450530205, 1334.62450699242, 1072.96302932, 1408.41424685422,
1453.98138963552, 1619.24856353662, 1393.1329826012, 1399.25113387699
), form8 = c(1482.14960970768, 1302.96011430734, 1455.11530997823,
1507.60187999797, 1403.62372119021, 1590.3115445541, 1392.70107590683,
1422.72772811208, 1440.68241714823, 1606.14610155669, 1656.53381495283,
1357.47229571355, 1476.63693689195, 1356.28387443873, 1567.80354390345,
1697.01564123702, 1829.93948069795, 1581.30521692185, 1561.45650301116
), form7 = c(1444.56088362196, 1256.09569669502, 1416.12716131828,
1471.33068319787, 1361.97012558123, 1558.32178921338, 1350.4820727773,
1382.06304580259, 1400.94715403591, 1574.97601740197, 1627.97203596215,
1313.42968513872, 1438.7628489193, 1312.17974558614, 1534.64866852904,
1670.54939207752, 1810.35399499291, 1548.84925168016, 1527.97307493173
), form6 = c(1199.39256844313, 1030.51525282711, 1173.91406615889,
1223.38008553142, 1125.38576782367, 1301.32988998026, 1115.09171006788,
1143.39035787661, 1160.31177216137, 1316.25318375141, 1363.74113364133,
1081.8903116367, 1194.19714454337, 1080.77028284113, 1280.11720270038,
1401.89327051093, 1527.16747332837, 1292.84186767351, 1274.13542778885
), form5 = c(1297.78687926793, 1159.12885718351, 1290.6491699916,
1344.46508388198, 1257.02131246849, 1368.96738018114, 1239.89545043121,
1250.12098970015, 1277.57642224122, 1419.04226152712, 1455.58342941928,
1202.60322079507, 1313.15664462902, 1177.98531965952, 1380.99558290387,
1461.37241431927, 1574.8610783177, 1384.16870680163, 1375.22939662201
), form4 = c(1335.97776730397, 1108.36308048125, 1324.2608292059,
1412.60257966574, 1269.05887158687, 1452.82443206729, 1240.94583733479,
1257.73161635649, 1302.80120256198, 1535.02507407783, 1595.00938916382,
1179.7286135352, 1361.20807332313, 1139.31698950533, 1472.56938122075,
1604.51232282192, 1790.81013902909, 1477.77823673001, 1463.10387273464
), form3 = c(1354.228, 1167.277, 1385.695, 1504.159, 1357.93,
1417.162, 1307.953, 1283.89, 1361.632, 1607.815, 1654.09,
1228.36, 1435.672, 1132.108, 1524.52, 1580.05, 1785.511,
1506.01, 1513.414), form2 = c(2275.7324829005, 1960.23260237236,
2259.163108513, 2384.94888103794, 2181.57337654262, 2442.86896126772,
2142.36120747078, 2165.7494001933, 2228.9072421228, 2562.48497832825,
2650.8148703194, 2057.68931533889, 2311.5302827576, 2002.33637794664,
2471.44922673607, 2664.88828208925, 2945.12448823488, 2479.00498842122,
2457.73611045874), form1 = c(1180.88828860349, 1056.82591443514,
1162.17101167316, 1198.5102427986, 1126.52065872992, 1255.77452231775,
1118.95833314255, 1139.74737411054, 1152.17835587263, 1266.73762443072,
1301.62370599969, 1094.56758356167, 1177.07157336578, 1093.7447765967,
1240.19104186727, 1329.65141749175, 1421.68162869499, 1249.53896489237,
1235.79664943772)), row.names = c(NA, -19L), class = c("tbl_df",
"tbl", "data.frame"))
>
The code works well and I am trying to implement it in order to take all possible combination with m from 2 to 8. I've tried the following code, but it doesn't work.
x<-2:8
k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
data <- cbind(data, combined)
as I get the following error:
> x<-2:8
> k=which(grepl("^form",colnames(data)))
> combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
**Error in combn(data[, k], seq_along(x), FUN = rowMeans) :
length(m) == 1L is not TRUE**
> colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
**Error in combn(names(data[, k]), seq_along(x), paste0, collapse = "") :
length(m) == 1L is not TRUE**
> data <- cbind(data, combined)
Where am I wrong?
Also, I would like to add the following prephix "comb_" to the name of all generated variables. How should I modify the above code?
Thank you!
The reason is simply that combn only takes one m at a time. Just use sapply to iterate over the ms. In order to get the column names in one step we can use 'colnames<-()'. 'colnames<-'(x, names) is actually the same as colnames(x) <- names but with the advantage that everything is on the RHS. "form" suffixes can be deleted with gsub.
k <- 2:14
combined.2.lst <- sapply(2:8, function(m)
`colnames<-`(combn(data[,k], m, rowMeans),
combn(names(data[,k]), m, function(x)
paste0("comb.", paste0(gsub("form", "", x), collapse=".")))))
This gives you a list which then can be cbinded.
combined.2 <- do.call(cbind, combined.2.lst)
dim(combined.2)
# [1] 19 7085
Result
combined.2[1:5, c(1, 50, 100, 500, 1000, 5000)] # example columns
# comb.13.12 comb.9.1 comb.13.10.9 comb.13.10.2.1 comb.9.5.4.3 comb.13.7.6.5.4.3.2
# [1,] 1315.207 1223.424 1207.833 1453.540 1313.488 1458.356
# [2,] 1117.319 1080.482 1058.291 1271.948 1134.727 1258.836
# [3,] 1282.016 1227.391 1220.311 1447.414 1323.304 1448.835
# [4,] 1340.518 1295.056 1299.926 1522.909 1413.207 1528.446
# [5,] 1223.082 1197.812 1192.932 1404.447 1288.278 1400.515
Finally just use cbind(data, combined.2).
The function combn, can only take 1 element for the number of combinations, so you, need to use lapply and finally combine them with do.call(cbind..):
First we define the function for combination x:
func = function(x,DATA){
mat = combn(DATA,x,FUN=rowMeans)
colnames(mat) = combn(names(DATA),x, paste0, collapse="")
mat
}
Then we iterate:
k=which(grepl("^form",colnames(data)))
combined = lapply(2:8,func,DATA=data[,k])
combined <- do.call(cbind, combined)
If you are familiar with purrr, you can also do:
library(purrr)
library(dplyr)
combined = 2:8 %>% map(~as.tibble(func(.x,DATA=data[,k]))) %>% bind_cols()
You need do iteration over m<-2:8, using lapply() or sapply(). I tried to keep your main structure of your code and make minimal changes to let it work:
m <- 2:8
k=which(grepl("^form",colnames(data)))
combined <- Reduce(cbind,lapply(m, function(m) combn(data[,k], m, FUN = rowMeans)))
colnames(combined) <-unlist(sapply(m, function(m) combn(names(data[,k]), m, paste0, collapse="")))
data <- cbind(data, combined)
In short, I have a larger function that creates data.frames that are subsets of a larger data.frame and are named after the arguments of the function. It's building data.frames for raw data AND the outputs and the predictive output of Holt-Winters...meaning it is creating multiple data.frames. A small example is the following (though there's not enough intervals here to actually generate a ts class data.frame):
Group <- c("Primary_Group","Primary_Group","Primary_Group","Primary_Group","Primary_Group","Primary_Group","Secondary_Group","Secondary_Group","Secondary_Group","Secondary_Group","Secondary_Group","Secondary_Group","Tertiary_Group","Tertiary_Group","Tertiary_Group","Tertiary_Group","Tertiary_Group","Tertiary_Group")
Day <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
Type <- c("A","A","A","B","B","B","A","A","A","B","B","B","A","A","A","B","B","B")
Value <- c(7,3,10,3,9,4,0,9,3,10,1,6,3,4,10,2,3,1)
df <- as.data.frame(cbind(Group,Day,Type,Value))
Fun <- function(Group,Type, A, B, G){
df <- Data[Data$Group== Group & Data$Type== Type, ]
assign(paste(Group,Type,"_df",sep = ''), df, envir = parent.frame())
df_holtwinters <- HoltWinters(ts(Data[Data$Group== Group & Data$Type== Type, ],
frequency = 365), alpha = A, beta = B, gamma = G)
assign(paste(Group,Type,"_hw",sep = ''), df_holtwinters, envir = parent.frame())
}
You'll notice that the Group and Type are characters, while A, B, G are either numeric or NULL.
If I now have a data.frame composed of lists values, how could I best loop the above function (likely with mapply) to use the values from each column in row one...then each column from row 2 etc - creating several data frames.
argGroup <- c("Primary_Group","Primary_Group","Secondary_Group","Secondary_Group","Tertiary_Group","Tertiary_Group")
argType <- c("A","B","A","B","A","B")
argA <- c(NA, NA, NA, NA, NA, NA)
argB <- c(0.05, 0.05, NA, NA, NA, NULL)
argG <- c(NA, NA, NA, NA, NA, NA)
argGroup[is.na(argGroup)] <- list(NULL)
argType[is.na(argType)] <- list(NULL)
argA[is.na(argA)] <- list(NULL)
argB[is.na(argB)] <- list(NULL)
argG[is.na(argG)] <- list(NULL)
Arguments <- cbind(argType, argType, argA, argB, argG)
Ideally, I would get the following data.frames to generate...
Primary_Group_A_df
Primary_Group_A_hw
Primary_Group_B_df
Primary_Group_B_hw
Secondary_Group_A_df
Secondary_Group_A_hw
Secondary_Group_B_df
Secondary_Group_B_hw
Tertiary_Group_A_df
Tertiary_Group_A_hw
Tertiary_Group_B_df
Tertiary_Group_B_hw
It would also be helpful to understand how to best (most automated way) rbind all the _df together and all the _hw together.
Any help would be amazing and very appreciated. Thanks so much!
Avoid flooding your global environment with many similarly structured objects. Consider using a container such as a list to hold the many dataframes. One useful method is by to subset your dataframe by one or more factor(s) such as Group and Type to return a list of dataframes. Also, don't iterate by rows but merge arguments with data for one pass of arguments per subset.
Specifically, call by twice for df and hw lists. But first, merge the df and Arguments data frames by Group and Type. One challenge is NULL cannot be stored in a data frame, so consider saving "NULL" string and assign temp variables to pass into the HW arguments. Unfortunately, this will cast entire column as character type which you will need to convert with as.numeric for non-NULL values.
Merge
Group <- c("Primary_Group","Primary_Group","Secondary_Group","Secondary_Group",
"Tertiary_Group","Tertiary_Group")
Type <- c("A","B","A","B","A","B")
argA <- c("NULL", "NULL", "NULL", "NULL", "NULL", "NULL")
argB <- c(0.05, 0.05, "NULL", "NULL", "NULL", "NULL")
argG <- c("NULL", "NULL", "NULL", "NULL", "NULL", "NULL")
Arguments <- data.frame(Group, Type, argA, argB, argG, stringsAsFactors=FALSE)
df <- merge(df, Arguments, by=c("Group", "Type"))
Dataframe List (with named df elements)
# ORDER FOR NAMING LATER
df <- with(df, df[order(Type, Group),])
# DATAFRAME LIST
df_list <- by(df, df[c("Group", "Type")], identity)
# RENAME LIST
df_list <- setNames(df_list, unique(paste0(df$Group, "_", df$Type, "_df")))
# REFERENCE ELEMENTS
df_list$Primary_Group_A_df
df_list$Secondary_Group_A_df
df_list$Tertiary_Group_A_df
...
HW List (with named hw elements)
# HW LIST
hw_list <- by(df, df[c("Group", "Type")], function(sub) {
# CONDITIONALLY ASSIGN TEMP VARIABLES
# (BEING SUBSETS: max(arg*)==min(arg*)==mean(arg*)==median(arg*))
if(!is.na(max(sub$argA)) & max(sub$argA) == "NULL") { tmpA <- NULL }
else { tmpA <- max(as.numeric(sub$argA)) }
if(!is.na(max(sub$argB)) & max(sub$argB) == "NULL") { tmpB <- NULL }
else { tmpB <- max(as.numeric(sub$argB)) }
if(!is.na(max(sub$argG)) & max(sub$argG) == "NULL") { tmpG <- NULL }
else { tmpG <- max(as.numeric(sub$argG)) }
# PASS ARGS ONCE PER SUBSET
return(HoltWinters(ts(sub, frequency = 365), alpha=tmpA, beta=tmpB, gamma=tmpG))
})
# RENAME LIST
hw_list <- setNames(hw_list, unique(paste0(df$Group, "_", df$Type, "_hw")))
# REFERENCE ELEMENTS
hw_list$Primary_Group_A_hw
hw_list$Secondary_Group_A_hw
hw_list$Tertiary_Group_A_hw
...
Output (using 3 for HW's frequency to align with posted data)
> hw_list$Primary_Group_A_hw
Holt-Winters exponential smoothing with trend and additive seasonal component.
Call:
HoltWinters(x = ts(sub[c("Group", "Day", "Type", "Value")], frequency = 3), alpha = tmpA, beta = tmpB, gamma = tmpG)
Smoothing parameters:
alpha: 0.2169231
beta : 0.05
gamma: 0.1
Coefficients:
[,1]
a 2.89129621
b 0.08783715
s1 0.54815382
s2 -0.12485260
s3 0.21087038
> hw_list$Secondary_Group_A_hw
Holt-Winters exponential smoothing with trend and additive seasonal component.
Call:
HoltWinters(x = ts(sub[c("Group", "Day", "Type", "Value")], frequency = 3), alpha = tmpA, beta = tmpB, gamma = tmpG)
Smoothing parameters:
alpha: 0.752124
beta : 0
gamma: 0
Coefficients:
[,1]
a 3.691664e+00
b 3.333333e-01
s1 3.333333e-01
s2 -1.480388e-16
s3 -3.333333e-01
> hw_list$Tertiary_Group_A_hw
Holt-Winters exponential smoothing with trend and additive seasonal component.
Call:
HoltWinters(x = ts(sub[c("Group", "Day", "Type", "Value")], frequency = 3), alpha = tmpA, beta = tmpB, gamma = tmpG)
Smoothing parameters:
alpha: 0.3145406
beta : 0
gamma: 0
Coefficients:
[,1]
a 3.022946e+00
b -3.333333e-01
s1 -3.333333e-01
s2 -1.480388e-16
s3 3.333333e-01
You're losing type information by using as.data.frame(cbind(...)),
just use data.frame directly:
Data <- data.frame(
Group = rep(c("Primary_Group", "Secondary_Group", "Tertiary_Group"), each = 6L),
Day = rep(1L:3L, 6L),
Type = rep(rep(c("A", "B"), each = 3L), 3L),
Value = c(7,3,10,3,9,4,0,9,3,10,1,6,3,4,10,2,3,1)
)
Afterwards, I presume you can do the following:
split_data <- split(Data, as.list(Data[, c("Group", "Type")]))
dfs <- do.call(rbind, split_data)
dfs_hw <- lapply(split_data, function(sub_data) {
Map(argA, argB, argG, f = function(A, B, G) {
HoltWinters(ts(sub_data, frequency = 365), alpha = A, beta = B, gamma = G)
})
})
dfs_hw <- do.call(rbind, unlist(dfs_hw, recursive = FALSE))
But I get an error from HoltWinters,
so I can't say for sure.
Also, I think dfs simply has Data again, just reordered.