tidyr::spread() with multiple keys and values - r

I assume this has been asked multiple times but I couldn't find the proper words to find a workable solution.
How can I spread() a data frame based on multiple keys for multiple values?
A simplified (I have many more columns to spread, but on only two keys: Id and time point of a given measurement) data I'm working with looks like this:
df <- data.frame(id = rep(seq(1:10),3),
time = rep(1:3, each=10),
x = rnorm(n=30),
y = rnorm(n=30))
> head(df)
id time x y
1 1 1 -2.62671241 0.01669755
2 2 1 -1.69862885 0.24992634
3 3 1 1.01820778 -1.04754037
4 4 1 0.97561596 0.35216040
5 5 1 0.60367158 -0.78066767
6 6 1 -0.03761868 1.08173157
> tail(df)
id time x y
25 5 3 0.03621258 -1.1134368
26 6 3 -0.25900538 1.6009824
27 7 3 0.13996626 0.1359013
28 8 3 -0.60364935 1.5750232
29 9 3 0.89618748 0.0294315
30 10 3 0.14709567 0.5461084
What i'd like to have is a dataframe populated like this:
One row per Id columns for each value from the time and each measurement variable.

With the devel version of tidyr (tidyr_0.8.3.9000), we can use pivot_wider to reshape multiple value columns from long to wide format
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(time = str_c("time", time)) %>%
pivot_wider(names_from = time, values_from = c("x", "y"), names_sep="")
# A tibble: 10 x 7
# id xtime1 xtime2 xtime3 ytime1 ytime2 ytime3
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 -0.256 0.483 -0.254 -0.652 0.655 0.291
# 2 2 1.10 -0.596 -1.85 1.09 -0.401 -1.24
# 3 3 0.756 -2.19 -0.0779 -0.763 -0.335 -0.456
# 4 4 -0.238 -0.675 0.969 -0.829 1.37 -0.830
# 5 5 0.987 -2.12 0.185 0.834 2.14 0.340
# 6 6 0.741 -1.27 -1.38 -0.968 0.506 1.07
# 7 7 0.0893 -0.374 -1.44 -0.0288 0.786 1.22
# 8 8 -0.955 -0.688 0.362 0.233 -0.902 0.736
# 9 9 -0.195 -0.872 -1.76 -0.301 0.533 -0.481
#10 10 0.926 -0.102 -0.325 -0.678 -0.646 0.563
NOTE: The numbers are different as there was no set seed while creating the sample dataset

Reshaping with multiple value variables can best be done with dcast from data.table or reshape from base R.
library(data.table)
out <- dcast(setDT(df), id ~ paste0("time", time), value.var = c("x", "y"), sep = "")
out
# id xtime1 xtime2 xtime3 ytime1 ytime2 ytime3
# 1: 1 0.4334921 -0.5205570 -1.44364515 0.49288757 -1.26955148 -0.83344256
# 2: 2 0.4785870 0.9261711 0.68173681 1.24639813 0.91805332 0.34346260
# 3: 3 -1.2067665 1.7309593 0.04923993 1.28184341 -0.69435556 0.01609261
# 4: 4 0.5240518 0.7481787 0.07966677 -1.36408357 1.72636849 -0.45827205
# 5: 5 0.3733316 -0.3689391 -0.11879819 -0.03276689 0.91824437 2.18084692
# 6: 6 0.2363018 -0.2358572 0.73389984 -1.10946940 -1.05379502 -0.82691626
# 7: 7 -1.4979165 0.9026397 0.84666801 1.02138768 -0.01072588 0.08925716
# 8: 8 0.3428946 -0.2235349 -1.21684977 0.40549497 0.68937085 -0.15793111
# 9: 9 -1.1304688 -0.3901419 -0.10722222 -0.54206830 0.34134397 0.48504564
#10: 10 -0.5275251 -1.1328937 -0.68059800 1.38790593 0.93199593 -1.77498807
Using reshape we could do
# setDF(df) # in case df is a data.table now
reshape(df, idvar = "id", timevar = "time", direction = "wide")

Your entry data frame is not tidy. You should use gather to make it so.
gather(df, key, value, -id, -time) %>%
mutate(key = paste0(key, "time", time)) %>%
select(-time) %>%
spread(key, value)

Related

Regex to catch similar matching word until it hits a number

I have this df:
data1 <- structure(list(attr = c("kind1", "kind2", "kind3", "price1",
"price2", "packing1", "weight1", "weight2", "calorie1"), coef = c(-1.08908045977012,
-0.732758620689656, -0.922413793103449, -0.570881226053641, 0.118773946360153,
-0.0287356321839081, -0.168582375478927, 0.173371647509578, -0.646551724137931
), pval = c(0.0000000461586619475345, 0.000225855110699109, 0.00000354973103147522,
0.000189625500287816, 0.506777189443937, 0.801713589134903, 0.269271977099465,
0.33257496253009, 0.0000000192904668116847)), row.names = c(NA,
-9L), class = "data.frame")
# attr coef pval
#1 kind1 -1.08908046 0.00000004615866
#2 kind2 -0.73275862 0.00022585511070
#3 kind3 -0.92241379 0.00000354973103
#4 price1 -0.57088123 0.00018962550029
#5 price2 0.11877395 0.50677718944394
#6 packing1 -0.02873563 0.80171358913490
#7 weight1 -0.16858238 0.26927197709946
#8 weight2 0.17337165 0.33257496253009
#9 calorie1 -0.64655172 0.00000001929047
I'm trying to add by groups according to a regex that identifies similar words up to a certain point, in this case, until a number appears.
For example, in the case of my variables, there would be 5 groups:
kind
Total = kind sum
price
Total = price sum
packing
Total= packing sum
weight
Total = weight sum
calorie
Total = calorie sum
I made this code, but I don't know how to position this regex or how to create it. I tried using stringr but I couldn't do what I want:
data1 %>%
dplyr::arrange(attr) %>%
split(f = .[,"attr"]) %>%
purrr::map_df(., janitor::adorn_totals)
# attr coef pval
# calorie1 -0.64655172 0.00000001929047
# Total -0.64655172 0.00000001929047
# kind1 -1.08908046 0.00000004615866
# Total -1.08908046 0.00000004615866
# kind2 -0.73275862 0.00022585511070
# Total -0.73275862 0.00022585511070
# kind3 -0.92241379 0.00000354973103
# Total -0.92241379 0.00000354973103
# packing1 -0.02873563 0.80171358913490
# Total -0.02873563 0.80171358913490
# price1 -0.57088123 0.00018962550029
# Total -0.57088123 0.00018962550029
# price2 0.11877395 0.50677718944394
# Total 0.11877395 0.50677718944394
# weight1 -0.16858238 0.26927197709946
# Total -0.16858238 0.26927197709946
# weight2 0.17337165 0.33257496253009
# Total 0.17337165 0.33257496253009
It sums individual rows as groups differ by number. I need a regex that captures this:
kind
price
packing
weight
calorie
That is, to capture the letters until a number appears there.
You can create a grouping variable by removing the digits from the attr variable, and then use group_modify:
data1 %>%
group_by(grp = str_remove_all(attr, "[0-9]")) %>%
group_modify(janitor::adorn_totals, where = "row") %>%
ungroup() %>%
select(-grp)
# # A tibble: 14 × 3
# attr coef pval
# <chr> <dbl> <dbl>
# 1 calorie1 -0.647 0.0000000193
# 2 Total -0.647 0.0000000193
# 3 kind1 -1.09 0.0000000462
# 4 kind2 -0.733 0.000226
# 5 kind3 -0.922 0.00000355
# 6 Total -2.74 0.000229
# 7 packing1 -0.0287 0.802
# 8 Total -0.0287 0.802
# 9 price1 -0.571 0.000190
# 10 price2 0.119 0.507
# 11 Total -0.452 0.507
# 12 weight1 -0.169 0.269
# 13 weight2 0.173 0.333
# 14 Total 0.00479 0.602
Something like this:
We could use group_split() after extract the words to identify. Then we get a list. Here we now can iterate with map_df the function adorn_totals:
library(tidyverse)
library(janitor)
data1 %>%
group_split(id=str_extract(attr, '[A-Za-z]+')) %>%
map_dfr(., adorn_totals) %>%
select(-id) %>%
as_tibble()
attr coef pval
<chr> <dbl> <dbl>
1 calorie1 -0.647 0.0000000193
2 Total -0.647 0.0000000193
3 kind1 -1.09 0.0000000462
4 kind2 -0.733 0.000226
5 kind3 -0.922 0.00000355
6 Total -2.74 0.000229
7 packing1 -0.0287 0.802
8 Total -0.0287 0.802
9 price1 -0.571 0.000190
10 price2 0.119 0.507
11 Total -0.452 0.507
12 weight1 -0.169 0.269
13 weight2 0.173 0.333
14 Total 0.00479 0.602

Stack dataframe columns with two distinct suffix into two columns, preferably using tidyverse [duplicate]

This question already has answers here:
tidyverse pivot_longer several sets of columns, but avoid intermediate mutate_wider steps [duplicate]
(3 answers)
Closed 1 year ago.
Suppose I have a list of dataframes, mylist and I want to do the same operation to each dataframes.
Say my dataframes look like this:
set.seed(1)
test.tbl <- tibble(
case1_diff = rnorm(10,0),
case1_avg = rnorm(10,0),
case2_diff = rnorm(10,0),
case2_avg = rnorm(10,0),
case3_diff = rnorm(10,0),
case3_avg = rnorm(10,0),
case4_diff = rnorm(10,0),
case4_avg = rnorm(10,0),
)
> head(test.tbl)
# A tibble: 6 x 8
case1_diff case1_avg case2_diff case2_avg case3_diff case3_avg case4_diff case4_avg
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.626 1.51 0.919 1.36 -0.165 0.398 2.40 0.476
2 0.184 0.390 0.782 -0.103 -0.253 -0.612 -0.0392 -0.710
3 -0.836 -0.621 0.0746 0.388 0.697 0.341 0.690 0.611
4 1.60 -2.21 -1.99 -0.0538 0.557 -1.13 0.0280 -0.934
5 0.330 1.12 0.620 -1.38 -0.689 1.43 -0.743 -1.25
6 -0.820 -0.0449 -0.0561 -0.415 -0.707 1.98 0.189 0.291
and I wish to stack them into two columns of diff and avg as 40 x 2 dataframe.
Normally, I would just separate it into two objects through select(ends_with("diff")) and select(ends_with("avg")), pivot them, then bind_rows.
However, since my original object is list, I want to do it using map like:
mylist %>%
map(*insertfunction1*) %>%
map(*insertfunction2*)
meaning I would need to do this without separating. I would also need to make sure that diff and avg is correctly paired.
What I have tried so far is
test.tbl %>%
pivot_longer(cols=everything(),
names_to = "metric") %>%
mutate(metric = str_remove(metric,"[0-9]+")) %>%
pivot_wider(id_cols=metric,
values_from=value)
We don't need both pivot_longer and pivot_wider. it can be done within pivot_longer itself by specifying the names_to and the names_sep argument
library(dplyr)
library(tidyr)
test.tbl %>%
pivot_longer(cols = everything(), names_to = c('grp', '.value'),
names_sep = "_") %>%
select(-grp)
-output
# A tibble: 40 x 2
# diff avg
# <dbl> <dbl>
# 1 -0.626 1.51
# 2 0.919 1.36
# 3 -0.165 0.398
# 4 2.40 0.476
# 5 0.184 0.390
# 6 0.782 -0.103
# 7 -0.253 -0.612
# 8 -0.0392 -0.710
# 9 -0.836 -0.621
#10 0.0746 0.388
# … with 30 more rows

row bind list columns using dplyr

I would like to find a better way to bind together the results of any number of regressions after adding an identifier for each model. The code below is my current solution but is too manual for a large number of regressions. This is part of a larger tidy workflow so a solution inside of the tidyverse is preferred but whatever works is fine. Thanks
library(tidyverse)
library(broom)
model_dat=mtcars %>%
do(lm_1 = tidy(lm(disp~ wt*vs, data = .),conf.int=T),
lm_2=tidy(lm(cyl ~ wt*vs, data = .),conf.int=T ),
lm_3=tidy(lm(mpg ~ wt*vs, data = .),conf.int=T ))
df=model_dat %>%
select(lm_1) %>%
unnest(c(lm_1)) %>%
mutate(model="one") %>%
select(model,term,estimate,p.value:conf.high) %>%
bind_rows(
model_dat %>%
select(lm_2) %>%
unnest(c(lm_2)) %>%
mutate(model="two") %>%
select(model,term,estimate,p.value:conf.high)) %>%
bind_rows(
model_dat %>%
select(lm_3) %>%
unnest(c(lm_3)) %>%
mutate(model="three") %>%
select(model,term,estimate,p.value:conf.high))
It may be easier with map2 i.e. loop across the columns and the corresponding english word for the sequence of columns, pluck the list element, create the 'model' column with second argument i.e. engish words (.y), select the columns of interest, and create a single dataset by specifying _dfr in map
library(purrr)
library(english)
library(dplyr)
library(broom)
map2_dfr(model_dat, as.character(english(seq_along(model_dat))),
~ .x %>%
pluck(1) %>%
mutate(model = .y) %>%
select(model, term, estimate, p.value:conf.high) )
-output
# A tibble: 12 x 6
# model term estimate p.value conf.low conf.high
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 one (Intercept) -70.0 1.55e- 1 -168. 28.2
# 2 one wt 102. 8.20e- 9 76.4 128.
# 3 one vs 31.2 6.54e- 1 -110. 172.
# 4 one wt:vs -36.7 1.10e- 1 -82.2 8.82
# 5 two (Intercept) 4.31 1.28e- 5 2.64 5.99
# 6 two wt 0.849 4.90e- 4 0.408 1.29
# 7 two vs -2.19 7.28e- 2 -4.59 0.216
# 8 two wt:vs 0.0869 8.20e- 1 -0.689 0.862
# 9 three (Intercept) 29.5 6.55e-12 24.2 34.9
#10 three wt -3.50 2.33e- 5 -4.92 -2.08
#11 three vs 11.8 4.10e- 3 4.06 19.5
#12 three wt:vs -2.91 2.36e- 2 -5.40 -0.419
Or use summarise with across, unclass and then bind with bind_rows
model_dat %>%
summarise(across(everything(), ~ {
# // get the column name
nm1 <- cur_column()
# // extract the list element (.[[1]])
list(.[[1]] %>%
# // create new column by extracting the numeric part
mutate(model = english(readr::parse_number(nm1))) %>%
# // select the subset of columns, wrap in a list
select(model, term, estimate, p.value:conf.high))
}
)) %>%
# // unclass to list
unclass %>%
# // bind the list elements
bind_rows
-output
# A tibble: 12 x 6
# model term estimate p.value conf.low conf.high
# <english> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 one (Intercept) -70.0 1.55e- 1 -168. 28.2
# 2 one wt 102. 8.20e- 9 76.4 128.
# 3 one vs 31.2 6.54e- 1 -110. 172.
# 4 one wt:vs -36.7 1.10e- 1 -82.2 8.82
# 5 two (Intercept) 4.31 1.28e- 5 2.64 5.99
# 6 two wt 0.849 4.90e- 4 0.408 1.29
# 7 two vs -2.19 7.28e- 2 -4.59 0.216
# 8 two wt:vs 0.0869 8.20e- 1 -0.689 0.862
# 9 three (Intercept) 29.5 6.55e-12 24.2 34.9
#10 three wt -3.50 2.33e- 5 -4.92 -2.08
#11 three vs 11.8 4.10e- 3 4.06 19.5
#12 three wt:vs -2.91 2.36e- 2 -5.40 -0.419

Create new column based on regular expression match

Problem
I would like to create a new column for relative standard deviation using following formula:stdev * 100 / abs(mean). I have over 40 variables, each with their own stdev and mean (so 80 columns). What I would like to do is use regular expressions to calculate the relative standard deviation from the 2 columns (stdev and mean) based on the preceding names. For example, for columns AceticAcid.stdevand AceticAcid.mean, calculate the relative standard deviation to automatically create a new column AcetiAcid.rsd. The equation being: AceticAcid.stdev * 100 / abs(AceticAcid.mean).
Example Dataframe
print(df)
AceticAcid.mean AceticAcid.stdev Glucose.mean Glucose.stdev Propanol.mean Propanol.stdev
1 28.75775 0.911130 48.27333 4.4991249 144.4770 38.34122
2 78.83051 10.562110 28.13337 1.2304387 134.6402 31.76264
3 40.89769 17.848381 37.10283 0.2102977 132.0253 33.76568
4 88.30174 11.028700 32.90534 1.6396036 149.7135 21.56639
5 94.04673 9.132295 14.11699 4.7725182 132.7853 15.88455
Desired Output (Don't care about the order of the new columns)
print(df_rsd)
AceticAcid.mean AceticAcid.stdev Glucose.mean Glucose.stdev Propanol.mean Propanol.stdev AceticAcid.rsd Glucose.rsd Propanol.rsd
1 28.75775 0.911130 48.27333 4.4991249 144.4770 38.34122 3.168294 9.3201039 26.53795
2 78.83051 10.562110 28.13337 1.2304387 134.6402 31.76264 13.398504 4.3735921 23.59076
3 40.89769 17.848381 37.10283 0.2102977 132.0253 33.76568 43.641536 0.5667969 25.57515
4 88.30174 11.028700 32.90534 1.6396036 149.7135 21.56639 12.489788 4.9827894 14.40511
5 94.04673 9.132295 14.11699 4.7725182 132.7853 15.88455 9.710380 33.8069175 11.96258
Repetitive Attempt...
I do not want to write these out 40 times (there has to be a nice regex way to achieve this):
df_rsd <- df %>% mutate(AceticAcid.rsd = AceticAcid.stdev * 100 / abs(AceticAcid.mean),
Glucose.rsd = Glucose.stdev * 100 / abs(Glucose.mean),
Propanol.rsd = Propanol.stdev * 100 / abs(Propanol.mean))
Reproducible Data
structure(list(AceticAcid.mean = c(28.7577520124614, 78.8305135443807,
40.89769218117, 88.3017404004931, 94.0467284293845), AceticAcid.stdev = c(0.911129987798631,
10.5621097609401, 17.8483808878809, 11.0287002893165, 9.13229470606893
), Glucose.mean = c(48.2733338139951, 28.1333662476391, 37.1028254181147,
32.9053360782564, 14.1169873066247), Glucose.stdev = c(4.49912485200912,
1.2304386717733, 0.210297667654231, 1.63960359641351, 4.77251824573614
), Propanol.mean = c(144.476965803187, 134.64017030783, 132.025340688415,
149.713488831185, 132.785289955791), Propanol.stdev = c(38.3412187267095,
31.7626409884542, 33.7656808178872, 21.5663894917816, 15.884545892477
)), class = "data.frame", row.names = c(NA, -5L))
We can use split.default to split the dataset into a list of data.frame columns based on removing the suffix part of the column names, then loop over the list with lapply, do the calculation and assign it to new column in 'df'
out <- lapply(split.default(df, sub("\\..*", "", names(df))),
function(x) x[[2]]* 100/abs(x[[1]]))
df[paste0(names(out), ".rsd")] <- out
df
# AceticAcid.mean AceticAcid.stdev Glucose.mean Glucose.stdev Propanol.mean Propanol.stdev AceticAcid.rsd Glucose.rsd Propanol.rsd
#1 28.75775 0.911130 48.27333 4.4991249 144.4770 38.34122 3.168294 9.3201039 26.53795
#2 78.83051 10.562110 28.13337 1.2304387 134.6402 31.76264 13.398504 4.3735921 23.59076
#3 40.89769 17.848381 37.10283 0.2102977 132.0253 33.76568 43.641536 0.5667969 25.57515
#4 88.30174 11.028700 32.90534 1.6396036 149.7135 21.56639 12.489788 4.9827894 14.40511
#5 94.04673 9.132295 14.11699 4.7725182 132.7853 15.88455 9.710380 33.8069175 11.96258
Or with tidyverse
library(purrr)
library(dplyr)
library(stringr)
df %>%
split.default(str_remove(names(.), "\\..*")) %>%
map_dfc(~ .x[[2]] * 100/abs(.x[[1]])) %>%
rename_all(~ str_c(., '.rsd')) %>%
bind_cols(df, .)
alternative, also with the tidyverse.
library(tidyverse)
df_long <- df %>%
mutate(measurement_number=row_number(), .before=1) %>%
pivot_longer(cols=-measurement_number, names_to="var", values_to="value") %>%
separate(var, into=c("var", "indicator")) %>%
pivot_wider(id_cols=c("measurement_number", "var"), names_from = indicator, values_from=value) %>%
mutate(rsd=stdev * 100 / abs(mean)) %>%
arrange(var, measurement_number)
df_long
#> # A tibble: 15 x 5
#> measurement_number var mean stdev rsd
#> <int> <chr> <dbl> <dbl> <dbl>
#> 1 1 AceticAcid 28.8 0.911 3.17
#> 2 2 AceticAcid 78.8 10.6 13.4
#> 3 3 AceticAcid 40.9 17.8 43.6
#> 4 4 AceticAcid 88.3 11.0 12.5
#> 5 5 AceticAcid 94.0 9.13 9.71
#> 6 1 Glucose 48.3 4.50 9.32
#> 7 2 Glucose 28.1 1.23 4.37
#> 8 3 Glucose 37.1 0.210 0.567
#> 9 4 Glucose 32.9 1.64 4.98
#> 10 5 Glucose 14.1 4.77 33.8
#> 11 1 Propanol 144. 38.3 26.5
#> 12 2 Propanol 135. 31.8 23.6
#> 13 3 Propanol 132. 33.8 25.6
#> 14 4 Propanol 150. 21.6 14.4
#> 15 5 Propanol 133. 15.9 12.0
df_wide <- df_long %>%
pivot_wider(id_cols=c("measurement_number"),
names_from = c(var),
values_from = c(mean, stdev, rsd),
names_sep = ".")
df_wide
#> # A tibble: 5 x 10
#> measurement_num~ mean.AceticAcid mean.Glucose mean.Propanol stdev.AceticAcid
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 28.8 48.3 144. 0.911
#> 2 2 78.8 28.1 135. 10.6
#> 3 3 40.9 37.1 132. 17.8
#> 4 4 88.3 32.9 150. 11.0
#> 5 5 94.0 14.1 133. 9.13
#> # ... with 5 more variables: stdev.Glucose <dbl>, stdev.Propanol <dbl>,
#> # rsd.AceticAcid <dbl>, rsd.Glucose <dbl>, rsd.Propanol <dbl>
Created on 2020-05-26 by the reprex package (v0.3.0)

How to divide dataset into balanced sets based on multiple variables

I have a large dataset I need to divide into multiple balanced sets.
The set looks something like the following:
> data<-matrix(runif(4000, min=0, max=10), nrow=500, ncol=8 )
> colnames(data)<-c("A","B","C","D","E","F","G","H")
The sets, each containing for example 20 rows, will need to be balanced across multiple variables so that each subset ends up having a similar mean of B, C, D that's included in their subgroup compared to all the other subsets.
Is there a way to do that with R? Any advice would be much appreciated. Thank you in advance!
library(tidyverse)
# Reproducible data
set.seed(2)
data<-matrix(runif(4000, min=0, max=10), nrow=500, ncol=8 )
colnames(data)<-c("A","B","C","D","E","F","G","H")
data=as.data.frame(data)
Updated Answer
It's probably not possible to get similar means across sets within each column if you want to keep observations from a given row together. With 8 columns (as in your sample data), you'd need 25 20-row sets where each column A set has the same mean, each column B set has the same mean, etc. That's a lot of constraints. Probably there are, however, algorithms that could find the set membership assignment schedule that minimizes the difference in set means.
However, if you can separately take 20 observations from each column without regard to which row it came from, then here's one option:
# Group into sets with same means
same_means = data %>%
gather(key, value) %>%
arrange(value) %>%
group_by(key) %>%
mutate(set = c(rep(1:25, 10), rep(25:1, 10)))
# Check means by set for each column
same_means %>%
group_by(key, set) %>%
summarise(mean=mean(value)) %>%
spread(key, mean) %>% as.data.frame
set A B C D E F G H
1 1 4.940018 5.018584 5.117592 4.931069 5.016401 5.171896 4.886093 5.047926
2 2 4.946496 5.018578 5.124084 4.936461 5.017041 5.172817 4.887383 5.048850
3 3 4.947443 5.021511 5.125649 4.929010 5.015181 5.173983 4.880492 5.044192
4 4 4.948340 5.014958 5.126480 4.922940 5.007478 5.175898 4.878876 5.042789
5 5 4.943010 5.018506 5.123188 4.924283 5.019847 5.174981 4.869466 5.046532
6 6 4.942808 5.019945 5.123633 4.924036 5.019279 5.186053 4.870271 5.044757
7 7 4.945312 5.022991 5.120904 4.919835 5.019173 5.187910 4.869666 5.041317
8 8 4.947457 5.024992 5.125821 4.915033 5.016782 5.187996 4.867533 5.043262
9 9 4.936680 5.020040 5.128815 4.917770 5.022527 5.180950 4.864416 5.043587
10 10 4.943435 5.022840 5.122607 4.921102 5.018274 5.183719 4.872688 5.036263
11 11 4.942015 5.024077 5.121594 4.921965 5.015766 5.185075 4.880304 5.045362
12 12 4.944416 5.024906 5.119663 4.925396 5.023136 5.183449 4.887840 5.044733
13 13 4.946751 5.020960 5.127302 4.923513 5.014100 5.186527 4.889140 5.048425
14 14 4.949517 5.011549 5.127794 4.925720 5.006624 5.188227 4.882128 5.055608
15 15 4.943008 5.013135 5.130486 4.930377 5.002825 5.194421 4.884593 5.051968
16 16 4.939554 5.021875 5.129392 4.930384 5.005527 5.197746 4.883358 5.052474
17 17 4.935909 5.019139 5.131258 4.922536 5.003273 5.204442 4.884018 5.059162
18 18 4.935830 5.022633 5.129389 4.927106 5.008391 5.210277 4.877859 5.054829
19 19 4.936171 5.025452 5.127276 4.927904 5.007995 5.206972 4.873620 5.054192
20 20 4.942925 5.018719 5.127394 4.929643 5.005699 5.202787 4.869454 5.055665
21 21 4.941351 5.014454 5.125727 4.932884 5.008633 5.205170 4.870352 5.047728
22 22 4.933846 5.019311 5.130156 4.923804 5.012874 5.213346 4.874263 5.056290
23 23 4.928815 5.021575 5.139077 4.923665 5.017180 5.211699 4.876333 5.056836
24 24 4.928739 5.024419 5.140386 4.925559 5.012995 5.214019 4.880025 5.055182
25 25 4.929357 5.025198 5.134391 4.930061 5.008571 5.217005 4.885442 5.062630
Original Answer
# Randomly group data into 20-row groups
set.seed(104)
data = data %>%
mutate(set = sample(rep(1:(500/20), each=20)))
head(data)
A B C D E F G H set
1 1.848823 6.920055 3.2283369 6.633721 6.794640 2.0288792 1.984295 2.09812642 10
2 7.023740 5.599569 0.4468325 5.198884 6.572196 0.9269249 9.700118 4.58840437 20
3 5.733263 3.426912 7.3168797 3.317611 8.301268 1.4466065 5.280740 0.09172101 19
4 1.680519 2.344975 4.9242313 6.163171 4.651894 2.2253335 1.175535 2.51299726 25
5 9.438393 4.296028 2.3563249 5.814513 1.717668 0.8130327 9.430833 0.68269106 19
6 9.434750 7.367007 1.2603451 5.952936 3.337172 5.2892300 5.139007 6.52763327 5
# Mean by set for each column
data %>% group_by(set) %>%
summarise_all(mean)
set A B C D E F G H
1 1 5.240236 6.143941 4.638874 5.367626 4.982008 4.200123 5.521844 5.083868
2 2 5.520983 5.257147 5.209941 4.504766 4.231175 3.642897 5.578811 6.439491
3 3 5.943011 3.556500 5.366094 4.583440 4.932206 4.725007 5.579103 5.420547
4 4 4.729387 4.755320 5.582982 4.763171 5.217154 5.224971 4.972047 3.892672
5 5 4.824812 4.527623 5.055745 4.556010 4.816255 4.426381 3.520427 6.398151
6 6 4.957994 7.517130 6.727288 4.757732 4.575019 6.220071 5.219651 5.130648
7 7 5.344701 4.650095 5.736826 5.161822 5.208502 5.645190 4.266679 4.243660
8 8 4.003065 4.578335 5.797876 4.968013 5.130712 6.192811 4.282839 5.669198
9 9 4.766465 4.395451 5.485031 4.577186 5.366829 5.653012 4.550389 4.367806
10 10 4.695404 5.295599 5.123817 5.358232 5.439788 5.643931 5.127332 5.089670
# ... with 15 more rows
If the total number of rows in the data frame is not divisible by the number of rows you want in each set, then you can do the following when you create the sets:
data = data %>%
mutate(set = sample(rep(1:ceiling(500/20), each=20))[1:n()])
In this case, the set sizes will vary a bit with the number of data rows is not divisible by the desired number of rows in each set.
The following approach could be worth trying for someone in a similar position.
It is based on the numerical balancing in groupdata2's fold() function, which allows creating groups with balanced means for a single column. By standardizing each of the columns and numerically balancing their rowwise sum, we might increase the chance of getting balanced means in the individual columns.
I compared this approach to creating groups randomly a few times and selecting the split with the least variance in means. It seems to be a bit better, but I'm not too convinced that this will hold in all contexts.
# Attach dplyr and groupdata2
library(dplyr)
library(groupdata2)
set.seed(1)
# Create the dataset
data <- matrix(runif(4000, min = 0, max = 10), nrow = 500, ncol = 8)
colnames(data) <- c("A", "B", "C", "D", "E", "F", "G", "H")
data <- dplyr::as_tibble(data)
# Standardize all columns and calculate row sums
data_std <- data %>%
dplyr::mutate_all(.funs = function(x){(x-mean(x))/sd(x)}) %>%
dplyr::mutate(total = rowSums(across(where(is.numeric))))
# Create groups (new column called ".folds")
# We numerically balance the "total" column
data_std <- data_std %>%
groupdata2::fold(k = 25, num_col = "total") # k = 500/20=25
# Transfer the groups to the original (non-standardized) data frame
data$group <- data_std$.folds
# Check the means
data %>%
dplyr::group_by(group) %>%
dplyr::summarise_all(.funs = mean)
> # A tibble: 25 x 9
> group A B C D E F G H
> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
> 1 1 4.48 5.05 4.80 5.65 5.04 4.60 5.12 4.85
> 2 2 5.57 5.17 3.21 5.46 4.46 5.89 5.06 4.79
> 3 3 4.33 6.02 4.57 6.18 4.76 3.79 5.94 3.71
> 4 4 4.51 4.62 4.62 5.27 4.65 5.41 5.26 5.23
> 5 5 4.55 5.10 4.19 5.41 5.28 5.39 5.57 4.23
> 6 6 4.82 4.74 6.10 4.34 4.82 5.08 4.89 4.81
> 7 7 5.88 4.49 4.13 3.91 5.62 4.75 5.46 5.26
> 8 8 4.11 5.50 5.61 4.23 5.30 4.60 4.96 5.35
> 9 9 4.30 3.74 6.45 5.60 3.56 4.92 5.57 5.32
> 10 10 5.26 5.50 4.35 5.29 4.53 4.75 4.49 5.45
> # … with 15 more rows
# Check the standard deviations of the means
# Could be used to compare methods
data %>%
dplyr::group_by(group) %>%
dplyr::summarise_all(.funs = mean) %>%
dplyr::summarise(across(where(is.numeric), sd))
> # A tibble: 1 x 8
> A B C D E F G H
> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
> 1 0.496 0.546 0.764 0.669 0.591 0.611 0.690 0.475
It might be best to compare the means and mean variances (or standard deviations as above) of different methods on the standardized data though. In that case, one could calculate the sum of the variances and minimize it.
data_std %>%
dplyr::select(-total) %>%
dplyr::group_by(.folds) %>%
dplyr::summarise_all(.funs = mean) %>%
dplyr::summarise(across(where(is.numeric), sd)) %>%
sum()
> 1.643989
Comparing multiple balanced splits
The fold() function allows creating multiple unique grouping factors (splits) at once. So here, I will perform the numerically balanced split 20 times and find the grouping with the lowest sum of the standard deviations of the means. I'll further convert it to a function.
create_multi_balanced_groups <- function(data, cols, k, num_tries){
# Extract the variables of interest
# We assume these are numeric but we could add a check
data_to_balance <- data[, cols]
# Standardize all columns
# And calculate rowwise sums
data_std <- data_to_balance %>%
dplyr::mutate_all(.funs = function(x){(x-mean(x))/sd(x)}) %>%
dplyr::mutate(total = rowSums(across(where(is.numeric))))
# Create `num_tries` unique numerically balanced splits
data_std <- data_std %>%
groupdata2::fold(
k = k,
num_fold_cols = num_tries,
num_col = "total"
)
# The new fold column names ".folds_1", ".folds_2", etc.
fold_col_names <- paste0(".folds_", seq_len(num_tries))
# Remove total column
data_std <- data_std %>%
dplyr::select(-total)
# Calculate score for each split
# This could probably be done more efficiently without a for loop
variance_scores <- c()
for (fcol in fold_col_names){
score <- data_std %>%
dplyr::group_by(!!as.name(fcol)) %>%
dplyr::summarise(across(where(is.numeric), mean)) %>%
dplyr::summarise(across(where(is.numeric), sd)) %>%
sum()
variance_scores <- append(variance_scores, score)
}
# Get the fold column with the lowest score
lowest_fcol_index <- which.min(variance_scores)
best_fcol <- fold_col_names[[lowest_fcol_index]]
# Add the best fold column / grouping factor to the original data
data[["group"]] <- data_std[[best_fcol]]
# Return the original data and the score of the best fold column
list(data, min(variance_scores))
}
# Run with 20 splits
set.seed(1)
data_grouped_and_score <- create_multi_balanced_groups(
data = data,
cols = c("A", "B", "C", "D", "E", "F", "G", "H"),
k = 25,
num_tries = 20
)
# Check data
data_grouped_and_score[[1]]
> # A tibble: 500 x 9
> A B C D E F G H group
> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
> 1 5.86 6.54 0.500 2.88 5.70 9.67 2.29 3.01 2
> 2 0.0895 4.69 5.71 0.343 8.95 7.73 5.76 9.58 1
> 3 2.94 1.78 2.06 6.66 9.54 0.600 4.26 0.771 16
> 4 2.77 1.52 0.723 8.11 8.95 1.37 6.32 6.24 7
> 5 8.14 2.49 0.467 8.51 0.889 6.28 4.47 8.63 13
> 6 2.60 8.23 9.17 5.14 2.85 8.54 8.94 0.619 23
> 7 7.24 0.260 6.64 8.35 8.59 0.0862 1.73 8.10 5
> 8 9.06 1.11 6.01 5.35 2.01 9.37 7.47 1.01 1
> 9 9.49 5.48 3.64 1.94 3.24 2.49 3.63 5.52 7
> 10 0.731 0.230 5.29 8.43 5.40 8.50 3.46 1.23 10
> # … with 490 more rows
# Check score
data_grouped_and_score[[2]]
> 1.552656
By commenting out the num_col = "total" line, we can run this without the numerical balancing. For me, this gave a score of 1.615257.
Disclaimer: I am the author of the groupdata2 package. The fold() function can also balance a categorical column (cat_col) and keep all data points with the same ID in the same fold (id_col) (e.g. to avoid leakage in cross-validation). There's a very similar partition() function as well.

Resources