95% winsorization by groups over multiple variables - r

In my real data, I have multiple outliers for multiple variables. My data looks something like the example below but the numbers here are completely random. I would like to pull in all data points that are greater than or less than 2 SD using a 95% winsorization.
df <- read.csv(header=TRUE, text="
id, group, test1, test2
1, 0, 57, 82
2, 0, 77, 80
3, 0, 67, 90
4, 0, 15, 70
5, 0, 58, 72
6, 1, 18, 44
7, 1, 44, 44
8, 1, 18, 46
9, 1, 20, 44
10, 1, 14, 38")
I am aware of the 'winsorize' function in the 'robustHD' package but am not sure: how to ensure the winsorization accounts for the 2 different groups, and including multiple variables in that winsorization.
I have tried this code to fix the problem but the code is not complete:
library(robustHD)
library(dplyr)
new.df.wins = df %>%
group_by(group) %>%
mutate(measure_winsorized = winsorize(c(test1,test2)))
An error is returned indicating
Error: Column `measure_winsorized` must be length 45 (the group size) or one, not 90
I am open to other ideas too. Thanks!

Consider creating two new fields for each numeric field to be winsorized:
new.df.wins <- df %>%
group_by(group) %>%
mutate(measure_winsorized_test1 = winsorize(test1),
measure_winsorized_test2 = winsorize(test2))
Alternatively with base R's ave:
new.df.wins <- within(df, {
measure_winsorized_test2 <- ave(test2, group, FUN=winsorize)
measure_winsorized_test1 <- ave(test1, group, FUN=winsorize)
})
Should you want to winsorize both simultaneously, assign to two new columns at once:
# TIDYVERSE (dplyr)
new.df.wins <- df %>%
group_by(group) %>%
mutate_at(.funs = list(wins = winsorize), .vars = vars(test1:test2))
# TINYVERSE (I.E. BASE R)
df[c("test1_wins", "test2_wins")] <- with(df, ave(cbind(test1, test2),
group, FUN=winsorize))

You can make a version of winsorize() that works on data frames, and use that with by()
# Example data
set.seed(1)
df2 <- round(matrix(rt(100, 4), 20), 3)
df2 <- data.frame(id=seq_len(nrow(df2)),
group=sort(rep(1:2, length=nrow(df2))),
test=df2)
df2[c(1:3, 11:13),]
# id group test.1 test.2 test.3 test.4 test.5
# 1 1 1 -0.673 -1.227 0.015 -0.831 0.024
# 2 2 1 -0.584 1.059 1.492 0.833 -0.377
# 3 3 1 0.572 0.613 -1.924 -0.672 1.184
# 11 11 2 0.054 0.020 2.241 -0.103 -0.047
# 12 12 2 1.746 -0.788 -0.268 -1.921 4.577
# 13 13 2 -0.472 -1.294 -0.258 0.795 -1.110
# data frame version of winsorize
winsorizedf <- function(x, ...) {
do.call(cbind, lapply(x, winsorize, ...))
}
# winsorize every column, except the two first ones, grouped by df2$group
w <- do.call(rbind,
by(df2[, -(1:2)], df2$group, winsorizedf))
# combine the winsorized columns with the original id and group columns
dfw <- data.frame(df2[, 1:2], round(w, 2))
dfw[c(1:3, 11:13),]
# id group test.1 test.2 test.3 test.4 test.5
# 1 1 1 -0.63 -1.23 0.02 -0.83 0.02
# 2 2 1 -0.58 1.06 1.49 0.26 -0.38
# 3 3 1 0.57 0.61 -1.60 -0.67 1.18
# 11 11 2 0.05 0.02 1.23 -0.10 -0.05
# 12 12 2 1.70 -0.79 -0.27 -1.92 4.58
# 13 13 2 -0.47 -1.07 -0.26 0.80 -1.11

Related

Apply two different formulas on four data frame columns

I want to apply two different formulas on four columns of my dataframe df. I have done this manually, but since my original data frame has several columns, I want to be able to use loops or case when to do this faster.
Here's how sample dataframe df looks like:
A B C D
20 100 4 1200
40 150 6 2300
34 200 3 1230
32 225 9 1100
12 220 10 1000
Formula 1:
(x-max(x))/(max(x)-min(x))
Formula 2:
(min(x)-x)/(max(x)-min(x))
I'd like to apply formula 1 on columns B and D and formula 2 on columns A and C.
After applying the formula, I want to store the values in a different dataframe but with the same column names.
Here's what I did:
formula_1 <-function(x) {
(((x - min(x)))/(max(x) - min(x)))
}
formula_2 <-function(x){(min(x)-x)/(max(x)-min(x))
}
Create an empty dataframe BI_score
BI_score$B <- formula_1(df$B)
BI_score$D <- formula_1 (df$D)
BI_score$A <- formula_2 (df$A)
BI_score$C <- formula_2 (df$C)
EDIT
As there are some NAs and Inf values and if we want to exclude them from calculation, we can handle it by updating the function as below and then apply the function to column as shown previously.
formula_1 <-function(x) {
temp <- x[is.finite(x)]
replace(x, is.finite(x), (((temp - min(temp)))/(max(temp) - min(temp))))
}
formula_2 <-function(x) {
temp <- x[is.finite(x)]
replace(x, is.finite(x), (min(temp)-temp)/(max(temp)-min(temp)))
}
The most straight forward approach would be to use lapply to apply the function separately on selected columns.
BI_score <- df
fm1_cols <- c("B", "D")
fm2_cols <- c("A", "C")
BI_score[fm1_cols] <- lapply(df[fm1_cols], formula_1)
BI_score[fm2_cols] <- lapply(df[fm2_cols], formula_2)
BI_score
# A B C D
#1 -0.29 0.00 -0.14 0.154
#2 -1.00 0.40 -0.43 1.000
#3 -0.79 0.80 0.00 0.177
#4 -0.71 1.00 -0.86 0.077
#5 0.00 0.96 -1.00 0.000
As mentioned by #Sotos, if you want to apply the function on alternate columns you could do
BI_score[c(TRUE, FALSE)] <- lapply(df[c(TRUE, FALSE)], formula_1)
BI_score[c(FALSE, TRUE)] <- lapply(df[c(FALSE, TRUE)], formula_2)
Just for fun, approach using dplyr
library(dplyr)
bind_cols(df %>% select(fm1_cols) %>% mutate_all(formula_1),
df %>% select(fm2_cols) %>% mutate_all(formula_2))
If your goal is to apply the two functions on alternating columns, then you can do it via logical indexing
cbind.data.frame(sapply(df[c(TRUE, FALSE)], formula_2),
sapply(df[c(FALSE, TRUE)], formula_1))
# A C B D
#1 -0.2857143 -0.1428571 0.00 0.15384615
#2 -1.0000000 -0.4285714 0.40 1.00000000
#3 -0.7857143 0.0000000 0.80 0.17692308
#4 -0.7142857 -0.8571429 1.00 0.07692308
#5 0.0000000 -1.0000000 0.96 0.00000000
We can use mutate_at from dplyr
library(dplyr)
df1 %>%
mutate_at(vars(B, D), formula_1) %>%
mutate_at(vars(A, C), formula_2)

Create and plot a table which preserves the ordering of the factor

When creating and plotting a table the names are numeric values and I would like for them to stay in numeric order.
Code :
library(plyr)
set.seed(1234)
# create a random vector of different categories
number_of_categories <- 11
probability_of_each_category <- c(0.1,0.05, 0.05,0.08, 0.01,
0.1, 0.2, 0.3, 0.01, 0.02,0.08)
number_of_samples <- 1000
x <- sample( LETTERS[1:number_of_categories],
number_of_samples,
replace=TRUE,
prob=probability_of_each_category)
# just a vector of zeros and ones
outcome <- rbinom(number_of_samples, 1, 0.4)
# I want x to be 1,2,...,11 so that it demonstrates the issue when
# creating the table
x <- mapvalues(x,
c(LETTERS[1:number_of_categories]),
seq(1:number_of_categories))
# the table shows the ordering
prop.table(table(x))
plot(table(x, outcome))
Table :
> prop.table(table(x))
x
1 10 11 2 3 4 5 6 7 8 9
0.105 0.023 0.078 0.044 0.069 0.083 0.018 0.097 0.195 0.281 0.007
Plot :
I would like the plot and the table in the order
1 3 4 5 ... 10 11
Rather than
1 10 11 2 3 4 5 6 7 8 9
You can either convert x to numeric before feeding it to table
plot(table(as.numeric(x), outcome))
Or order the table's rows by the as.numeric of the rownames
t <- table(x, outcome)
t <- t[order(as.numeric(rownames(t))),]
plot(t)
A simple to solve this problem is to format the numbers to include a leading zero during mapvalues(), using sprintf().
x <- mapvalues(x,
c(LETTERS[1:number_of_categories]),
sprintf("%02d",seq(1:number_of_categories)))
# the table shows the ordering
prop.table(table(x))
plot(table(x, outcome))
...and the output:
> prop.table(table(x))
x
01 02 03 04 05 06 07 08 09 10 11
0.104 0.067 0.038 0.073 0.019 0.112 0.191 0.291 0.011 0.019 0.075

cutting interval based on limits in a list

I have the following data frame with 4 numeric columns:
df <- structure(list(a = c(0.494129340746821, 1.0182303327812, 0.412227511922328,
0.204436644926016, 0.707038309818134, -0.0547300783473556, 1.02124944293185,
0.381284586356091, 0.375197843213519, -1.18172401075089), b =
c(-1.34374367808722,
-0.724644569211516, -0.618107980582741, -1.79274868750102,
-3.03559838445132,
-0.205726144151615, -0.441511286334811, 0.126660637747845,
0.353737902975931,
-0.26601393471207), c = c(1.36922677098999, -1.81698348029464,
-0.846111260721092, 0.121256015837603, -1.16499681749603, 1.14145675696301,
-0.782988942359773, 3.25142254765012, -0.132099541183856, -0.242831877642412
), d = c(-0.30002630673509, -0.507496812070994, -2.59870853299723,
-1.30109828239028, 1.05029458887117, -0.606381379180569, -0.928822706709913,
-0.68324741261771, -1.17980245487707, 2.20174180936794)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I would like to create two new factor columns, in which I group columns 2 and 3 according to the values given in the list L:
ColsToChoose = c(2,3)
L = list()
L[[1]] = c(-0.3, 0.7)
L[[2]] = c(-1, 0.5, 1)
df %>% mutate_at(ColsToChoose, funs(intervals = cut(., c(-Inf, L[[.]], Inf))))
That is, I am expecting to get two new columns, the first called intervals_b indicating if the values of column b (column 2) are between -Inf and -0.3, -0.3 and 0.7 or 0.7 and Inf, and similarly for column c: -Inf to -1, -1 to 0.5, 0.5 to 1 and 1 to Inf.
I am getting an error:
Error in mutate_impl(.data, dots) :
Evaluation error: recursive indexing failed at level 2
I would like to do this for the general case, that's why I am using implicit names.
Any ideas?
You could do this base R mapply passing ColsToChoose of df and L parallely to get the range.
df[paste0("interval", names(df)[ColsToChoose])] <-
mapply(function(x, y) cut(x, c(-Inf, y, Inf)), df[ColsToChoose], L)
df
# a b c d intervalb intervalc
# <dbl> <dbl> <dbl> <dbl> <chr> <chr>
# 1 0.494 -1.34 1.37 -0.300 (-Inf,-0.3] (1, Inf]
# 2 1.02 -0.725 -1.82 -0.507 (-Inf,-0.3] (-Inf,-1]
# 3 0.412 -0.618 -0.846 -2.60 (-Inf,-0.3] (-1,0.5]
# 4 0.204 -1.79 0.121 -1.30 (-Inf,-0.3] (-1,0.5]
# 5 0.707 -3.04 -1.16 1.05 (-Inf,-0.3] (-Inf,-1]
# 6 -0.0547 -0.206 1.14 -0.606 (-0.3,0.7] (1, Inf]
# 7 1.02 -0.442 -0.783 -0.929 (-Inf,-0.3] (-1,0.5]
# 8 0.381 0.127 3.25 -0.683 (-0.3,0.7] (1, Inf]
# 9 0.375 0.354 -0.132 -1.18 (-0.3,0.7] (-1,0.5]
#10 -1.18 -0.266 -0.243 2.20 (-0.3,0.7] (-1,0.5]
A tidyverse approach using same approach
library(tidyverse)
bind_cols(df,
map2(df[ColsToChoose], L, ~ cut(.x, c(-Inf, .y, Inf))) %>%
data.frame() %>%
rename_all(paste0, "_interval"))
This gives same output as above.

How to divide dataset into balanced sets based on multiple variables

I have a large dataset I need to divide into multiple balanced sets.
The set looks something like the following:
> data<-matrix(runif(4000, min=0, max=10), nrow=500, ncol=8 )
> colnames(data)<-c("A","B","C","D","E","F","G","H")
The sets, each containing for example 20 rows, will need to be balanced across multiple variables so that each subset ends up having a similar mean of B, C, D that's included in their subgroup compared to all the other subsets.
Is there a way to do that with R? Any advice would be much appreciated. Thank you in advance!
library(tidyverse)
# Reproducible data
set.seed(2)
data<-matrix(runif(4000, min=0, max=10), nrow=500, ncol=8 )
colnames(data)<-c("A","B","C","D","E","F","G","H")
data=as.data.frame(data)
Updated Answer
It's probably not possible to get similar means across sets within each column if you want to keep observations from a given row together. With 8 columns (as in your sample data), you'd need 25 20-row sets where each column A set has the same mean, each column B set has the same mean, etc. That's a lot of constraints. Probably there are, however, algorithms that could find the set membership assignment schedule that minimizes the difference in set means.
However, if you can separately take 20 observations from each column without regard to which row it came from, then here's one option:
# Group into sets with same means
same_means = data %>%
gather(key, value) %>%
arrange(value) %>%
group_by(key) %>%
mutate(set = c(rep(1:25, 10), rep(25:1, 10)))
# Check means by set for each column
same_means %>%
group_by(key, set) %>%
summarise(mean=mean(value)) %>%
spread(key, mean) %>% as.data.frame
set A B C D E F G H
1 1 4.940018 5.018584 5.117592 4.931069 5.016401 5.171896 4.886093 5.047926
2 2 4.946496 5.018578 5.124084 4.936461 5.017041 5.172817 4.887383 5.048850
3 3 4.947443 5.021511 5.125649 4.929010 5.015181 5.173983 4.880492 5.044192
4 4 4.948340 5.014958 5.126480 4.922940 5.007478 5.175898 4.878876 5.042789
5 5 4.943010 5.018506 5.123188 4.924283 5.019847 5.174981 4.869466 5.046532
6 6 4.942808 5.019945 5.123633 4.924036 5.019279 5.186053 4.870271 5.044757
7 7 4.945312 5.022991 5.120904 4.919835 5.019173 5.187910 4.869666 5.041317
8 8 4.947457 5.024992 5.125821 4.915033 5.016782 5.187996 4.867533 5.043262
9 9 4.936680 5.020040 5.128815 4.917770 5.022527 5.180950 4.864416 5.043587
10 10 4.943435 5.022840 5.122607 4.921102 5.018274 5.183719 4.872688 5.036263
11 11 4.942015 5.024077 5.121594 4.921965 5.015766 5.185075 4.880304 5.045362
12 12 4.944416 5.024906 5.119663 4.925396 5.023136 5.183449 4.887840 5.044733
13 13 4.946751 5.020960 5.127302 4.923513 5.014100 5.186527 4.889140 5.048425
14 14 4.949517 5.011549 5.127794 4.925720 5.006624 5.188227 4.882128 5.055608
15 15 4.943008 5.013135 5.130486 4.930377 5.002825 5.194421 4.884593 5.051968
16 16 4.939554 5.021875 5.129392 4.930384 5.005527 5.197746 4.883358 5.052474
17 17 4.935909 5.019139 5.131258 4.922536 5.003273 5.204442 4.884018 5.059162
18 18 4.935830 5.022633 5.129389 4.927106 5.008391 5.210277 4.877859 5.054829
19 19 4.936171 5.025452 5.127276 4.927904 5.007995 5.206972 4.873620 5.054192
20 20 4.942925 5.018719 5.127394 4.929643 5.005699 5.202787 4.869454 5.055665
21 21 4.941351 5.014454 5.125727 4.932884 5.008633 5.205170 4.870352 5.047728
22 22 4.933846 5.019311 5.130156 4.923804 5.012874 5.213346 4.874263 5.056290
23 23 4.928815 5.021575 5.139077 4.923665 5.017180 5.211699 4.876333 5.056836
24 24 4.928739 5.024419 5.140386 4.925559 5.012995 5.214019 4.880025 5.055182
25 25 4.929357 5.025198 5.134391 4.930061 5.008571 5.217005 4.885442 5.062630
Original Answer
# Randomly group data into 20-row groups
set.seed(104)
data = data %>%
mutate(set = sample(rep(1:(500/20), each=20)))
head(data)
A B C D E F G H set
1 1.848823 6.920055 3.2283369 6.633721 6.794640 2.0288792 1.984295 2.09812642 10
2 7.023740 5.599569 0.4468325 5.198884 6.572196 0.9269249 9.700118 4.58840437 20
3 5.733263 3.426912 7.3168797 3.317611 8.301268 1.4466065 5.280740 0.09172101 19
4 1.680519 2.344975 4.9242313 6.163171 4.651894 2.2253335 1.175535 2.51299726 25
5 9.438393 4.296028 2.3563249 5.814513 1.717668 0.8130327 9.430833 0.68269106 19
6 9.434750 7.367007 1.2603451 5.952936 3.337172 5.2892300 5.139007 6.52763327 5
# Mean by set for each column
data %>% group_by(set) %>%
summarise_all(mean)
set A B C D E F G H
1 1 5.240236 6.143941 4.638874 5.367626 4.982008 4.200123 5.521844 5.083868
2 2 5.520983 5.257147 5.209941 4.504766 4.231175 3.642897 5.578811 6.439491
3 3 5.943011 3.556500 5.366094 4.583440 4.932206 4.725007 5.579103 5.420547
4 4 4.729387 4.755320 5.582982 4.763171 5.217154 5.224971 4.972047 3.892672
5 5 4.824812 4.527623 5.055745 4.556010 4.816255 4.426381 3.520427 6.398151
6 6 4.957994 7.517130 6.727288 4.757732 4.575019 6.220071 5.219651 5.130648
7 7 5.344701 4.650095 5.736826 5.161822 5.208502 5.645190 4.266679 4.243660
8 8 4.003065 4.578335 5.797876 4.968013 5.130712 6.192811 4.282839 5.669198
9 9 4.766465 4.395451 5.485031 4.577186 5.366829 5.653012 4.550389 4.367806
10 10 4.695404 5.295599 5.123817 5.358232 5.439788 5.643931 5.127332 5.089670
# ... with 15 more rows
If the total number of rows in the data frame is not divisible by the number of rows you want in each set, then you can do the following when you create the sets:
data = data %>%
mutate(set = sample(rep(1:ceiling(500/20), each=20))[1:n()])
In this case, the set sizes will vary a bit with the number of data rows is not divisible by the desired number of rows in each set.
The following approach could be worth trying for someone in a similar position.
It is based on the numerical balancing in groupdata2's fold() function, which allows creating groups with balanced means for a single column. By standardizing each of the columns and numerically balancing their rowwise sum, we might increase the chance of getting balanced means in the individual columns.
I compared this approach to creating groups randomly a few times and selecting the split with the least variance in means. It seems to be a bit better, but I'm not too convinced that this will hold in all contexts.
# Attach dplyr and groupdata2
library(dplyr)
library(groupdata2)
set.seed(1)
# Create the dataset
data <- matrix(runif(4000, min = 0, max = 10), nrow = 500, ncol = 8)
colnames(data) <- c("A", "B", "C", "D", "E", "F", "G", "H")
data <- dplyr::as_tibble(data)
# Standardize all columns and calculate row sums
data_std <- data %>%
dplyr::mutate_all(.funs = function(x){(x-mean(x))/sd(x)}) %>%
dplyr::mutate(total = rowSums(across(where(is.numeric))))
# Create groups (new column called ".folds")
# We numerically balance the "total" column
data_std <- data_std %>%
groupdata2::fold(k = 25, num_col = "total") # k = 500/20=25
# Transfer the groups to the original (non-standardized) data frame
data$group <- data_std$.folds
# Check the means
data %>%
dplyr::group_by(group) %>%
dplyr::summarise_all(.funs = mean)
> # A tibble: 25 x 9
> group A B C D E F G H
> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
> 1 1 4.48 5.05 4.80 5.65 5.04 4.60 5.12 4.85
> 2 2 5.57 5.17 3.21 5.46 4.46 5.89 5.06 4.79
> 3 3 4.33 6.02 4.57 6.18 4.76 3.79 5.94 3.71
> 4 4 4.51 4.62 4.62 5.27 4.65 5.41 5.26 5.23
> 5 5 4.55 5.10 4.19 5.41 5.28 5.39 5.57 4.23
> 6 6 4.82 4.74 6.10 4.34 4.82 5.08 4.89 4.81
> 7 7 5.88 4.49 4.13 3.91 5.62 4.75 5.46 5.26
> 8 8 4.11 5.50 5.61 4.23 5.30 4.60 4.96 5.35
> 9 9 4.30 3.74 6.45 5.60 3.56 4.92 5.57 5.32
> 10 10 5.26 5.50 4.35 5.29 4.53 4.75 4.49 5.45
> # … with 15 more rows
# Check the standard deviations of the means
# Could be used to compare methods
data %>%
dplyr::group_by(group) %>%
dplyr::summarise_all(.funs = mean) %>%
dplyr::summarise(across(where(is.numeric), sd))
> # A tibble: 1 x 8
> A B C D E F G H
> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
> 1 0.496 0.546 0.764 0.669 0.591 0.611 0.690 0.475
It might be best to compare the means and mean variances (or standard deviations as above) of different methods on the standardized data though. In that case, one could calculate the sum of the variances and minimize it.
data_std %>%
dplyr::select(-total) %>%
dplyr::group_by(.folds) %>%
dplyr::summarise_all(.funs = mean) %>%
dplyr::summarise(across(where(is.numeric), sd)) %>%
sum()
> 1.643989
Comparing multiple balanced splits
The fold() function allows creating multiple unique grouping factors (splits) at once. So here, I will perform the numerically balanced split 20 times and find the grouping with the lowest sum of the standard deviations of the means. I'll further convert it to a function.
create_multi_balanced_groups <- function(data, cols, k, num_tries){
# Extract the variables of interest
# We assume these are numeric but we could add a check
data_to_balance <- data[, cols]
# Standardize all columns
# And calculate rowwise sums
data_std <- data_to_balance %>%
dplyr::mutate_all(.funs = function(x){(x-mean(x))/sd(x)}) %>%
dplyr::mutate(total = rowSums(across(where(is.numeric))))
# Create `num_tries` unique numerically balanced splits
data_std <- data_std %>%
groupdata2::fold(
k = k,
num_fold_cols = num_tries,
num_col = "total"
)
# The new fold column names ".folds_1", ".folds_2", etc.
fold_col_names <- paste0(".folds_", seq_len(num_tries))
# Remove total column
data_std <- data_std %>%
dplyr::select(-total)
# Calculate score for each split
# This could probably be done more efficiently without a for loop
variance_scores <- c()
for (fcol in fold_col_names){
score <- data_std %>%
dplyr::group_by(!!as.name(fcol)) %>%
dplyr::summarise(across(where(is.numeric), mean)) %>%
dplyr::summarise(across(where(is.numeric), sd)) %>%
sum()
variance_scores <- append(variance_scores, score)
}
# Get the fold column with the lowest score
lowest_fcol_index <- which.min(variance_scores)
best_fcol <- fold_col_names[[lowest_fcol_index]]
# Add the best fold column / grouping factor to the original data
data[["group"]] <- data_std[[best_fcol]]
# Return the original data and the score of the best fold column
list(data, min(variance_scores))
}
# Run with 20 splits
set.seed(1)
data_grouped_and_score <- create_multi_balanced_groups(
data = data,
cols = c("A", "B", "C", "D", "E", "F", "G", "H"),
k = 25,
num_tries = 20
)
# Check data
data_grouped_and_score[[1]]
> # A tibble: 500 x 9
> A B C D E F G H group
> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
> 1 5.86 6.54 0.500 2.88 5.70 9.67 2.29 3.01 2
> 2 0.0895 4.69 5.71 0.343 8.95 7.73 5.76 9.58 1
> 3 2.94 1.78 2.06 6.66 9.54 0.600 4.26 0.771 16
> 4 2.77 1.52 0.723 8.11 8.95 1.37 6.32 6.24 7
> 5 8.14 2.49 0.467 8.51 0.889 6.28 4.47 8.63 13
> 6 2.60 8.23 9.17 5.14 2.85 8.54 8.94 0.619 23
> 7 7.24 0.260 6.64 8.35 8.59 0.0862 1.73 8.10 5
> 8 9.06 1.11 6.01 5.35 2.01 9.37 7.47 1.01 1
> 9 9.49 5.48 3.64 1.94 3.24 2.49 3.63 5.52 7
> 10 0.731 0.230 5.29 8.43 5.40 8.50 3.46 1.23 10
> # … with 490 more rows
# Check score
data_grouped_and_score[[2]]
> 1.552656
By commenting out the num_col = "total" line, we can run this without the numerical balancing. For me, this gave a score of 1.615257.
Disclaimer: I am the author of the groupdata2 package. The fold() function can also balance a categorical column (cat_col) and keep all data points with the same ID in the same fold (id_col) (e.g. to avoid leakage in cross-validation). There's a very similar partition() function as well.

Generate summary table from bins of a plot

I have a dataset of the form:
d = data.frame(seq(0.01,1,by=0.01), c(seq(0.27,0.1,-0.01),seq(0.1,0.5,0.01),seq(0.5,0.1,-0.01)))
names(d) = c("X","Y")
ggplot(d, aes(x=X, y=Y)) + geom_line()
I am trying to generate a summary table that bins the Y variable into equal groups of 10% and generate the summary statistics of X for each bin. This is how I would like my result to look like:
Y Group X Group
0-10% {Range1: 10-30%, mean1, median1, sd1} {Range2: 85-100%, mean2, median2, sd2}
10-20% ...
20-30% ...
30-40% ...
40-50% ...
The ranges of X are not always two, 20-30% of Y has three ranges of X and 40-50% has one.
I have many large datasets on which this has to be implemented. The data is for reproducing the problem. My actual data could have many inflection points, as this code has to run on many combinations of X and Y.
Output not formatted like yours.
But here is a close solution. You can easily reformat to your liking. It seems you are binning Y in 10 groups but not sure on X. I am using 10 groups on X too.
d = data.frame(seq(0.01,1,by=0.01), c(seq(0.27,0.1,-0.01),seq(0.1,0.5,0.01),seq(0.5,0.1,-0.01)))
names(d) = c("X","Y")
library(dplyr)
d$x.decile<-ntile(d$X,10)
d$y.decile<-ntile(d$Y,10)
summary<-data.frame(d%>%group_by(y.decile, x.decile)%>%summarise(mean=mean(X),median=median(X), min=min(X), max=max(X), sd=sd(X)))
> summary
y.decile x.decile mean median min max sd
1 1 2 0.175 0.175 0.15 0.20 0.018708287
2 1 3 0.210 0.210 0.21 0.21 NaN
3 1 10 0.990 0.990 0.98 1.00 0.010000000
4 2 2 0.135 0.135 0.13 0.14 0.007071068
5 2 3 0.235 0.235 0.22 0.25 0.012909944
6 2 10 0.955 0.955 0.94 0.97 0.012909944
7 3 1 0.095 0.095 0.09 0.10 0.007071068
You can get the format you want with melt and dcast from the reshape package.
In the code below, I've cut the data into 10 Y groups and 2 X groups, just to keep the width of the output reasonable. Change 2 to 10 in the ntile function to get actual deciles for X. Also, I haven't included every summary item, but hopefully the code below will guide you for adding additional information.
library(dplyr)
library(reshape2)
sm = d %>% group_by(`Y decile`=ntile(Y,10), X.decile=ntile(X,2)) %>%
summarise(`X decile` = paste0("{Count: ", n(), ", Range: ", min(X),"-",max(X),", Median: ",median(X),"}"))
sm %>% melt(id.var=c("Y decile","X.decile")) %>%
dcast(`Y decile` ~ variable + X.decile, value.var="value", fill="")
Y decile X decile_1 X decile_2
1 1 {Count: 7, Range: 0.15-0.21, Median: 0.18} {Count: 3, Range: 0.98-1, Median: 0.99}
2 2 {Count: 6, Range: 0.13-0.25, Median: 0.225} {Count: 4, Range: 0.94-0.97, Median: 0.955}
3 3 {Count: 7, Range: 0.09-0.28, Median: 0.12} {Count: 3, Range: 0.91-0.93, Median: 0.92}
4 4 {Count: 6, Range: 0.06-0.31, Median: 0.185} {Count: 4, Range: 0.87-0.9, Median: 0.885}
5 5 {Count: 8, Range: 0.02-0.35, Median: 0.185} {Count: 2, Range: 0.85-0.86, Median: 0.855}
6 6 {Count: 5, Range: 0.01-0.39, Median: 0.37} {Count: 5, Range: 0.8-0.84, Median: 0.82}
7 7 {Count: 5, Range: 0.4-0.44, Median: 0.42} {Count: 5, Range: 0.75-0.79, Median: 0.77}
8 8 {Count: 5, Range: 0.45-0.49, Median: 0.47} {Count: 5, Range: 0.7-0.74, Median: 0.72}
9 9 {Count: 1, Range: 0.5-0.5, Median: 0.5} {Count: 9, Range: 0.51-0.69, Median: 0.65}
10 10 {Count: 10, Range: 0.55-0.64, Median: 0.595}
melt isn't actually necessary here. You could to the following, where the extra line at the end is to get more explanatory names.
sm = d %>% group_by(`Y decile`=ntile(Y,10), X.decile=ntile(X,2)) %>%
summarise(`X decile` = paste0("{N: ", n(), ", Range: ", min(X),"-",max(X),", Median: ",median(X),"}")) %>%
dcast(`Y decile` ~ X.decile, value.var="X decile", fill="", value.name=) %>%
setNames(., c(names(.)[1], paste0("X decile ", names(.)[-1])))
The quantile and aggregate functions can help you.
# Create data frame
d <- data.frame(seq(0.01,1,by=0.01), c(seq(0.27,0.1,- 0.01),seq(0.1,0.5,0.01),seq(0.5,0.1,-0.01)))
names(d) <- c("X","Y")
# Define bins
bins <- quantile(d$Y, seq(0.1,1,length.out=10))
# Create indicator variable for which bin each Y belongs in
ag <- c()
for (i in 1:nrow(d)) {ag[i] <- which(d$Y[i] < bins)[1]}
# Compute summary statistics
means <- aggregate(d$X, by=list(ag), mean)
medians <- aggregate(d$X, by=list(ag), median)
variances <- aggregate(d$X, by=list(ag), var)
# Put them all into a new data frame
data.frame(group=(1:10),mean=means[,2], median=medians[,2], variance=variances[,2])
## group mean median variance
##1 1 0.4533333 0.200 0.162250000
##2 2 0.4709091 0.240 0.148969091
##3 3 0.3990000 0.265 0.134543333
##4 4 0.4650000 0.305 0.139583333
##5 5 0.3525000 0.325 0.114278571
##6 6 0.4983333 0.385 0.097178788
##7 7 0.5950000 0.595 0.034250000
##8 8 0.5950000 0.595 0.017583333
##9 9 0.5950000 0.595 0.006472222
##10 10 0.5950000 0.595 0.001171429

Resources