I'm trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.
#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73",
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56",
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26",
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"),
fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976,
11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436,
28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932,
19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787,
20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608,
16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636,
23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208,
21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436,
22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847,
24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182,
14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361,
17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999,
-0.244843034968998, -0.298887493573998, -0.223614002904,
-1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999,
-0.216792667809003, -0.596428704428, -0.668238310155001,
-1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001,
-0.269967200167002, -0.524678463173998, -0.360871620761998,
0.214251247972999, -0.966030101111, -0.166404124383998),
fm_percchg = c(0.00052913132097943, -0.0217928016671462,
-0.0141722355981437, -0.0259313896588437, -0.00939355370091154,
-0.0656180855522784, 0.00795459423472242, -0.0286337438132355,
-0.0165832282022865, -0.00963877445980213, -0.0408260722701251,
-0.0338607155572751, -0.0328573534170568, -0.0262673392452288,
-0.028017619615079, -0.025419001203338, -0.0120940958619099,
-0.0355048596062299, -0.0168713805332318, 0.0126083147698213,
-0.0683277073949869, -0.00936051767758492)), row.names = c(NA,
-22L), class = "data.frame")
#my function:
summbygrp <- function(x) {
group_by(dexadf, group) %>%
summarise(
count = n(),
mean = mean({{x}}, na.rm = TRUE),
sd = sd({{x}}, na.rm = TRUE)
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
#apply function to all numeric columns and print column names before output
coln = 1
for (col in dexadf) {
print(colnames(dexadf)[coln])
coln = coln + 1
if(is.numeric(col)) {
print(summbygrp(col))
} else {next}
}
#output:
[1] "fm_bdc3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.6 5.48 1.65 16.9 24.3
2 e 11 20.6 5.48 1.65 16.9 24.3
[1] "fm_rec3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.1 5.41 1.63 16.5 23.8
2 e 11 20.1 5.41 1.63 16.5 23.8
[1] "fm_chg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.450 0.406 0.122 -0.723 -0.178
2 e 11 -0.450 0.406 0.122 -0.723 -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
2 e 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
As you can see, all the means for both groups are the same, and I know this shouldn't be true. Could someone identify the error in the code? Thank you!
So instead of using for-loops you can do better,
library(dplyr)
library(rlang)
library(purrr)
library(tibble)
dexadf <- data.frame(
stringsAsFactors = FALSE,
participant = c("pt04","pt75","pt21","pt73",
"pt27","pt39","pt43","pt52","pt69","pt49","pt50",
"pt56","pt62","pt68","pt22","pt64","pt54","pt79",
"pt36","pt26","pt65","pt38"),
fm_bdc3 = c(18.535199635968,23.52996574649,
17.276246451976,11.526088555461,23.805048656112,
23.08597823716,28.691020942436,28.968097858499,
23.378093165331,22.491725344661,14.609015054932,19.734914019306,
31.947412973684,25.152298171274,12.007356801787,
20.836128108938,22.322230884349,14.777652101515,
21.389572717608,16.992853675086,14.138189878472,17.777235203826),
fm_rec3 = c(18.545007190636,
23.017181869742,17.031403417007,11.227201061887,23.581434653208,
21.571120542136,28.919246372213,28.138632765662,
22.990408911436,22.274932676852,14.012586350504,19.066675709151,
30.897705534847,24.491614222412,11.670939246332,
20.306494543464,22.052263684182,14.252973638341,
21.028701096846,17.207104923059,13.172159777361,17.610831079442),
fm_chg = c(0.00980755466799721,
-0.512783876747999,-0.244843034968998,-0.298887493573998,
-0.223614002904,-1.514857695024,0.228225429777002,
-0.829465092836998,-0.387684253894999,-0.216792667809003,
-0.596428704428,-0.668238310155001,-1.049707438837,
-0.660683948862001,-0.336417555455,-0.529633565474001,
-0.269967200167002,-0.524678463173998,-0.360871620761998,
0.214251247972999,-0.966030101111,-0.166404124383998),
fm_percchg = c(0.00052913132097943,
-0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
-0.00939355370091154,-0.0656180855522784,
0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
-0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
-0.0328573534170568,-0.0262673392452288,-0.028017619615079,
-0.025419001203338,-0.0120940958619099,
-0.0355048596062299,-0.0168713805332318,0.0126083147698213,
-0.0683277073949869,-0.00936051767758492),
group = as.factor(c("c","e",
"e","c","c","e","c","e","c","e","e","c",
"e","c","c","e","e","c","e","c","e",
"c")),
sex = as.factor(c("f","m",
"m","m","m","m","m","f","m","f","f","f",
"f","f","f","f","m","f","m","m","f",
"m"))
)
dexadf <- as_tibble(dexadf)
# Note the use of .data pronoun, since columns will passed to this function as characters
summbygrp <- function(df, x) {
df %>%
group_by(group) %>%
summarise(
count = n(),
mean = mean(.data[[x]], na.rm = TRUE), # use of .data
sd = sd(.data[[x]], na.rm = TRUE) # use of .data
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
# Here we extract the numerical columns of the dataset
cols <- dexadf %>%
select(where(is.numeric)) %>% colnames(.)
cols
#> [1] "fm_bdc3" "fm_rec3" "fm_chg" "fm_percchg"
# Then instead of for loops we can simply use this map function
map(.x = cols, ~ summbygrp(dexadf, .x))
#> [[1]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 e 11 21.9 5.40 1.63 18.2 25.5
#>
#> [[2]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.1 5.54 1.67 15.3 22.8
#> 2 e 11 21.2 5.31 1.60 17.7 24.8
#>
#> [[3]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 2 e 11 -0.645 0.407 0.123 -0.918 -0.371
#>
#> [[4]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 2 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
# -------------------------------------------------------------------
# we can also bind all the output results (dataframes) in a single dataframe
map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")
#> # A tibble: 8 × 8
#> vars group count mean sd se lower.ci upper.ci
#> <chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 1 e 11 21.9 5.40 1.63 18.2 25.5
#> 3 2 c 11 19.1 5.54 1.67 15.3 22.8
#> 4 2 e 11 21.2 5.31 1.60 17.7 24.8
#> 5 3 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 6 3 e 11 -0.645 0.407 0.123 -0.918 -0.371
#> 7 4 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 8 4 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
Created on 2022-07-09 by the reprex package (v2.0.1)
out <- df %>%
pivot_longer(starts_with('fm')) %>%
group_by(name, group) %>%
summarise(
count = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
out
# A tibble: 8 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
3 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
4 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
5 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
6 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
7 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
8 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
if you need the list, just split it:
split(out, ~name)
$fm_bdc3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
$fm_chg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
2 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
$fm_percchg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
2 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
$fm_rec3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
2 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
A similar answer to the above, but combining across and summarise:
df |>
group_by(group) |>
summarise(
across(
where(is.numeric),
list(
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
n = ~n()
),
.names = "{.col}.{.fn}"
)
) |>
pivot_longer(
-group,
names_to = c("measure", "stat"),
names_sep = "\\."
) |>
pivot_wider(
names_from = stat,
values_from = value
) |>
mutate(
se = sd / sqrt(n),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
) |>
arrange(measure)
I currently have a list with columns as individual elements.
I would like to combine list elements with the same column names (i.e. bind rows) and merge across the different columns (i.e. bind columns) into a single data frame. I'm having difficulty finding examples of how to do this.
l = list(est = c(0, 0.062220390087795, 1.1020213968139, 0.0359939361491544
), se = c(0.0737200634874046, 0.237735179934829, 0.18105632705918,
0.111359438298789), rf = structure(c(NA, NA, NA, 4L), levels = c("Never\nsmoker",
"Occasional\nsmoker", "Ex-regular\nsmoker", "Smoker"), class = "factor"),
n = c(187L, 18L, 32L, 82L), model = c("Crude", "Crude", "Crude",
"Crude"), est = c(0, 0.112335510453586, 0.867095253670329,
0.144963556944891), se = c(0.163523775933409, 0.237039485900481,
0.186247776987999, 0.119887623484768), rf = structure(c(NA,
NA, NA, 4L), levels = c("Never\nsmoker", "Occasional\nsmoker",
"Ex-regular\nsmoker", "Smoker"), class = "factor"), n = c(187L,
18L, 32L, 82L), model = c("Model 1", "Model 1", "Model 1",
"Model 1"), est = c(0, 0.107097305324242, 0.8278765140371,
0.0958220447859447), se = c(0.164787596943329, 0.237347836229364,
0.187201880036661, 0.120882616647714), rf = structure(c(NA,
NA, NA, 4L), levels = c("Never\nsmoker", "Occasional\nsmoker",
"Ex-regular\nsmoker", "Smoker"), class = "factor"), n = c(187L,
18L, 32L, 82L), model = c("Model 2", "Model 2", "Model 2",
"Model 2"))
I would like the data to have the following format:
data.frame(
est = c(),
se = c(),
rf = c(),
model = c()
)
Any help would be appreciated. Thank you!
In this solution, first the elements of l are grouped by name and then are combined using c. Finally, the resulting list is converted to a dataframe using map_dfc.
library(dplyr)
library(purrr)
cols <- c("est", "se", "rf", "model")
setNames(cols,cols) |>
map(~l[names(l) == .x]) |>
map_dfc(~do.call(c, .x))
#> # A tibble: 12 × 4
#> est se rf model
#> <dbl> <dbl> <fct> <chr>
#> 1 0 0.0737 NA Crude
#> 2 0.0622 0.238 NA Crude
#> 3 1.10 0.181 NA Crude
#> 4 0.0360 0.111 Smoker Crude
#> 5 0 0.164 NA Model 1
#> 6 0.112 0.237 NA Model 1
#> 7 0.867 0.186 NA Model 1
#> 8 0.145 0.120 Smoker Model 1
#> 9 0 0.165 NA Model 2
#> 10 0.107 0.237 NA Model 2
#> 11 0.828 0.187 NA Model 2
#> 12 0.0958 0.121 Smoker Model 2
another option
library(purrr)
grp <- (seq(length(l)) - 1) %/% 5
l_split <- split(l, grp)
map_df(l_split, c)
#> # A tibble: 12 × 5
#> est se rf n model
#> <dbl> <dbl> <fct> <int> <chr>
#> 1 0 0.0737 <NA> 187 Crude
#> 2 0.0622 0.238 <NA> 18 Crude
#> 3 1.10 0.181 <NA> 32 Crude
#> 4 0.0360 0.111 Smoker 82 Crude
#> 5 0 0.164 <NA> 187 Model 1
#> 6 0.112 0.237 <NA> 18 Model 1
#> 7 0.867 0.186 <NA> 32 Model 1
#> 8 0.145 0.120 Smoker 82 Model 1
#> 9 0 0.165 <NA> 187 Model 2
#> 10 0.107 0.237 <NA> 18 Model 2
#> 11 0.828 0.187 <NA> 32 Model 2
#> 12 0.0958 0.121 Smoker 82 Model 2
I have 2 R data frames that looks like this:
DATA FRAME 1:
identifier
ef_posterior
position_no
classification
11111
0.260
1
yes
11111
0.0822
2
yes
11111
0.00797
3
yes
11111
0.04
4
no
11111
0.245
5
yes
11111
0.432
6
yes
11112
0.342
1
maybe
11112
0.453
2
yes
11112
0.0032
3
yes
11112
0.241
5
no
11112
0.0422
6
yes
11112
0.311
4
no
DATAFRAME 2:
study_identifier
%LVEF
11111
62
11112
76
I want to merge and rearrange these two data frames into something like this:
Study_identifier and identifier are the same thing (just different column names). Additionally, I would like to recode the classification so that yes = 0, no = 1, maybe = 2
identifier
pos_1
pos_1_class
pos_2
pos_2_class
pos_3
pos_3_class
pos_4
pos_4_class
pos_5
pos_5_class
pos_6
pos_6_class
%LVEF
11111
0.260
0
0.0822
0
0.00797
0
0.04
1
0.245
0
0.432
0
62
11112
0.342
2
0.453
0
0.0032
0
0.311
1
0.241
1
0.0422
0
76
df1 %>% mutate(position_no = paste0("position_", position_no)) %>%
pivot_wider(id_cols = identifier, names_from = position_no, values_from = ef_posterior) %>%
left_join(df2 %>% mutate(study_identifier = as.numeric(as.character(study_identifier))), by = c("identifier" = "study_identifier"))
This is the code I have right now, but I can't figure out where to put in the code for the classification column
How would I go about doing this?
Any help would be very much appreciated!
You can recode quite easily with dplyr and case_when:
df1 %>% mutate(
classification =
case_when( classification == "yes" ~ 1,
classification == "no" ~ 0,
classification == "maybe" ~ 2)
)
I would solve it the following way:
library(tidyverse)
df1 <- data.frame(
stringsAsFactors = FALSE,
identifier = c(11111L,11111L,11111L,11111L,
11111L,11111L,11112L,11112L,11112L,11112L,11112L,
11112L),
ef_posterior = c(0.26,0.0822,0.00797,0.04,
0.245,0.432,0.342,0.453,0.0032,0.241,0.0422,0.311),
position_no = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 5L, 6L, 4L),
classification = c("yes","yes","yes","no",
"yes","yes","maybe","yes","yes","no","yes","no")
)
df2 <- data.frame(
check.names = FALSE,
study_identifier = c(11111L, 11112L),
`%LVEF` = c(62L, 76L)
)
df1 %>% mutate(
classification =
case_when( classification == "yes" ~ 1,
classification == "no" ~ 0,
classification == "maybe" ~ 2)
) %>%
pivot_wider(
id_cols = c(identifier), names_from = c(position_no), values_from = c(classification,ef_posterior)) %>%
left_join(df2, by = c("identifier" = "study_identifier"))
#> # A tibble: 2 x 14
#> identifier classification_1 classification_2 classification_3 classification_4
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 11111 1 1 1 0
#> 2 11112 2 1 1 0
#> # … with 9 more variables: classification_5 <dbl>, classification_6 <dbl>,
#> # ef_posterior_1 <dbl>, ef_posterior_2 <dbl>, ef_posterior_3 <dbl>,
#> # ef_posterior_4 <dbl>, ef_posterior_5 <dbl>, ef_posterior_6 <dbl>,
#> # `%LVEF` <int>
Created on 2021-04-12 by the reprex package (v0.3.0)
I am trying to perform cor.test in R in a dataframe:
For a toy dataset of X and Y, I used the following:
library(dplyr)
library(broom)
X = c(0.88,1.3,5.6,3.1)
Y = c(0,1,1,1)
ft<-cor.test(X,Y)
tidy(ft) %>%
select(estimate, p.value, conf.low, conf.high) %>%
bind_rows(.id = 'grp')
which gives me the following result:
grp estimate p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl>
1 1 0.571 0.429 -0.864 0.989
Now, a short version of my dataframe is like:
df<-structure(list(X_sample1 = c(0.11, 0.98, 0.88), X_sample2 = c(0.13,
0, 1.3), X_sample3 = c(1.5, 3.5, 5.6), X_sample4 = c(3.2, 2.4,
3.1), Y_sample1 = c(0L, 1L, 0L), Y_sample2 = c(0L, 0L, 1L), Y_sample3 = c(1L,
1L, 1L), Y_sample4 = c(1L, 1L, 1L)), class = "data.frame", row.names = c("Product1",
"Product2", "Product3"))
I want to perform cor.test in each row of the df between X and Y groups. Thus, in the above example df, the groups are:
X = c(0.11,0.13,1.5,3.2)
Y = c(0,0,1,1)
---------------
X = c(0.98,0,3.5,2.4)
Y = c(1,0,1,1)
---------------
X = c(0.88,1.3,5.6,3.1)
Y = c(0,1,1,1)
I want a output like:
grp estimate p.value conf.low conf.high
Product1 0.88 0.12 -0.525 0.997
Product2 0.743 0.257 -0.762 0.994
Product3 0.571 0.429 -0.864 0.989
Thanks for your help!
One option could be:
df %>%
rownames_to_column(var = "grp") %>%
rowwise() %>%
transmute(grp,
tidy(cor.test(c_across(starts_with("X")), c_across(starts_with("Y"))))) %>%
select(grp, estimate, p.value, conf.low, conf.high)
grp estimate p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl>
1 Product1 0.880 0.120 -0.525 0.997
2 Product2 0.743 0.257 -0.762 0.994
3 Product3 0.571 0.429 -0.864 0.989
You can use dplyr and broom:
library(dplyr)
library(broom)
df %>%
rownames_to_column() %>%
pivot_longer(-rowname, names_to = c(".value", "sample"),
names_sep = "_sample") %>%
nest_by(rowname) %>%
summarize(cors1 = tidy(cor.test(data$X, data$Y)))
# A tibble: 3 x 2
# Groups: rowname [3]
rowname cors1$estimate $statistic $p.value $parameter $conf.low $conf.high
<chr> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 Produc~ 0.880 2.62 0.120 2 -0.525 0.997
2 Produc~ 0.743 1.57 0.257 2 -0.762 0.994
3 Produc~ 0.571 0.984 0.429 2 -0.864 0.989
My objective is to compare which of the two clustering methods I've used cluster_method_1 and cluster_method_2 has the largest between cluster sum of squares in order to identify which one achieves better separation.
I'm basically looking for an efficient way to calculate the distance between each point of cluster 1 and all points of cluster 2,3,4, and so on.
example dataframe:
structure(list(x1 = c(0.01762376, -1.147739752, 1.073605848,
2.000420899, 0.01762376, 0.944438811, 2.000420899, 0.01762376,
-1.147739752, -1.147739752), x2 = c(0.536193126, 0.885609849,
-0.944699546, -2.242627057, -1.809984553, 1.834120637, 0.885609849,
0.96883563, 0.186776403, -0.678508604), x3 = c(0.64707104, -0.603759684,
-0.603759684, -0.603759684, -0.603759684, 0.64707104, -0.603759684,
-0.603759684, -0.603759684, 1.617857394), x4 = c(-0.72712328,
0.72730861, 0.72730861, -0.72712328, -0.72712328, 0.72730861,
0.72730861, -0.72712328, -0.72712328, -0.72712328), cluster_method_1 = structure(c(1L,
3L, 3L, 3L, 2L, 2L, 3L, 2L, 1L, 4L), .Label = c("1", "2", "4",
"6"), class = "factor"), cluster_method_2 = structure(c(5L, 3L,
1L, 3L, 4L, 2L, 1L, 1L, 1L, 6L), .Label = c("1", "2", "3", "4",
"5", "6"), class = "factor")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
x1 x2 x3 x4 cluster_method_1 cluster_method_2
<dbl> <dbl> <dbl> <dbl> <fct> <fct>
1 0.0176 0.536 0.647 -0.727 1 5
2 -1.15 0.886 -0.604 0.727 4 3
3 1.07 -0.945 -0.604 0.727 4 1
4 2.00 -2.24 -0.604 -0.727 4 3
5 0.0176 -1.81 -0.604 -0.727 2 4
6 0.944 1.83 0.647 0.727 2 2
7 2.00 0.886 -0.604 0.727 4 1
8 0.0176 0.969 -0.604 -0.727 2 1
9 -1.15 0.187 -0.604 -0.727 1 1
10 -1.15 -0.679 1.62 -0.727 6 6
The within sum-of-squares for cluster Si can be written as the sum of all pairwise (Euclidean) distances squared, divided by twice the number of points in that cluster (see e.g. the Wikipedia article on k-means clustering)
For convenience we define a function calc_SS that returns the within sum-of-squares for a (numeric) data.frame
calc_SS <- function(df) sum(as.matrix(dist(df)^2)) / (2 * nrow(df))
It's then straightforward to calculate the within (cluster) sum-of-squares for every cluster for every method
library(tidyverse)
df %>%
gather(method, cluster, cluster_method_1, cluster_method_2) %>%
group_by(method, cluster) %>%
nest() %>%
transmute(
method,
cluster,
within_SS = map_dbl(data, ~calc_SS(.x))) %>%
spread(method, within_SS)
## A tibble: 6 x 3
# cluster cluster_method_1 cluster_method_2
# <chr> <dbl> <dbl>
#1 1 1.52 9.99
#2 2 10.3 0
#3 3 NA 10.9
#4 4 15.2 0
#5 5 NA 0
#6 6 0 0
The total within sum-of-squares is then just the sum of the within sum-of-squares for every cluster
df %>%
gather(method, cluster, cluster_method_1, cluster_method_2) %>%
group_by(method, cluster) %>%
nest() %>%
transmute(
method,
cluster,
within_SS = map_dbl(data, ~calc_SS(.x))) %>%
group_by(method) %>%
summarise(total_within_SS = sum(within_SS)) %>%
spread(method, total_within_SS)
## A tibble: 1 x 2
# cluster_method_1 cluster_method_2
# <dbl> <dbl>
#1 27.0 20.9
By the way, we can confirm that calc_SS does indeed return the within sum-of-squares using the iris dataset:
set.seed(2018)
df2 <- iris[, 1:4]
kmeans <- kmeans(as.matrix(df2), 3)
df2$cluster <- kmeans$cluster
df2 %>%
group_by(cluster) %>%
nest() %>%
mutate(within_SS = map_dbl(data, ~calc_SS(.x))) %>%
arrange(cluster)
## A tibble: 3 x 3
# cluster data within_SS
# <int> <list> <dbl>
#1 1 <tibble [38 × 4]> 23.9
#2 2 <tibble [62 × 4]> 39.8
#3 3 <tibble [50 × 4]> 15.2
kmeans$within
#[1] 23.87947 39.82097 15.15100
The total sum of squares, sum_x sum_y ||x-y||² is constant.
The total sum of squares can be computed trivially from variance.
If you now subtract the within-cluster sum of squares where x and y belong to the same cluster, then the between cluster sum of squares remains.
If you do this approach, it takes O(n) time instead of O(n²).
Corollary: the solution with the smallest WCSS has the largest BCSS.
Consider the package clValid. It calculates a large number of indexes for validating clustering. The Dunn index is particularly appropriate for what you are trying to do. The documentation says that the Dunn index is the ratio between the smallest distance between observation not in the same cluster to the largest intra-cluster distance. The documentation for it can be found at https://cran.r-project.org/web/packages/clValid/clValid.pdf.