I scripted the following code
out %>% group_by(tests0, GROUP) %>%
summarise(
mean0 = mean(score0, na.rm = T),
stderr0 = std.error(score0, na.rm = T),
mean7 = mean(score7, na.rm = T),
stederr7 = std.error(score7, na.rm = T),
diff.std.mean = t.test(score0, score7, paired = T)$estimate,
p.value = t.test(score0, score7, paired = T)$p.value,
)
and I have obtained the following output
tests0 GROUP mean0 stderr0 mean7 stederr7 diff.std.mean p.value
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ADAS_CogT0 CONTROL 12.6 0.525 13.6 0.662 -1.15 0.00182
2 ADAS_CogT0 TRAINING 14.0 0.613 12.6 0.570 1.40 0.00295
3 PVF_T0 CONTROL 32.1 1.22 31.3 1.45 0.498 0.636
4 PVF_T0 TRAINING 31.6 1.37 34.3 1.51 -2.48 0.0102
5 ROCF_CT0 CONTROL 29.6 0.893 30.3 0.821 -0.180 0.835
6 ROCF_CT0 TRAINING 30.1 0.906 29.5 0.929 0.489 0.615
7 ROCF_IT0 CONTROL 12.8 0.563 12.2 0.683 0.580 0.356
8 ROCF_IT0 TRAINING 10.9 0.735 12.3 0.768 -1.44 0.0238
9 ROCF_RT0 CONTROL 12.1 0.725 12.5 0.797 -0.370 0.598
10 ROCF_RT0 TRAINING 10.5 0.746 10.9 0.742 -0.534 0.370
11 SVF_T0 CONTROL 35.5 1.05 34 1.15 1.42 0.107
12 SVF_T0 TRAINING 34.1 1.04 32.9 1.16 0.962 0.231
In case I would like to do the same via across function, What am i supposed to do to achieve the same results, shown into the code above? Actaully I am in trouble becase I was drawing some example from the answer published under this question Reproduce a complex table with double headesrs, but I was not able to suit it properly.
Here the dataset
Below you could find the way I would like to obtain the same. It ius a method requiring for .x manipulation.
out %>%
group_by(across(all_of(tests0, GROUP))) %>% summarise(across(starts_with('score'),
list(mean = ~ mean(.x,na.rm = T),
stderr = ~ std.error(.x, na.rm = TRUE),
diff.std.mean = ~ t.test(.x, na.rm = T)))$estimate,
p.value = ~ t.test(.x, na.rm = T)))$p.value)),.groups = "drop")
You can use the argument .names in across():
library(dplyr)
out %>%
group_by(tests0, GROUP) %>%
summarize(across(c(score0, score7), sd, na.rm = TRUE, .names = "sd_{.col}"),
across(c(score0, score7), mean, na.rm = TRUE, .names = "mean_{.col}"),
diff.std.mean = t.test(score0, score7, paired = T)$estimate,
p.value = t.test(score0, score7, paired = T)$p.value) %>%
ungroup()
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.
#> # A tibble: 2 x 8
#> tests0 GROUP sd_score0 sd_score7 mean_score0 mean_score7 diff.std.mean p.value
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ADAS_~ CONT~ 3.72 4.81 12.5 13.5 -1.24 0.00471
#> 2 ADAS_~ TRAI~ 4.55 4.15 14.0 12.6 1.40 0.00295
Created on 2021-11-26 by the reprex package (v2.0.1)
EDIT
If you prefer a list it would be easier to determine the separate parts and then bind them together:
library(data.table)
by <- c("tests0", "GROUP")
out_dt <- data.table::data.table(out)
means <- out_dt[, sapply(.SD, function(x) list(mean = mean(x, na.rm = TRUE))),
by = by, .SDcols = patterns("^score")]
sds <- out_dt[, sapply(.SD, function(x) list(sd = sd(x, na.rm = TRUE))),
by = by, .SDcols = patterns("^score")]
t_est <- out_dt[, .(diff.std.mean = t.test(score0, score7, paired = T)$estimate), by = by]
tpvalue <- out_dt[, .(p.value = t.test(score0, score7, paired = T)$p.value), by = by]
list(means = means, sds = sds, diff.std.mean = t_est, p.value = tpvalue)
Here is another approach you may want to consider. First I took your code and cut and pasted it into a function. Abstracting the column names and removing the dependency on the plotrix package for calculating the standard error are the only changes.
g <- function (df)
{
nms <- c(names(df)[1:2],
paste0('mean', sub(".*[a-z]","",names(df)[3])),
paste0('stderr', sub(".*[a-z]","",names(df)[3])),
paste0('mean', sub(".*[a-z]","",names(df)[4])),
paste0('stderr', sub(".*[a-z]","",names(df)[4])),
'diff.std.mean', 'p.value')
z <- df %>% group_by(df[,1:2]) %>%
summarize(
x1 = mean(pull(df[,3]), na.rm = T),
x2 = sd(pull(df[,3]), na.rm=T) / sqrt(sum(!is.na(pull(df[,3])))),
x3 = mean(pull(df[,4]), na.rm = T),
x4 = sd(pull(df[,4]), na.rm=T) / sqrt(sum(!is.na(pull(df[,4])))),
x5 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$estimate,
x6 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$p.value)
colnames(z) <- nms
return(z)
}
Then, because the test data only had one level of a factor and insufficient sample size for the plotrix::std.error function that you used, I introduced variation in the 'test0' factor, doubled the sample size, and dropped the unused levels because they would cause iterations on empty frames. In addition I added a score8 to show how you could run on other variables.
s <- t %>% mutate(tests0 = case_when(Education <= 8 ~ 'ADAS_CogTO', T ~ 'PVF_T0'),
score8 = score0 + score7)
q <- rbind(s, s)
fct_drop(q$tests0)
Then I split the frame by the factor levels, applied the function to each of the splits, then remerged the data back together inside a function that allows you to manipulate the score and group variables. I assumed 2 each, which is safe with the score variables since your are doing a paired t-test, and it is easily extendible with the group variables (if you simply move the score variables to positions 1 and 2, and use all remaining variables passed to the function as group variables).
h <- function(df, group_vars, score_vars)
{
z <- df %>% select(group_vars, score_vars)
z <- z %>% group_by(z[,1:2]) %>%
group_map( ~ g(.x), .keep = T) %>%
bind_rows()
}
Note that if you desire to apply this to other data, you only need to change the columns passed to the group and score variables. Should be fairly easy to alter that if you want to as well, just thought this was a good framework for what you seem to be trying to do. Think about how you handle the case where test0 is null and test7 is non-null (or vice-versa) since these observations are included in come of your summary statistics, but necessarily excluded from the t-test. Good luck.
x <- h(q, c("tests0", "GROUP"), c("score0", "score7")) %>%
group_by(tests0) %>%
pivot_wider(id_cols = tests0,
names_from = GROUP,
values_from = c("mean0","stderr0","mean7","stderr7",
'diff.std.mean', 'p.value'))
I don't have a function called std.error so I've used sd, but of course you can change it.
library(dplyr)
library(readr)
out %>%
group_by(tests0, GROUP) %>%
summarise(
across(c(score0, score7), list(mean = mean, stderr = sd), na.rm = TRUE,
.names = '{.fn}{parse_number(.col)}'),
with(t.test(score0, score7, paired = T),
tibble(diff.std.mean = estimate,
p.value)))
# # A tibble: 2 × 8
# tests0 GROUP mean0 stderr0 mean7 stderr7 diff.std.mean p.value
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 ADAS_CogT0 CONTROL 12.5 3.72 13.5 4.81 -1.24 0.00471
# 2 ADAS_CogT0 TRAINING 14.0 4.55 12.6 4.15 1.40 0.00295
In reality I would just put the above code in a function that takes an x and y argument and then run fun(df, x = score0, y = score7). But, just for fun, if you must use .x and .y, here's one way (although imo it would be a little silly to do this)
df %>%
group_by(tests0, GROUP) %>%
select(starts_with('score')) %>%
summarise(
across(everything(), list(mean = mean, stderr = sd), na.rm = TRUE,
.names = '{.fn}{parse_number(.col)}'),
across(everything(), list(list)) %>%
pmap_dfr(~ t.test(.x, .y, paired = TRUE)[c('estimate', 'p.value')]) %>%
transmute(diff.std.mean = estimate, p.value))
# # A tibble: 2 × 8
# # Groups: tests0 [1]
# tests0 GROUP mean0 stderr0 mean7 stderr7 diff.std.mean p.value
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 ADAS_CogT0 CONTROL 12.5 3.72 13.5 4.81 -1.24 0.00471
# 2 ADAS_CogT0 TRAINING 14.0 4.55 12.6 4.15 1.40 0.00295
I thought of a possible workaround (that may or may not help) by using across() "manually", without applying functions one column at a time. The resulting output is a data.frame with list columns that are deeply nested, so unnest() will come in handy. I also used possibly() to address the case when two columns are not present, remember that across() can match any number of columns and t.test() needs x and y arguments.
Code:
library(tidyverse)
data <-
df %>%
group_by(tests0, GROUP) %>%
summarize(
all = list(across(starts_with("score")) %>%
{
tibble(
ttest = data.frame(possibly(~ reduce(., ~ t.test(.x, .y, paired = TRUE))[c("estimate", 'p.value')], NA)(.)),
means = data.frame(map(., ~ mean(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\\D+", "mean"))),
stderrs = data.frame(map(., ~ sd(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\\D+", "stederr")))
)
})
)
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.
data %>%
unnest(all) %>%
unnest(-c("tests0", "GROUP"))
#> # A tibble: 2 × 8
#> # Groups: tests0 [1]
#> tests0 GROUP estimate p.value mean0 mean7 stederr0 stederr7
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ADAS_CogT0 CONTROL -1.24 0.00471 12.5 13.5 3.72 4.81
#> 2 ADAS_CogT0 TRAINING 1.40 0.00295 14.0 12.6 4.55 4.15
Created on 2021-11-29 by the reprex package (v2.0.1)
Related
I am making linear models across a large dataset which is unbalanced (not all contrasts are present for all groupings). Is there an efficient way to ignore groupings where there are less than 2 contrasts? In the examples below testData1 represents a balanced dataset where the workflow works correctly. testData2 represents an unbalanced dataset which throws a contrast error.
aovFxn <- function(dat){
lm(outcomeVar ~ predVar1, data = dat) %>%
broom::tidy()
}
testData1 <- data.frame(
groupVar = rep(c('a', 'b'), each = 12),
predVar1 = c(rep(c('x', 'y', 'z'), each = 4, times = 2)),
outcomeVar = sample(1:100, 24)
)
testData2 <- data.frame(
groupVar = rep(c('a', 'b'), each = 12),
predVar1 = c(rep(c('x', 'y', 'z'), each = 4),
rep('x', 12)),
outcomeVar = sample(1:100, 24)
)
testStats1 <- testData1 %>%
nest(groupData = -groupVar) %>%
mutate(df = purrr::map(groupData, aovFxn)) %>%
unnest_legacy(df)
testStats2 <- testData2 %>%
nest(groupData = -groupVar) %>%
mutate(df = purrr::map(groupData, aovFxn)) %>%
unnest_legacy(df)
We may use either tryCatch or purrr::possibly to return a desired value when there is an error
library(dplyr)
library(purrr)
paovFxn <- possibly(aovFxn, otherwise = NULL)
testData2 %>%
nest(groupData = -groupVar) %>%
mutate(df = purrr::map(groupData, paovFxn)) %>%
unnest(df)%>%
select(-groupData)
-output
A tibble: 3 × 6
groupVar term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 a (Intercept) 42.5 17.3 2.45 0.0367
2 a predVar1y 19.7 24.5 0.805 0.441
3 a predVar1z 2.25 24.5 0.0917 0.929
Another option is to create an if condition
testData2 %>%
nest(groupData = -groupVar) %>%
mutate(df = map(groupData, ~ if(n_distinct(.x$predVar1) > 1) aovFxn(.x)) ) %>%
unnest(df, keep_empty = TRUE) %>%
select(-groupData)
-output
# A tibble: 4 × 6
groupVar term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 a (Intercept) 42.5 17.3 2.45 0.0367
2 a predVar1y 19.7 24.5 0.805 0.441
3 a predVar1z 2.25 24.5 0.0917 0.929
4 b <NA> NA NA NA NA
NOTE: If we don't use keep_empty = TRUE, it will be FALSE by default and the 'groupVar' 'b' row will not be there in the output
I am using a self declared function that runs a regression analysis. I want to run this for thousands of companies for multiple years, thus speed is essential. My function creates three outputs (a coefficient, the p value and r-squared). The function runs fine individually, however when I use mutate() to let it run through the whole dataset, it only gives the same values for all rows. The weirdest thing is that I can't reproduce those particular values by running the function individually. I made an reproducible example below. I have used lapply successfully before with this data, but I would like to keep it in mutate and above all I would like to know what's exactly happening here.
So my question is: how can I make this function work for each individual row for the companies dataset using mutate?
library(tidyverse)
companies <- data.frame(comp_id = 1:5)
individuals <- data.frame(id = 1:100,
comp_id = sample(1:5, 100, replace = T),
age = sample(18:67, 100, replace = T),
wage = sample(1700:10000, 100, replace = T))
regger <- function(x){
df <- individuals %>% filter(comp_id == x)
formula <- wage ~ age
regression <- lm(formula, df)
res <- list(coeff = summary(regression)$coefficient[2,1],
p = summary(regression)$coefficients[2,4],
r2 = summary(regression)$r.squared)
return(res)
}
companies %>%
mutate(data = list(regger(comp_id))) %>%
unnest_wider(data)
output:
# A tibble: 5 x 4
comp_id coeff p r2
<int> <dbl> <dbl> <dbl>
1 1 -4.92 0.916 0.000666
2 2 -4.92 0.916 0.000666
3 3 -4.92 0.916 0.000666
4 4 -4.92 0.916 0.000666
5 5 -4.92 0.916 0.000666
Use map from the purrr package if a function is not vectorized:
library(tidyverse)
set.seed(1337)
companies <- data.frame(comp_id = 1:5)
individuals <- data.frame(
id = 1:100,
comp_id = sample(1:5, 100, replace = T),
age = sample(18:67, 100, replace = T),
wage = sample(1700:10000, 100, replace = T)
)
regger <- function(x) {
df <- individuals %>% filter(comp_id == x)
formula <- wage ~ age
regression <- lm(formula, df)
res <- list(
coeff = summary(regression)$coefficient[2, 1],
p = summary(regression)$coefficients[2, 4],
r2 = summary(regression)$r.squared
)
return(res)
}
companies %>%
mutate(data = comp_id %>% map(regger)) %>%
unnest_wider(data)
#> # A tibble: 5 x 4
#> comp_id coeff p r2
#> <int> <dbl> <dbl> <dbl>
#> 1 1 67.1 0.108 0.218
#> 2 2 23.7 0.466 0.0337
#> 3 3 31.2 0.292 0.0462
#> 4 4 18.4 0.582 0.0134
#> 5 5 0.407 0.994 0.00000371
Created on 2021-09-09 by the reprex package (v2.0.1)
I'm not sure what the output should look like, but could it be that you need to work on a row-by-row basis?
companies %>%
rowwise() %>%
mutate(data = list(regger(comp_id))) %>%
unnest_wider(data)
comp_id coeff p r2
<int> <dbl> <dbl> <dbl>
1 1 21.6 0.470 0.0264
2 2 13.5 0.782 0.00390
3 3 0.593 0.984 0.0000175
4 4 -9.33 0.824 0.00394
5 5 64.9 0.145 0.156
I have already asked a similar question to this here with the answer below. I wanted to aggregate my dataframe by "number" and calculate a weighted mean. Now I would like to do a weighted sum but somehow I cannot find out how to apply a weighted sum to my dataframe. The weighted.sum function doesn no longer work for my R version.
df = data.frame(number=c("a","a","a","b","c","c"), y=c(1,2,3,4,1,7),
z=c(2,2,6,8,9,1), weight =c(1,1,3,1,2,1))
df %>%
group_by(number) %>%
summarise(across(c(y, z),
list( mean = ~mean(., na.rm = TRUE), sd = ~sd(., na.rm = TRUE),
weighted = ~weighted.mean(., w = weight))), .groups = 'drop')
We could use
library(dplyr)
df %>%
group_by(number) %>%
summarise(across(c(y, z),
list( mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
weighted = ~weighted.mean(., w = weight),
weightedsum = ~ sum(. * weight)), .groups = 'drop'))
# A tibble: 3 x 9
# number y_mean y_sd y_weighted y_weightedsum z_mean z_sd z_weighted z_weightedsum
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a 2 1 2.4 12 3.33 2.31 4.4 22
#2 b 4 NA 4 4 8 NA 8 8
#3 c 4 4.24 3 9 5 5.66 6.33 19
I am having some trouble with the new dplyr::summarise() function
Here is the data
df <- data.frame(id = factor(1:10),
group = factor(rep(letters[1:2],each = 5)),
w1 = rnorm(10),
w2 = rnorm(10),
w3 = rnorm(10),
dummy = as.character(LETTERS[1:10]),
stringsAsFactors = F)
Now I want to get means and standard deviations for the numeric variables only. So I ran the following code
df %>%
dplyr::select(id, group, w1:w3) %>%
group_by(group) %>%
dplyr::summarise(across(where(is.numeric), ~ mean(.x, na.rm = T), .names = "mean_{col}"),
across(where(is.numeric), ~ sd(.x, na.rm = T), .names = "sd_{col}"),
count = n())
Which gives me the following output
# A tibble: 2 x 11
# group mean_w1 mean_w2 mean_w3 sd_w1 sd_w2 sd_w3 sd_mean_w1 sd_mean_w2 sd_mean_w3 count
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
# a -0.399 0.152 -0.151 1.07 0.703 1.15 NA NA NA 5
# b 0.560 -0.107 -0.0439 1.18 0.612 0.862 NA NA NA 5
Now the columns starting with mean_ and sd_ are exactly what I want, but I'm also getting this set of sd_mean_ columns, I assume because it is trying to find the sd of the new mean_ columns.
How do I get the output without the superfluous columns?
The issue is when you go to second across the number of numeric columns have increased, so it applies sd function to the new columns as well. To avoid this apply multiple function in the same across using list().
library(dplyr)
df %>%
group_by(group) %>%
summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE)),
.names = "{fn}_{col}"),
count = n())
# group mean_w1 sd_w1 mean_w2 sd_w2 mean_w3 sd_w3 count
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#1 a 0.0746 0.696 0.760 1.39 0.0530 1.29 5
#2 b 0.522 0.686 0.0979 0.566 -0.0133 1.12 5
Also, your attempt would work as expected if you don't select columns by their type :
df %>%
group_by(group) %>%
summarise(across(w1:w3, ~ mean(.x, na.rm = T), .names = "mean_{col}"),
across(w1:w3, ~ sd(.x, na.rm = T), .names = "sd_{col}"),
count = n())
I have a very large data set in tibble form. I'd like to summarize the data using some functions which return lists. I'm interested in several components of the list, and I'd like to return each of the components I need into new tibble columns.
Here's an example
library(tibble)
library(dplyr)
# Create a data set of 1,000 random values in 100 subgroups with sample size 10
contrived_data <- tibble(subgroup = rep(1:100, each = 10),
value = rnorm(1000, mean = 5, sd = 1))
# Run the KS test vs. normal distribution on each sample of size 10. Return the KS statistic and p-value
# into new tibble columns
contrived_data %>% group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
ks_stat = ks.test(value, "pnorm", mean = 5, sd = 1)$statistic,
ks_pval = ks.test(value, "pnorm", mean = 5, sd = 1)$p.value)
Running it this way gets the results I want, but not very efficiently. Calling the ks.test function twice means the execution time is (almost) doubled. It seems there must be a more efficient way to extract these two list components with a single function call, but I don't know how to do that.
you can define the function and use map from purrr:
library(tibble)
library(dplyr)
library(purrr)
func = function(DA){
kstest = ks.test(DA$value, "pnorm", mean = 5, sd = 1)
data.frame(
subgroup = unique(DA$subgroup),
avg=mean(DA$value),
std_dev = sd(DA$value),
ks_stat = kstest$statistic,
ks_pval = kstest$p.value)
}
contrived_data %>%
split(.$subgroup) %>%
map_dfr(func)
The test can be run once and wrapped in a list and then use map (from purrr) to extract the values
library(purrr)
library(dplyr)
library(tidyr)
contrived_data %>%
group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
test = list(ks.test(value, "pnorm", mean = 5, sd = 1))) %>%
mutate(out = map(test, ~ tibble(ks_stat = .x$statistic,
ks_pval = .x$p.value))) %>%
unnest_wider(c(out)) %>%
select(-test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 4.52 0.675 0.375 0.0907
# 2 2 5.17 1.02 0.342 0.152
# 3 3 5.02 0.909 0.141 0.972
# 4 4 5.08 0.846 0.313 0.227
# 5 5 4.82 0.819 0.225 0.614
# 6 6 5.07 0.866 0.159 0.928
# 7 7 4.94 0.914 0.145 0.966
# 8 8 5.52 1.01 0.290 0.306
# 9 9 5.17 0.787 0.258 0.443
#10 10 4.61 1.15 0.476 0.0132
# … with 90 more rows
Another option is to tidy the output (with broom) and extract all the components at once
library(broom)
contrived_data %>%
group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
out = list(tidy(ks.test(value, "pnorm", mean = 5, sd = 1)))) %>%
unnest_wider(c(out))
A dplyr solution using the rowwise command which performs the same task as map does.
contrived_data %>%
group_by(subgroup) %>%
summarise(
avg = mean(value),
std_dev = sd(value),
ks_test = list(ks.test(value,"pnorm",mean=5,sd=1))
) %>%
ungroup() %>%
rowwise() %>%
mutate(
ks_stat = ks_test$statistic,
ks_pval = ks_test$p.value
) %>%
ungroup() %>%
select(-ks_test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 5.10 1.24 0.186 0.819
# 2 2 4.86 0.805 0.231 0.584
# 3 3 5.24 0.729 0.258 0.445
# 4 4 5.16 0.642 0.307 0.247
# 5 5 4.63 0.752 0.393 0.0664
# Benchmark using rbenchmark:
# test replications elapsed relative user.self sys.self user.child sys.child
#2 nested 1000 10.58 1.000 10.58 0 NA NA
#1 original 1000 16.75 1.583 16.73 0 NA NA
You can use group_modify
library(tidyverse)
contrived_data %>%
group_by(subgroup) %>%
group_modify(~{
ks <- ks.test(.$value, "pnorm", mean = 5, sd = 1)
tibble(
avg = mean(.$value),
std_dev = sd(.$value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
})
Or with data.table
library(data.table)
setDT(contrived_data)
contrived_data[, {
ks <- ks.test(value, "pnorm", mean = 5, sd = 1)
.(avg = mean(value),
std_dev = sd(value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
}, by = subgroup]