I have the following DF:
structure(list(AgeGroup = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Young",
"Old"), class = "factor"), variable = structure(c(1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L,
10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L,
23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L,
30L, 30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L,
36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L,
43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L,
49L, 50L, 50L), .Label = c("Point.1", "Point.2", "Point.3", "Point.4",
"Point.5", "Point.6", "Point.7", "Point.8", "Point.9", "Point.10",
"Point.11", "Point.12", "Point.13", "Point.14", "Point.15", "Point.16",
"Point.17", "Point.18", "Point.19", "Point.20", "Point.21", "Point.22",
"Point.23", "Point.24", "Point.25", "Point.26", "Point.27", "Point.28",
"Point.29", "Point.30", "Point.31", "Point.32", "Point.33", "Point.34",
"Point.35", "Point.36", "Point.37", "Point.38", "Point.39", "Point.40",
"Point.41", "Point.42", "Point.43", "Point.44", "Point.45", "Point.46",
"Point.47", "Point.48", "Point.49", "Point.50"), class = "factor"),
value = c(0.714518666666667, 0.723876630952381, 0.728961368421053,
0.735228897233202, 0.701283807017544, 0.71396457312253, 0.663229964912281,
0.68923661660079, 0.613014666666667, 0.652671079051383, 0.547104,
0.602951166007905, 0.504106245614035, 0.558832648221344,
0.487034052631579, 0.515752438735178, 0.451825245614035,
0.476300007905138, 0.442370175438596, 0.441173656126482,
0.438668315789474, 0.435859173913043, 0.450059526315789,
0.434047494071146, 0.478947649122807, 0.450561841897233,
0.481134438596491, 0.461228027667984, 0.446763543859649,
0.451031316205534, 0.396206754385965, 0.406836889328063,
0.357049368421053, 0.368716249011858, 0.343943631578947,
0.368048932806324, 0.376060403508772, 0.398834193675889,
0.413613877192982, 0.434683889328063, 0.434964894736842,
0.448746023715415, 0.451208631578947, 0.450663276679842,
0.470569192982456, 0.473143399209486, 0.515300736842105,
0.502499193675889, 0.543379719298246, 0.507495533596838,
0.550050701754386, 0.498506288537549, 0.541725807017544,
0.482379664031621, 0.517293315789474, 0.458068636363636,
0.485205245614035, 0.423109671936759, 0.438844403508772,
0.385925747035573, 0.39522349122807, 0.362403612648221, 0.374209192982456,
0.350889750988142, 0.354036315789474, 0.336213118577075,
0.340668122807018, 0.327800648221344, 0.326388666666667,
0.322577146245059, 0.328114842105263, 0.319440624505929,
0.342721666666667, 0.323974818181818, 0.357620473684211,
0.335501339920949, 0.372856842105263, 0.343831292490119,
0.377362315789474, 0.361571442687747, 0.393890736842105,
0.377489727272727, 0.419330684210526, 0.38274228458498, 0.419797666666667,
0.387899881422925, 0.423127684210526, 0.385955055335968,
0.42140750877193, 0.377730351778656, 0.403711631578947, 0.366319122529644,
0.390753140350877, 0.355189754940711, 0.373226596491228,
0.347452173913044, 0.348689877192982, 0.340376324110672,
0.329466947368421, 0.344867375494071)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
which a subset looks like:
A tibble: 100 x 3
AgeGroup variable value
<fct> <fct> <dbl>
1 Young Point.1 0.715
2 Old Point.1 0.724
3 Young Point.2 0.729
4 Old Point.2 0.735
5 Young Point.3 0.701
6 Old Point.3 0.714
7 Young Point.4 0.663
8 Old Point.4 0.689
9 Young Point.5 0.613
10 Old Point.5 0.653
I have an output using:
Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
But this provides me with a lot of unnecessary calculations like Point.1 and Point.3, Point.1 and Point.4, etc.
I would like to calculate Cohen's D for each successive points. So for example:
Point.1:Point.2, Point.2:Point.3, etc. The end goal is to plot D values on the Y-axis and Points 1 through 50 on the X-axis.
In base R you can accomplish the same using embed + apply:
L <- split(DF$value, DF$variable)
mat <- embed(names(L), 2)[,2:1]
res <- apply(mat, 1, function(x) rstatix::cohens_d(stack(L[x]), values~ind))
do.call(rbind, res)
A tibble: 49 x 7
.y. group1 group2 effsize n1 n2 magnitude
<chr> <chr> <chr> <dbl> <int> <int> <ord>
1 values Point.1 Point.2 -2.29 2 2 large
2 values Point.2 Point.3 3.46 2 2 large
3 values Point.3 Point.4 2.17 2 2 large
4 values Point.4 Point.5 1.83 2 2 large
5 values Point.5 Point.6 1.69 2 2 large
6 values Point.6 Point.7 1.11 2 2 large
7 values Point.7 Point.8 0.973 2 2 large
8 values Point.8 Point.9 1.98 2 2 large
9 values Point.9 Point.10 1.82 2 2 large
10 values Point.10 Point.11 2.95 2 2 large
# ... with 39 more rows
if you can use the effsize::cohen.d then this function will be faster than all the options given so far:
my_cohen <- function(data){
L <- split(data$value, data$variable)
mat <- embed(names(L), 2)
res <- apply(mat, 1, function(x)
effsize::cohen.d(L[[x[2]]], L[[x[1]]])$estimate)
data.frame(mat, res)
}
my_cohen(DF)
X1 X2 res
1 Point.2 Point.1 -2.29025540
2 Point.3 Point.2 3.45998958
3 Point.4 Point.3 2.16986489
4 Point.5 Point.4 1.82991671
5 Point.6 Point.5 1.68816593
6 Point.7 Point.6 1.11414226
We could nest the 'value', get the lead of the list column, and apply cohen.d by looping over the two list
library(dplyr)
library(effsize)
library(purrr)
out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))
-output
head(out, 2)
# A tibble: 2 x 5
variable data variable_lead data_lead cohen_d
<fct> <list> <fct> <list> <dbl>
1 Point.1 <tibble [2 × 1]> Point.2 <tibble [2 × 1]> -2.29
2 Point.2 <tibble [2 × 1]> Point.3 <tibble [2 × 1]> 3.46
comparing with OP's filtered output
Cho_D %>%
slice(seq(1, n(), by = 49)) %>%
slice_head(n = 2)
# A tibble: 2 x 7
# .y. group1 group2 effsize n1 n2 magnitude
# <chr> <chr> <chr> <dbl> <int> <int> <ord>
#1 value Point.1 Point.2 -2.29 2 2 large
#2 value Point.2 Point.3 3.46 2 2 large
Benchmarks
With the number of comparisons reduced, the timings are below
system.time({Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
})
# user system elapsed
# 16.316 0.060 16.330
system.time({out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))})
# user system elapsed
# 0.031 0.005 0.037
Related
I'm trying to extract the values for a dataset smoothed by group (like how I would get them with geom_smooth(), but I need to be able to plot the actual fitted values). I tried all the solutions presented in this thread loess regression on each group with dplyr::group_by() but I keep getting the following errors for all of them
From the 'neat tidyverse way' answer:
library(dplyr)
library(tidyr)
library(purrr)
models <- test %>%
tidyr::nest(-id) %>%
dplyr::mutate(
# Perform loess calculation on each CpG group
m = purrr::map(data, loess,
formula = y ~ x, span = .5),
# Retrieve the fitted values from each model
fitted = purrr::map(m, `[[`, "fitted")
)
Warning message:
All elements of `...` must be named.
Did you want `data = c(x, y)`?
From the accepted answer:
test <- test %>%
group_by(id) %>%
arrange(id, x) %>%
mutate(Loess = predict(loess(y ~ x, span = .5, data=.),
data.frame(x = seq(min(x), max(x), 1))))
Error: Problem with `mutate()` input `Loess`.
x Input `Loess` can't be recycled to size 60.
ℹ Input `Loess` is `predict(...)`.
ℹ Input `Loess` must be size 60 or 1, not 3.
ℹ The error occurred in group 2: id = 2.
From the comments
test2 <- test %>% nest(-id) %>%
mutate(fit = map(data, ~ loess(y ~ x, .))) %>%
unnest(map2(fit, data, augment))
Error: object 'fit' not found
Run `rlang::last_error()` to see where the error occurred.
In addition: Warning message:
All elements of `...` must be named.
Did you want `data = c(x, y)`?
I'm not very experienced with the tidyverse, so I'm not completely sure what I'm doing differently compared to those examples. Any help is greatly appreciated.
#data
test<-structure(list(x = c(-8.09566199976713, -8.06147618447585, -8.00975809213924,
-7.96200959687978, -7.91437069721383, -7.89794310600054, -7.90270753398918,
-7.90758707202371, -7.91285423074951, -7.92478106752289, -7.93691825045689,
-8.01874278066444, -7.963527427585, -7.93407938329611, -7.94323616248026,
-7.94085332876942, -7.93203791319322, -7.94357702787401, -7.91377080575536,
-7.88252433889846, -7.85499829071686, -7.86566601358475, -7.91275434062848,
-7.96059415201768, -8.00264819208551, -8.02577207019623, -7.95146178490521,
-7.86605403953235, -7.78476412041177, -7.88927033537369, -7.8876796916652,
-7.95187190641719, -7.98401088593829, -8.00012692860024, -8.01433523159787,
-8.00807092407049, -8.00178416213873, -8.00011043534128, -8.02862971973644,
-8.14101616921556, -8.05197547777147, -8.06579012466988, -8.07977339594891,
-9.91048749031936, -9.66560592747977, -10.6156432290275, -10.8015737085674,
-11.1501986169795, -11.7159178445694, -11.852600007071, -11.9500158919174,
-11.8099796968354, -11.5041369336859, -11.2139686794437, -10.9382871968791,
-10.6564566029614, -9.79377558892992, -9.09630034541423, -10.9883787360288,
-11.2269861445731, -11.4824854018762, -11.6079686487163, -10.428108852748,
-9.6062104865303, -9.46949542587671, -9.85124539788883, -10.276205029234,
-10.4776426742835, -10.6400079178346, -10.4147379554317, -10.2038339860429,
-10.0934609736433, -10.0824486553681, -9.96530249613719, -9.94853909056431,
-9.93151185565445, -9.91417478382807, -9.90180880165919, -9.91333691008172,
-10.0373498803589, -10.1768025338422, -10.320572551138, -10.4688466820861,
-10.6218220581835, -10.6343234090347, -10.3264187862842, -9.98775786739598,
-9.58260323586509, -9.42043071710701, -9.43924036934577, -9.52799164804214,
-9.62866487252844, -9.72887391315455, -9.81243902432872, -9.70821477146116,
-10.349558979838, -10.1458405548694, -9.99415308822563, -9.84536020757233,
-9.79599547035806, -9.75295135996101, -9.79904377903862, -9.25340041876441,
-15.3471460066526, -15.8962098900031, -16.4807765780841, -16.3237393813877,
-17.3021769512891, -16.3800352448021, -15.8070127149332, -15.3891912546897,
-14.9758961247955, -14.3049193201036, -13.6951530131572, -13.6426403073008,
-13.7192755520964, -13.317953401665, -12.9213325990039, -12.4594681682978,
-12.0191583645286, -11.8053727831142, -11.6872352172752, -11.5737935404884,
-11.4107390421452, -11.1892161252634, -10.993043829208, -10.9330292789471,
-10.8749587447015, -10.839113401307, -10.812108561392, -10.9713463919414,
-11.1680590680373, -11.2539207698049, -11.3405390697112, -11.4280103071663,
-11.5164358909023, -11.6059223744961, -11.6965817015189, -11.6020622433579,
-11.5436710566386, -11.5081631613201, -11.6534368262729, -11.808341217397,
-11.9736843692008, -12.1503511448775), y = c(0.657500808303136,
0.738124328538192, 0.846237100479524, 0.905780104908876, 0.964065611141255,
0.97130448683885, 0.961952809454258, 0.952520035393847, 0.942792088995166,
0.927852423620437, 0.912757550459065, 0.759400682943845, 0.816109383509902,
0.87020033850233, 0.915815641213451, 0.967873386043728, 1.02174887297134,
1.05888614752619, 1.09417181629346, 1.12927656453054, 1.15845597740891,
1.14281223660865, 1.09951983390484, 1.05556252495082, 1.02150454110485,
1.00407662866727, 1.0383078668771, 1.07743614470557, 1.1142850320069,
0.849172703363773, 0.928570803393031, 1.03718890041412, 1.04033830574734,
1.03347118859473, 1.02808408212257, 1.03805023835948, 1.04800826979005,
1.05065203275594, 1.00517755278862, 0.816786352254452, 0.987099861643988,
0.986294681427012, 0.985411697682411, 6.30490194156339, 6.23591979381374,
6.54324631274467, 6.69393500338639, 6.89041829514468, 7.17507154906183,
7.18466565037316, 7.17069075506272, 7.09749953887751, 6.99963625007418,
6.90682321485407, 6.818675464992, 6.72319228655747, 6.35779144412249,
6.07675755691038, 6.71116104273407, 6.84049060491686, 6.97924607466303,
7.05864889503111, 6.59454969605345, 6.27016677264436, 6.19033669905758,
6.37773429443436, 6.58723774958637, 6.68536624180541, 6.76713349215161,
6.63576249672338, 6.51308895413708, 6.43262161144123, 6.42705851495013,
6.37935821335848, 6.37095186987229, 6.36245251810731, 6.35384217814464,
6.34823344504427, 6.3568189796202, 6.40093531258066, 6.45018168045593,
6.50106303671575, 6.55364891437324, 6.60801264620102, 6.61714449186606,
6.55356637339973, 6.47759629080218, 6.34852444038794, 6.26379776807968,
6.23209002392533, 6.22130259164532, 6.21289851462945, 6.20309983882547,
6.18744917387348, 6.10894055296196, 6.25148917581075, 6.17293529625129,
6.15075529839679, 6.12848960212141, 6.1326318372733, 6.13833262010545,
6.16395466665727, 6.05567879845981, 6.25186104520219, 6.44281768293323,
6.65324901403219, 6.72950091493087, 7.03322640156021, 6.88309899890625,
6.70207702789022, 6.54079770390583, 6.38549695194146, 6.18839711001894,
6.01102800783953, 5.87133691513539, 5.77306022102593, 5.67740669749789,
5.58788061886448, 5.50325665174288, 5.42363100637011, 5.35172803418452,
5.28384694735869, 5.21822380115916, 5.14919582516762, 5.07712142295026,
5.0106828132089, 4.95786444890208, 4.90608940527112, 4.85658988565748,
4.80812641990368, 4.77955320604372, 4.75486746882052, 4.73784287455848,
4.72038983462283, 4.70251299307059, 4.68421637386314, 4.66550330220186,
4.64637629394034, 4.64370313837652, 4.65439969630172, 4.67177327419859,
4.74790550644152, 4.82753161090928, 4.91100496765981, 4.9987159528997
), id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L,
22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L,
35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L,
48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L,
61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L,
74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L,
87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L,
100L, 101L, 102L, 103L, 131L, 132L, 133L, 134L, 135L, 136L, 137L,
138L, 139L, 140L, 141L, 142L, 143L, 144L, 145L, 146L, 147L, 148L,
149L, 150L, 151L, 152L, 153L, 154L, 155L, 156L, 157L, 158L, 159L,
160L, 161L, 162L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L,
171L, 172L), class = "data.frame")
First group by id, then fit a model to each group's data and use broom::augment to extract the fitted values and, as a bonus, the residuals. This preserves the inputs x and y; it will be easier to plot x vs .fitted for example.
library("tidyverse")
test %>%
group_by(id) %>%
group_modify(
# .x refers to the subset of rows that belong to a group.
# It's a smaller data frame with the same columns as the input
# but fewer rows.
~ loess(y ~ x, span = .5, data = .x) %>% broom::augment()
)
#> # A tibble: 145 × 5
#> # Groups: id [3]
#> id y x .fitted .resid
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.658 -8.10 0.834 -0.176
#> 2 1 0.738 -8.06 0.897 -0.159
#> 3 1 0.846 -8.01 0.989 -0.142
#> 4 1 0.906 -7.96 0.975 -0.0688
#> 5 1 0.964 -7.91 0.982 -0.0180
#> 6 1 0.971 -7.90 0.983 -0.0121
#> 7 1 0.962 -7.90 0.980 -0.0179
#> 8 1 0.953 -7.91 0.981 -0.0289
#> 9 1 0.943 -7.91 0.983 -0.0404
#> 10 1 0.928 -7.92 0.977 -0.0487
#> # … with 135 more rows
The original data frame might have more columns than x and y that are not used in the loess fit. To keep those columns, pass the group data .x to the augment function as well.
test %>%
mutate(
# Extra columns that we don't need for the loess fit but we want to keep.
z = rnorm(n()),
w = rnorm(n())
) %>%
group_by(id) %>%
group_modify(
# Now `broom::augment` appends .fitted and .resid to the original columns.
~ loess(y ~ x, span = .5, data = .x) %>% broom::augment(.x)
)
#> # A tibble: 145 × 7
#> # Groups: id [3]
#> id x y z w .fitted .resid
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -8.10 0.658 1.31 -0.199 0.834 -0.176
#> 2 1 -8.06 0.738 0.395 -1.74 0.897 -0.159
#> 3 1 -8.01 0.846 -1.34 0.382 0.989 -0.142
#> 4 1 -7.96 0.906 0.376 0.231 0.975 -0.0688
#> 5 1 -7.91 0.964 -0.224 0.327 0.982 -0.0180
#> 6 1 -7.90 0.971 0.678 -0.504 0.983 -0.0121
#> 7 1 -7.90 0.962 0.736 -0.0186 0.980 -0.0179
#> 8 1 -7.91 0.953 0.368 0.0313 0.981 -0.0289
#> 9 1 -7.91 0.943 -1.35 1.02 0.983 -0.0404
#> 10 1 -7.92 0.928 0.280 -0.471 0.977 -0.0487
#> # … with 135 more rows
Created on 2022-03-08 by the reprex package (v2.0.1)
The input data in predict is going to be of different length than the original data I would suggest to save the output in list and then use unnest to get one long dataframe.
library(dplyr)
library(tidyr)
test %>%
arrange(id, x) %>%
group_by(id) %>%
summarise(Loess = list(predict(loess(y ~ x, span = .5, data=cur_data()),
data.frame(x = seq(min(x), max(x), 1))))) %>%
unnest(Loess)
# id Loess
# <int> <dbl>
# 1 1 0.784
# 2 2 7.20
# 3 2 6.78
# 4 2 6.34
# 5 3 7.01
# 6 3 6.73
# 7 3 6.43
# 8 3 6.11
# 9 3 5.73
#10 3 5.31
#11 3 4.82
[![enter image description here][2]][2][![i need help in writing gstummary r code to produce following table output.dummy table shown in above table][2]][2]
i need help in writing gstummary r code to produce following table output.dummy table shown in above table
[![enter image description here][2]][2]
library(gtsummary)
[![enter image description here][2]][2]
[![enter image description here][3]][3]
id
age
sex
country
edu
ln
ivds
n2
p5
1
a
M
eng
x
45
15
40
15
2
a
M
eng
x
23
26
70
15
4
a
M
eng
x
26
36
35
40
5
b
F
eng
x
26
25
36
47
6
b
F
wal
y
45
45
60
12
7
b
M
wal
y
60
25
36
15
8
c
M
wal
y
70
08
25
36
9
c
F
sco
z
80
25
36
15
10
c
F
sco
z
90
25
26
39
structure(list(id = 1:15, age = structure(c(1L, 1L, 2L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 2L), .Label = c("a", "b",
"c"), class = "factor"), sex = structure(c(2L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("F", "M"), class = "factor"),
country = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 3L), .Label = c("eng", "scot", "wale"
), class = "factor"), edu = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L), .Label = c("x",
"y", "z"), class = "factor"), lon = c(45L, 23L,
25L, 45L, 70L, 69L, 90L, 50L, 62L, 45L, 23L, 25L, 45L, 70L,
69L), is = c(15L, 26L, 36L, 34L, 2L, 4L, 5L, 8L, 9L,
15L, 26L, 36L, 34L, 2L, 4L), n2 = c(40L, 70L, 50L, 60L,
30L, 25L, 80L, 89L, 10L, 40L, 70L, 50L, 60L, 30L, 25L), p5 = c(15L,
20L, 36L, 48L, 25L, 36L, 28L, 15L, 25L, 15L, 20L, 36L, 48L,
25L, 36L)), row.names = c(NA, 15L), class = "data.frame")
[
I made a table similar to what you have above (more similar to the table you had before you updated it). But I think it'll get you most of the way there.
The type of table you're requesting it something that is in the works. In the meantime, you will need to use the bstfun::tbl_2way_summary() function. This function exists in another package while we work to make it better before integrating with gtsummary.
library(bstfun) # install with `remotes::install_github("ddsjoberg/bstfun")`
library(gtsummary)
packageVersion("gtsummary")
#> [1] '1.4.1'
# add a column that is all the same value
trial2 <- trial %>% mutate(constant = TRUE)
# loop over each continuous variable, construct table, then merge them together
tbls_row1 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = grade, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# repeat for the second row
tbls_row2 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = stage, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# stack these tables
tbl_stacked <- tbl_stack(list(tbls_row1, tbls_row2))
# lastly, add calculated summary stats for categorical variables, and merge them
tbl_summary_stats <-
trial2 %>%
tbl_summary(
include = c(grade, stage),
missing = "no"
) %>%
modify_header(stat_0 ~ "**n (%)**") %>%
modify_footnote(everything() ~ NA)
tbl_final <-
tbl_merge(list(tbl_summary_stats, tbl_stacked)) %>%
modify_spanning_header(everything() ~ NA) %>%
# column spanning column headers
modify_spanning_header(
list(c(stat_1_1_2, stat_1_2_2) ~ "**Group 1**",
stat_1_3_2 ~ "**Group 2**")
)
Created on 2021-07-10 by the reprex package (v2.0.0)
I have data frame like this dummy sample, my real dataset had 56 variables.
I would like to drop the date and aggregate by id and sum last 4 total variables while keep the other unchanged.
df <- data.frame(stringsAsFactors=FALSE,
date = c("2019-02-10", "2019-02-10", "2019-02-11", "2019-02-11",
"2019-02-12", "2019-02-12", "2019-02-13", "2019-02-13",
"2019-02-14", "2019-02-14"),
id = c("18100410-aa", "18101080-ae", "18100410-aa", "18101080-ae",
"18100410-aa", "18101080-ae", "18100410-aa", "18101080-ae",
"18100410-aa", "18101080-ae"),
f_type = c(4L, 2L, 4L, 2L, 4L, 2L, 4L, 2L, 4L, 2L),
reg = c(6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L),
hh_p10 = c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L),
internet = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L),
youngest = c(5L, 7L, 5L, 7L, 5L, 7L, 5L, 7L, 5L, 7L),
a_group = c(3L, 6L, 3L, 6L, 3L, 6L, 3L, 6L, 3L, 6L),
total_prd = c(130L, 337L, 374L, 261L, 106L, 230L, 150L, 36L, 15L, 123L),
B_totalprod = c(20L, 0L, 256L, 0L, 32L, 0L, 0L, 36L, 0L, 45L),
p_totalprod = c(0L, 81L, 11L, 260L, 26L, 230L, 0L, 0L, 15L, 0L),
n_totalprod = c(110L, 256L, 107L, 1L, 48L, 0L, 150L, 0L, 0L, 78L)
)
I found this solution from plyr package here it is working but I need to specify all my 52 unaffected variables. I am just wondering is there any other way to do this task?
library(plyr)
ddply(df,.(id,f_type, reg, internet,hh_p10 ,youngest, a_group ),summarise,total_prd = sum(total_prd) ,
B_totalprod = sum(B_totalprod) , p_totalprod = sum(p_totalprod) ,
n_totalprod = sum(n_totalprod))
If your real dataset also has columns that contain "total" this should work:
library(tidyverse)
df %>%
select(-date) %>%
group_by(.dots = str_subset(names(.), "total", negate = TRUE)) %>%
summarise_all(list(sum = sum))
# A tibble: 2 x 11
# Groups: id, f_type, reg, hh_p10, internet, youngest [2]
id f_type reg hh_p10 internet youngest a_group total_prd_sum B_totalprod_sum p_totalprod_sum n_totalprod_sum
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 18100410-aa 4 6 2 1 5 3 775 308 52 415
2 18101080-ae 2 7 1 2 7 6 987 81 571 335
The line group_by(.dots = str_subset(names(.), "total", negate = TRUE)) means we are going to group by all the column names in our this dataset that do not contain the word "total".
I have a data frame, call it d, containing one continuous variable, and two categorical (0/1) variables.
Here is an example
structure(list(s = c(35.33, 39.51, 42.35, 42.35, 43.62, 43.77, 44.28, 44.32,44.74, 44.81, 47.71, 48.05, 48.13, 48.75, 49.4,49.44, 49.98, 50.27, 50.33, 50.54, 50.97, 51.2, 51.67, 51.94, 52.05, 52.7, 52.74, 52.82, 52.92, 54.17, 54.38, 54.57, 54.71, 55.53, 55.71, 56.11, 56.24, 56.29, 56.53, 57.16, 57.53, 58.04, 58.6, 58.8, 59.01, 59.26, 59.48, 59.61, 59.98, 60.54, 60.85, 61.89,62.01, 62.8, 63.22, 63.38, 63.78, 63.95, 67.08, 67.24, 67.54, 68.69, 70.16, 70.59, 72.15, 72.87, 76.69), age = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L), .Label = c(">=30", "<30"), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L), .Label = c("Men", "Women"), class = "factor")), .Names = c("s", "age", "sex"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L), class = "data.frame")
I would like to create 4 data frames containing the same continuous variable, one for each possible combination of the categorical variables: 00, 01, 10, 11. How can I do that in R ?
You can do it using the split() function:
# Create a list holding the four dataframes
list.of.dfs <- split(df, paste(df$age, df$sex, sep="_"))
# check the result
lapply(list.of.dfs, head)
#> $`<30_Men`
#> s age sex
#> 10 44.74 <30 Men
#> 12 47.71 <30 Men
#> 16 49.40 <30 Men
#> 18 49.98 <30 Men
#> 19 50.27 <30 Men
#> 20 50.33 <30 Men
#>
#> $`<30_Women`
#> s age sex
#> 4 42.35 <30 Women
#> 5 42.35 <30 Women
#> 6 43.62 <30 Women
#> 7 43.77 <30 Women
#> 8 44.28 <30 Women
#> 11 44.81 <30 Women
#>
#> $`>=30_Men`
#> s age sex
#> 15 48.75 >=30 Men
#> 25 51.94 >=30 Men
#> 27 52.70 >=30 Men
#> 37 56.11 >=30 Men
#> 38 56.24 >=30 Men
#> 40 56.53 >=30 Men
#>
#> $`>=30_Women`
#> s age sex
#> 1 35.33 >=30 Women
#> 3 39.51 >=30 Women
#> 9 44.32 >=30 Women
#> 14 48.13 >=30 Women
#> 21 50.54 >=30 Women
#> 30 52.92 >=30 Women
suppose I have two boxplots.
trial1 <- ggplot(completionTime, aes(fill=Condition, x=Scenario, y=Trial1))
trial1 + geom_boxplot()+geom_point(position=position_dodge(width=0.75)) + ylim(0, 160)
trial2 <- ggplot(completionTime, aes(fill=Condition, x=Scenario, y=Trial2))
trial2 + geom_boxplot()+geom_point(position=position_dodge(width=0.75)) + ylim(0, 160)
How can I plot trial 1 and trial 2 on the same plot and same respective X? they have the same range of y.
I looked at geom_boxplot(position="identity"), but that plots the two conditions(fill) on the same X.
I want to plot two y column on the same X.
Edit: the dataset
User Condition Scenario Trial1 Trial2
1 1 ME a 67 41
2 1 ME b 70 42
3 1 ME c 40 15
4 1 ME d 65 23
5 1 ME e 45 45
6 1 SE a 100 34
7 1 SE b 54 23
8 1 SE c 70 23
9 1 SE d 56 15
10 1 SE e 30 20
11 2 ME a 42 23
12 2 ME b 22 12
13 2 ME c 28 8
14 2 ME d 22 8
15 2 ME e 38 37
16 2 SE a 59 18
17 2 SE b 65 14
18 2 SE c 75 7
19 2 SE d 37 9
20 2 SE e 31 7
dput()
structure(list(User = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Condition = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L), .Label = c("ME", "SE"), class = "factor"), Scenario =
structure(c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L), .Label = c("a", "b", "c", "d", "e"), class = "factor"),
Trial1 = c(67L, 70L, 40L, 65L, 45L, 100L, 54L, 70L, 56L,
30L, 42L, 22L, 28L, 22L, 38L, 59L, 65L, 75L, 37L, 31L), Trial2 = c(41L,
42L, 15L, 23L, 45L, 34L, 23L, 23L, 15L, 20L, 23L, 12L, 8L,
8L, 37L, 18L, 14L, 7L, 9L, 7L)), .Names = c("User", "Condition",
"Scenario", "Trial1", "Trial2"), class = "data.frame", row.names = c(NA,
-20L))
You could try using interaction to combine two of your factors and plot against a third. For example, assuming you want to fill by condition as in your original code:
library(tidyr)
completionTime %>%
gather(trial, value, -Scenario, -Condition, -User) %>%
ggplot(aes(interaction(Scenario, trial), value)) + geom_boxplot(aes(fill = Condition))
Result: