Related
I have the following DF:
structure(list(AgeGroup = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Young",
"Old"), class = "factor"), variable = structure(c(1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L,
10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L,
23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L,
30L, 30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L,
36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L,
43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L,
49L, 50L, 50L), .Label = c("Point.1", "Point.2", "Point.3", "Point.4",
"Point.5", "Point.6", "Point.7", "Point.8", "Point.9", "Point.10",
"Point.11", "Point.12", "Point.13", "Point.14", "Point.15", "Point.16",
"Point.17", "Point.18", "Point.19", "Point.20", "Point.21", "Point.22",
"Point.23", "Point.24", "Point.25", "Point.26", "Point.27", "Point.28",
"Point.29", "Point.30", "Point.31", "Point.32", "Point.33", "Point.34",
"Point.35", "Point.36", "Point.37", "Point.38", "Point.39", "Point.40",
"Point.41", "Point.42", "Point.43", "Point.44", "Point.45", "Point.46",
"Point.47", "Point.48", "Point.49", "Point.50"), class = "factor"),
value = c(0.714518666666667, 0.723876630952381, 0.728961368421053,
0.735228897233202, 0.701283807017544, 0.71396457312253, 0.663229964912281,
0.68923661660079, 0.613014666666667, 0.652671079051383, 0.547104,
0.602951166007905, 0.504106245614035, 0.558832648221344,
0.487034052631579, 0.515752438735178, 0.451825245614035,
0.476300007905138, 0.442370175438596, 0.441173656126482,
0.438668315789474, 0.435859173913043, 0.450059526315789,
0.434047494071146, 0.478947649122807, 0.450561841897233,
0.481134438596491, 0.461228027667984, 0.446763543859649,
0.451031316205534, 0.396206754385965, 0.406836889328063,
0.357049368421053, 0.368716249011858, 0.343943631578947,
0.368048932806324, 0.376060403508772, 0.398834193675889,
0.413613877192982, 0.434683889328063, 0.434964894736842,
0.448746023715415, 0.451208631578947, 0.450663276679842,
0.470569192982456, 0.473143399209486, 0.515300736842105,
0.502499193675889, 0.543379719298246, 0.507495533596838,
0.550050701754386, 0.498506288537549, 0.541725807017544,
0.482379664031621, 0.517293315789474, 0.458068636363636,
0.485205245614035, 0.423109671936759, 0.438844403508772,
0.385925747035573, 0.39522349122807, 0.362403612648221, 0.374209192982456,
0.350889750988142, 0.354036315789474, 0.336213118577075,
0.340668122807018, 0.327800648221344, 0.326388666666667,
0.322577146245059, 0.328114842105263, 0.319440624505929,
0.342721666666667, 0.323974818181818, 0.357620473684211,
0.335501339920949, 0.372856842105263, 0.343831292490119,
0.377362315789474, 0.361571442687747, 0.393890736842105,
0.377489727272727, 0.419330684210526, 0.38274228458498, 0.419797666666667,
0.387899881422925, 0.423127684210526, 0.385955055335968,
0.42140750877193, 0.377730351778656, 0.403711631578947, 0.366319122529644,
0.390753140350877, 0.355189754940711, 0.373226596491228,
0.347452173913044, 0.348689877192982, 0.340376324110672,
0.329466947368421, 0.344867375494071)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
which a subset looks like:
A tibble: 100 x 3
AgeGroup variable value
<fct> <fct> <dbl>
1 Young Point.1 0.715
2 Old Point.1 0.724
3 Young Point.2 0.729
4 Old Point.2 0.735
5 Young Point.3 0.701
6 Old Point.3 0.714
7 Young Point.4 0.663
8 Old Point.4 0.689
9 Young Point.5 0.613
10 Old Point.5 0.653
I have an output using:
Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
But this provides me with a lot of unnecessary calculations like Point.1 and Point.3, Point.1 and Point.4, etc.
I would like to calculate Cohen's D for each successive points. So for example:
Point.1:Point.2, Point.2:Point.3, etc. The end goal is to plot D values on the Y-axis and Points 1 through 50 on the X-axis.
In base R you can accomplish the same using embed + apply:
L <- split(DF$value, DF$variable)
mat <- embed(names(L), 2)[,2:1]
res <- apply(mat, 1, function(x) rstatix::cohens_d(stack(L[x]), values~ind))
do.call(rbind, res)
A tibble: 49 x 7
.y. group1 group2 effsize n1 n2 magnitude
<chr> <chr> <chr> <dbl> <int> <int> <ord>
1 values Point.1 Point.2 -2.29 2 2 large
2 values Point.2 Point.3 3.46 2 2 large
3 values Point.3 Point.4 2.17 2 2 large
4 values Point.4 Point.5 1.83 2 2 large
5 values Point.5 Point.6 1.69 2 2 large
6 values Point.6 Point.7 1.11 2 2 large
7 values Point.7 Point.8 0.973 2 2 large
8 values Point.8 Point.9 1.98 2 2 large
9 values Point.9 Point.10 1.82 2 2 large
10 values Point.10 Point.11 2.95 2 2 large
# ... with 39 more rows
if you can use the effsize::cohen.d then this function will be faster than all the options given so far:
my_cohen <- function(data){
L <- split(data$value, data$variable)
mat <- embed(names(L), 2)
res <- apply(mat, 1, function(x)
effsize::cohen.d(L[[x[2]]], L[[x[1]]])$estimate)
data.frame(mat, res)
}
my_cohen(DF)
X1 X2 res
1 Point.2 Point.1 -2.29025540
2 Point.3 Point.2 3.45998958
3 Point.4 Point.3 2.16986489
4 Point.5 Point.4 1.82991671
5 Point.6 Point.5 1.68816593
6 Point.7 Point.6 1.11414226
We could nest the 'value', get the lead of the list column, and apply cohen.d by looping over the two list
library(dplyr)
library(effsize)
library(purrr)
out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))
-output
head(out, 2)
# A tibble: 2 x 5
variable data variable_lead data_lead cohen_d
<fct> <list> <fct> <list> <dbl>
1 Point.1 <tibble [2 × 1]> Point.2 <tibble [2 × 1]> -2.29
2 Point.2 <tibble [2 × 1]> Point.3 <tibble [2 × 1]> 3.46
comparing with OP's filtered output
Cho_D %>%
slice(seq(1, n(), by = 49)) %>%
slice_head(n = 2)
# A tibble: 2 x 7
# .y. group1 group2 effsize n1 n2 magnitude
# <chr> <chr> <chr> <dbl> <int> <int> <ord>
#1 value Point.1 Point.2 -2.29 2 2 large
#2 value Point.2 Point.3 3.46 2 2 large
Benchmarks
With the number of comparisons reduced, the timings are below
system.time({Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
})
# user system elapsed
# 16.316 0.060 16.330
system.time({out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))})
# user system elapsed
# 0.031 0.005 0.037
[![enter image description here][2]][2][![i need help in writing gstummary r code to produce following table output.dummy table shown in above table][2]][2]
i need help in writing gstummary r code to produce following table output.dummy table shown in above table
[![enter image description here][2]][2]
library(gtsummary)
[![enter image description here][2]][2]
[![enter image description here][3]][3]
id
age
sex
country
edu
ln
ivds
n2
p5
1
a
M
eng
x
45
15
40
15
2
a
M
eng
x
23
26
70
15
4
a
M
eng
x
26
36
35
40
5
b
F
eng
x
26
25
36
47
6
b
F
wal
y
45
45
60
12
7
b
M
wal
y
60
25
36
15
8
c
M
wal
y
70
08
25
36
9
c
F
sco
z
80
25
36
15
10
c
F
sco
z
90
25
26
39
structure(list(id = 1:15, age = structure(c(1L, 1L, 2L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 2L), .Label = c("a", "b",
"c"), class = "factor"), sex = structure(c(2L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("F", "M"), class = "factor"),
country = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 3L), .Label = c("eng", "scot", "wale"
), class = "factor"), edu = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L), .Label = c("x",
"y", "z"), class = "factor"), lon = c(45L, 23L,
25L, 45L, 70L, 69L, 90L, 50L, 62L, 45L, 23L, 25L, 45L, 70L,
69L), is = c(15L, 26L, 36L, 34L, 2L, 4L, 5L, 8L, 9L,
15L, 26L, 36L, 34L, 2L, 4L), n2 = c(40L, 70L, 50L, 60L,
30L, 25L, 80L, 89L, 10L, 40L, 70L, 50L, 60L, 30L, 25L), p5 = c(15L,
20L, 36L, 48L, 25L, 36L, 28L, 15L, 25L, 15L, 20L, 36L, 48L,
25L, 36L)), row.names = c(NA, 15L), class = "data.frame")
[
I made a table similar to what you have above (more similar to the table you had before you updated it). But I think it'll get you most of the way there.
The type of table you're requesting it something that is in the works. In the meantime, you will need to use the bstfun::tbl_2way_summary() function. This function exists in another package while we work to make it better before integrating with gtsummary.
library(bstfun) # install with `remotes::install_github("ddsjoberg/bstfun")`
library(gtsummary)
packageVersion("gtsummary")
#> [1] '1.4.1'
# add a column that is all the same value
trial2 <- trial %>% mutate(constant = TRUE)
# loop over each continuous variable, construct table, then merge them together
tbls_row1 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = grade, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# repeat for the second row
tbls_row2 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = stage, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# stack these tables
tbl_stacked <- tbl_stack(list(tbls_row1, tbls_row2))
# lastly, add calculated summary stats for categorical variables, and merge them
tbl_summary_stats <-
trial2 %>%
tbl_summary(
include = c(grade, stage),
missing = "no"
) %>%
modify_header(stat_0 ~ "**n (%)**") %>%
modify_footnote(everything() ~ NA)
tbl_final <-
tbl_merge(list(tbl_summary_stats, tbl_stacked)) %>%
modify_spanning_header(everything() ~ NA) %>%
# column spanning column headers
modify_spanning_header(
list(c(stat_1_1_2, stat_1_2_2) ~ "**Group 1**",
stat_1_3_2 ~ "**Group 2**")
)
Created on 2021-07-10 by the reprex package (v2.0.0)
This question already has answers here:
How to group by two columns in R
(4 answers)
Closed 4 years ago.
I have a dataframe with 5 columns. I know how to calculate the mean for one column grouped by another column. However, i need to group it by two columns. For example, I want to calculate the mean for column 5 grouped by column 1 and column 2.
df <- structure(list(Country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("AT", "CH", "DE"), class = "factor"),
Occupation = c(1L, 3L, 5L, 3L, 1L, 2L, 5L, 3L, 5L, 3L, 1L,
2L, 1L, 5L, 3L, 3L, 1L, 3L, 2L, 5L, 5L, 1L, 2L, 1L, 3L),
Age = c(20L, 46L, 30L, 12L, 73L, 53L, 19L, 43L, 65L, 53L,
19L, 34L, 76L, 25L, 45L, 39L, 18L, 59L, 37L, 24L, 19L, 60L,
51L, 32L, 29L), Gender = structure(c(1L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L), .Label = c("female", "male"), class = "factor"),
Income = c(100L, 80L, 78L, 29L, 156L, 56L, 95L, 104L, 87L,
56L, 203L, 45L, 112L, 78L, 56L, 140L, 99L, 67L, 89L, 109L,
43L, 145L, 30L, 101L, 77L)), class = "data.frame", row.names = c(NA,
-25L))
head(df)
Country Occupation Age Gender Income
1 AT 1 20 female 100
2 AT 3 46 female 80
3 AT 5 30 male 78
4 AT 3 12 male 29
5 AT 1 73 male 156
6 AT 2 53 female 56
So what I want to to is calculate the mean for column ‘income’, grouped by country and occupation. E.g., I want to calculate the mean of ‘income’ for all those people living in country ‘AT’ with occupation ‘3’, the mean of ‘income’ for all those living in country ‘CH’ with occupation ‘1’ and so on.
(1) base method (aggregate)
mean.df <- aggregate(Income ~ Country + Occupation, df, mean)
names(mean.df)[3] <- "Income_Mean"
merge(df, mean.df)
(2) base method (tapply)
mean.df1 <- tapply(df$Income, list(df$Country, df$Occupation), mean)
mean.df2 <- as.data.frame(as.table(mean.df1))
names(mean.df2) <- c("Country", "Occupation", "Income_Mean")
merge(df, mean.df2)
(3) stats method (ave)
df2 <- df
df2$Income_Mean <- ave(df$Income, df$Country, df$Occupation)
(4) dplyr method
df %>% group_by(Country, Occupation) %>%
mutate(Income_Mean = mean(Income))
Output :
Country Occupation Age Gender Income Income_Mean
<fct> <int> <int> <fct> <int> <dbl>
1 AT 1 20 female 100 128
2 AT 3 46 female 80 71
3 AT 5 30 male 78 86.5
4 AT 3 12 male 29 71
5 AT 1 73 male 156 128
6 AT 2 53 female 56 56
7 AT 5 19 male 95 86.5
8 AT 3 43 male 104 71
9 CH 5 65 male 87 82.5
10 CH 3 53 female 56 84
# ... with 15 more rows
Using sqldf:
sqldf("select Country,Occupation,Age,Gender,avg(Income) from df group by Country,Occupation")
OR
Using data.table:
library(data.table)
df=data.table(df)
df[, mean(Income), by = list(Country,Occupation)]
Output:
Country Occupation Age Gender avg(Income)
1 AT 1 73 male 128.0
2 AT 2 53 female 56.0
3 AT 3 43 male 71.0
4 AT 5 19 male 86.5
5 CH 1 18 female 138.0
6 CH 2 34 male 45.0
7 CH 3 39 male 84.0
8 CH 5 25 female 82.5
9 DE 1 32 female 123.0
10 DE 2 51 female 59.5
11 DE 3 29 male 72.0
12 DE 5 19 male 76.0
I've got a simple dataset.
structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L,
4L, 5L, 5L), Primrely = c(0L, 2L, 1L, 1L, 1L, 1L, 3L, 4L, 4L,
3L, 1L, 2L, 2L), Primset = c(-4L, -3L, 1L, 2L, -4L, 5L, 3L, 1L,
2L, -4L, -2L, -3L, 3L), Primvalue = c(45L, 5L, 6L, 15L, 53L,
45L, 44L, 65L, 1L, 5L, 1L, 12L, 5L), Secrely = c(5L, 7L, 2L,
1L, 2L, 0L, 4L, 5L, 1L, 1L, 1L, 0L, 2L), Secset = c(-3L, 1L,
2L, -2L, -3L, 2L, 5L, 7L, 7L, 4L, 3L, 2L, 1L), Secvalue = c(38L,
-2L, -1L, 8L, 46L, 38L, 37L, 58L, -6L, -2L, -6L, 5L, -2L), Desired = structure(c(NA,
1L, NA, NA, 2L, 2L, NA, NA, NA, NA, NA, 1L, 1L), .Label = c("Primary",
"Secondary"), class = "factor")), .Names = c("ID", "Primrely",
"Primset", "Primvalue", "Secrely", "Secset", "Secvalue", "Desired"
), class = "data.frame", row.names = c(NA, -13L))
ID Primrely Primset Primvalue Secrely Secset Secvalue Desired
1 1 0 -4 45 5 -3 38 <NA>
2 1 2 -3 5 7 1 -2 Primary
3 1 1 1 6 2 2 -1 <NA>
4 1 1 2 15 1 -2 8 <NA>
5 2 1 -4 53 2 -3 46 Secondary
6 2 1 5 45 0 2 38 Secondary
7 2 3 3 44 4 5 37 <NA>
8 3 4 1 65 5 7 58 <NA>
9 4 4 2 1 1 7 -6 <NA>
10 4 3 -4 5 1 4 -2 <NA>
11 4 1 -2 1 1 3 -6 <NA>
12 5 2 -3 12 0 2 5 Primary
13 5 2 3 5 2 1 -2 Primary
For each ID, I'd like to select rows that meet the criteria (Prim = primary, Sec = secondary): If Primrely is 0 or 2 and Primset is -3:3, select all rows for each ID. If no rows for a given ID meet the primary criteria, select rows that meet the secondary criteria (Secrely is 0 or 2 and Secset is -3:3). Ideally, I'd like to add a column (Desired) that indicate which criteria was met (primary/secondary/NA).
I've been working with ifelse and if else functions without much luck mainly because I don't know how to command R to ingore a given ID if the primary criteria was already met (eg ID #1 meets the second criteria but doesn't need it because it already met the first criteria). In other words, if a 'primary' shows up in a given ID, it trumps all the 'secondary' criteria that were met. I would appreciate any advice.
If I understand you correctly now:
(left in the steps to show you what I was doing, you can remove them and/or do this all in one step if you want)
dat <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L,
4L, 5L, 5L), Primrely = c(0L, 2L, 1L, 1L, 1L, 1L, 3L, 4L, 4L,
3L, 1L, 2L, 2L), Primset = c(-4L, -3L, 1L, 2L, -4L, 5L, 3L, 1L,
2L, -4L, -2L, -3L, 3L), Primvalue = c(45L, 5L, 6L, 15L, 53L,
45L, 44L, 65L, 1L, 5L, 1L, 12L, 5L), Secrely = c(5L, 7L, 2L,
1L, 2L, 0L, 4L, 5L, 1L, 1L, 1L, 0L, 2L), Secset = c(-3L, 1L,
2L, -2L, -3L, 2L, 5L, 7L, 7L, 4L, 3L, 2L, 1L), Secvalue = c(38L,
-2L, -1L, 8L, 46L, 38L, 37L, 58L, -6L, -2L, -6L, 5L, -2L), Desired = structure(c(NA,
1L, NA, NA, 2L, 2L, NA, NA, NA, NA, NA, 1L, 1L), .Label = c("Primary",
"Secondary"), class = "factor")), .Names = c("ID", "Primrely",
"Primset", "Primvalue", "Secrely", "Secset", "Secvalue", "Desired"
), class = "data.frame", row.names = c(NA, -13L))
within(dat, {
Desired_step1 <- ifelse(Primrely %in% c(0,2) & Primset %in% -3:3,
1, ifelse(Secrely %in% c(0,2) & Secset %in% -3:3,
2, 3))
Desired_new <- factor(ave(Desired_step1, ID, FUN = function(x)
ifelse(x == min(x), x, NA)),
levels = 1:3, labels = c('Primary', 'Secondary', 'NA'))
Desired_step1 <- c('1'='Primary','2'='Secondary','3'=NA)[Desired_step1]
})
# ID Primrely Primset Primvalue Secrely Secset Secvalue Desired Desired_new Desired_step1
# 1 1 0 -4 45 5 -3 38 <NA> <NA> <NA>
# 2 1 2 -3 5 7 1 -2 Primary Primary Primary
# 3 1 1 1 6 2 2 -1 <NA> <NA> Secondary
# 4 1 1 2 15 1 -2 8 <NA> <NA> <NA>
# 5 2 1 -4 53 2 -3 46 Secondary Secondary Secondary
# 6 2 1 5 45 0 2 38 Secondary Secondary Secondary
# 7 2 3 3 44 4 5 37 <NA> <NA> <NA>
# 8 3 4 1 65 5 7 58 <NA> NA <NA>
# 9 4 4 2 1 1 7 -6 <NA> NA <NA>
# 10 4 3 -4 5 1 4 -2 <NA> NA <NA>
# 11 4 1 -2 1 1 3 -6 <NA> NA <NA>
# 12 5 2 -3 12 0 2 5 Primary Primary Primary
# 13 5 2 3 5 2 1 -2 Primary Primary Primary
Here's my quick & dirty solution assuming your data.frame is named df. You can refine it yourself I think:
df$Desired <- ifelse((df$Primrely==0 | df$Primrely==2) & (df$Primset >= -3 & df$Primset <= 3),
"Primary",
NA)
idx <- is.na(df$Desired)
df$Desired[idx] <- ifelse((df$Secrely[idx]==0 | df$Secrely[idx]==2) & (df$Secset[idx] >= -3 & df$Secset[idx] <= 3),
"Secondary",
NA)
I have a melted data.frame, dput(x), below:
## dput(x)
x <- structure(list(variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L), .Label = c("a", "b", "c", "d"), class = "factor"),
value = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("Never Heard of",
"Heard of but Not at all Familiar",
"Somewhat Familiar", "Familiar", "Very Familiar", "Extremely Familiar"
), class = "factor"), freq = c(10L, 24L, 32L, 90L, 97L, 69L,
15L, 57L, 79L, 94L, 58L, 19L, 11L, 17L, 34L, 81L, 94L, 85L, 4L,
28L, 59L, 114L, 82L, 35L)), .Names = c("variable", "value", "freq"
), row.names = c(NA, -24L), class = "data.frame")
Which looks like this (for those of you who don't need a test set):
variable value freq
1 a Never Heard of 10
2 a Heard of but Not at all Familiar 24
3 a Somewhat Familiar 32
4 a Familiar 90
5 a Very Familiar 97
6 a Extremely Familiar 69
7 b Never Heard of 15
8 b Heard of but Not at all Familiar 57
9 b Somewhat Familiar 79
10 b Familiar 94
11 b Very Familiar 58
12 b Extremely Familiar 19
13 c Never Heard of 11
14 c Heard of but Not at all Familiar 17
15 c Somewhat Familiar 34
16 c Familiar 81
17 c Very Familiar 94
18 c Extremely Familiar 85
19 d Never Heard of 4
20 d Heard of but Not at all Familiar 28
21 d Somewhat Familiar 59
22 d Familiar 114
23 d Very Familiar 82
24 d Extremely Familiar 35
Now, I can make a nice and pretty plot akin to this:
ggplot(x, aes(variable, freq, fill = value)) +
geom_bar(position = "fill") +
coord_flip() +
scale_y_continuous("", formatter="percent")
Question
What I would like to do is sort a,b,c,d by the highest to lowest "freq" of "Extremely Familiar"
?relevel and ?reorder haven't provided any constructive examples for this usage.
Your help, is always appreciated.
Cheers,
BEB
Here is another way to do it:
tmp <- subset(x, value=="Extremely Familiar")
x$variable <- factor(x$variable, levels=levels(x$variable)[order(-tmp$freq)])
Here is one way:
tmpfun <- function(i) {
tmp <- x[i,]
-tmp[ tmp$value=='Extremely Familiar', 'freq' ]
}
x$variable <- reorder( x$variable, 1:nrow(x), tmpfun )