Adding sample size to ggplot boxplot - r

I'm interested to see how age is related to a continuous outcome, for which I have the following data:
library(dplyr)
library(tidyverse)
library(magrittr)
library(ggplot2)
mydata <-
structure(list(ID = c(104, 157, 52, 152, 114, 221, 320, 125,
75, 171, 80, 76, 258, 82, 142, 203, 37, 92, 202, 58, 194, 38,
4, 137, 25, 87, 40, 117, 21, 255, 277, 315, 96, 134, 185, 94,
3, 153, 172, 65, 279, 209, 60, 13, 154, 160, 24, 29, 159, 213,
127, 74, 48, 126, 184, 132, 61, 141, 27, 49, 8, 39, 164, 162,
34, 205, 179, 119, 77, 135, 138, 165, 103, 253, 14, 20, 310,
84, 30, 273, 22, 105, 262, 116, 86, 83, 145, 31, 95, 51, 81,
271, 36, 50, 189, 2, 115, 7, 197, 54), age = c(67.1, 70.7, 53,
61.7, 66.1, 57.7, 54.1, 67.2, 60.9, 55.8, 40.7, 57.6, 64.1, 70.7,
47.5, 46.3, 66.7, 55, 63.3, 68.2, 61.2, 60.5, 52, 65.3, 48.9,
56.9, 62.7, 75.2, 61.4, 57.9, 53.6, 58.1, 51, 67.3, 63.9, 57,
43.2, 64.7, 62.8, 56.3, 51.7, 39.4, 45.2, 57.8, 55.7, 69.6, 61.5,
50.1, 73.7, 55.5, 65.2, 54.6, 49, 35.2, 52.9, 46.3, 55, 52.5,
54.2, 61, 57.4, 56.5, 53.6, 47.7, 64.2, 53.4, 60.9, 58.2, 60.7,
50.3, 48.3, 74.7, 52.1, 59.9, 52.4, 70.8, 61.2, 66.5, 55.4, 57.5,
59.2, 60.1, 52.3, 60.2, 54.8, 36.3, 61.5, 48.6, 56, 62, 64.8,
40.4, 68.3, 60, 69.1, 56.6, 45.3, 58.5, 52.3, 52), continuous_outcome = c(3636.6,
1128.2, 2007.5, 802.9, 332.3, 2636.1, 169.5, 67.9, 3261.8, 1920.3,
155.2, 1677.2, 198.2, 11189.7, 560.9, 633.1, 196.1, 13.9, 100.7,
7594.5, 1039.8, 83.9, 2646.8, 284.6, 306, 1135.6, 1883.1, 5681.4,
1706.2, 2241.1, 97.7, 1106.8, 1107.1, 290.8, 2123.4, 267, 115.3,
138.5, 152.7, 1338.9, 6709.8, 561.7, 1931.7, 3112.4, 1876.3,
3795.9, 5706.7, 7.4, 1324.9, 4095.4, 205.4, 1886, 177.3, 304.4,
1319.1, 415.9, 537.2, 3141.1, 740, 1976.7, 624.8, 983.1, 1163.5,
1432.6, 3730.4, 2023.4, 498.2, 652.5, 982.7, 1345.3, 138.4, 1505.1,
3528.1, 11.9, 884.5, 10661.6, 1911.4, 2800.8, 81.5, 396.4, 409.1,
417.3, 186, 1892.4, 1689.7, 0, 210.1, 210.5, 3484.5, 3196.8,
57.2, 20.2, 947, 540, 1603.1, 1571.8, 9.1, 149.2, 122, 63.2),
age_decades = structure(c(3L, 4L, 2L, 3L, 3L, 2L, 2L, 3L,
3L, 2L, 1L, 2L, 3L, 4L, 1L, 1L, 3L, 2L, 3L, 3L, 3L, 3L, 2L,
3L, 1L, 2L, 3L, 4L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 1L, 3L,
3L, 2L, 2L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 4L, 2L, 3L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 3L, 2L, 3L, 2L,
3L, 2L, 1L, 4L, 2L, 2L, 2L, 4L, 3L, 3L, 2L, 2L, 2L, 3L, 2L,
3L, 2L, 1L, 3L, 1L, 2L, 3L, 3L, 1L, 3L, 2L, 3L, 2L, 1L, 2L,
2L, 2L), .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
To make a boxplot of age decades on the x axis and my continuous outcome I'm using ggplot2.
I want to make several, and automatically plot the sample size on the x-axis ticks. To do so I've computed labels in the dataset as follows:
mydata <-
mydata %>%
group_by(age_decades) %>%
mutate(n_decades=as_character(n())) %>%
mutate(label_decades=case_when(age_decades==1 ~ "Below 50",
age_decades==2 ~ "Between 50 and 60",
age_decades==3 ~ "Between 60 and 70",
age_decades==4 ~ "Above 70")) %>%
mutate(label_decades=paste0(label_decades, '\n n = ', n_decades)) %>%
ungroup() %>%
relocate(age_decades, label_decades, .after=age) %>%
select(-n_decades) %>%
arrange(ID)
Then I've tried to plot the boxplot using the newly created variable label_decades to label. The first thing I tried was:
ggplot(mydata, aes(x=age_decades, y=continuous_outcome)) +
geom_boxplot() +
scale_x_discrete(labels=mydata$label_decades)
But that just plots the first few labels as they occur in the dataset (so they dont correspond to the actual boxplot):
Then I tried:
ggplot(mydata, aes(x=age_decades, y=continuous_outcome)) +
geom_boxplot() +
geom_text(data=mydata, aes(age_decades, Inf, label=label_decades),
vjust = 15, size=4)
Which works better but the font is really weird and also the original x axis labels/ticks are still showing.
Anyone know how to solve this issue? Thanks!

The font looks wired because there are many labels with the same text plotted on top of each other. You can use distinct to get only one label per x tick and use the theme function to get rid of x tick labels:
mydata %>%
ggplot(aes(age_decades, continuous_outcome)) +
geom_boxplot() +
geom_text(
data = mydata %>% distinct(age_decades, label_decades),
mapping = aes(label = label_decades),
y = 9e3
) +
theme(
axis.text.x = element_blank()
)

One way would be to turn the labels to factor as well.
library(dplyr)
library(ggplot2)
mydata <- mydata %>%
group_by(age_decades) %>%
mutate(n_decades= as.character(n())) %>%
mutate(label_decades= case_when(age_decades==1 ~ "Below 50",
age_decades==2 ~ "Between 50 and 60",
age_decades==3 ~ "Between 60 and 70",
age_decades==4 ~ "Above 70")) %>%
mutate(label_decades= factor(paste0(label_decades, '\n n = ', n_decades))) %>%
ungroup() %>%
relocate(age_decades, label_decades, .after=age) %>%
select(-n_decades) %>%
arrange(ID)
You can then use it's levels in scale_x_discrete.
ggplot(mydata, aes(x=age_decades, y=continuous_outcome)) +
geom_boxplot() +
scale_x_discrete(labels= levels(mydata$label_decades))

Related

Randomly sample by group based on numeric variables

Is it possible to randomly sample patients by group so that they have similar distributions based on other variables? To me, this sounds like a matching problem, but there's no "treatment" here, so I'm not sure if the concept applies.
Sample data:
structure(list(id = c(8350L, 22543L, 24144L, 9392L, 27648L, 2943L,
34686L, 27153L, 11143L, 15209L, 11952L, 22669L, 8211L, 27765L,
28671L, 9693L, 30274L, 25807L, 14839L, 22400L, 24494L, 6540L,
6861L, 31825L, 34190L, 19606L, 21077L, 5037L, 25943L, 20530L,
23730L, 34774L, 7210L, 2051L, 28410L, 18318L, 34848L, 26596L,
8973L, 24885L, 9652L, 8387L, 16168L, 36893L, 24048L, 17769L,
1273L, 22734L, 36796L, 25497L, 28300L, 166L, 21172L, 20026L,
16265L, 1699L, 33140L, 23997L, 10216L, 27408L, 6813L, 10196L,
15015L, 2748L, 34979L, 21763L, 27438L, 6255L, 17047L, 30593L,
30723L, 7914L, 218L, 20134L, 29952L, 27126L, 3795L, 1367L, 33585L,
5940L, 26250L, 22519L, 35611L, 26168L, 26848L, 21276L, 8971L,
22554L, 16655L, 5315L, 18121L, 32526L, 21513L, 9262L, 36882L,
7408L, 18873L, 17238L, 15216L, 23667L, 30138L, 2978L, 25451L,
2492L, 30983L, 7677L, 22880L, 29674L, 7093L, 24910L, 20839L,
18176L, 23031L, 17197L, 4613L, 35801L, 30822L, 3889L, 11752L,
11314L, 22317L, 12825L, 17433L, 4407L, 3986L, 10173L, 32409L,
2697L, 3410L, 26834L, 3203L, 5474L, 34678L, 35336L, 19462L, 15835L,
7888L, 27897L, 9245L, 16524L, 13316L, 21604L, 30458L, 9191L,
1220L, 1779L, 1724L, 26382L, 11566L, 21310L, 12600L, 25063L,
30912L, 31189L, 9480L, 16804L, 2372L, 26238L, 20113L, 33753L,
32711L, 11543L, 10578L, 4475L, 13187L, 23395L, 35342L, 6903L,
26905L, 12026L, 5697L, 15352L, 33985L, 1132L, 15806L, 13611L,
29930L, 15896L, 6057L, 10849L, 12944L, 25561L, 3328L, 27481L,
28790L, 3260L, 24986L, 22177L, 26580L, 11639L, 2256L, 4839L,
22805L, 616L, 6702L, 18360L, 4439L, 1300L, 33779L, 24940L, 10043L,
21268L, 35127L, 36621L, 17618L, 6688L, 15937L, 31057L, 2144L,
30866L, 12500L, 29753L, 36497L, 21247L, 9481L, 36465L, 20665L,
15017L, 21234L, 34258L, 576L, 31187L, 4528L, 15314L, 3657L, 24489L,
33871L, 106L, 24916L, 2524L, 17469L, 2799L, 13311L, 26585L, 7131L,
21401L, 6191L, 22338L, 11647L, 11681L, 22744L, 14000L, 5356L,
2892L, 24481L, 24116L, 21461L, 13992L, 22751L, 11129L, 8802L,
29963L, 4660L, 29020L, 20843L, 21796L, 3607L, 10692L, 29168L,
25034L, 3307L, 35010L, 20280L, 31894L, 7276L, 24259L, 34059L,
35867L, 11165L, 16010L, 34082L, 26586L, 30958L, 25030L, 34851L,
29185L, 25721L, 8968L, 29427L, 20213L, 34667L, 28721L, 21472L,
17132L, 35247L, 9798L, 36826L, 21226L, 28335L, 16077L, 2654L,
20466L, 21324L, 36969L, 22553L, 5895L, 16514L, 10644L, 4376L,
13592L, 11206L, 32440L, 13413L, 31416L, 22540L, 15986L, 11506L,
16928L, 18652L, 17858L, 13522L, 8566L, 10665L, 29442L, 28219L,
22549L, 2209L, 8017L, 6066L, 21718L, 21930L, 11540L, 4100L, 35236L,
240L, 24900L, 425L, 26880L, 21409L, 18885L, 5803L, 33335L, 25597L,
12547L, 8930L, 4328L, 17360L, 4696L, 25198L, 26469L, 14679L,
1691L, 32989L, 6099L, 14427L, 31797L, 23408L, 29296L, 23928L,
31889L, 31737L, 6420L, 11304L, 34798L, 20785L, 9806L, 35018L,
35008L, 1450L, 3246L, 15123L, 19603L, 8519L, 32012L, 3397L, 11682L,
27102L, 18022L, 20408L, 15836L, 18284L, 12897L, 29580L, 14510L,
23925L, 28821L, 35825L, 14922L, 36643L, 10948L, 4220L, 23791L,
65L, 35772L, 1423L, 29386L, 755L, 23627L, 27201L, 12353L, 3578L,
1914L, 35373L, 16702L, 13057L, 3021L, 27531L, 1990L, 205L, 21559L,
29081L, 26301L, 18894L, 3088L, 9782L, 10522L, 12570L, 8948L,
36240L, 33943L, 33022L, 2750L, 32649L, 30134L, 13920L, 11498L,
8314L, 16849L, 15559L, 22529L, 31406L, 5680L, 17908L, 14931L,
2122L, 2581L, 33546L, 12143L, 17220L, 16713L, 7454L, 13659L,
15973L, 20116L, 27689L, 35285L, 36106L, 21834L, 29850L, 29030L,
7957L, 31698L, 12307L, 23642L, 5615L, 12016L, 1161L, 15291L,
32738L, 1089L, 32988L, 33382L, 3642L, 18661L, 35584L, 8009L,
24000L, 30587L, 25870L, 19944L, 34970L, 29983L, 24774L, 28702L,
21199L, 17292L, 29831L, 476L, 18881L, 29923L, 31476L, 4570L,
31081L, 10544L, 3373L, 13435L, 22651L, 17861L, 3818L, 35387L,
11459L, 35637L, 308L, 35697L, 12696L, 15175L, 7990L, 16691L,
19494L, 9008L, 30695L, 28889L, 446L, 22178L, 13000L, 26166L,
15431L, 19332L, 35991L, 2840L), race_f = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 4L, 2L, 3L, 4L, 1L, 1L, 3L, 3L, 3L, 3L, 1L,
3L, 1L, 3L, 3L, 1L, 1L, 3L, 2L, 2L, 1L, 4L, 5L, 1L, 4L, 1L, 1L,
5L, 1L, 1L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 1L,
2L, 1L, 3L, 1L, 2L, 1L, 1L, 1L, 2L, 3L, 3L, 1L, 1L, 3L, 3L, 3L,
1L, 1L, 1L, 3L, 3L, 2L, 1L, 1L, 3L, 4L, 4L, 1L, 1L, 3L, 1L, 2L,
3L, 4L, 1L, 1L, 1L, 3L, 1L, 1L, 5L, 3L, 1L, 1L, 3L, 2L, 1L, 1L,
3L, 1L, 4L, 1L, 1L, 3L, 1L, 4L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 2L,
3L, 3L, 4L, 4L, 1L, 2L, 1L, 4L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 3L,
1L, 2L, 1L, 2L, 2L, 4L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 4L, 1L, 5L,
2L, 1L, 2L, 3L, 1L, 5L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 3L,
1L, 4L, 4L, 3L, 2L, 4L, 2L, 1L, 3L, 3L, 1L, 4L, 3L, 3L, 3L, 1L,
1L, 4L, 1L, 4L, 2L, 3L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 1L,
3L, 4L, 1L, 3L, 1L, 1L, 4L, 3L, 4L, 1L, 3L, 1L, 2L, 4L, 3L, 3L,
1L, 1L, 3L, 1L, 5L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 2L, 1L, 4L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
3L, 1L, 1L, 1L, 4L, 1L, 4L, 3L, 1L, 3L, 2L, 1L, 1L, 2L, 3L, 1L,
4L, 2L, 3L, 1L, 3L, 4L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 5L, 4L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 4L, 1L, 1L, 3L,
1L, 3L, 3L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 2L, 1L, 3L, 1L, 4L,
1L, 4L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 4L, 1L, 1L, 2L, 1L, 1L, 1L,
4L, 1L, 1L, 3L, 3L, 1L, 4L, 3L, 3L, 4L, 1L, 3L, 1L, 5L, 3L, 4L,
1L, 4L, 4L, 1L, 3L, 4L, 1L, 4L, 1L, 1L, 1L, 3L, 2L, 1L, 2L, 4L,
1L, 1L, 5L, 4L, 1L, 1L, 4L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 2L,
1L, 3L, 3L, 3L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 3L, 1L,
1L, 3L, 4L, 1L, 1L, 2L, 5L, 3L, 3L, 1L, 1L, 4L, 1L, 4L, 1L, 4L,
2L, 3L, 3L, 1L, 1L, 1L, 4L, 1L, 4L, 3L, 1L, 1L, 1L, 1L, 3L, 1L,
3L, 1L, 1L, 1L, 1L, 4L, 3L, 4L, 3L, 3L, 3L, 2L, 3L, 1L, 1L, 1L,
3L, 5L, 2L, 5L, 1L, 1L, 4L, 3L, 1L, 3L, 2L, 1L, 1L, 5L, 1L, 3L,
3L, 4L, 1L, 1L, 1L, 2L, 5L, 1L, 1L, 4L, 3L, 1L, 1L, 1L, 2L, 1L,
1L, 3L, 3L, 1L, 1L, 4L, 3L, 2L, 4L, 4L, 1L, 1L, 2L, 3L, 1L, 3L,
3L, 1L), .Label = c("White", "Black", "Hispanic", "Asian", "Other"
), class = "factor"), cops2_avg_12mo = c(82.9166666666667, 66,
23.3333333333333, 28, 9.33333333333333, 69.9166666666667, 6,
33.3333333333333, 0, 12, 102, NA, 66, 6, 45, 58.5, 10, 55.9166666666667,
19.5, 6, 10, 234.666666666667, 28, 23, 51.5833333333333, 10,
38, 123.5, 0, 24, 10, 0, 73, 10, 25, 6, 20, 13.4166666666667,
13.8333333333333, 8, 14.8333333333333, 53.5, 42, NA, 57.1666666666667,
0, 24.6666666666667, 10, NA, 54.6666666666667, 38.75, 41, 22,
0.833333333333333, 13, 113.083333333333, 27.3333333333333, 9,
33.1666666666667, 18.75, 57.75, 30, 60.3333333333333, 23.1666666666667,
37, 16.5, 0, 145.5, 45, 31.3333333333333, 0, 10, 187.5, 27.4166666666667,
10, 54.9166666666667, 78.8333333333333, 103.75, 6.66666666666667,
30.4166666666667, 10, 10, 24.6666666666667, 10, 118.333333333333,
61.25, 17, 10, 28, 51, 6, 32.0833333333333, 80.75, 8.83333333333333,
NA, 10, 74.25, 42.25, 47, 60, 41.6666666666667, 19.0833333333333,
98.5, 73.5, 10, 6.66666666666667, 49.8333333333333, 10, 79.8333333333333,
10, 42, 95.8333333333333, 130.583333333333, 5.41666666666667,
47.25, 6, 8, 17.8333333333333, 10, 73.9166666666667, 10, 8, 27.8333333333333,
125.916666666667, 134.166666666667, 88, 10, 58, 62.5, 10.3333333333333,
28.8333333333333, 100.083333333333, 35.5, 0, 0, 10, 105, 7.33333333333333,
35, 9.66666666666667, 10, 4.16666666666667, 10, 8.33333333333333,
70.6666666666667, 28.4166666666667, 38.1666666666667, 8, 101.5,
26.75, 61.1666666666667, 14, 95.5833333333333, 35, 65, 0, 51.75,
57.5, 10, 13.6666666666667, 10, 67.5, 10, 62.3333333333333, 72.6666666666667,
10, 45.5, 20.8333333333333, 31, 84.5, 10, 98.1666666666667, 47.5,
56, 126, 14, 10, 10, 8, 36, 111.5, 54.5, 45.5, 8, 37.5, 84.8333333333333,
39.1666666666667, 56.25, 37.9166666666667, 37.75, 27, 55.6666666666667,
10, 34, 5.83333333333333, 37, 80.0833333333333, 57, 102.166666666667,
12.6666666666667, 10, 19.3333333333333, 10, NA, 51, 25.9166666666667,
14, 36.9090909090909, 38.6666666666667, 0, 6.33333333333333,
NA, 31, 43, 26.5, 10, 34.4166666666667, 77.1666666666667, 10,
10, 89.9166666666667, 59, 37, 77.3333333333333, 64, 52, 19.6666666666667,
66.5, 24, 106.083333333333, 29.6666666666667, 38.1666666666667,
6.66666666666667, 10, 16.75, NA, 86.75, 1, 14, 20.3333333333333,
8, 21, 38.9166666666667, 50.8333333333333, 57.5, 29, 0, 26.5,
51.9166666666667, 71.25, 42.6666666666667, 82, 58.0833333333333,
11.3333333333333, 82, 9.5, 78.6666666666667, 102.5, 71, 10, 70.6666666666667,
NA, 33.8333333333333, 61.25, 87, 36.5, 10, 40.4166666666667,
51.8333333333333, 23, 9.66666666666667, 44.5, 8, 10, 4.16666666666667,
0, 48.8333333333333, 49.25, 15, 70, 10, 6, 10, 34.8333333333333,
108.75, 36, NA, 31, 51, 69.5, 122.5, 48, 43.5833333333333, NA,
10, 20, 80.75, 54.75, 106.916666666667, 53.5, 90.6666666666667,
8.33333333333333, 85.5, 40.5833333333333, 5.5, 10, 61.3333333333333,
69.8333333333333, 10, 51, 0, 49.0833333333333, 13.6666666666667,
13.3333333333333, 5.83333333333333, 33.8333333333333, 14.4166666666667,
11.25, 14, 6, 14.5833333333333, 36, 21, 10, 29.5833333333333,
13, 34, 10, 2.5, 10, 211.916666666667, 19.75, 7.33333333333333,
6, 59.6666666666667, 30.25, 34.25, 16.1666666666667, 10, NA,
NA, 97, 75, 26.5, 8, 32.25, 0, 39, 37, 165.333333333333, 45,
33.1666666666667, 21, 10, 57, 70.3333333333333, 10, 10, 62, 79.1666666666667,
38, 26.1666666666667, 13, 8, 69.6666666666667, 40.5, 100, 0.833333333333333,
8, 82.5, 10, 19.8333333333333, 20.0833333333333, 8, 25.8333333333333,
16.75, 10, 36, NA, 12.8333333333333, 31.4166666666667, 10, 61.4166666666667,
14, 67.5, 3, 83.1666666666667, 48, 43.75, 35.4166666666667, 73,
44.1666666666667, 8, 29.75, 10, 10, 62.6666666666667, 26.9166666666667,
29.6666666666667, 10, NA, 15, 19.4166666666667, 112, 29, 3, 33.5,
62.5, 10, 84.6666666666667, 8, 84.4166666666667, 81.5, 56.1666666666667,
10, 101.416666666667, 16, 10, 19.6666666666667, 60, 73.6666666666667,
74.9166666666667, 21, 5, 15.0833333333333, 17.0833333333333,
17.5, 46, 61.8333333333333, 115.333333333333, 92, 30, 0, 22.75,
16.6666666666667, 15, 15, 10, NA, 56.25, 54, 10, 40, 9.83333333333333,
10.9166666666667, 22.25, 84.75, 80, 1.66666666666667, 99.8333333333333,
10, 38.6666666666667, 169.75, 35.0833333333333, 8, 78.5, 6.33333333333333,
21, 10, 42, 105.166666666667, 162.416666666667, 14, 69.25, 35.8333333333333,
13, 5.83333333333333, 34, 51, 12.75, 44.3333333333333, 39.5,
10, 23, 46.8333333333333, 89.9166666666667, 15, 28, 128.416666666667,
10, 91.6666666666667, 3.5, 54, 23, NA, 29.75, 37.1666666666667,
12.6666666666667, 31.9166666666667, 23, 0, 11, 67.9166666666667,
3.16666666666667, 8.33333333333333, 51, NA, 10, 0, 58.8333333333333
), AGE = c(86, 82, 83, 92, 45, 81, 52, 64, 71, 96, 79, 64, 76,
37, 81, 79, 72, 79, 74, 46, 45, 71, 89, 76, 53, 48, 52, 77, 63,
52, 57, 62, 84, 88, 55, 69, 67, 63, 67, 51, 86, 53, 65, 59, 71,
60, 70, 20, 78, 62, 58, 73, 68, 71, 66, 72, 71, 65, 95, 67, 79,
70, 86, 77, 81, 54, 44, 66, 80, 71, 30, 77, 67, 75, 48, 65, 83,
85, 70, 70, 74, 58, 81, 28, 78, 66, 79, 47, 74, 41, 74, 58, 73,
55, 53, 56, 84, 74, 62, 85, 68, 47, 78, 72, 57, 56, 64, 55, 86,
76, 77, 58, 74, 55, 71, 61, 74, 62, 65, 75, 81, 68, 39, 58, 65,
76, 27, 79, 86, 61, 87, 52, 72, 58, 53, 69, 78, 65, 81, 69, 66,
68, 61, 72, 74, 80, 88, 46, 53, 77, 89, 83, 41, 67, 83, 62, 90,
70, 60, 62, 33, 78, 80, 62, 81, 37, 55, 90, 81, 73, 67, 97, 32,
71, 70, 69, 46, 57, 60, 79, 79, 56, 75, 60, 52, 78, 61, 51, 70,
67, 71, 36, 53, 70, 53, 74, 89, 78, 70, 56, 58, 83, 50, 77, 70,
50, 75, 53, 86, 65, 45, 63, 62, 78, 65, 69, 75, 79, 71, 56, 88,
63, 72, 85, 68, 72, 45, 81, 46, 70, 84, 71, 82, 63, 57, 77, 70,
42, 87, 84, 61, 64, 79, 53, 65, 64, 69, 68, 71, 89, 49, 70, 82,
63, 79, 65, 64, 54, 73, 36, 80, 38, 68, 62, 84, 80, 65, 73, 91,
59, 35, 80, 67, 68, 65, 47, 60, 67, 72, 81, 22, 35, 58, 57, 68,
94, 38, 77, 75, 73, 78, 71, 78, 53, 58, 61, 77, 44, 95, 53, 72,
68, 72, 73, 78, 41, 75, 80, 60, 53, 68, 79, 80, 74, 25, 79, 55,
68, 85, 64, 72, 78, 78, 71, 73, 82, 73, 73, 58, 69, 58, 72, 78,
56, 74, 67, 66, 72, 38, 58, 62, 77, 81, 37, 46, 88, 55, 76, 50,
57, 72, 39, 56, 29, 76, 77, 36, 31, 70, 70, 70, 54, 74, 47, 81,
46, 81, 55, 53, 70, 28, 71, 79, 68, 78, 81, 30, 83, 43, 70, 79,
47, 94, 60, 64, 82, 81, 92, 57, 90, 86, 58, 61, 69, 50, 64, 79,
56, 76, 52, 55, 53, 85, 89, 64, 86, 58, 82, 64, 74, 45, 64, 71,
75, 61, 79, 82, 63, 81, 60, 70, 79, 63, 59, 80, 53, 80, 41, 83,
67, 90, 60, 82, 74, 75, 52, 62, 35, 53, 49, 71, 69, 73, 67, 44,
77, 81, 96, 52, 75, 30, 83, 74, 56, 62, 78, 63, 63, 62, 71, 62,
89, 83, 77, 66, 64, 24, 96, 63, 51, 65, 71, 50, 68, 83, 82, 90,
91, 84, 90, 76, 62, 79, 20, 75, 79, 80, 62, 62, 71, 51, 81, 84,
65, 65, 55, 65, 51, 26, 70)), row.names = c(NA, -500L), class = c("tbl_df",
"tbl", "data.frame"))
I'm hoping to sample by race_f so that the different race groups are similar in AGE and cops2_avg_12mo. Is this at all possible? Thank you!
The answer depends on if you want to ensure that their ages/cops2_avg_12mo will always be within a specific range - in which case you would simply create a subset of your data frame with only the patients whose age and cops2_avg_12mo are within some range. I do think that this is the safer thing to do in terms of quality control. You can view a plot of the two columns of your data (AGE and cops2_avg_12mo) to get an idea of what ranges of values most of the patients fall into:
plot(x[,c("AGE", "cops2_avg_12mo")])
Pick ranges for these values that contain enough patients to sample from. (I don't know how many samples you need). Basically, draw a box in the dot plot which contains enough patients to sample from.
So once you determine the ranges/boundaries of the box, just create indexes like so:
idx = (x[,"AGE"] > 50) & (x[,"AGE"] < 75) & (x[,"cops2_avg_12mo"] > 0) & (x[,"cops2_avg_12mo"] < 75) & !is.na(x[,"cops2_avg_12mo"])
then get the subset of your data:
subsetX = x[idx,]
After you create that subset, you can randomly sample using R's sample() function. If you want to do sampling from each race equally, then call sample() with the subsetX data, with each race selected at a time, to get n samples at a time:
sample(subsetX[subsetX[,"race_f"]=="Asian",], n, replace=FALSE)
Alternatively, if you are ok with sampling patients that have outlier values (but I feel like this will produce more variation in your results), then you can create a histogram of each of the columns - for example, AGE - then get the histogram bin counts, divide them by the total number of patients to get a probability distribution, then create a vector the same length as the number of patients where each value is the probability we calculated for the bin it belongs to (found by getting bin indexes when calculating the histogram), then pass that vector into the sample() function as the prob input argument so that values are sampled with their specified probability.

ggplot facet_grid: plotting hourly data for different days, directly below each other

I am trying to plot count of visitors for different days. I would like to use facet_grid to have the plots on a common X-axis directly below each other. Every time I try, the second plot (day 2) ends up on the right. Does somebody know what I have done wrong? Below is the code I am using:
ggplot(count_visitors, aes(x = date)) +
geom_line(aes(y=average_count), colour=colour[1], size = 0.5) +
geom_line(aes(y=count_max), colour=colour[1], size = 0.5, alpha="0.2") +
geom_line(aes(y=count_min), colour=colour[1], size = 0.5, alpha="0.2") +
geom_ribbon(aes(ymin=count_min,ymax=count_max), fill=colour[1], alpha="0.2") +
labs(x = "Time", y = "Visitors Count") +
scale_y_continuous(breaks = seq(0, 600, by=100), limits = c(0, 600)) +
scale_x_datetime(labels = date_format("%H:%M")) +
facet_grid(day_month ~ .)
And this is how the data looks like:
$ date : POSIXct, format: "2017-12-02 07:00:00" "2017-12-02 07:15:00" "2017-12-02 07:30:00" "2017-12-02 07:45:00" ...
$ day_month : int 2 2 2 2 2 2 2 2 2 2 ...
$ average_count: num 1 2 2.5 3.5 9 11 19.5 31.5 62 90.5 .
$ count_min : num 0 0 0 0 2 4 9 15 39 61 ...
$ count_max : num 2 4 5 7 16 18 30 48 85 120 ...
structure(list(date = structure(c(1512198000, 1512198900, 1512199800,
1512200700, 1512201600, 1512202500, 1512203400, 1512204300, 1512205200,
1512206100, 1512207000, 1512207900, 1512208800, 1512209700, 1512210600,
1512211500, 1512212400, 1512213300, 1512214200, 1512215100, 1512216000,
1512216900, 1512217800, 1512218700, 1512219600, 1512220500, 1512221400,
1512222300, 1512223200, 1512224100, 1512225000, 1512225900, 1512226800,
1512227700, 1512228600, 1512229500, 1512230400, 1512231300, 1512232200,
1512233100, 1512234000, 1512234900, 1512235800, 1512236700, 1512237600,
1512238500, 1512239400, 1512240300, 1512241200, 1512242100, 1512243000,
1512243900, 1512244800, 1512245700, 1512246600, 1512247500, 1512248400,
1512249300, 1512250200, 1512251100, 1512252000, 1512252900, 1512253800,
1512254700, 1512255600, 1512111600, 1512112500, 1512113400, 1512114300,
1512115200, 1512116100, 1512117000, 1512117900, 1512118800, 1512119700,
1512120600, 1512121500, 1512122400, 1512123300, 1512124200, 1512125100,
1512126000, 1512126900, 1512127800, 1512128700, 1512129600, 1512130500,
1512131400, 1512132300, 1512133200, 1512134100, 1512135000, 1512135900,
1512136800, 1512137700, 1512138600, 1512139500, 1512140400, 1512141300,
1512142200, 1512143100, 1512144000, 1512144900, 1512145800, 1512146700,
1512147600, 1512148500, 1512149400, 1512150300, 1512151200, 1512152100,
1512153000, 1512153900, 1512154800, 1512155700, 1512156600, 1512157500,
1512158400, 1512159300, 1512160200, 1512161100, 1512162000, 1512162900,
1512163800, 1512164700, 1512165600, 1512166500, 1512167400, 1512168300,
1512169200), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
day_month = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
count_min = c(0, 0, 0, 0, 2, 4, 9, 15, 39, 61, 78, 95, 110,
121, 135, 151, 154, 173, 175, 187, 201, 227, 239, 254, 265,
275, 278, 288, 293, 290, 290, 293, 295, 299, 297, 284, 279,
278, 255, 250, 231, 224, 204, 184, 178, 170, 161, 149, 146,
148, 153, 150, 138, 127, 119, 112, 91, 79, 53, 40, 29, 15,
11, 9, 5, 1, 1, 1, 1, 1, 2, 3, 5, 14, 16, 26, 35, 58, 67,
89, 114, 141, 159, 183, 187, 198, 208, 207, 206, 209, 209,
204, 194, 180, 175, 156, 142, 145, 133, 128, 121, 104, 100,
85, 74, 75, 81, 93, 106, 104, 116, 121, 137, 151, 153, 159,
168, 165, 159, 156, 144, 119, 102, 84, 60, 35, 23, 17, 15,
10), count_max = c(2, 4, 5, 7, 16, 18, 30, 48, 85, 120, 146,
176, 207, 229, 253, 295, 312, 327, 348, 370, 392, 418, 446,
457, 489, 501, 509, 507, 514, 515, 533, 550, 564, 554, 557,
552, 552, 524, 502, 476, 447, 432, 411, 400, 380, 352, 341,
322, 314, 312, 303, 292, 288, 262, 239, 219, 202, 177, 138,
108, 81, 43, 32, 22, 12, 2, 2, 2, 2, 2, 7, 10, 21, 33, 44,
64, 89, 117, 153, 186, 222, 260, 279, 298, 323, 332, 341,
345, 349, 361, 361, 367, 364, 352, 324, 309, 291, 282, 267,
256, 240, 220, 197, 192, 185, 181, 184, 195, 203, 208, 202,
218, 245, 269, 297, 312, 320, 315, 317, 301, 284, 250, 220,
194, 166, 124, 77, 41, 30, 20), average_count = c(1, 2, 2.5,
3.5, 9, 11, 19.5, 31.5, 62, 90.5, 112, 135.5, 158.5, 175,
194, 223, 233, 250, 261.5, 278.5, 296.5, 322.5, 342.5, 355.5,
377, 388, 393.5, 397.5, 403.5, 402.5, 411.5, 421.5, 429.5,
426.5, 427, 418, 415.5, 401, 378.5, 363, 339, 328, 307.5,
292, 279, 261, 251, 235.5, 230, 230, 228, 221, 213, 194.5,
179, 165.5, 146.5, 128, 95.5, 74, 55, 29, 21.5, 15.5, 8.5,
1.5, 1.5, 1.5, 1.5, 1.5, 4.5, 6.5, 13, 23.5, 30, 45, 62,
87.5, 110, 137.5, 168, 200.5, 219, 240.5, 255, 265, 274.5,
276, 277.5, 285, 285, 285.5, 279, 266, 249.5, 232.5, 216.5,
213.5, 200, 192, 180.5, 162, 148.5, 138.5, 129.5, 128, 132.5,
144, 154.5, 156, 159, 169.5, 191, 210, 225, 235.5, 244, 240,
238, 228.5, 214, 184.5, 161, 139, 113, 79.5, 50, 29, 22.5,
15)), class = "data.frame", row.names = c(NA, -130L))
Example Image
One option is to use facet_wrap instead.
Note that I removed lots of your rather redundant code (for the question). Would recommend to have a look at how to create an MCVE
ggplot(count_visitors, aes(x = date)) +
geom_line(aes(y=average_count), size = 0.5) +
geom_ribbon(aes(ymin=count_min,ymax=count_max), alpha="0.2") +
facet_wrap(day_month ~ ., nrow = 2, scales = 'free_x')

Superimposing two plots in R with same axis and limits

I have two plots from two different data frames
The DPUT from data frame 1 is as follows
ppv_npv2 <- structure(list(pred.prob = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50), variable = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("ppv_2.5", "ppv_50", "ppv_97.5"), class = "factor"),
value = c(4.8, 9.3, 13.4, 17.2, 20.8, 24.2, 27.3, 30.3, 33.1,
35.7, 38.2, 40.5, 42.8, 44.9, 46.9, 48.8, 50.6, 52.3, 54,
55.6, 57.1, 58.5, 59.9, 61.2, 62.5, 63.7, 64.9, 66, 67.1,
68.2, 69.2, 70.2, 71.1, 72, 72.9, 73.8, 74.6, 75.4, 76.2,
76.9, 77.7, 78.4, 79, 79.7, 80.4, 81, 81.6, 82.2, 82.8, 83.3,
7.2, 13.6, 19.3, 24.4, 28.9, 33, 36.8, 40.2, 43.3, 46.2,
48.9, 51.3, 53.6, 55.7, 57.7, 59.6, 61.3, 62.9, 64.5, 65.9,
67.3, 68.6, 69.8, 70.9, 72, 73.1, 74.1, 75, 75.9, 76.8, 77.6,
78.4, 79.2, 79.9, 80.6, 81.3, 82, 82.6, 83.2, 83.8, 84.3,
84.8, 85.4, 85.9, 86.3, 86.8, 87.3, 87.7, 88.1, 88.5, 11.7,
21.1, 28.8, 35.3, 40.8, 45.5, 49.7, 53.3, 56.4, 59.3, 61.8,
64.1, 66.2, 68.1, 69.8, 71.4, 72.9, 74.2, 75.5, 76.6, 77.7,
78.7, 79.7, 80.5, 81.4, 82.2, 82.9, 83.6, 84.3, 84.9, 85.5,
86, 86.6, 87.1, 87.6, 88.1, 88.5, 88.9, 89.3, 89.7, 90.1,
90.5, 90.8, 91.1, 91.5, 91.8, 92.1, 92.4, 92.6, 92.9)),
.Names =c("pred.prob","variable", "value"), row.names = c(NA, -150L),
class = "data.frame")
The plot that i have created is from the following code
p1 <- ggplot(ppv_npv2,aes(x=pred.prob,y=value))+
geom_line(data=ppv_npv2[ppv_npv2$variable=="ppv_50",],
colour="red",linetype=2)+
geom_line(data=ppv_npv2[ ppv_npv2$variable=="ppv_2.5", ],
colour="blue",linetype=4)+
geom_line(data=ppv_npv2[ ppv_npv2$variable=="ppv_97.5", ],
colour="blue",linetype=4)+
theme_classic()+
ylab("Predicted positive predictive value (%) \n")+
xlab("\n Prevalence (%)")+
scale_x_continuous(limits=c(0,50),breaks=seq(0,50,2))+
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10), expand=c(0,0))+
theme(axis.text.x = element_text(size=12,hjust=.5,vjust=.8,face="plain"),
axis.text.y = element_text(size=12,hjust=.5,vjust=.8,face="plain"))+
theme(axis.title.x = element_text(size=14,face="bold"),
axis.title.y = element_text(size=14,face="bold"))
p1
The dput for the second data frame is
dat <- structure(list(PPV = c(57, 89, 19, 52, 52, 62, 63, 46, 31, 52,
54, 13, 17, 47, 48, 52, 96, 88, 64, 33, 62, 77, 75, 72), Prevalence = c(19,
35, 12, 16, 24, 6, 28, 13, 8, 19, 30, 6, 8, 20, 11, 25, 29, 55,
46, 13, 16, 22, 23, 20), total = c(939L, 323L, 306L, 703L, 137L,
833L, 360L, 317L, 440L, 2072L, 209L, 386L, 142L, 358L, 167L,
503L, 180L, 233L, 342L, 478L, 4870L, 1104L, 1813L, 1567L),
Author = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L,
15L,18L, 19L, 8L, 14L, 16L, 17L, 21L, 20L, 20L, 13L, 10L),
.Label = c("Aldous",
"Bahrmann", "Body", "Christ ", "Collinson", "Eggers", "Freund",
"Giannitis", "Hammerer-Lercher", "Hoeller", "Inoue", "Invernizi",
"Keller", "Khan", "Lotze", "Melki ", "Normann", "Santalol", "Sebbane",
"Shah", "Thelin "), class = "factor"), Study.assay = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("TnI", "TnT"), class = "factor")),
.Names = c("PPV", "Prevalence", "total", "Author", "Study.assay"),
class ="data.frame", row.names = c(NA, -24L))
And the plot from dataframe 2 is as follows
p2 <- ggplot(dat, aes(x=dat$Prevalence, y=dat$PPV, size=dat$total,
label=dat$Author),guide=F)+
geom_point(colour="white", fill="red", shape=21)+
scale_size_area(max_size = 10)+
scale_x_continuous(name="\n Prevalence", limits=c(0,100))+
scale_y_continuous(name="Predicted positive predictive value (%) \n",
limits=c(0,100))+
geom_text(size=2.5)+
theme_classic()+
ylab("Predicted positive predictive value (%) \n")+
xlab("\n Prevalence (%)")+
scale_x_continuous(limits=c(0,50),breaks=seq(0,50,2))+
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10), expand=c(0,0))+
theme(axis.text.x = element_text(size=12,hjust=.5,vjust=.8,face="plain"),
axis.text.y = element_text(size=12,hjust=.5,vjust=.8,face="plain"))+
theme(axis.title.x = element_text(size=14,face="bold"),
axis.title.y = element_text(size=14,face="bold"))+
theme(legend.position='none')
p2
As you can see both plots have the same axis and limits. I have two questions:
a) Can i overlay plot 2 onto plot 1?
b) Can i make the bubbles on plot 2 more transparent and choose colours by the factor dat$Study.assay (green and purple)?
Many thanks in advance - have spent a day researching this but no solution yet.
Here's a start using your data,
(plot2 <- ggplot() +
geom_line(data = ppv_npv2,aes(pred.prob, value,
group= variable, colour = variable)) +
geom_point(data = dat, aes(Prevalence, PPV, label=Author, size = total,
colour = Study.assay), alpha = I(0.4)) +
geom_text(data = dat, aes(Prevalence, PPV, label=Author,
size = total), size=3, hjust=-1, vjust=0)
)
It's not the orthodox ggplot2 way, but it's a start.

specific stripchart with ggplot2

I've got this dataframe
df <- structure(list(rang = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25.5, 25.5, 27.5,
27.5, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42.5,
42.5, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54.5, 54.5, 56,
57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88
), dr = c(164, 176, 260, 297, 308, 313, 327, 333, 339, 365, 396,
403, 404, 410, 413, 414, 422, 424, 440, 442, 443, 451, 477, 496,
530, 530, 546, 546, 548, 565, 567, 574, 576, 587, 590, 603, 619,
626, 629, 630, 642, 653, 653, 660, 667, 670, 677, 682, 689, 711,
716, 737, 763, 772, 772, 776, 778, 792, 794, 820, 835, 838, 842,
855, 861, 888, 890, 899, 906, 908, 969, 1011, 1046, 1058, 1069,
1072, 1074, 1100, 1153, 1348, 1368, 1432, 1468, 1516, 1612, 1712,
1714, 1731), signe = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L), .Label = c("negatif", "positif"), class = "factor")), .Names = c("rang",
"dr", "signe"), row.names = c(NA, -88L), class = "data.frame")
and this chart when I use the stripchart function in base R
stripchart(df[,1]~df[,3],
method="stack", vertical=FALSE, ylim=c(0.5,2.5),
group.names=levels(df[,3]),
xlab="Rang des différences dr", pch=18, cex=1.2)
Can I have the same plot with the library ggplot2?
I used geom_dotplot but I didn't the same plot. This an example
ggplot(data = df, aes(y=df[,1], x=factor(df[,3]))) +
geom_dotplot(binaxis = "y", dotsize = 0.5) +
coord_cartesian(ylim=c(0, 88)) +
scale_y_continuous(breaks=seq(0, 88, 1))
Help me, please!
You have to flip coordinates, and set binwidth = 1 to get the same plot:
ggplot(data = df, aes(y=rang, x=factor(signe))) +
geom_dotplot(binaxis = "y", dotsize = 0.8, binwidth=1) +
coord_cartesian(ylim=c(0, 88)) +
scale_y_continuous(name='Rang des différences dr') +
scale_x_discrete(name='') +
coord_flip() +
theme_bw(base_size = 20)
Is this along the lines of what you were looking for:
ggplot(df) + geom_point(aes(df[,1],df[,3])) + theme_bw()

How to combine multiple outputs in summarise in ddply in r?

For the Orange data frame why doesn't the following work?
library(plyr)
> ddply(Orange, .(Tree), summarise, circum = list(circumference))
Error: unsupported type for column 'circum' (VECSXP)
I want to combine all circumference values in 1 column.
Orange data frame:
> dput(Orange)
structure(list(Tree = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("3",
"1", "5", "2", "4"), class = c("ordered", "factor")), age = c(118,
484, 664, 1004, 1231, 1372, 1582, 118, 484, 664, 1004, 1231,
1372, 1582, 118, 484, 664, 1004, 1231, 1372, 1582, 118, 484,
664, 1004, 1231, 1372, 1582, 118, 484, 664, 1004, 1231, 1372,
1582), circumference = c(30, 58, 87, 115, 120, 142, 145, 33,
69, 111, 156, 172, 203, 203, 30, 51, 75, 108, 115, 139, 140,
32, 62, 112, 167, 179, 209, 214, 30, 49, 81, 125, 142, 174, 177
)), .Names = c("Tree", "age", "circumference"), row.names = c(NA,
35L), class = c("nfnGroupedData", "nfGroupedData", "groupedData",
"data.frame"), formula = circumference ~ age | Tree, labels = structure(list(
x = "Time since December 31, 1968", y = "Trunk circumference"), .Names = c("x",
"y")), units = structure(list(x = "(days)", y = "(mm)"), .Names = c("x",
"y")))

Resources