Is it possible to randomly sample patients by group so that they have similar distributions based on other variables? To me, this sounds like a matching problem, but there's no "treatment" here, so I'm not sure if the concept applies.
Sample data:
structure(list(id = c(8350L, 22543L, 24144L, 9392L, 27648L, 2943L,
34686L, 27153L, 11143L, 15209L, 11952L, 22669L, 8211L, 27765L,
28671L, 9693L, 30274L, 25807L, 14839L, 22400L, 24494L, 6540L,
6861L, 31825L, 34190L, 19606L, 21077L, 5037L, 25943L, 20530L,
23730L, 34774L, 7210L, 2051L, 28410L, 18318L, 34848L, 26596L,
8973L, 24885L, 9652L, 8387L, 16168L, 36893L, 24048L, 17769L,
1273L, 22734L, 36796L, 25497L, 28300L, 166L, 21172L, 20026L,
16265L, 1699L, 33140L, 23997L, 10216L, 27408L, 6813L, 10196L,
15015L, 2748L, 34979L, 21763L, 27438L, 6255L, 17047L, 30593L,
30723L, 7914L, 218L, 20134L, 29952L, 27126L, 3795L, 1367L, 33585L,
5940L, 26250L, 22519L, 35611L, 26168L, 26848L, 21276L, 8971L,
22554L, 16655L, 5315L, 18121L, 32526L, 21513L, 9262L, 36882L,
7408L, 18873L, 17238L, 15216L, 23667L, 30138L, 2978L, 25451L,
2492L, 30983L, 7677L, 22880L, 29674L, 7093L, 24910L, 20839L,
18176L, 23031L, 17197L, 4613L, 35801L, 30822L, 3889L, 11752L,
11314L, 22317L, 12825L, 17433L, 4407L, 3986L, 10173L, 32409L,
2697L, 3410L, 26834L, 3203L, 5474L, 34678L, 35336L, 19462L, 15835L,
7888L, 27897L, 9245L, 16524L, 13316L, 21604L, 30458L, 9191L,
1220L, 1779L, 1724L, 26382L, 11566L, 21310L, 12600L, 25063L,
30912L, 31189L, 9480L, 16804L, 2372L, 26238L, 20113L, 33753L,
32711L, 11543L, 10578L, 4475L, 13187L, 23395L, 35342L, 6903L,
26905L, 12026L, 5697L, 15352L, 33985L, 1132L, 15806L, 13611L,
29930L, 15896L, 6057L, 10849L, 12944L, 25561L, 3328L, 27481L,
28790L, 3260L, 24986L, 22177L, 26580L, 11639L, 2256L, 4839L,
22805L, 616L, 6702L, 18360L, 4439L, 1300L, 33779L, 24940L, 10043L,
21268L, 35127L, 36621L, 17618L, 6688L, 15937L, 31057L, 2144L,
30866L, 12500L, 29753L, 36497L, 21247L, 9481L, 36465L, 20665L,
15017L, 21234L, 34258L, 576L, 31187L, 4528L, 15314L, 3657L, 24489L,
33871L, 106L, 24916L, 2524L, 17469L, 2799L, 13311L, 26585L, 7131L,
21401L, 6191L, 22338L, 11647L, 11681L, 22744L, 14000L, 5356L,
2892L, 24481L, 24116L, 21461L, 13992L, 22751L, 11129L, 8802L,
29963L, 4660L, 29020L, 20843L, 21796L, 3607L, 10692L, 29168L,
25034L, 3307L, 35010L, 20280L, 31894L, 7276L, 24259L, 34059L,
35867L, 11165L, 16010L, 34082L, 26586L, 30958L, 25030L, 34851L,
29185L, 25721L, 8968L, 29427L, 20213L, 34667L, 28721L, 21472L,
17132L, 35247L, 9798L, 36826L, 21226L, 28335L, 16077L, 2654L,
20466L, 21324L, 36969L, 22553L, 5895L, 16514L, 10644L, 4376L,
13592L, 11206L, 32440L, 13413L, 31416L, 22540L, 15986L, 11506L,
16928L, 18652L, 17858L, 13522L, 8566L, 10665L, 29442L, 28219L,
22549L, 2209L, 8017L, 6066L, 21718L, 21930L, 11540L, 4100L, 35236L,
240L, 24900L, 425L, 26880L, 21409L, 18885L, 5803L, 33335L, 25597L,
12547L, 8930L, 4328L, 17360L, 4696L, 25198L, 26469L, 14679L,
1691L, 32989L, 6099L, 14427L, 31797L, 23408L, 29296L, 23928L,
31889L, 31737L, 6420L, 11304L, 34798L, 20785L, 9806L, 35018L,
35008L, 1450L, 3246L, 15123L, 19603L, 8519L, 32012L, 3397L, 11682L,
27102L, 18022L, 20408L, 15836L, 18284L, 12897L, 29580L, 14510L,
23925L, 28821L, 35825L, 14922L, 36643L, 10948L, 4220L, 23791L,
65L, 35772L, 1423L, 29386L, 755L, 23627L, 27201L, 12353L, 3578L,
1914L, 35373L, 16702L, 13057L, 3021L, 27531L, 1990L, 205L, 21559L,
29081L, 26301L, 18894L, 3088L, 9782L, 10522L, 12570L, 8948L,
36240L, 33943L, 33022L, 2750L, 32649L, 30134L, 13920L, 11498L,
8314L, 16849L, 15559L, 22529L, 31406L, 5680L, 17908L, 14931L,
2122L, 2581L, 33546L, 12143L, 17220L, 16713L, 7454L, 13659L,
15973L, 20116L, 27689L, 35285L, 36106L, 21834L, 29850L, 29030L,
7957L, 31698L, 12307L, 23642L, 5615L, 12016L, 1161L, 15291L,
32738L, 1089L, 32988L, 33382L, 3642L, 18661L, 35584L, 8009L,
24000L, 30587L, 25870L, 19944L, 34970L, 29983L, 24774L, 28702L,
21199L, 17292L, 29831L, 476L, 18881L, 29923L, 31476L, 4570L,
31081L, 10544L, 3373L, 13435L, 22651L, 17861L, 3818L, 35387L,
11459L, 35637L, 308L, 35697L, 12696L, 15175L, 7990L, 16691L,
19494L, 9008L, 30695L, 28889L, 446L, 22178L, 13000L, 26166L,
15431L, 19332L, 35991L, 2840L), race_f = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 4L, 2L, 3L, 4L, 1L, 1L, 3L, 3L, 3L, 3L, 1L,
3L, 1L, 3L, 3L, 1L, 1L, 3L, 2L, 2L, 1L, 4L, 5L, 1L, 4L, 1L, 1L,
5L, 1L, 1L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 1L,
2L, 1L, 3L, 1L, 2L, 1L, 1L, 1L, 2L, 3L, 3L, 1L, 1L, 3L, 3L, 3L,
1L, 1L, 1L, 3L, 3L, 2L, 1L, 1L, 3L, 4L, 4L, 1L, 1L, 3L, 1L, 2L,
3L, 4L, 1L, 1L, 1L, 3L, 1L, 1L, 5L, 3L, 1L, 1L, 3L, 2L, 1L, 1L,
3L, 1L, 4L, 1L, 1L, 3L, 1L, 4L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 2L,
3L, 3L, 4L, 4L, 1L, 2L, 1L, 4L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 3L,
1L, 2L, 1L, 2L, 2L, 4L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 4L, 1L, 5L,
2L, 1L, 2L, 3L, 1L, 5L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 3L,
1L, 4L, 4L, 3L, 2L, 4L, 2L, 1L, 3L, 3L, 1L, 4L, 3L, 3L, 3L, 1L,
1L, 4L, 1L, 4L, 2L, 3L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 1L,
3L, 4L, 1L, 3L, 1L, 1L, 4L, 3L, 4L, 1L, 3L, 1L, 2L, 4L, 3L, 3L,
1L, 1L, 3L, 1L, 5L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 2L, 1L, 4L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
3L, 1L, 1L, 1L, 4L, 1L, 4L, 3L, 1L, 3L, 2L, 1L, 1L, 2L, 3L, 1L,
4L, 2L, 3L, 1L, 3L, 4L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 5L, 4L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 4L, 1L, 1L, 3L,
1L, 3L, 3L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 2L, 1L, 3L, 1L, 4L,
1L, 4L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 4L, 1L, 1L, 2L, 1L, 1L, 1L,
4L, 1L, 1L, 3L, 3L, 1L, 4L, 3L, 3L, 4L, 1L, 3L, 1L, 5L, 3L, 4L,
1L, 4L, 4L, 1L, 3L, 4L, 1L, 4L, 1L, 1L, 1L, 3L, 2L, 1L, 2L, 4L,
1L, 1L, 5L, 4L, 1L, 1L, 4L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 2L,
1L, 3L, 3L, 3L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 3L, 1L,
1L, 3L, 4L, 1L, 1L, 2L, 5L, 3L, 3L, 1L, 1L, 4L, 1L, 4L, 1L, 4L,
2L, 3L, 3L, 1L, 1L, 1L, 4L, 1L, 4L, 3L, 1L, 1L, 1L, 1L, 3L, 1L,
3L, 1L, 1L, 1L, 1L, 4L, 3L, 4L, 3L, 3L, 3L, 2L, 3L, 1L, 1L, 1L,
3L, 5L, 2L, 5L, 1L, 1L, 4L, 3L, 1L, 3L, 2L, 1L, 1L, 5L, 1L, 3L,
3L, 4L, 1L, 1L, 1L, 2L, 5L, 1L, 1L, 4L, 3L, 1L, 1L, 1L, 2L, 1L,
1L, 3L, 3L, 1L, 1L, 4L, 3L, 2L, 4L, 4L, 1L, 1L, 2L, 3L, 1L, 3L,
3L, 1L), .Label = c("White", "Black", "Hispanic", "Asian", "Other"
), class = "factor"), cops2_avg_12mo = c(82.9166666666667, 66,
23.3333333333333, 28, 9.33333333333333, 69.9166666666667, 6,
33.3333333333333, 0, 12, 102, NA, 66, 6, 45, 58.5, 10, 55.9166666666667,
19.5, 6, 10, 234.666666666667, 28, 23, 51.5833333333333, 10,
38, 123.5, 0, 24, 10, 0, 73, 10, 25, 6, 20, 13.4166666666667,
13.8333333333333, 8, 14.8333333333333, 53.5, 42, NA, 57.1666666666667,
0, 24.6666666666667, 10, NA, 54.6666666666667, 38.75, 41, 22,
0.833333333333333, 13, 113.083333333333, 27.3333333333333, 9,
33.1666666666667, 18.75, 57.75, 30, 60.3333333333333, 23.1666666666667,
37, 16.5, 0, 145.5, 45, 31.3333333333333, 0, 10, 187.5, 27.4166666666667,
10, 54.9166666666667, 78.8333333333333, 103.75, 6.66666666666667,
30.4166666666667, 10, 10, 24.6666666666667, 10, 118.333333333333,
61.25, 17, 10, 28, 51, 6, 32.0833333333333, 80.75, 8.83333333333333,
NA, 10, 74.25, 42.25, 47, 60, 41.6666666666667, 19.0833333333333,
98.5, 73.5, 10, 6.66666666666667, 49.8333333333333, 10, 79.8333333333333,
10, 42, 95.8333333333333, 130.583333333333, 5.41666666666667,
47.25, 6, 8, 17.8333333333333, 10, 73.9166666666667, 10, 8, 27.8333333333333,
125.916666666667, 134.166666666667, 88, 10, 58, 62.5, 10.3333333333333,
28.8333333333333, 100.083333333333, 35.5, 0, 0, 10, 105, 7.33333333333333,
35, 9.66666666666667, 10, 4.16666666666667, 10, 8.33333333333333,
70.6666666666667, 28.4166666666667, 38.1666666666667, 8, 101.5,
26.75, 61.1666666666667, 14, 95.5833333333333, 35, 65, 0, 51.75,
57.5, 10, 13.6666666666667, 10, 67.5, 10, 62.3333333333333, 72.6666666666667,
10, 45.5, 20.8333333333333, 31, 84.5, 10, 98.1666666666667, 47.5,
56, 126, 14, 10, 10, 8, 36, 111.5, 54.5, 45.5, 8, 37.5, 84.8333333333333,
39.1666666666667, 56.25, 37.9166666666667, 37.75, 27, 55.6666666666667,
10, 34, 5.83333333333333, 37, 80.0833333333333, 57, 102.166666666667,
12.6666666666667, 10, 19.3333333333333, 10, NA, 51, 25.9166666666667,
14, 36.9090909090909, 38.6666666666667, 0, 6.33333333333333,
NA, 31, 43, 26.5, 10, 34.4166666666667, 77.1666666666667, 10,
10, 89.9166666666667, 59, 37, 77.3333333333333, 64, 52, 19.6666666666667,
66.5, 24, 106.083333333333, 29.6666666666667, 38.1666666666667,
6.66666666666667, 10, 16.75, NA, 86.75, 1, 14, 20.3333333333333,
8, 21, 38.9166666666667, 50.8333333333333, 57.5, 29, 0, 26.5,
51.9166666666667, 71.25, 42.6666666666667, 82, 58.0833333333333,
11.3333333333333, 82, 9.5, 78.6666666666667, 102.5, 71, 10, 70.6666666666667,
NA, 33.8333333333333, 61.25, 87, 36.5, 10, 40.4166666666667,
51.8333333333333, 23, 9.66666666666667, 44.5, 8, 10, 4.16666666666667,
0, 48.8333333333333, 49.25, 15, 70, 10, 6, 10, 34.8333333333333,
108.75, 36, NA, 31, 51, 69.5, 122.5, 48, 43.5833333333333, NA,
10, 20, 80.75, 54.75, 106.916666666667, 53.5, 90.6666666666667,
8.33333333333333, 85.5, 40.5833333333333, 5.5, 10, 61.3333333333333,
69.8333333333333, 10, 51, 0, 49.0833333333333, 13.6666666666667,
13.3333333333333, 5.83333333333333, 33.8333333333333, 14.4166666666667,
11.25, 14, 6, 14.5833333333333, 36, 21, 10, 29.5833333333333,
13, 34, 10, 2.5, 10, 211.916666666667, 19.75, 7.33333333333333,
6, 59.6666666666667, 30.25, 34.25, 16.1666666666667, 10, NA,
NA, 97, 75, 26.5, 8, 32.25, 0, 39, 37, 165.333333333333, 45,
33.1666666666667, 21, 10, 57, 70.3333333333333, 10, 10, 62, 79.1666666666667,
38, 26.1666666666667, 13, 8, 69.6666666666667, 40.5, 100, 0.833333333333333,
8, 82.5, 10, 19.8333333333333, 20.0833333333333, 8, 25.8333333333333,
16.75, 10, 36, NA, 12.8333333333333, 31.4166666666667, 10, 61.4166666666667,
14, 67.5, 3, 83.1666666666667, 48, 43.75, 35.4166666666667, 73,
44.1666666666667, 8, 29.75, 10, 10, 62.6666666666667, 26.9166666666667,
29.6666666666667, 10, NA, 15, 19.4166666666667, 112, 29, 3, 33.5,
62.5, 10, 84.6666666666667, 8, 84.4166666666667, 81.5, 56.1666666666667,
10, 101.416666666667, 16, 10, 19.6666666666667, 60, 73.6666666666667,
74.9166666666667, 21, 5, 15.0833333333333, 17.0833333333333,
17.5, 46, 61.8333333333333, 115.333333333333, 92, 30, 0, 22.75,
16.6666666666667, 15, 15, 10, NA, 56.25, 54, 10, 40, 9.83333333333333,
10.9166666666667, 22.25, 84.75, 80, 1.66666666666667, 99.8333333333333,
10, 38.6666666666667, 169.75, 35.0833333333333, 8, 78.5, 6.33333333333333,
21, 10, 42, 105.166666666667, 162.416666666667, 14, 69.25, 35.8333333333333,
13, 5.83333333333333, 34, 51, 12.75, 44.3333333333333, 39.5,
10, 23, 46.8333333333333, 89.9166666666667, 15, 28, 128.416666666667,
10, 91.6666666666667, 3.5, 54, 23, NA, 29.75, 37.1666666666667,
12.6666666666667, 31.9166666666667, 23, 0, 11, 67.9166666666667,
3.16666666666667, 8.33333333333333, 51, NA, 10, 0, 58.8333333333333
), AGE = c(86, 82, 83, 92, 45, 81, 52, 64, 71, 96, 79, 64, 76,
37, 81, 79, 72, 79, 74, 46, 45, 71, 89, 76, 53, 48, 52, 77, 63,
52, 57, 62, 84, 88, 55, 69, 67, 63, 67, 51, 86, 53, 65, 59, 71,
60, 70, 20, 78, 62, 58, 73, 68, 71, 66, 72, 71, 65, 95, 67, 79,
70, 86, 77, 81, 54, 44, 66, 80, 71, 30, 77, 67, 75, 48, 65, 83,
85, 70, 70, 74, 58, 81, 28, 78, 66, 79, 47, 74, 41, 74, 58, 73,
55, 53, 56, 84, 74, 62, 85, 68, 47, 78, 72, 57, 56, 64, 55, 86,
76, 77, 58, 74, 55, 71, 61, 74, 62, 65, 75, 81, 68, 39, 58, 65,
76, 27, 79, 86, 61, 87, 52, 72, 58, 53, 69, 78, 65, 81, 69, 66,
68, 61, 72, 74, 80, 88, 46, 53, 77, 89, 83, 41, 67, 83, 62, 90,
70, 60, 62, 33, 78, 80, 62, 81, 37, 55, 90, 81, 73, 67, 97, 32,
71, 70, 69, 46, 57, 60, 79, 79, 56, 75, 60, 52, 78, 61, 51, 70,
67, 71, 36, 53, 70, 53, 74, 89, 78, 70, 56, 58, 83, 50, 77, 70,
50, 75, 53, 86, 65, 45, 63, 62, 78, 65, 69, 75, 79, 71, 56, 88,
63, 72, 85, 68, 72, 45, 81, 46, 70, 84, 71, 82, 63, 57, 77, 70,
42, 87, 84, 61, 64, 79, 53, 65, 64, 69, 68, 71, 89, 49, 70, 82,
63, 79, 65, 64, 54, 73, 36, 80, 38, 68, 62, 84, 80, 65, 73, 91,
59, 35, 80, 67, 68, 65, 47, 60, 67, 72, 81, 22, 35, 58, 57, 68,
94, 38, 77, 75, 73, 78, 71, 78, 53, 58, 61, 77, 44, 95, 53, 72,
68, 72, 73, 78, 41, 75, 80, 60, 53, 68, 79, 80, 74, 25, 79, 55,
68, 85, 64, 72, 78, 78, 71, 73, 82, 73, 73, 58, 69, 58, 72, 78,
56, 74, 67, 66, 72, 38, 58, 62, 77, 81, 37, 46, 88, 55, 76, 50,
57, 72, 39, 56, 29, 76, 77, 36, 31, 70, 70, 70, 54, 74, 47, 81,
46, 81, 55, 53, 70, 28, 71, 79, 68, 78, 81, 30, 83, 43, 70, 79,
47, 94, 60, 64, 82, 81, 92, 57, 90, 86, 58, 61, 69, 50, 64, 79,
56, 76, 52, 55, 53, 85, 89, 64, 86, 58, 82, 64, 74, 45, 64, 71,
75, 61, 79, 82, 63, 81, 60, 70, 79, 63, 59, 80, 53, 80, 41, 83,
67, 90, 60, 82, 74, 75, 52, 62, 35, 53, 49, 71, 69, 73, 67, 44,
77, 81, 96, 52, 75, 30, 83, 74, 56, 62, 78, 63, 63, 62, 71, 62,
89, 83, 77, 66, 64, 24, 96, 63, 51, 65, 71, 50, 68, 83, 82, 90,
91, 84, 90, 76, 62, 79, 20, 75, 79, 80, 62, 62, 71, 51, 81, 84,
65, 65, 55, 65, 51, 26, 70)), row.names = c(NA, -500L), class = c("tbl_df",
"tbl", "data.frame"))
I'm hoping to sample by race_f so that the different race groups are similar in AGE and cops2_avg_12mo. Is this at all possible? Thank you!
The answer depends on if you want to ensure that their ages/cops2_avg_12mo will always be within a specific range - in which case you would simply create a subset of your data frame with only the patients whose age and cops2_avg_12mo are within some range. I do think that this is the safer thing to do in terms of quality control. You can view a plot of the two columns of your data (AGE and cops2_avg_12mo) to get an idea of what ranges of values most of the patients fall into:
plot(x[,c("AGE", "cops2_avg_12mo")])
Pick ranges for these values that contain enough patients to sample from. (I don't know how many samples you need). Basically, draw a box in the dot plot which contains enough patients to sample from.
So once you determine the ranges/boundaries of the box, just create indexes like so:
idx = (x[,"AGE"] > 50) & (x[,"AGE"] < 75) & (x[,"cops2_avg_12mo"] > 0) & (x[,"cops2_avg_12mo"] < 75) & !is.na(x[,"cops2_avg_12mo"])
then get the subset of your data:
subsetX = x[idx,]
After you create that subset, you can randomly sample using R's sample() function. If you want to do sampling from each race equally, then call sample() with the subsetX data, with each race selected at a time, to get n samples at a time:
sample(subsetX[subsetX[,"race_f"]=="Asian",], n, replace=FALSE)
Alternatively, if you are ok with sampling patients that have outlier values (but I feel like this will produce more variation in your results), then you can create a histogram of each of the columns - for example, AGE - then get the histogram bin counts, divide them by the total number of patients to get a probability distribution, then create a vector the same length as the number of patients where each value is the probability we calculated for the bin it belongs to (found by getting bin indexes when calculating the histogram), then pass that vector into the sample() function as the prob input argument so that values are sampled with their specified probability.
I am trying to plot count of visitors for different days. I would like to use facet_grid to have the plots on a common X-axis directly below each other. Every time I try, the second plot (day 2) ends up on the right. Does somebody know what I have done wrong? Below is the code I am using:
ggplot(count_visitors, aes(x = date)) +
geom_line(aes(y=average_count), colour=colour[1], size = 0.5) +
geom_line(aes(y=count_max), colour=colour[1], size = 0.5, alpha="0.2") +
geom_line(aes(y=count_min), colour=colour[1], size = 0.5, alpha="0.2") +
geom_ribbon(aes(ymin=count_min,ymax=count_max), fill=colour[1], alpha="0.2") +
labs(x = "Time", y = "Visitors Count") +
scale_y_continuous(breaks = seq(0, 600, by=100), limits = c(0, 600)) +
scale_x_datetime(labels = date_format("%H:%M")) +
facet_grid(day_month ~ .)
And this is how the data looks like:
$ date : POSIXct, format: "2017-12-02 07:00:00" "2017-12-02 07:15:00" "2017-12-02 07:30:00" "2017-12-02 07:45:00" ...
$ day_month : int 2 2 2 2 2 2 2 2 2 2 ...
$ average_count: num 1 2 2.5 3.5 9 11 19.5 31.5 62 90.5 .
$ count_min : num 0 0 0 0 2 4 9 15 39 61 ...
$ count_max : num 2 4 5 7 16 18 30 48 85 120 ...
structure(list(date = structure(c(1512198000, 1512198900, 1512199800,
1512200700, 1512201600, 1512202500, 1512203400, 1512204300, 1512205200,
1512206100, 1512207000, 1512207900, 1512208800, 1512209700, 1512210600,
1512211500, 1512212400, 1512213300, 1512214200, 1512215100, 1512216000,
1512216900, 1512217800, 1512218700, 1512219600, 1512220500, 1512221400,
1512222300, 1512223200, 1512224100, 1512225000, 1512225900, 1512226800,
1512227700, 1512228600, 1512229500, 1512230400, 1512231300, 1512232200,
1512233100, 1512234000, 1512234900, 1512235800, 1512236700, 1512237600,
1512238500, 1512239400, 1512240300, 1512241200, 1512242100, 1512243000,
1512243900, 1512244800, 1512245700, 1512246600, 1512247500, 1512248400,
1512249300, 1512250200, 1512251100, 1512252000, 1512252900, 1512253800,
1512254700, 1512255600, 1512111600, 1512112500, 1512113400, 1512114300,
1512115200, 1512116100, 1512117000, 1512117900, 1512118800, 1512119700,
1512120600, 1512121500, 1512122400, 1512123300, 1512124200, 1512125100,
1512126000, 1512126900, 1512127800, 1512128700, 1512129600, 1512130500,
1512131400, 1512132300, 1512133200, 1512134100, 1512135000, 1512135900,
1512136800, 1512137700, 1512138600, 1512139500, 1512140400, 1512141300,
1512142200, 1512143100, 1512144000, 1512144900, 1512145800, 1512146700,
1512147600, 1512148500, 1512149400, 1512150300, 1512151200, 1512152100,
1512153000, 1512153900, 1512154800, 1512155700, 1512156600, 1512157500,
1512158400, 1512159300, 1512160200, 1512161100, 1512162000, 1512162900,
1512163800, 1512164700, 1512165600, 1512166500, 1512167400, 1512168300,
1512169200), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
day_month = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
count_min = c(0, 0, 0, 0, 2, 4, 9, 15, 39, 61, 78, 95, 110,
121, 135, 151, 154, 173, 175, 187, 201, 227, 239, 254, 265,
275, 278, 288, 293, 290, 290, 293, 295, 299, 297, 284, 279,
278, 255, 250, 231, 224, 204, 184, 178, 170, 161, 149, 146,
148, 153, 150, 138, 127, 119, 112, 91, 79, 53, 40, 29, 15,
11, 9, 5, 1, 1, 1, 1, 1, 2, 3, 5, 14, 16, 26, 35, 58, 67,
89, 114, 141, 159, 183, 187, 198, 208, 207, 206, 209, 209,
204, 194, 180, 175, 156, 142, 145, 133, 128, 121, 104, 100,
85, 74, 75, 81, 93, 106, 104, 116, 121, 137, 151, 153, 159,
168, 165, 159, 156, 144, 119, 102, 84, 60, 35, 23, 17, 15,
10), count_max = c(2, 4, 5, 7, 16, 18, 30, 48, 85, 120, 146,
176, 207, 229, 253, 295, 312, 327, 348, 370, 392, 418, 446,
457, 489, 501, 509, 507, 514, 515, 533, 550, 564, 554, 557,
552, 552, 524, 502, 476, 447, 432, 411, 400, 380, 352, 341,
322, 314, 312, 303, 292, 288, 262, 239, 219, 202, 177, 138,
108, 81, 43, 32, 22, 12, 2, 2, 2, 2, 2, 7, 10, 21, 33, 44,
64, 89, 117, 153, 186, 222, 260, 279, 298, 323, 332, 341,
345, 349, 361, 361, 367, 364, 352, 324, 309, 291, 282, 267,
256, 240, 220, 197, 192, 185, 181, 184, 195, 203, 208, 202,
218, 245, 269, 297, 312, 320, 315, 317, 301, 284, 250, 220,
194, 166, 124, 77, 41, 30, 20), average_count = c(1, 2, 2.5,
3.5, 9, 11, 19.5, 31.5, 62, 90.5, 112, 135.5, 158.5, 175,
194, 223, 233, 250, 261.5, 278.5, 296.5, 322.5, 342.5, 355.5,
377, 388, 393.5, 397.5, 403.5, 402.5, 411.5, 421.5, 429.5,
426.5, 427, 418, 415.5, 401, 378.5, 363, 339, 328, 307.5,
292, 279, 261, 251, 235.5, 230, 230, 228, 221, 213, 194.5,
179, 165.5, 146.5, 128, 95.5, 74, 55, 29, 21.5, 15.5, 8.5,
1.5, 1.5, 1.5, 1.5, 1.5, 4.5, 6.5, 13, 23.5, 30, 45, 62,
87.5, 110, 137.5, 168, 200.5, 219, 240.5, 255, 265, 274.5,
276, 277.5, 285, 285, 285.5, 279, 266, 249.5, 232.5, 216.5,
213.5, 200, 192, 180.5, 162, 148.5, 138.5, 129.5, 128, 132.5,
144, 154.5, 156, 159, 169.5, 191, 210, 225, 235.5, 244, 240,
238, 228.5, 214, 184.5, 161, 139, 113, 79.5, 50, 29, 22.5,
15)), class = "data.frame", row.names = c(NA, -130L))
Example Image
One option is to use facet_wrap instead.
Note that I removed lots of your rather redundant code (for the question). Would recommend to have a look at how to create an MCVE
ggplot(count_visitors, aes(x = date)) +
geom_line(aes(y=average_count), size = 0.5) +
geom_ribbon(aes(ymin=count_min,ymax=count_max), alpha="0.2") +
facet_wrap(day_month ~ ., nrow = 2, scales = 'free_x')
I have two plots from two different data frames
The DPUT from data frame 1 is as follows
ppv_npv2 <- structure(list(pred.prob = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50), variable = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("ppv_2.5", "ppv_50", "ppv_97.5"), class = "factor"),
value = c(4.8, 9.3, 13.4, 17.2, 20.8, 24.2, 27.3, 30.3, 33.1,
35.7, 38.2, 40.5, 42.8, 44.9, 46.9, 48.8, 50.6, 52.3, 54,
55.6, 57.1, 58.5, 59.9, 61.2, 62.5, 63.7, 64.9, 66, 67.1,
68.2, 69.2, 70.2, 71.1, 72, 72.9, 73.8, 74.6, 75.4, 76.2,
76.9, 77.7, 78.4, 79, 79.7, 80.4, 81, 81.6, 82.2, 82.8, 83.3,
7.2, 13.6, 19.3, 24.4, 28.9, 33, 36.8, 40.2, 43.3, 46.2,
48.9, 51.3, 53.6, 55.7, 57.7, 59.6, 61.3, 62.9, 64.5, 65.9,
67.3, 68.6, 69.8, 70.9, 72, 73.1, 74.1, 75, 75.9, 76.8, 77.6,
78.4, 79.2, 79.9, 80.6, 81.3, 82, 82.6, 83.2, 83.8, 84.3,
84.8, 85.4, 85.9, 86.3, 86.8, 87.3, 87.7, 88.1, 88.5, 11.7,
21.1, 28.8, 35.3, 40.8, 45.5, 49.7, 53.3, 56.4, 59.3, 61.8,
64.1, 66.2, 68.1, 69.8, 71.4, 72.9, 74.2, 75.5, 76.6, 77.7,
78.7, 79.7, 80.5, 81.4, 82.2, 82.9, 83.6, 84.3, 84.9, 85.5,
86, 86.6, 87.1, 87.6, 88.1, 88.5, 88.9, 89.3, 89.7, 90.1,
90.5, 90.8, 91.1, 91.5, 91.8, 92.1, 92.4, 92.6, 92.9)),
.Names =c("pred.prob","variable", "value"), row.names = c(NA, -150L),
class = "data.frame")
The plot that i have created is from the following code
p1 <- ggplot(ppv_npv2,aes(x=pred.prob,y=value))+
geom_line(data=ppv_npv2[ppv_npv2$variable=="ppv_50",],
colour="red",linetype=2)+
geom_line(data=ppv_npv2[ ppv_npv2$variable=="ppv_2.5", ],
colour="blue",linetype=4)+
geom_line(data=ppv_npv2[ ppv_npv2$variable=="ppv_97.5", ],
colour="blue",linetype=4)+
theme_classic()+
ylab("Predicted positive predictive value (%) \n")+
xlab("\n Prevalence (%)")+
scale_x_continuous(limits=c(0,50),breaks=seq(0,50,2))+
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10), expand=c(0,0))+
theme(axis.text.x = element_text(size=12,hjust=.5,vjust=.8,face="plain"),
axis.text.y = element_text(size=12,hjust=.5,vjust=.8,face="plain"))+
theme(axis.title.x = element_text(size=14,face="bold"),
axis.title.y = element_text(size=14,face="bold"))
p1
The dput for the second data frame is
dat <- structure(list(PPV = c(57, 89, 19, 52, 52, 62, 63, 46, 31, 52,
54, 13, 17, 47, 48, 52, 96, 88, 64, 33, 62, 77, 75, 72), Prevalence = c(19,
35, 12, 16, 24, 6, 28, 13, 8, 19, 30, 6, 8, 20, 11, 25, 29, 55,
46, 13, 16, 22, 23, 20), total = c(939L, 323L, 306L, 703L, 137L,
833L, 360L, 317L, 440L, 2072L, 209L, 386L, 142L, 358L, 167L,
503L, 180L, 233L, 342L, 478L, 4870L, 1104L, 1813L, 1567L),
Author = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L,
15L,18L, 19L, 8L, 14L, 16L, 17L, 21L, 20L, 20L, 13L, 10L),
.Label = c("Aldous",
"Bahrmann", "Body", "Christ ", "Collinson", "Eggers", "Freund",
"Giannitis", "Hammerer-Lercher", "Hoeller", "Inoue", "Invernizi",
"Keller", "Khan", "Lotze", "Melki ", "Normann", "Santalol", "Sebbane",
"Shah", "Thelin "), class = "factor"), Study.assay = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("TnI", "TnT"), class = "factor")),
.Names = c("PPV", "Prevalence", "total", "Author", "Study.assay"),
class ="data.frame", row.names = c(NA, -24L))
And the plot from dataframe 2 is as follows
p2 <- ggplot(dat, aes(x=dat$Prevalence, y=dat$PPV, size=dat$total,
label=dat$Author),guide=F)+
geom_point(colour="white", fill="red", shape=21)+
scale_size_area(max_size = 10)+
scale_x_continuous(name="\n Prevalence", limits=c(0,100))+
scale_y_continuous(name="Predicted positive predictive value (%) \n",
limits=c(0,100))+
geom_text(size=2.5)+
theme_classic()+
ylab("Predicted positive predictive value (%) \n")+
xlab("\n Prevalence (%)")+
scale_x_continuous(limits=c(0,50),breaks=seq(0,50,2))+
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10), expand=c(0,0))+
theme(axis.text.x = element_text(size=12,hjust=.5,vjust=.8,face="plain"),
axis.text.y = element_text(size=12,hjust=.5,vjust=.8,face="plain"))+
theme(axis.title.x = element_text(size=14,face="bold"),
axis.title.y = element_text(size=14,face="bold"))+
theme(legend.position='none')
p2
As you can see both plots have the same axis and limits. I have two questions:
a) Can i overlay plot 2 onto plot 1?
b) Can i make the bubbles on plot 2 more transparent and choose colours by the factor dat$Study.assay (green and purple)?
Many thanks in advance - have spent a day researching this but no solution yet.
Here's a start using your data,
(plot2 <- ggplot() +
geom_line(data = ppv_npv2,aes(pred.prob, value,
group= variable, colour = variable)) +
geom_point(data = dat, aes(Prevalence, PPV, label=Author, size = total,
colour = Study.assay), alpha = I(0.4)) +
geom_text(data = dat, aes(Prevalence, PPV, label=Author,
size = total), size=3, hjust=-1, vjust=0)
)
It's not the orthodox ggplot2 way, but it's a start.