I've an error when analyze "Simple slope" with R function - r

I've tried to analyze "Simple slope" with Moderation Regression
Using library names interactions
but turns out it doesn't work
I've already searched in google but it seems no one has the same problem I had
install.packages("interactions", dependencies = TRUE)
library(interactions)
out1 = lm(timetogether ~ malehappy + femalehappy, df)
out2 = lm(timetogether ~ malehappy*femalehappy, df)
summary(out1)
summary(out2)
anova(out1, out2)
sim_slopes(out2, pred = "malehappy", modx = "femalehappy")
When I compute function names sim_slopes(out2, pred...)
it returns me as
"Error in isFALSE(row.names) : could not find function "isFALSE""
Some might run sim_slopes() without any error.
but not for me...
What should I do, to resolve it, or to check it?
Thank you
and here, is the output of dput(df)
structure(list(malehappy = structure(c(62, 53, 55, 36, 60, 50, 45,
53, 48, 50, 63, 46, 72, 40, 40, 30, 49, 49, 45, 59, 46.1513513513514,
51, 36, 47, 53, 65, 46, 39, 41, 56, 54, 41, 36, 46.1513513513514, 51,
50, 47, 56, 44, 42, 61, 44, 47, 55, 57, 55, 32, 62, 53, 60, 59, 65,
49, 49, 60, 56, 67, 54, 46.1513513513514, 46.1513513513514,
46.1513513513514, 34, 57, 61, 73, 42, 84, 46.1513513513514, 47, 43, 46.1513513513514, 59, 40, 42, 49, 55, 46, 56, 50, 48, 57, 50, 53, 46.1513513513514, 50, 46.1513513513514, 61, 64, 48, 42, 31, 71, 54, 29, 45, 56, 53, 56, 47, 48, 39, 58, 51, 48, 54, 52, 57, 89, 53, 53,
44, 53, 40, 47, 40, 47, 54, 69, 60, 56, 47, 65, 50, 29, 58, 50,
46.1513513513514, 39, 66, 50, 46.1513513513514, 47, 38, 50, 70, 36, 59, 71, 41, 54, 18, 46.1513513513514, 38, 29, 71, 46.1513513513514,
51, 46, 48, 61, 52, 41, 48, 44, 37, 43, 54, 56, 44, 55, 51, 64, 52,
38, 48, 60, 45, 43, 44, 39, 54, 56, 47, 53, 51, 43, 49, 50, 56, 41,
37, 49, 59, 60, 72, 31, 58, 52, 49, 58, 60, 52, 47, 65, 63, 67,
46.1513513513514, 54, 60,
46.1513513513514, 52, 43, 45, 26, 50, 40, 35, 43, 38, 40, 53, 36, 62, 30, 30, 46.1513513513514, 39, 39, 35, 49, 34, 41, 26, 37, 43, 55, 36,
29, 31, 46, 44, 31, 26, 28, 41, 40, 37, 46, 34,
46.1513513513514, 51, 34, 37, 45, 47, 45, 22, 52, 43, 50, 49, 55, 39, 39, 50, 46, 46.1513513513514, 44, 46.1513513513514, 43,
46.1513513513514, 24, 47, 51, 63, 32, 74, 24, 37, 33, 42, 49, 30, 32, 39, 45, 36, 46, 40, 46.1513513513514, 47, 40, 43, 58, 40, 47, 51, 54,
38, 32, 21, 61, 44, 19, 35, 46, 43, 46, 37, 38, 29, 48, 41, 38, 44,
42, 47, 79, 43, 43, 34, 43, 30, 37, 30, 37, 44, 59, 50, 46,
46.1513513513514, 55, 40, 19, 48, 40, 37, 29, 56, 40, 49, 37, 28, 46.1513513513514, 60, 26, 49, 61, 31, 44, 8, 36, 28, 19, 61, 38, 41, 36, 38, 51, 42, 31, 38, 34, 27, 33, 44, 46, 46.1513513513514,
46.1513513513514, 46.1513513513514, 54, 42, 28, 38, 50, 35, 46.1513513513514, 34, 29, 46.1513513513514, 46, 37, 43, 41, 33, 39, 40, 46, 31, 27, 39, 49, 46.1513513513514, 62, 46.1513513513514, 48,
42, 39, 48, 50, 42, 37, 55, 53, 57, 44, 44, 50, 52), imputed = c(21L,
34L, 59L, 60L, 61L, 68L, 71L, 84L, 86L, 127L, 131L, 142L, 146L, 197L,
200L, 216L, 240L, 257L, 259L, 261L, 280L, 321L, 334L, 359L, 360L,
361L, 368L, 371L, 384L, 386L), class = "impute"), femalehappy =
structure(c(59, 54, 51, 35, 50, 55.5978260869565, 45, 59, 49, 63, 53,
57, 65, 38, 45, 45, 34, 48, 35, 89, 45, 53, 46, 30, 54, 59, 31, 44,
37, 55, 46, 63, 41, 43, 57, 65, 41, 67, 52, 55, 69, 41, 55, 37, 50,
39, 23, 63, 63, 47, 53, 52, 37, 51, 52, 34, 58, 55, 55.5978260869565,
60, 55.5978260869565, 42, 42, 55.5978260869565, 55, 39, 71,
55.5978260869565, 41, 51, 38, 38, 44, 72, 57, 44, 45, 57, 56, 43, 55.5978260869565, 51, 46, 64, 64, 65, 74, 58, 54, 51, 45, 61, 56, 39, 48, 49, 57, 56, 39, 51, 35, 42, 49, 43, 43, 53, 64, 67, 43, 54, 49,
57, 43, 44, 57, 48, 64, 56, 57, 69, 55.5978260869565, 65, 65, 37, 52,
50, 55.5978260869565, 55.5978260869565, 61, 57, 55.5978260869565, 46,
62, 55, 66, 50, 70, 63, 44, 62, 36, 55.5978260869565, 23, 47, 54,
55.5978260869565, 41, 40, 57, 40, 61, 45, 57, 30, 40, 42, 55.5978260869565, 57, 45, 44, 46, 48, 33, 45, 49, 55, 47, 40, 47, 42, 60, 55.5978260869565, 38, 55.5978260869565, 41, 55, 36, 52, 50, 36,
44, 50, 59, 59, 55.5978260869565, 49, 62, 57, 37, 59, 63, 43, 38, 63,
53, 58, 60, 47, 49, 55.5978260869565, 69, 64, 61, 45, 60, 61, 55, 69,
59, 73, 63, 67, 75, 48, 55, 55.5978260869565, 44, 58, 45, 99, 55, 63,
56, 40, 64, 69, 55.5978260869565, 54, 47, 65, 56, 73, 51, 53, 67, 75,
51, 77, 62, 55.5978260869565, 79, 51, 65, 47, 60, 49, 33, 73, 73,
55.5978260869565, 63, 62, 47, 61, 62, 44, 68, 65, 55.5978260869565, 70, 55.5978260869565, 52, 52, 64, 65, 49, 81, 48, 51, 61, 48, 48, 54,
55.5978260869565, 67, 54, 55, 67, 66, 55.5978260869565, 55.5978260869565, 61, 56, 74, 74, 75, 84, 68, 64, 61, 55, 71, 66, 49, 58, 59, 67, 66, 49, 61, 45, 52, 59, 53, 53, 55.5978260869565, 74, 77,
53, 64, 59, 67, 53, 54, 67, 58, 74, 66, 67, 79, 57, 75, 75, 47, 62,
60, 57, 42, 71, 67, 63, 56, 72, 65, 76, 60, 80, 73, 54, 72, 46, 57,
33, 57, 64, 72, 51, 50, 67, 50, 71, 55, 67, 40, 50, 52, 56, 67,
55.5978260869565, 54, 55.5978260869565, 58, 43, 55.5978260869565, 59, 65, 57, 55.5978260869565, 57, 52, 70, 56, 48, 65, 51, 65, 46, 62, 60,
46, 55.5978260869565, 60, 69, 69, 84, 59, 72, 67, 47, 69, 73, 53, 48,
73, 63, 68, 70, 57, 59, 72), imputed = c(6L, 59L, 61L, 64L, 68L, 81L,
121L, 127L, 128L, 131L, 142L, 146L, 157L, 172L, 174L, 185L, 200L,
216L, 227L, 240L, 250L, 259L, 261L, 274L, 280L, 281L, 306L, 359L,
361L, 364L, 368L, 381L), class = "impute"), timetogether =
structure(c(132, 89, 86, 19, 96, 74, 47, 91.7415143603133, 62, 104,
114, 76, 195, 27, 39, 18, 30, 63, 28, 91.7415143603133, 45, 79, 29,
18, 89, 145, 20, 34, 26, 101, 69, 70, 25, 32, 93, 107, 43, 136, 60,
59, 165, 37, 73, 43, 89, 49, 6, 146, 91.7415143603133, 85,
91.7415143603133, 115, 36, 71, 103, 35, 145, 93, 37, 104, 69, 91.7415143603133, 64, 114, 152, 31, 91.7415143603133, 20, 43, 54, 43, 51, 36, 87, 85, 65, 50, 109, 85, 48, 89, 74, 67, 178, 105, 136, 186,
138, 75, 51, 19, 172, 96, 14, 55, 84, 98, 91.7415143603133, 38, 68,
22, 64, 70, 49, 60, 82, 132, 277, 60, 89, 54, 98, 36, 51, 57, 58,
122, 142, 118, 146, 57, 165, 109, 13, 95, 70, 55, 17, 153, 88, 103,
52, 58, 82, 190, 36, 162, 184, 38, 91.7415143603133, 0, 56, 5, 17,
139, 90, 48, 39, 82, 61, 103, 41, 82, 16, 26, 38, 68, 108, 45, 66,
61, 98, 29, 34, 64, 114, 51, 35, 51, 30, 109, 74, 35, 89, 50,
91.7415143603133, 34, 75, 85, 26, 31, 67, 122, 128, 237, 21, 130, 95, 36, 123, 141, 55, 37, 158, 116, 145, 109, 72, 92, 91.7415143603133,
164, 113, 120, 47, 137, 100, 73, 119, 88, 111, 157, 87, 231, 57, 59,
23, 78, 91, 71, 205, 71, 103, 41, 70, 116, 181, 70, 54, 60, 130, 108,
63, 43, 51, 111, 111, 79, 147, 75, 65, 179, 69, 87, 97, 127, 101, 47,
171, 124, 130, 139, 163, 81, 95, 142, 95, 185, 118, 66, 121, 96, 39,
113, 151, 206, 63, 325, 41, 79, 71, 90, 110, 56, 69, 101, 109, 79,
134, 103, 82, 125, 99, 103, 211, 110, 150, 194, 175, 93, 66, 25, 214,
120, 26, 78, 121, 120, 132, 78, 91, 55, 114, 100, 83, 103, 108, 148,
91.7415143603133, 102, 115, 74, 120, 59, 83, 59, 84, 134, 189, 150, 91.7415143603133, 85, 193, 114, 28, 131, 98, 83, 55, 188, 105, 138, 81, 49, 102, 223, 42, 172, 222, 61, 132, 0, 81, 54, 16, 191, 96, 90,
74, 91.7415143603133, 119, 91.7415143603133, 62, 97, 64, 50, 67, 106,
133, 71, 109, 96, 91.7415143603133, 84, 51, 89, 149,
91.7415143603133, 68, 74, 53, 128, 116, 76, 113, 91, 70, 80, 98, 121, 60, 48, 92, 149, 157, 262, 21, 151, 114, 81, 148, 164, 97, 78, 188,
158, 186, 126, 91.7415143603133, 136, 174), imputed = c(8L, 20L, 49L,
51L, 62L, 67L, 98L, 140L, 176L, 200L, 308L, 320L, 349L, 351L, 362L,
367L, 398L), class = "impute"),
kids = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("nokids", "kids"), class = "factor")), .Names = c("malehappy", "femalehappy", "timetogether", "kids"), row.names =
c(NA, -400L ), class = "data.frame")

It is related to the R version. The funcion isFALSE is built-in in R from version 3.5 onwards. I was getting the same error with the package "jjtools" when using R 3.4. Upgraded R to version 3.6, and the problem was gone.

Related

Randomly sample by group based on numeric variables

Is it possible to randomly sample patients by group so that they have similar distributions based on other variables? To me, this sounds like a matching problem, but there's no "treatment" here, so I'm not sure if the concept applies.
Sample data:
structure(list(id = c(8350L, 22543L, 24144L, 9392L, 27648L, 2943L,
34686L, 27153L, 11143L, 15209L, 11952L, 22669L, 8211L, 27765L,
28671L, 9693L, 30274L, 25807L, 14839L, 22400L, 24494L, 6540L,
6861L, 31825L, 34190L, 19606L, 21077L, 5037L, 25943L, 20530L,
23730L, 34774L, 7210L, 2051L, 28410L, 18318L, 34848L, 26596L,
8973L, 24885L, 9652L, 8387L, 16168L, 36893L, 24048L, 17769L,
1273L, 22734L, 36796L, 25497L, 28300L, 166L, 21172L, 20026L,
16265L, 1699L, 33140L, 23997L, 10216L, 27408L, 6813L, 10196L,
15015L, 2748L, 34979L, 21763L, 27438L, 6255L, 17047L, 30593L,
30723L, 7914L, 218L, 20134L, 29952L, 27126L, 3795L, 1367L, 33585L,
5940L, 26250L, 22519L, 35611L, 26168L, 26848L, 21276L, 8971L,
22554L, 16655L, 5315L, 18121L, 32526L, 21513L, 9262L, 36882L,
7408L, 18873L, 17238L, 15216L, 23667L, 30138L, 2978L, 25451L,
2492L, 30983L, 7677L, 22880L, 29674L, 7093L, 24910L, 20839L,
18176L, 23031L, 17197L, 4613L, 35801L, 30822L, 3889L, 11752L,
11314L, 22317L, 12825L, 17433L, 4407L, 3986L, 10173L, 32409L,
2697L, 3410L, 26834L, 3203L, 5474L, 34678L, 35336L, 19462L, 15835L,
7888L, 27897L, 9245L, 16524L, 13316L, 21604L, 30458L, 9191L,
1220L, 1779L, 1724L, 26382L, 11566L, 21310L, 12600L, 25063L,
30912L, 31189L, 9480L, 16804L, 2372L, 26238L, 20113L, 33753L,
32711L, 11543L, 10578L, 4475L, 13187L, 23395L, 35342L, 6903L,
26905L, 12026L, 5697L, 15352L, 33985L, 1132L, 15806L, 13611L,
29930L, 15896L, 6057L, 10849L, 12944L, 25561L, 3328L, 27481L,
28790L, 3260L, 24986L, 22177L, 26580L, 11639L, 2256L, 4839L,
22805L, 616L, 6702L, 18360L, 4439L, 1300L, 33779L, 24940L, 10043L,
21268L, 35127L, 36621L, 17618L, 6688L, 15937L, 31057L, 2144L,
30866L, 12500L, 29753L, 36497L, 21247L, 9481L, 36465L, 20665L,
15017L, 21234L, 34258L, 576L, 31187L, 4528L, 15314L, 3657L, 24489L,
33871L, 106L, 24916L, 2524L, 17469L, 2799L, 13311L, 26585L, 7131L,
21401L, 6191L, 22338L, 11647L, 11681L, 22744L, 14000L, 5356L,
2892L, 24481L, 24116L, 21461L, 13992L, 22751L, 11129L, 8802L,
29963L, 4660L, 29020L, 20843L, 21796L, 3607L, 10692L, 29168L,
25034L, 3307L, 35010L, 20280L, 31894L, 7276L, 24259L, 34059L,
35867L, 11165L, 16010L, 34082L, 26586L, 30958L, 25030L, 34851L,
29185L, 25721L, 8968L, 29427L, 20213L, 34667L, 28721L, 21472L,
17132L, 35247L, 9798L, 36826L, 21226L, 28335L, 16077L, 2654L,
20466L, 21324L, 36969L, 22553L, 5895L, 16514L, 10644L, 4376L,
13592L, 11206L, 32440L, 13413L, 31416L, 22540L, 15986L, 11506L,
16928L, 18652L, 17858L, 13522L, 8566L, 10665L, 29442L, 28219L,
22549L, 2209L, 8017L, 6066L, 21718L, 21930L, 11540L, 4100L, 35236L,
240L, 24900L, 425L, 26880L, 21409L, 18885L, 5803L, 33335L, 25597L,
12547L, 8930L, 4328L, 17360L, 4696L, 25198L, 26469L, 14679L,
1691L, 32989L, 6099L, 14427L, 31797L, 23408L, 29296L, 23928L,
31889L, 31737L, 6420L, 11304L, 34798L, 20785L, 9806L, 35018L,
35008L, 1450L, 3246L, 15123L, 19603L, 8519L, 32012L, 3397L, 11682L,
27102L, 18022L, 20408L, 15836L, 18284L, 12897L, 29580L, 14510L,
23925L, 28821L, 35825L, 14922L, 36643L, 10948L, 4220L, 23791L,
65L, 35772L, 1423L, 29386L, 755L, 23627L, 27201L, 12353L, 3578L,
1914L, 35373L, 16702L, 13057L, 3021L, 27531L, 1990L, 205L, 21559L,
29081L, 26301L, 18894L, 3088L, 9782L, 10522L, 12570L, 8948L,
36240L, 33943L, 33022L, 2750L, 32649L, 30134L, 13920L, 11498L,
8314L, 16849L, 15559L, 22529L, 31406L, 5680L, 17908L, 14931L,
2122L, 2581L, 33546L, 12143L, 17220L, 16713L, 7454L, 13659L,
15973L, 20116L, 27689L, 35285L, 36106L, 21834L, 29850L, 29030L,
7957L, 31698L, 12307L, 23642L, 5615L, 12016L, 1161L, 15291L,
32738L, 1089L, 32988L, 33382L, 3642L, 18661L, 35584L, 8009L,
24000L, 30587L, 25870L, 19944L, 34970L, 29983L, 24774L, 28702L,
21199L, 17292L, 29831L, 476L, 18881L, 29923L, 31476L, 4570L,
31081L, 10544L, 3373L, 13435L, 22651L, 17861L, 3818L, 35387L,
11459L, 35637L, 308L, 35697L, 12696L, 15175L, 7990L, 16691L,
19494L, 9008L, 30695L, 28889L, 446L, 22178L, 13000L, 26166L,
15431L, 19332L, 35991L, 2840L), race_f = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 4L, 2L, 3L, 4L, 1L, 1L, 3L, 3L, 3L, 3L, 1L,
3L, 1L, 3L, 3L, 1L, 1L, 3L, 2L, 2L, 1L, 4L, 5L, 1L, 4L, 1L, 1L,
5L, 1L, 1L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 1L,
2L, 1L, 3L, 1L, 2L, 1L, 1L, 1L, 2L, 3L, 3L, 1L, 1L, 3L, 3L, 3L,
1L, 1L, 1L, 3L, 3L, 2L, 1L, 1L, 3L, 4L, 4L, 1L, 1L, 3L, 1L, 2L,
3L, 4L, 1L, 1L, 1L, 3L, 1L, 1L, 5L, 3L, 1L, 1L, 3L, 2L, 1L, 1L,
3L, 1L, 4L, 1L, 1L, 3L, 1L, 4L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 2L,
3L, 3L, 4L, 4L, 1L, 2L, 1L, 4L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 3L,
1L, 2L, 1L, 2L, 2L, 4L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 4L, 1L, 5L,
2L, 1L, 2L, 3L, 1L, 5L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 3L,
1L, 4L, 4L, 3L, 2L, 4L, 2L, 1L, 3L, 3L, 1L, 4L, 3L, 3L, 3L, 1L,
1L, 4L, 1L, 4L, 2L, 3L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 1L,
3L, 4L, 1L, 3L, 1L, 1L, 4L, 3L, 4L, 1L, 3L, 1L, 2L, 4L, 3L, 3L,
1L, 1L, 3L, 1L, 5L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 2L, 1L, 4L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
3L, 1L, 1L, 1L, 4L, 1L, 4L, 3L, 1L, 3L, 2L, 1L, 1L, 2L, 3L, 1L,
4L, 2L, 3L, 1L, 3L, 4L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 5L, 4L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 4L, 1L, 1L, 3L,
1L, 3L, 3L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 2L, 1L, 3L, 1L, 4L,
1L, 4L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 4L, 1L, 1L, 2L, 1L, 1L, 1L,
4L, 1L, 1L, 3L, 3L, 1L, 4L, 3L, 3L, 4L, 1L, 3L, 1L, 5L, 3L, 4L,
1L, 4L, 4L, 1L, 3L, 4L, 1L, 4L, 1L, 1L, 1L, 3L, 2L, 1L, 2L, 4L,
1L, 1L, 5L, 4L, 1L, 1L, 4L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 2L,
1L, 3L, 3L, 3L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 3L, 1L,
1L, 3L, 4L, 1L, 1L, 2L, 5L, 3L, 3L, 1L, 1L, 4L, 1L, 4L, 1L, 4L,
2L, 3L, 3L, 1L, 1L, 1L, 4L, 1L, 4L, 3L, 1L, 1L, 1L, 1L, 3L, 1L,
3L, 1L, 1L, 1L, 1L, 4L, 3L, 4L, 3L, 3L, 3L, 2L, 3L, 1L, 1L, 1L,
3L, 5L, 2L, 5L, 1L, 1L, 4L, 3L, 1L, 3L, 2L, 1L, 1L, 5L, 1L, 3L,
3L, 4L, 1L, 1L, 1L, 2L, 5L, 1L, 1L, 4L, 3L, 1L, 1L, 1L, 2L, 1L,
1L, 3L, 3L, 1L, 1L, 4L, 3L, 2L, 4L, 4L, 1L, 1L, 2L, 3L, 1L, 3L,
3L, 1L), .Label = c("White", "Black", "Hispanic", "Asian", "Other"
), class = "factor"), cops2_avg_12mo = c(82.9166666666667, 66,
23.3333333333333, 28, 9.33333333333333, 69.9166666666667, 6,
33.3333333333333, 0, 12, 102, NA, 66, 6, 45, 58.5, 10, 55.9166666666667,
19.5, 6, 10, 234.666666666667, 28, 23, 51.5833333333333, 10,
38, 123.5, 0, 24, 10, 0, 73, 10, 25, 6, 20, 13.4166666666667,
13.8333333333333, 8, 14.8333333333333, 53.5, 42, NA, 57.1666666666667,
0, 24.6666666666667, 10, NA, 54.6666666666667, 38.75, 41, 22,
0.833333333333333, 13, 113.083333333333, 27.3333333333333, 9,
33.1666666666667, 18.75, 57.75, 30, 60.3333333333333, 23.1666666666667,
37, 16.5, 0, 145.5, 45, 31.3333333333333, 0, 10, 187.5, 27.4166666666667,
10, 54.9166666666667, 78.8333333333333, 103.75, 6.66666666666667,
30.4166666666667, 10, 10, 24.6666666666667, 10, 118.333333333333,
61.25, 17, 10, 28, 51, 6, 32.0833333333333, 80.75, 8.83333333333333,
NA, 10, 74.25, 42.25, 47, 60, 41.6666666666667, 19.0833333333333,
98.5, 73.5, 10, 6.66666666666667, 49.8333333333333, 10, 79.8333333333333,
10, 42, 95.8333333333333, 130.583333333333, 5.41666666666667,
47.25, 6, 8, 17.8333333333333, 10, 73.9166666666667, 10, 8, 27.8333333333333,
125.916666666667, 134.166666666667, 88, 10, 58, 62.5, 10.3333333333333,
28.8333333333333, 100.083333333333, 35.5, 0, 0, 10, 105, 7.33333333333333,
35, 9.66666666666667, 10, 4.16666666666667, 10, 8.33333333333333,
70.6666666666667, 28.4166666666667, 38.1666666666667, 8, 101.5,
26.75, 61.1666666666667, 14, 95.5833333333333, 35, 65, 0, 51.75,
57.5, 10, 13.6666666666667, 10, 67.5, 10, 62.3333333333333, 72.6666666666667,
10, 45.5, 20.8333333333333, 31, 84.5, 10, 98.1666666666667, 47.5,
56, 126, 14, 10, 10, 8, 36, 111.5, 54.5, 45.5, 8, 37.5, 84.8333333333333,
39.1666666666667, 56.25, 37.9166666666667, 37.75, 27, 55.6666666666667,
10, 34, 5.83333333333333, 37, 80.0833333333333, 57, 102.166666666667,
12.6666666666667, 10, 19.3333333333333, 10, NA, 51, 25.9166666666667,
14, 36.9090909090909, 38.6666666666667, 0, 6.33333333333333,
NA, 31, 43, 26.5, 10, 34.4166666666667, 77.1666666666667, 10,
10, 89.9166666666667, 59, 37, 77.3333333333333, 64, 52, 19.6666666666667,
66.5, 24, 106.083333333333, 29.6666666666667, 38.1666666666667,
6.66666666666667, 10, 16.75, NA, 86.75, 1, 14, 20.3333333333333,
8, 21, 38.9166666666667, 50.8333333333333, 57.5, 29, 0, 26.5,
51.9166666666667, 71.25, 42.6666666666667, 82, 58.0833333333333,
11.3333333333333, 82, 9.5, 78.6666666666667, 102.5, 71, 10, 70.6666666666667,
NA, 33.8333333333333, 61.25, 87, 36.5, 10, 40.4166666666667,
51.8333333333333, 23, 9.66666666666667, 44.5, 8, 10, 4.16666666666667,
0, 48.8333333333333, 49.25, 15, 70, 10, 6, 10, 34.8333333333333,
108.75, 36, NA, 31, 51, 69.5, 122.5, 48, 43.5833333333333, NA,
10, 20, 80.75, 54.75, 106.916666666667, 53.5, 90.6666666666667,
8.33333333333333, 85.5, 40.5833333333333, 5.5, 10, 61.3333333333333,
69.8333333333333, 10, 51, 0, 49.0833333333333, 13.6666666666667,
13.3333333333333, 5.83333333333333, 33.8333333333333, 14.4166666666667,
11.25, 14, 6, 14.5833333333333, 36, 21, 10, 29.5833333333333,
13, 34, 10, 2.5, 10, 211.916666666667, 19.75, 7.33333333333333,
6, 59.6666666666667, 30.25, 34.25, 16.1666666666667, 10, NA,
NA, 97, 75, 26.5, 8, 32.25, 0, 39, 37, 165.333333333333, 45,
33.1666666666667, 21, 10, 57, 70.3333333333333, 10, 10, 62, 79.1666666666667,
38, 26.1666666666667, 13, 8, 69.6666666666667, 40.5, 100, 0.833333333333333,
8, 82.5, 10, 19.8333333333333, 20.0833333333333, 8, 25.8333333333333,
16.75, 10, 36, NA, 12.8333333333333, 31.4166666666667, 10, 61.4166666666667,
14, 67.5, 3, 83.1666666666667, 48, 43.75, 35.4166666666667, 73,
44.1666666666667, 8, 29.75, 10, 10, 62.6666666666667, 26.9166666666667,
29.6666666666667, 10, NA, 15, 19.4166666666667, 112, 29, 3, 33.5,
62.5, 10, 84.6666666666667, 8, 84.4166666666667, 81.5, 56.1666666666667,
10, 101.416666666667, 16, 10, 19.6666666666667, 60, 73.6666666666667,
74.9166666666667, 21, 5, 15.0833333333333, 17.0833333333333,
17.5, 46, 61.8333333333333, 115.333333333333, 92, 30, 0, 22.75,
16.6666666666667, 15, 15, 10, NA, 56.25, 54, 10, 40, 9.83333333333333,
10.9166666666667, 22.25, 84.75, 80, 1.66666666666667, 99.8333333333333,
10, 38.6666666666667, 169.75, 35.0833333333333, 8, 78.5, 6.33333333333333,
21, 10, 42, 105.166666666667, 162.416666666667, 14, 69.25, 35.8333333333333,
13, 5.83333333333333, 34, 51, 12.75, 44.3333333333333, 39.5,
10, 23, 46.8333333333333, 89.9166666666667, 15, 28, 128.416666666667,
10, 91.6666666666667, 3.5, 54, 23, NA, 29.75, 37.1666666666667,
12.6666666666667, 31.9166666666667, 23, 0, 11, 67.9166666666667,
3.16666666666667, 8.33333333333333, 51, NA, 10, 0, 58.8333333333333
), AGE = c(86, 82, 83, 92, 45, 81, 52, 64, 71, 96, 79, 64, 76,
37, 81, 79, 72, 79, 74, 46, 45, 71, 89, 76, 53, 48, 52, 77, 63,
52, 57, 62, 84, 88, 55, 69, 67, 63, 67, 51, 86, 53, 65, 59, 71,
60, 70, 20, 78, 62, 58, 73, 68, 71, 66, 72, 71, 65, 95, 67, 79,
70, 86, 77, 81, 54, 44, 66, 80, 71, 30, 77, 67, 75, 48, 65, 83,
85, 70, 70, 74, 58, 81, 28, 78, 66, 79, 47, 74, 41, 74, 58, 73,
55, 53, 56, 84, 74, 62, 85, 68, 47, 78, 72, 57, 56, 64, 55, 86,
76, 77, 58, 74, 55, 71, 61, 74, 62, 65, 75, 81, 68, 39, 58, 65,
76, 27, 79, 86, 61, 87, 52, 72, 58, 53, 69, 78, 65, 81, 69, 66,
68, 61, 72, 74, 80, 88, 46, 53, 77, 89, 83, 41, 67, 83, 62, 90,
70, 60, 62, 33, 78, 80, 62, 81, 37, 55, 90, 81, 73, 67, 97, 32,
71, 70, 69, 46, 57, 60, 79, 79, 56, 75, 60, 52, 78, 61, 51, 70,
67, 71, 36, 53, 70, 53, 74, 89, 78, 70, 56, 58, 83, 50, 77, 70,
50, 75, 53, 86, 65, 45, 63, 62, 78, 65, 69, 75, 79, 71, 56, 88,
63, 72, 85, 68, 72, 45, 81, 46, 70, 84, 71, 82, 63, 57, 77, 70,
42, 87, 84, 61, 64, 79, 53, 65, 64, 69, 68, 71, 89, 49, 70, 82,
63, 79, 65, 64, 54, 73, 36, 80, 38, 68, 62, 84, 80, 65, 73, 91,
59, 35, 80, 67, 68, 65, 47, 60, 67, 72, 81, 22, 35, 58, 57, 68,
94, 38, 77, 75, 73, 78, 71, 78, 53, 58, 61, 77, 44, 95, 53, 72,
68, 72, 73, 78, 41, 75, 80, 60, 53, 68, 79, 80, 74, 25, 79, 55,
68, 85, 64, 72, 78, 78, 71, 73, 82, 73, 73, 58, 69, 58, 72, 78,
56, 74, 67, 66, 72, 38, 58, 62, 77, 81, 37, 46, 88, 55, 76, 50,
57, 72, 39, 56, 29, 76, 77, 36, 31, 70, 70, 70, 54, 74, 47, 81,
46, 81, 55, 53, 70, 28, 71, 79, 68, 78, 81, 30, 83, 43, 70, 79,
47, 94, 60, 64, 82, 81, 92, 57, 90, 86, 58, 61, 69, 50, 64, 79,
56, 76, 52, 55, 53, 85, 89, 64, 86, 58, 82, 64, 74, 45, 64, 71,
75, 61, 79, 82, 63, 81, 60, 70, 79, 63, 59, 80, 53, 80, 41, 83,
67, 90, 60, 82, 74, 75, 52, 62, 35, 53, 49, 71, 69, 73, 67, 44,
77, 81, 96, 52, 75, 30, 83, 74, 56, 62, 78, 63, 63, 62, 71, 62,
89, 83, 77, 66, 64, 24, 96, 63, 51, 65, 71, 50, 68, 83, 82, 90,
91, 84, 90, 76, 62, 79, 20, 75, 79, 80, 62, 62, 71, 51, 81, 84,
65, 65, 55, 65, 51, 26, 70)), row.names = c(NA, -500L), class = c("tbl_df",
"tbl", "data.frame"))
I'm hoping to sample by race_f so that the different race groups are similar in AGE and cops2_avg_12mo. Is this at all possible? Thank you!
The answer depends on if you want to ensure that their ages/cops2_avg_12mo will always be within a specific range - in which case you would simply create a subset of your data frame with only the patients whose age and cops2_avg_12mo are within some range. I do think that this is the safer thing to do in terms of quality control. You can view a plot of the two columns of your data (AGE and cops2_avg_12mo) to get an idea of what ranges of values most of the patients fall into:
plot(x[,c("AGE", "cops2_avg_12mo")])
Pick ranges for these values that contain enough patients to sample from. (I don't know how many samples you need). Basically, draw a box in the dot plot which contains enough patients to sample from.
So once you determine the ranges/boundaries of the box, just create indexes like so:
idx = (x[,"AGE"] > 50) & (x[,"AGE"] < 75) & (x[,"cops2_avg_12mo"] > 0) & (x[,"cops2_avg_12mo"] < 75) & !is.na(x[,"cops2_avg_12mo"])
then get the subset of your data:
subsetX = x[idx,]
After you create that subset, you can randomly sample using R's sample() function. If you want to do sampling from each race equally, then call sample() with the subsetX data, with each race selected at a time, to get n samples at a time:
sample(subsetX[subsetX[,"race_f"]=="Asian",], n, replace=FALSE)
Alternatively, if you are ok with sampling patients that have outlier values (but I feel like this will produce more variation in your results), then you can create a histogram of each of the columns - for example, AGE - then get the histogram bin counts, divide them by the total number of patients to get a probability distribution, then create a vector the same length as the number of patients where each value is the probability we calculated for the bin it belongs to (found by getting bin indexes when calculating the histogram), then pass that vector into the sample() function as the prob input argument so that values are sampled with their specified probability.

Adding sample size to ggplot boxplot

I'm interested to see how age is related to a continuous outcome, for which I have the following data:
library(dplyr)
library(tidyverse)
library(magrittr)
library(ggplot2)
mydata <-
structure(list(ID = c(104, 157, 52, 152, 114, 221, 320, 125,
75, 171, 80, 76, 258, 82, 142, 203, 37, 92, 202, 58, 194, 38,
4, 137, 25, 87, 40, 117, 21, 255, 277, 315, 96, 134, 185, 94,
3, 153, 172, 65, 279, 209, 60, 13, 154, 160, 24, 29, 159, 213,
127, 74, 48, 126, 184, 132, 61, 141, 27, 49, 8, 39, 164, 162,
34, 205, 179, 119, 77, 135, 138, 165, 103, 253, 14, 20, 310,
84, 30, 273, 22, 105, 262, 116, 86, 83, 145, 31, 95, 51, 81,
271, 36, 50, 189, 2, 115, 7, 197, 54), age = c(67.1, 70.7, 53,
61.7, 66.1, 57.7, 54.1, 67.2, 60.9, 55.8, 40.7, 57.6, 64.1, 70.7,
47.5, 46.3, 66.7, 55, 63.3, 68.2, 61.2, 60.5, 52, 65.3, 48.9,
56.9, 62.7, 75.2, 61.4, 57.9, 53.6, 58.1, 51, 67.3, 63.9, 57,
43.2, 64.7, 62.8, 56.3, 51.7, 39.4, 45.2, 57.8, 55.7, 69.6, 61.5,
50.1, 73.7, 55.5, 65.2, 54.6, 49, 35.2, 52.9, 46.3, 55, 52.5,
54.2, 61, 57.4, 56.5, 53.6, 47.7, 64.2, 53.4, 60.9, 58.2, 60.7,
50.3, 48.3, 74.7, 52.1, 59.9, 52.4, 70.8, 61.2, 66.5, 55.4, 57.5,
59.2, 60.1, 52.3, 60.2, 54.8, 36.3, 61.5, 48.6, 56, 62, 64.8,
40.4, 68.3, 60, 69.1, 56.6, 45.3, 58.5, 52.3, 52), continuous_outcome = c(3636.6,
1128.2, 2007.5, 802.9, 332.3, 2636.1, 169.5, 67.9, 3261.8, 1920.3,
155.2, 1677.2, 198.2, 11189.7, 560.9, 633.1, 196.1, 13.9, 100.7,
7594.5, 1039.8, 83.9, 2646.8, 284.6, 306, 1135.6, 1883.1, 5681.4,
1706.2, 2241.1, 97.7, 1106.8, 1107.1, 290.8, 2123.4, 267, 115.3,
138.5, 152.7, 1338.9, 6709.8, 561.7, 1931.7, 3112.4, 1876.3,
3795.9, 5706.7, 7.4, 1324.9, 4095.4, 205.4, 1886, 177.3, 304.4,
1319.1, 415.9, 537.2, 3141.1, 740, 1976.7, 624.8, 983.1, 1163.5,
1432.6, 3730.4, 2023.4, 498.2, 652.5, 982.7, 1345.3, 138.4, 1505.1,
3528.1, 11.9, 884.5, 10661.6, 1911.4, 2800.8, 81.5, 396.4, 409.1,
417.3, 186, 1892.4, 1689.7, 0, 210.1, 210.5, 3484.5, 3196.8,
57.2, 20.2, 947, 540, 1603.1, 1571.8, 9.1, 149.2, 122, 63.2),
age_decades = structure(c(3L, 4L, 2L, 3L, 3L, 2L, 2L, 3L,
3L, 2L, 1L, 2L, 3L, 4L, 1L, 1L, 3L, 2L, 3L, 3L, 3L, 3L, 2L,
3L, 1L, 2L, 3L, 4L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 1L, 3L,
3L, 2L, 2L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 4L, 2L, 3L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 3L, 2L, 3L, 2L,
3L, 2L, 1L, 4L, 2L, 2L, 2L, 4L, 3L, 3L, 2L, 2L, 2L, 3L, 2L,
3L, 2L, 1L, 3L, 1L, 2L, 3L, 3L, 1L, 3L, 2L, 3L, 2L, 1L, 2L,
2L, 2L), .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
To make a boxplot of age decades on the x axis and my continuous outcome I'm using ggplot2.
I want to make several, and automatically plot the sample size on the x-axis ticks. To do so I've computed labels in the dataset as follows:
mydata <-
mydata %>%
group_by(age_decades) %>%
mutate(n_decades=as_character(n())) %>%
mutate(label_decades=case_when(age_decades==1 ~ "Below 50",
age_decades==2 ~ "Between 50 and 60",
age_decades==3 ~ "Between 60 and 70",
age_decades==4 ~ "Above 70")) %>%
mutate(label_decades=paste0(label_decades, '\n n = ', n_decades)) %>%
ungroup() %>%
relocate(age_decades, label_decades, .after=age) %>%
select(-n_decades) %>%
arrange(ID)
Then I've tried to plot the boxplot using the newly created variable label_decades to label. The first thing I tried was:
ggplot(mydata, aes(x=age_decades, y=continuous_outcome)) +
geom_boxplot() +
scale_x_discrete(labels=mydata$label_decades)
But that just plots the first few labels as they occur in the dataset (so they dont correspond to the actual boxplot):
Then I tried:
ggplot(mydata, aes(x=age_decades, y=continuous_outcome)) +
geom_boxplot() +
geom_text(data=mydata, aes(age_decades, Inf, label=label_decades),
vjust = 15, size=4)
Which works better but the font is really weird and also the original x axis labels/ticks are still showing.
Anyone know how to solve this issue? Thanks!
The font looks wired because there are many labels with the same text plotted on top of each other. You can use distinct to get only one label per x tick and use the theme function to get rid of x tick labels:
mydata %>%
ggplot(aes(age_decades, continuous_outcome)) +
geom_boxplot() +
geom_text(
data = mydata %>% distinct(age_decades, label_decades),
mapping = aes(label = label_decades),
y = 9e3
) +
theme(
axis.text.x = element_blank()
)
One way would be to turn the labels to factor as well.
library(dplyr)
library(ggplot2)
mydata <- mydata %>%
group_by(age_decades) %>%
mutate(n_decades= as.character(n())) %>%
mutate(label_decades= case_when(age_decades==1 ~ "Below 50",
age_decades==2 ~ "Between 50 and 60",
age_decades==3 ~ "Between 60 and 70",
age_decades==4 ~ "Above 70")) %>%
mutate(label_decades= factor(paste0(label_decades, '\n n = ', n_decades))) %>%
ungroup() %>%
relocate(age_decades, label_decades, .after=age) %>%
select(-n_decades) %>%
arrange(ID)
You can then use it's levels in scale_x_discrete.
ggplot(mydata, aes(x=age_decades, y=continuous_outcome)) +
geom_boxplot() +
scale_x_discrete(labels= levels(mydata$label_decades))

Randomly sampling and assigning a variable using dplyr

I have a data frame of 200 individuals, and using dplyr I would like to randomly select half of them, create a variable called 'sex,' and assign 100 with sex as male. For the remaining 100 individuals, I would like to assign the sex as female. A reproducible example of the data set is available below.
df <- dput(input)
structure(list(id = 1:200, age = c(6L, 4L, 4L, 6L, 1L, 5L, 3L,
1L, 0L, 0L, 0L, 5L, 5L, 5L, 3L, 4L, 4L, 2L, 2L, 3L, 3L, 4L, 6L,
4L, 4L, 0L, 4L, 6L, 1L, 5L, 2L, 6L, 2L, 2L, 0L, 3L, 1L, 6L, 0L,
2L, 5L, 3L, 5L, 3L, 1L, 6L, 6L, 0L, 4L, 5L, 0L, 5L, 3L, 6L, 1L,
2L, 1L, 1L, 4L, 2L, 1L, 2L, 0L, 4L, 3L, 3L, 6L, 2L, 1L, 2L, 5L,
0L, 5L, 2L, 5L, 3L, 3L, 3L, 2L, 5L, 1L, 0L, 0L, 1L, 6L, 3L, 1L,
5L, 6L, 4L, 4L, 4L, 0L, 6L, 6L, 3L, 4L, 6L, 5L, 2L, 5L, 6L, 2L,
2L, 4L, 0L, 4L, 6L, 5L, 6L, 0L, 6L, 2L, 1L, 5L, 5L, 5L, 5L, 3L,
1L, 6L, 3L, 1L, 1L, 3L, 4L, 2L, 4L, 2L, 0L, 5L, 0L, 3L, 1L, 1L,
2L, 0L, 5L, 2L, 3L, 6L, 5L, 2L, 6L, 0L, 0L, 6L, 6L, 1L, 4L, 2L,
0L, 4L, 1L, 3L, 6L, 3L, 4L, 3L, 0L, 1L, 6L, 6L, 5L, 4L, 1L, 1L,
6L, 0L, 1L, 2L, 1L, 1L, 2L, 0L, 4L, 1L, 2L, 2L, 2L, 1L, 6L, 5L,
3L, 2L, 3L, 5L, 2L, 3L, 4L, 5L, 0L, 6L, 5L, 1L, 4L, 5L, 3L, 5L,
5L), x = c(21, 9, 31, 55, 5, 63, 63, 3, 13, 21, 53, 77, 5, 67,
63, 31, 17, 5, 21, 45, 79, 3, 7, 43, 27, 1, 63, 11, 37, 33, 27,
53, 71, 73, 97, 87, 77, 17, 85, 91, 49, 87, 89, 61, 65, 17, 71,
33, 53, 85, 49, 41, 75, 85, 79, 75, 23, 63, 89, 31, 29, 47, 75,
63, 65, 27, 27, 71, 89, 29, 25, 49, 91, 91, 39, 65, 45, 99, 53,
21, 29, 81, 35, 7, 27, 81, 93, 41, 79, 83, 31, 51, 33, 75, 15,
69, 7, 29, 7, 35, 87, 93, 57, 13, 91, 87, 95, 77, 7, 37, 81,
99, 83, 69, 85, 5, 77, 69, 55, 7, 39, 5, 41, 1, 63, 25, 13, 39,
97, 73, 25, 49, 35, 95, 59, 75, 23, 35, 67, 73, 91, 83, 79, 9,
27, 89, 79, 53, 89, 69, 95, 57, 11, 45, 63, 5, 25, 61, 3, 89,
1, 61, 85, 75, 67, 73, 63, 77, 43, 31, 69, 39, 47, 59, 75, 45,
57, 73, 5, 85, 57, 13, 91, 69, 79, 89, 13, 33, 15, 23, 89, 85,
39, 87, 7, 97, 57, 5, 61, 85), y = c(41, 57, 29, 59, 83, 77,
35, 73, 99, 69, 85, 23, 85, 11, 63, 97, 73, 47, 57, 73, 77, 1,
91, 17, 71, 57, 11, 3, 81, 31, 5, 41, 69, 93, 3, 11, 45, 97,
81, 87, 43, 9, 53, 61, 11, 63, 59, 33, 49, 89, 87, 79, 47, 59,
41, 25, 47, 13, 69, 11, 93, 83, 91, 85, 13, 95, 13, 37, 99, 35,
11, 63, 19, 99, 71, 55, 5, 21, 43, 59, 49, 15, 99, 15, 75, 77,
53, 51, 91, 45, 83, 21, 29, 35, 3, 27, 97, 95, 29, 53, 55, 41,
45, 31, 75, 37, 15, 47, 3, 1, 99, 55, 81, 37, 1, 41, 51, 45,
27, 83, 9, 69, 13, 81, 91, 55, 51, 31, 17, 97, 1, 47, 35, 7,
53, 59, 5, 51, 7, 5, 93, 63, 95, 51, 33, 43, 75, 67, 59, 89,
49, 83, 21, 49, 5, 5, 19, 45, 29, 41, 25, 3, 9, 1, 73, 53, 43,
99, 69, 41, 21, 3, 3, 13, 39, 21, 55, 75, 91, 31, 79, 17, 43,
91, 73, 11, 75, 15, 49, 77, 77, 23, 83, 47, 51, 53, 57, 99, 35,
15)), row.names = c(NA, -200L), class = "data.frame", .Names = c("id",
"age", "x", "y"))
I'm new to using dplyr, so I'm not exactly sure how to perform this operation. I'm thinking it would look something like this:
new_df <- df %>%
sample_frac(0.5) %>% # use sample_frac or sample_n to select 100 individuals
mutate(sex = "male")
but obviously that just results in a new data frame. Is there a way to select 100 males from the original data frame, then use something like an ifelse statement to assign the rest as female?
If you absolutely need a 50/50 distribution between male and female, you could run with dplyr:
dfs <- sample_n(df, 100, replace = FALSE) %>%
mutate(sex = "male") %>%
select(id, sex) %>%
right_join(df, by = "id") %>%
mutate(sex = if_else(is.na(sex), "female", "male"))
results:
table(dfs$sex)
female male
100 100

Date format in hover for ggplot2 and plotly

I have a question about date formats in plotly. I made a time series plot in ggplot2 that I'm trying to visualize with plotly but a format issue for date-time appears on the hover (see image). I would like the date format to be YYMMD-hh:mm. How could I get this format?
Relevant R code on my script:
library(lubridate)
datosO3.melt <- melt(datosO3.plot, id.vars="fecha", value.name="value")
ozono.plot <- ggplot() + geom_line(data=datosO3.melt, aes(x=fecha, y=value, colour=variable))
ggplotly(ozono.plot)
The point is that column fecha in the dataframe is a date-time object created in a previous dataframe.
datosO3<-within(datosO3, fecha.hora <- ymd_hm(paste(datosO3$AAMMDD,datosO3$hhmm,sep="")))
and inherited by datosO3.melt
str(datosO3.melt)
'data.frame': 23328 obs. of 3 variables:
$ fecha : POSIXct, format: "2017-06-13 00:00:00" "2017-06-13 00:10:00" ...
$ variable: Factor w/ 54 levels "Alcoi.Verge_dels_Lliris",..: 1 1 1 1 1 1 1 1 1 1 ...
$ value : num 75 76 73 72 71 72 73 74 74 73 ...
But when I dput I get:
> dput(data)
structure(list(fecha = structure(c(1497312000, 1497312600, 1497313200,
1497313800, 1497314400, 1497315000, 1497315600, 1497316200, 1497316800,
1497317400, 1497318000, 1497318600, 1497319200, 1497319800, 1497320400,
1497321000, 1497321600, 1497322200, 1497322800, 1497323400, 1497324000,
1497324600, 1497325200, 1497325800, 1497326400, 1497327000, 1497327600,
1497328200, 1497328800, 1497329400, 1497330000, 1497330600, 1497331200,
1497331800, 1497332400, 1497333000, 1497333600, 1497334200, 1497334800,
1497335400, 1497336000, 1497336600, 1497337200, 1497337800, 1497338400,
1497339000, 1497339600, 1497340200, 1497340800, 1497341400, 1497342000,
1497342600, 1497343200, 1497343800, 1497344400, 1497345000, 1497345600,
1497346200, 1497346800, 1497347400, 1497348000, 1497348600, 1497349200,
1497349800, 1497350400, 1497351000, 1497351600, 1497352200, 1497352800,
1497353400, 1497354000, 1497354600, 1497355200, 1497355800, 1497356400,
1497357000, 1497357600, 1497358200, 1497358800, 1497359400, 1497360000,
1497360600, 1497361200, 1497361800, 1497362400, 1497363000, 1497363600,
1497364200, 1497364800, 1497365400, 1497366000, 1497366600, 1497367200,
1497367800, 1497368400, 1497369000, 1497369600, 1497370200, 1497370800,
1497371400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Alcoi.Verge_dels_Lliris", "Alacant.El_Pla",
"Alacant.Florida_Babel", "Alacant.Rabassa", "Benidorm", "Elx.Agroalimentari",
"Elx.Parc_de_Bombers", "Elda.Lacy", "Orihuela", "El_Pinos",
"Torrevieja", "L.Alcora", "Burriana", "Castello.Penyeta",
"Castello.Ermita", "Castello.Grau", "Castello.Patronat_d.Esports",
"Cirat", "Morella", "Onda", "Coratxar", "Sant_Jordi", "Torre_Endomenech",
"La_Vall_d.Uixo", "Vilafranca", "Vinaros_Planta", "Viver",
"Zorita", "Albalat_dels_Tarongers", "Alzira", "Algar_de_Palancia",
"Beniganim", "Bunnol.Cemex", "Burjassot.Facultats", "Caudete_de_las_Fuentes",
"Cortes_de_Pallas", "Quart_de_Poblet", "Gandia", "Ontinyent",
"Paterna.CEAM", "Sagunt.Port", "Sagunt.Nord", "Sagunt.CEA",
"Torrebaja", "Valencia.Pista_de_Silla", "Valencia.Vivers",
"Valencia.Politecnic", "Valencia.Avd._Francia", "Valencia.Moli_del_Sol",
"Valencia.Bulevard_Sud", "Vilamarxant", "Villar_del_Arzobispo",
"Torrent.El_Vedat", "Chiva_UM"), class = "factor"), value = c(75,
76, 73, 72, 71, 72, 73, 74, 74, 73, 71, 72, 71, 72, 74, 74,
73, 73, 73, 74, 74, 74, 72, 72, 72, 71, 70, 70, 70, 70, 72,
71, 68, 66, 68, 68, 65, 61, 63, 65, 71, 71, 79, 91, 84, 82,
91, 94, 91, 88, 88, 92, 99, 102, 103, 100, 105, 104, 104,
101, 102, 100, 101, 104, 109, 109, 112, 115, 116, 116, 113,
111, 110, 113, 113, 114, 115, 115, 114, 113, 111, 112, 115,
114, 112, 112, 114, 116, 116, 115, 114, 115, 113, 112, 112,
110, 109, 110, 110, 111)), .Names = c("fecha", "variable",
"value"), row.names = c(NA, 100L), class = "data.frame")
>
How do I change the fecha format to be reflected in the hover?
EDIT 1: Added data
> dput(datosO3.plot)
structure(list(Alcoi.Verge_dels_Lliris = c(75, 76, 73, 72, 71,
72), Alacant.El_Pla = c(56, 55, 53, 56, 55, 54), Alacant.Florida_Babel = c(56,
49, 48, 45, 44, 42), Alacant.Rabassa = c(43, 42, 43, 41, 41,
43), Benidorm = c(110, 105, 95, 107, 110, 107), Elx.Agroalimentari = c(80,
77, 75, 69, 64, 62), Elx.Parc_de_Bombers = c(71, 68, 67, 68,
65, 66), Elda.Lacy = c(39, 34, 32, 28, 25, 26), Orihuela = c(16,
13, 25, 13, 17, 9), El_Pinos = c(48, 35, 36, 35, 33, 43), Torrevieja = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), L.Alcora = c(40,
45, 42, 44, 48, 50), Burriana = c(14, 8, 8, 9, 7, 7), Castello.Penyeta = c(57,
61, 68, 65, 58, 59), Castello.Ermita = c(18, 20, 14, 16, 21,
19), Castello.Grau = c(20, 21, 19, 13, 11, 16), Castello.Patronat_d.Esports = c(36,
26, 29, 28, 28, 29), Cirat = c(56, 56, 54, 54, 51, 51), Morella = c(119,
121, 122, 122, 123, 123), Onda = c(57, 58, 57, 58, 60, 60), Coratxar = c(123,
125, 126, 127, 128, 125), Sant_Jordi = c(37, 36, 37, 38, 40,
39), Torre_Endomenech = c(28, 34, 35, 32, 30, 30), La_Vall_d.Uixo = c(63,
64, 65, 65, 64, 65), Vilafranca = c(100, 101, 97, 98, 97, 99),
Vinaros_Planta = c(26.7, 31.3, 31.6, 31.7, 37.8, 41.7), Viver = c(40.6,
36.9, 47.6, 36.7, 43.5, 46.1), Zorita = c(67, 70, 69, 64,
64, 68), Albalat_dels_Tarongers = c(33, 32, 32, 29, 26, 26
), Alzira = c(24, 26, 23, 19, 20, 39), Algar_de_Palancia = c(47,
50, 48, 49, 47, 52), Beniganim = c(53, 58, 56, 56, 54, 53
), Bunnol.Cemex = c(64, 55, 53, 53, 53, 55), Burjassot.Facultats = c(43,
30, 30, 28, 16, 20), Caudete_de_las_Fuentes = c(71, 68, 66,
72, 74, 72), Cortes_de_Pallas = c(88, 74, 78, 82, 82, 85),
Quart_de_Poblet = c(13, 18, 21, 23, 30, 38), Gandia = c(45,
39, 49, 49, 48, 46), Ontinyent = c(88, 83, 83, 89, 86, 82
), Paterna.CEAM = c(48, 49, 47, 47, 48, 47), Sagunt.Port = c(52,
51, 51, 50, 50, 49), Sagunt.Nord = c(33, 34, 34, 32, 31,
31), Sagunt.CEA = c(34.8, 36.3, 37.6, 43.8, 40.7, 37.5),
Torrebaja = c(51, 42, 56, 52, 45, 65), Valencia.Pista_de_Silla = c(37,
52, 57, 60, 35, 7), Valencia.Vivers = c(45, 42, 39, 34, 32,
33), Valencia.Politecnic = c(40, 33, 30, 26, 25, 23), Valencia.Avd._Francia = c(54,
50, 50, 48, 45, 43), Valencia.Moli_del_Sol = c(9, 10, 10,
8, 7, 9), Valencia.Bulevard_Sud = c(1, 0, 0, 2, 0, 0), Vilamarxant = c(21,
29, 33, 27, 33, 22), Villar_del_Arzobispo = c(55, 57, 57,
54, 53, 55), Torrent.El_Vedat = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), Chiva_UM = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), fecha = structure(c(1497312000,
1497312600, 1497313200, 1497313800, 1497314400, 1497315000
), class = c("POSIXct", "POSIXt"), tzone = "UTC")), .Names = c("Alcoi.Verge_dels_Lliris",
"Alacant.El_Pla", "Alacant.Florida_Babel", "Alacant.Rabassa",
"Benidorm", "Elx.Agroalimentari", "Elx.Parc_de_Bombers", "Elda.Lacy",
"Orihuela", "El_Pinos", "Torrevieja", "L.Alcora", "Burriana",
"Castello.Penyeta", "Castello.Ermita", "Castello.Grau", "Castello.Patronat_d.Esports",
"Cirat", "Morella", "Onda", "Coratxar", "Sant_Jordi", "Torre_Endomenech",
"La_Vall_d.Uixo", "Vilafranca", "Vinaros_Planta", "Viver", "Zorita",
"Albalat_dels_Tarongers", "Alzira", "Algar_de_Palancia", "Beniganim",
"Bunnol.Cemex", "Burjassot.Facultats", "Caudete_de_las_Fuentes",
"Cortes_de_Pallas", "Quart_de_Poblet", "Gandia", "Ontinyent",
"Paterna.CEAM", "Sagunt.Port", "Sagunt.Nord", "Sagunt.CEA", "Torrebaja",
"Valencia.Pista_de_Silla", "Valencia.Vivers", "Valencia.Politecnic",
"Valencia.Avd._Francia", "Valencia.Moli_del_Sol", "Valencia.Bulevard_Sud",
"Vilamarxant", "Villar_del_Arzobispo", "Torrent.El_Vedat", "Chiva_UM",
"fecha"), row.names = 289:294, class = "data.frame")
We can use the "hidden" text aes, to use it in the tooltip:
ggplot(datosO3.melt) +
geom_line(aes(x = fecha,
y = value,
colour = variable,
group = variable,
text = paste('fecha: ', fecha, '\n',
'variable: ', variable, '\n',
'value: ', value, '\n')
)
)
ggplotly(tooltip = 'text')
However for anything that's slightly more complicated than default, especially when working with hover tooltips I usually prefer to work directly in plotly:
plot_ly(datosO3.melt,
type = 'scatter',
mode = 'lines',
x = ~fecha,
y = ~value,
color = ~variable,
text = ~paste('fecha: ', fecha, '\n',
'variable: ', variable, '\n',
'value: ', value, '\n'),
hoverinfo = 'text'
)
To use a custom date format, other the print.Date default, just substitute fecha with the format you prefer, e.g:
plot_ly(datosO3.melt,
type = 'scatter',
mode = 'lines',
x = ~fecha,
y = ~value,
color = ~variable,
text = ~paste('fecha: ', format(fecha, '%Y-%m-%d %H:%M'), '\n',
'variable: ', variable, '\n',
'value: ', value, '\n'),
hoverinfo = 'text'
)

sampling based on frequency in R

I want to make 20000 sample from a data which is quite big,based on the each value size in order to fill the NA values:
so I use the output of histogram, but it wasn't successful, and get me an error, how to avoid it ?
y=hist(maindata,col="red",breaks=length(unique(maindata))
for(k in 1:20000){
data=maindata
for(i in 1:nrow(data)){
if (data[i]="Na"){
data[i]=sample(y$breaks,size=1,replace=FALSE,prob=y$density)}}}
I get this error :
Error in sample.int(length(x), size, replace, prob) :
incorrect number of probabilities
and I check the length(y$breaks) and length(y$density),length(y$breaks) was one unit more, how should I fixed it ?
thank you in advance
EDIT :
structure(list(breaks = c(15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
103, 104, 105, 106, 107, 108, 109), counts = c(27L, 17L, 31L,
83L, 118L, 144L, 211L, 279L, 354L, 312L, 300L, 377L, 407L, 443L,
481L, 351L, 302L, 236L, 248L, 178L, 141L, 101L, 77L, 80L, 63L,
44L, 64L, 44L, 60L, 46L, 24L, 29L, 15L, 28L, 21L, 13L, 19L, 10L,
30L, 11L, 12L, 12L, 7L, 12L, 12L, 11L, 11L, 7L, 7L, 4L, 4L, 4L,
1L, 2L, 3L, 6L, 1L, 1L, 3L, 3L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 1L, 2L), density = c(0.00453172205438067,
0.00285330647868412, 0.00520308828465928, 0.0139308492782813,
0.0198053037932192, 0.0241691842900302, 0.035414568647197, 0.0468277945619335,
0.0594159113796576, 0.0523665659617321, 0.0503524672708963, 0.0632762672037596,
0.0683115139308493, 0.0743538100033568, 0.0807317891910037, 0.0589123867069486,
0.0506881503860356, 0.0396106075864384, 0.0416247062772743, 0.0298757972473985,
0.0236656596173212, 0.0169519973145351, 0.0129237999328634, 0.0134273246055723,
0.0105740181268882, 0.00738502853306479, 0.0107418596844579,
0.00738502853306479, 0.0100704934541793, 0.0077207116482041,
0.0040281973816717, 0.00486740516951997, 0.00251762336354481,
0.00469956361195032, 0.00352467270896274, 0.00218194024840551,
0.00318898959382343, 0.00167841557569654, 0.00503524672708963,
0.0018462571332662, 0.00201409869083585, 0.00201409869083585,
0.00117489090298758, 0.00201409869083585, 0.00201409869083585,
0.0018462571332662, 0.0018462571332662, 0.00117489090298758,
0.00117489090298758, 0.000671366230278617, 0.000671366230278617,
0.000671366230278617, 0.000167841557569654, 0.000335683115139308,
0.000503524672708963, 0.00100704934541793, 0.000167841557569654,
0.000167841557569654, 0.000503524672708963, 0.000503524672708963,
0, 0, 0, 0.000167841557569654, 0.000167841557569654, 0, 0, 0,
0.000167841557569654, 0, 0, 0.000167841557569654, 0, 0.000167841557569654,
0, 0.000167841557569654, 0, 0.000167841557569654, 0.000167841557569654,
0, 0, 0.000167841557569654, 0.000167841557569654, 0, 0, 0, 0,
0, 0.000503524672708963, 0, 0, 0, 0.000167841557569654, 0.000335683115139308
), mids = c(15.5, 16.5, 17.5, 18.5, 19.5, 20.5, 21.5, 22.5, 23.5,
24.5, 25.5, 26.5, 27.5, 28.5, 29.5, 30.5, 31.5, 32.5, 33.5, 34.5,
35.5, 36.5, 37.5, 38.5, 39.5, 40.5, 41.5, 42.5, 43.5, 44.5, 45.5,
46.5, 47.5, 48.5, 49.5, 50.5, 51.5, 52.5, 53.5, 54.5, 55.5, 56.5,
57.5, 58.5, 59.5, 60.5, 61.5, 62.5, 63.5, 64.5, 65.5, 66.5, 67.5,
68.5, 69.5, 70.5, 71.5, 72.5, 73.5, 74.5, 75.5, 76.5, 77.5, 78.5,
79.5, 80.5, 81.5, 82.5, 83.5, 84.5, 85.5, 86.5, 87.5, 88.5, 89.5,
90.5, 91.5, 92.5, 93.5, 94.5, 95.5, 96.5, 97.5, 98.5, 99.5, 100.5,
101.5, 102.5, 103.5, 104.5, 105.5, 106.5, 107.5, 108.5), xname = "b",
equidist = TRUE), .Names = c("breaks", "counts", "density",
"mids", "xname", "equidist"), class = "histogram")
Data information :
> head(maindata)
[1] 30 44 -1 32 30 34
> is.numeric(maindata)
[1] TRUE
> is.vector(maindata)
[1] TRUE
> length(maindata)
[1] 36203
Do you just want 20,000 samples from the distribution of the non-missing data? If so, another way to approach this would be to just calculate a kernel density estimate directly from the non-missing data and then sample from that. For example, using fake data:
# Fake data with some missing values
set.seed(31)
dat = rnorm(30000, 20, 10)
dat[sample(1:30000, 5000)] = NA
# Create kernel density estimate from the data
# n is the number of grid points used in the esimate (should always be a power of 2)
dat.dens = density(dat[!is.na(dat)], n=2^10)
sim.sample = sample(dat.dens$x, 2e4, replace=TRUE, prob=dat.dens$y)
plot(dat.dens)
lines(density(sim.sample), col="red")
Please let me know if I've misunderstood what you're trying to do.

Resources