Remove incorrect coloured outline from ggplot legend in plots using multiple dataframes? - r

I've created a ggplot out of multiple dataframes. One of the dataframes is used to draw filled polygons/rectangles while the other is used to create contour plots. If I specify a colour in the contour plot element, it appears as an outline in the legend for the polygons (but not as an outline for the plotted polygons themselves).
How can I remove the outline only from the single unrelated legend element?
Reproducible example:
df1 <- data.frame(
person = c("Avery", "Doug", "Avery", "Doug", "Avery", "Doug", "Avery", "Doug"),
place = c("A","A","B","B","C","C","D","D"),
Coord1 = c(10, 30, 70, 90, 70, 90, 10, 30),
Coord2 = c(70, 90, 70, 90, 10, 30, 10, 30)
)
df2 <- structure(list(place = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "D"), Coord1 = c(11.9321233389815,
5.65317891793307, 19.1326709658932, 27.9338809206353, 19.9660510683355,
21.0749048744809, 7.21913717149466, 20.1920548762508, 10.4543717875714,
25.757795926256, 12.1828845087838, -0.465353420455905, 7.11311351850768,
14.8155028526132, -2.1637642702189, 13.0813689628972, 4.7982168502595,
5.57686199262573, 4.00687188024056, 22.9457782880066, 18.3539124739061,
4.33984900298331, 17.8841935003471, -1.65929325512088, 4.69179993740943,
42.6151590341647, 27.615643609024, 10.0218739210389, 37.0107595807328,
42.2081901427891, 21.4907606643235, 24.3639224345837, 32.772343673693,
39.707188961395, 39.526308329454, 26.0304665678886, 29.9780452839205,
26.3307539307013, 30.3989518780307, 48.3076387197071, 18.9469498637539,
20.2952072925756, 16.6669862005188, 37.4700660304315, 17.8662145857614,
37.2600750819877, 25.7007521883461, 28.6742428655807, 40.314957702891,
30.0697139406434, 73.6583322470224, 79.3142178409643, 50.9322840242791,
70.5198852398892, 61.5150022794989, 66.3183258649759, 76.2469408002744,
82.2025917335132, 47.6021718929377, 66.3132462976689, 69.1291434239167,
83.4197156933065, 86.9407992718112, 76.4950208109435, 73.661826873959,
66.9162963609677, 75.1456713179674, 60.9887903971435, 71.9565881057142,
68.4266063305417, 57.6911530548141, 74.7773517795878, 69.6101844803799,
67.790228250546, 69.3010177449088, 102.70586821213, 70.9690781293641,
81.8211641205855, 89.1142208240172, 103.752341186373, 89.5593092939362,
80.6305724472789, 82.0007363853625, 65.8943427952359, 78.3936032524407,
83.0803062787287, 92.1760929466364, 78.9701119884041, 86.8932236845231,
97.1763793934393, 84.058342477451, 93.7849946833988, 83.1484361525486,
80.1491902221492, 85.9961991961048, 93.2550659070971, 78.3156355536891,
94.2129927101896, 93.4865526165191, 93.4934076661858, 75.4629285622502,
47.6731852393923, 72.88284171046, 75.438344587857, 58.0801523419401,
69.8230768794789, 60.5302214082446, 65.9778757219936, 70.5612025149866,
68.4370521177005, 58.1149422572855, 82.8226830631184, 65.7390566694014,
65.6285158427551, 64.1736763473425, 93.616329884223, 60.1827131711359,
48.6282110427849, 66.7586165042503, 75.5318338818184, 67.3206665449225,
59.0440215809036, 51.0189160405021, 84.6055490872487, 54.892120364794,
97.5277806090821, 87.3463009673921, 102.066058009738, 82.0203143022486,
90.9586925853003, 88.6949600210823, 103.884302040987, 87.4998069331291,
76.3591677407161, 81.8943250717613, 100.824382142449, 102.946636154418,
81.8333852359431, 94.7599932319231, 93.7316328376966, 79.8039952376426,
92.3881270619707, 71.2558895720234, 99.8119134521082, 112.401228355728,
83.0814548497863, 94.0539152929977, 79.4016592347326, 85.7829067433969,
75.948295755515, 3.73649805560632, 23.0903794794387, 8.67023149246646,
-6.07704226517408, -3.07641645158295, 9.54851319649848, 17.8889045992558,
5.10657758386499, 18.7016029853773, 25.4522147195668, 6.8023795377009,
15.3767651131782, 20.3683653225209, -20.3254139776893, 15.6691879496879,
9.45068374562864, 18.7361628847212, 12.623384426461, 10.17849269891,
-16.9581826824957, 17.4909319409575, 9.44644950565353, -5.38702617407262,
6.21805297402208, 14.1749418466783, 12.9105548124602, 27.1855440225952,
29.8482741922039, 29.7496920817558, 38.524389832422, 33.7023405759537,
45.3217768361544, 40.1308769909943, 34.4402395511235, 25.068583150376,
26.2353756696678, 21.8111750232141, 33.8629499457315, 29.7131703823705,
34.1556614684988, 34.1245105271038, 18.3147946286069, 27.3066155417929,
26.5207184421978, 34.0884687114446, 26.639481645307, 38.6173778182637,
37.8098024973006, 13.0741049202031, 45.5358046032432), Coord2 = c(69.9894126325745,
64.8743763485989, 82.4286751251657, 63.3941714859759, 71.6662421480457,
64.4679475985789, 70.9818141479016, 58.5333414006127, 57.5007274347188,
67.978883174351, 71.7750032484243, 83.0764732083471, 56.5706459545147,
78.6108303958077, 72.3287500312082, 80.4225709350104, 83.2909825415992,
79.4595795345076, 60.1437449584155, 57.8413316685676, 53.7079406423028,
70.9231623851271, 69.7788257690901, 75.1347791237738, 67.7044636198011,
94.3340903929599, 102.873693143552, 65.8173572622545, 76.2734860629939,
85.2060002479906, 90.1101352051085, 96.2874409924653, 109.760633210447,
94.6155820846595, 95.2601614162185, 69.4981680111657, 84.4916642548314,
84.207250490172, 95.2814397728582, 88.6024629127233, 105.45937855117,
101.73736873413, 96.3994065298907, 89.5148162082444, 85.656552291334,
87.1768687739584, 103.397297430617, 100.466406474931, 97.492149943699,
95.7799794375843, 58.689787023173, 91.7856309013427, 70.284597504488,
82.5264495002628, 72.563622499658, 67.9271039063317, 76.9752642043184,
77.0427295652995, 60.493215584824, 71.9177768464766, 85.2536338000138,
78.6037513000414, 79.9167528070661, 45.2538960490244, 73.7270619632537,
74.1775039120291, 79.3753619281975, 72.5802290852752, 77.6589441840394,
65.0442876775209, 75.9474471250248, 67.6324983845202, 66.7269504282668,
61.4684933637993, 84.9370241657568, 100.807408193669, 90.6253499380336,
82.4392258080415, 85.978657057683, 85.5556977940698, 80.0069018830692,
92.0205306026153, 92.2799374212157, 92.075887951686, 84.046647740422,
95.2469867954641, 90.4378832940894, 103.868141681032, 96.1678027516943,
97.556915790983, 75.8136031118073, 87.269403250105, 102.176170079899,
83.303319611484, 77.0422898434913, 94.9522487152654, 93.5056368803038,
99.6066565241826, 93.2540063907868, 75.3235503045194, 4.79683173913879,
-1.77333441979239, 22.2927646778249, 21.2002682830845, 9.2462559574228,
12.9328717922413, 9.71104299233956, 6.02183609528862, 16.5548144989934,
14.9138467115507, 26.4290598776307, 13.2859100121563, 24.6139296632551,
6.01920473648369, 6.44403579988305, 15.0356163044265, 14.0083469209857,
-0.983183926446912, 11.305792234271, 17.6465875981944, -8.94856332381213,
7.36961719889383, 7.92121709811615, -0.418766330916343, -5.42220613778122,
33.1278663709957, 22.5146385498018, 46.6482380411691, 21.1315055557743,
23.3535222367336, 37.0589046029771, 20.5508910389587, 34.3154424989857,
37.5906644856971, 28.9704448512218, 30.6411807554989, 35.4051444540154,
30.0311466912122, 28.7535914372174, 28.4106840578911, 36.8662136858688,
12.898132148862, 46.7570642830409, 30.8475454612592, 19.28210909481,
19.5978468800625, 34.6039683478465, 21.0831790287953, 34.2925742059241,
53.7786968993373, 19.0172370755723, 9.21397428746006, -2.37886313478754,
6.79453923040543, 22.6413009302484, 33.2099355799597, 6.58849285636098,
-11.7836723204909, 5.86738511098067, 3.81888634358306, 20.5825579992258,
9.8827774793369, 14.4441862792073, 12.3532766231854, 10.2334773998435,
22.8131908769118, 23.8680133376392, 9.59780868195676, 24.4068082968655,
11.2009206443726, 13.4244085452577, 22.0625356242069, 12.2526091738859,
-2.39754291555823, 2.00084520595361, 36.7097665883357, 16.9122643222195,
20.0379767031296, 41.4658882877744, 24.0612860080661, 24.7786616325539,
33.9205960391195, 32.0684115690426, 26.5208653130137, 47.3248746356669,
37.8352815877295, 42.4135950414131, 33.8021211637222, 28.309616360779,
24.4645546802529, 32.9897820259554, 17.8194684139265, 36.2084800081616,
37.6500899883031, 22.4961071408182, 47.6368585912597, 57.0362400846021,
43.7894593761563, 23.2634280784332, 54.7477709366998)), row.names = c(NA,
-200L), class = c("tbl_df", "tbl", "data.frame"))
ggplot() +
stat_density_2d(data = df2,
aes(x = Coord1, y = Coord2, group=place, shape="Yuki"),
geom = "polygon", contour = TRUE, colour="darkorchid3", bins = 7, alpha=0.2, show.legend=TRUE) +
geom_polygon(data = df1, aes(x = Coord1, y = Coord2, fill = person), alpha = 0.6) +
geom_point(data = df1, aes(x = Coord1, y = Coord2), size = 1.5, show.legend=FALSE) +
geom_text_repel(data = df1, aes(x = Coord1, y = Coord2, label = place), colour="black") +
ggtitle("Combined Plot", subtitle = "Avery, Doug, Yuki") +
labs(fill = "Person", shape = "") +
guides(shape = guide_legend(order = 2), fill = guide_legend(order = 1))
The above code produces this plot:
imgdesc: a ggplot generated from the cold listed above, representing two filled rectangles and a collection of contour plots
This is what I'd like it to look like (moving the second element up or otherwise combining the legends would be awesome, but is a little outside the scope of my question.)
imgdesc: A plot similar to the one generated by the code above, except the purple outline around two of the three legend elements has been removed. The space between the elements is also even.
Things I've tried: rearranging the elements, forcing other colours to be called in the geom_polygon, and using extra elements to try and cover up the outline. The closest I've achieved, is removing the "colour" code from stat_density_2d removes the outline from all the elements in the legend, but it also removes the contour lines from the plot itself, which is not ideal. Removing the "dummy" shape aes from stat_density_2d removes the contour plots from the legend but does not remove the outlines from the other legend elements. Combining the dataframes is also not an ideal/workable solution in this case (sorry).
My actual plots also use custom scale_colour and fill values, which can't be easily included in a minimum reproducible example. For that reason, I haven't gone very far in trying to utilize custom labelling assigned to those, and would appreciate if there's a solution that doesn't depend on scale_color_manual, etc. However, I also know beggers can't be choosers and I'd appreciate any help on this at all!

Put the legend in correct layout but it seem really tricky to do anything regard the border. In stead of using your color I use the white color which still distinctive but would not show on legend border.
fill_color <- c("green", "red", "#444444")
names(fill_color) <- c("Avery", "Doug", "Yuki")
line_color <- c("green", "red", "darkorchid3")
names(line_color) <- c("Avery", "Doug", "Yuki")
ggplot() +
stat_density_2d(data = df2,
# Here using the fill aes instead of shape aes like your
# original input which cause the legend was put into
# two different categories.
aes(x = Coord1, y = Coord2, group=place, fill = "Yuki", color="Yuki",),
geom = "polygon", contour = TRUE,
bins = 7, alpha=0.2, show.legend=TRUE) +
geom_polygon(data = df1, aes(x = Coord1, y = Coord2, fill = person, color = person),
colour = "transparent", alpha = 0.6) +
geom_point(data = df1, aes(x = Coord1, y = Coord2), size = 1.5,
show.legend=FALSE) +
geom_text_repel(data = df1, aes(x = Coord1, y = Coord2, label = place),
colour="black") +
ggtitle("Combined Plot", subtitle = "Avery, Doug, Yuki") +
scale_color_manual(values = line_color, guide = FALSE) +
scale_fill_manual(values = fill_color) +
labs(fill = "Person", shape = "") +
guides(shape = guide_legend(order = 2), fill = guide_legend(order = 1)) +
theme(legend.key = element_rect(fill = NA))
[Update with maniputlate color pallete using scale_fill_manual and scale_color_manual]

Related

Creating a treemap, based on count, using R

I would like to create a tree map based on the count of "names". However, I am not sure how to do so. Seeking you help on this matter.
names <- c("A", "B", "B", "C", "D", "A", "A", "A", "A", "G", "B", "F", "F", "H")
names <- names %>% as.factor()
ggplot(names, aes(area= names, fill= names) + geom_treemap()
Many thanks
names <- c("A", "B", "B", "C", "D", "A", "A", "A", "A", "G", "B", "F", "F", "H")
names <- data.frame(names)
names <- names %>%
count(names)
ggplot(names, aes(area= n, fill= names)) + geom_treemap()

Adjust the multiple fills(color) of different label regions

0
Forgive my stupid to disturb you again.
#teunbrand answered my question yesterday and I used it in my real data but it doesn’t work .
Here is my question in stackoverfow:Can I adjust the fill(color) of different label regions when using ggh4x package
And # teunbrand created a function : assign_strip_colours <- function(gt, index, colours){…}
I don’t know where is wrong with my real data and code. There are 42 regions need to be filled with different colors.
gt <- assign_strip_colours(gt, 1:42,rainbow(42)) Warning message: In gt$grobs[is_strips] <- strips : 被替换的项目不是替换值长度的倍数(The item being replaced is not a multiple of the length of the replacement value. ) ?
If there is sth need to be adjust in assign_strip_colours <- function(gt, index, colours){…} ?
Forgive me I’m really new to ggplotGrob. I need your help.Thanks.
sample data and code:
structure(list(Name = 1:71, Disease = 72:142, Organ = c("A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A"), fill = c("a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a"
), mean =..., row.names = c(NA, 71L), class = "data.frame")
p1<-ggplot(data = data, aes(Name,mean, label = Name, fill=Organ)) +
geom_bar(position="dodge2", stat="identity",width = 0.85,color="black") +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd),position = position_dodge(0.95), width = .2) +
# scale_alpha_manual(values = datamean_sd$Alpha) +
# scale_color_manual(name = "Organ", values = c("A"="#f15a24", "B"="#00FF00","C"="#7570B3","D"="#FF00FF","E"="#FFFF33","F"="#00F5FF","G"="#666666","H"="#7FC97F","I"="#BEAED4","J"="#A6D854"))+
# guides(
# colour = guide_legend(title.position = "right")
# )+
facet_nested(.~Organ+Disease, scales = "free_x", space = "free_x",switch = "x")+
## facet_wrap(strip.position="bottom") +
labs(title = "123", x = NULL, y = "value") +
rotate_x_text(angle = 45)+
scale_fill_manual(name = "Organ",values = unique(datamean_sd$Organ_fill))
p1
####
gt <- ggplotGrob(p1)
###############
assign_strip_colours <- function(gt, index, colours) {
if (length(index) != length(colours))
stop()
# Decide which strips to recolour, here: the first 3
is_strips <- which(startsWith(gt$layout$name, "strip-b"))[index]
# Extract strips
strips <- gt$grobs[is_strips]
# Loop over strips
strips <- mapply(function(strip, colour) {
# Find actual strip
is_strip <- strip$layout$name == "strip"
grob <- strip$grobs[is_strip][[1]]
# Find rectangle
is_rect <- which(vapply(grob$children, inherits, logical(1), "rect"))
# Change colour
grob$children[[is_rect]]$gp$fill <- colour
# Put back into strip
strip$grobs[is_strip][[1]] <- grob
return(strip)
}, strip = strips, colour = colours)
# Put strips back into gtable
gt$grobs[is_strips] <- strips
return(gt)
}
gt <- assign_strip_colours(gt, 1:42,rainbow(42))
grid::grid.newpage(); grid::grid.draw(gt)
My bad, I think there should have been a SIMPLIFY = FALSE at the mapply() function which I forgot earlier.
gt <- ggplotGrob(p1)
assign_strip_colours <- function(gt, index, colours) {
if (length(index) != length(colours))
stop()
# Decide which strips to recolour, here: the first 3
is_strips <- which(startsWith(gt$layout$name, "strip-b"))[index]
# Extract strips
strips <- gt$grobs[is_strips]
# Loop over strips
strips <- mapply(function(strip, colour) {
# Find actual strip
is_strip <- strip$layout$name == "strip"
grob <- strip$grobs[is_strip][[1]]
# Find rectangle
is_rect <- which(vapply(grob$children, inherits, logical(1), "rect"))
# Change colour
grob$children[[is_rect]]$gp$fill <- colour
# Put back into strip
strip$grobs[is_strip][[1]] <- grob
return(strip)
}, strip = strips, colour = colours, SIMPLIFY = FALSE)
# Put strips back into gtable
gt$grobs[is_strips] <- strips
return(gt)
}
gt <- assign_strip_colours(gt, 1:42,rainbow(42))
grid::grid.newpage(); grid::grid.draw(gt)
Created on 2021-04-11 by the reprex package (v1.0.0)
Data / plot construction:
library(ggplot2)
library(ggh4x)
data <- [Censored upon request]
p1<-ggplot(data = data, aes(Name,mean, label = Name, fill=Organ)) +
geom_bar(position="dodge2", stat="identity",width = 0.85,color="black") +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd),position = position_dodge(0.95), width = .2) +
facet_nested(.~Organ+Disease, scales = "free_x", space = "free_x",switch = "x")+
theme_classic() +
theme(legend.position = "bottom",
legend.box = "horizontal",
plot.title = element_text(hjust = 0.5),
plot.margin = unit(c(5, 10, 20, 7), "mm"),
strip.background = element_rect(colour="black", fill="white"),
strip.text.x = element_text(size = 6, angle=0),
axis.text.x=element_text(size=8),
strip.placement = "outside"
) +
labs(title = "123", x = NULL, y = "value")

Efficient way to use geom_boxplot with specified quantiles and long data

I have a dataset with calculated quantiles for each department and country. It looks like this:
df <- structure(list(quantile = c("p5", "p25", "p50", "p75", "p95",
"p5", "p25", "p50", "p75", "p95", "p5", "p25", "p50", "p75",
"p95", "p5", "p25", "p50", "p75", "p95"), value = c(6, 12, 20,
33, 61, 6, 14, 23, 38, 63, 7, 12, 17, 26, 50, 7, 12, 18, 26,
51), country = c("A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B"), dep = c("D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "I", "I", "I", "I",
"I", "I", "I", "I", "I", "I"), kpi = c("F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F")), row.names = c(NA, -20L), class = c("tbl_df", "tbl",
"data.frame"))
Now, I would like to build a boxplot for each department comparing countries and using p5/p95 instead of min/max similar to this plot but without outliers (hence, Train_number would be countries):
The corresponding code to this plot is (from question ggplot2, geom_boxplot with custom quantiles and outliers):
ggplot(MyData, aes(factor(Stations), Arrival_Lateness,
fill = factor(Train_number))) +
stat_summary(fun.data = f, geom="boxplot",
position=position_dodge(1))+
stat_summary(aes(color=factor(Train_number)),fun.y = q, geom="point",
position=position_dodge(1))
I tried to derive a solution from the code above and the provided answers. Unfortunately I lack the knowledge how to provide the neccessary values from the variables quantile and value to ggplot(). Is there an argument in the stat_summary() function I missed and could use? Or just another simple solution?
Whatever data you have provided from that you can generate the following plot
library(ggplot2)
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
ggplot(df, aes(factor(dep), value)) +
stat_summary(fun.data = f, geom="boxplot",
position=position_dodge(1))+
facet_grid(.~country, scales="free")
I don't know whether it is correct or not.

Efficient subsetting of a data.frame based on another jagged data.frame

I'm working on a project where I need to repeatedly subset a data.frame based on different combinations of attributes. Right now I'm subsetting the data.frame using the merge function as I don't know what the attributes input will be at run time, and this works. However, I'm wondering if there is a faster way to create the subsets.
require(data.table)
df <- structure(list(att1 = c("e", "a", "c", "a", "d", "e", "a", "d", "b", "a", "c", "a", "b", "e", "e", "c", "d", "d", "a", "e", "b"),
att2 = c("b", "d", "c", "a", "e", "c", "e", "d", "e", "b", "e", "e", "c", "e", "a", "a", "e", "c", "b", "b", "d"),
att3 = c("c", "b", "e", "b", "d", "d", "d", "c", "c", "d", "e", "a", "d", "c", "e", "a", "d", "e", "d", "a", "e"),
att4 = c("c", "a", "b", "a", "e", "c", "a", "a", "b", "a", "a", "e", "c", "d", "b", "e", "b", "d", "d", "b", "e")),
.Names = c("att1", "att2", "att3", "att4"), class = "data.frame", row.names = c(NA, -21L))
#create combinations of attributes
#attributes to search through
cnames <- colnames(df)
att_combos <- data.table()
for(i in 2:length(cnames)){
combos <- combn(cnames, i)
for(x in 1:ncol(combos)){
df_sub <- unique(df[,combos[1:nrow(combos), x]])
att_combos <- rbind(att_combos, df_sub, fill = T)
}
}
rm(df_sub, i, x, combos, cnames)
for(i in 1:nrow(att_combos)){
att_sub <- att_combos[i, ]
att_sub <- att_sub[, is.na(att_sub)==F, with = F]
#need to subset data.frame here - very slow on large data.frames
#anyway to speed this up?
df_subset_for_analysis <- merge(df, att_sub)
}
I would use data.table keys on the columns you want to subset on, and then generate a data.table (at runtime) with the combinations you are interested in, and then merge the two.
Here is an example with a single combination of attributes (simple_combinations) and one with multiple combinations of attributes (multiple_combinations):
require(data.table)
df <- structure(list(att1 = c("e", "a", "c", "a", "d", "e", "a", "d", "b", "a", "c", "a", "b", "e", "e", "c", "d", "d", "a", "e", "b"),
att2 = c("b", "d", "c", "a", "e", "c", "e", "d", "e", "b", "e", "e", "c", "e", "a", "a", "e", "c", "b", "b", "d"),
att3 = c("c", "b", "e", "b", "d", "d", "d", "c", "c", "d", "e", "a", "d", "c", "e", "a", "d", "e", "d", "a", "e"),
att4 = c("c", "a", "b", "a", "e", "c", "a", "a", "b", "a", "a", "e", "c", "d", "b", "e", "b", "d", "d", "b", "e")),
.Names = c("att1", "att2", "att3", "att4"), class = "data.frame", row.names = c(NA, -21L))
# Convert to data.table
dt <- data.table(df)
# Set key on the columns used for "subsetting"
setkey(dt, att1, att2, att3, att4)
# Simple subset on a single set of attributes
simple_combinations <- data.table(att1 = "d", att2 = "e", att3 = "d", att4 = "e")
setkey(simple_combinations, att1, att2, att3, att4)
# Merge to generate simple output subset (simple_combinations of att present in dt)
simple_subset <- merge(dt, simple_combinations)
# Complex (multiple) sets of attributes
multiple_combinations <- data.table(expand.grid(att1=c("d"), att2=c("c", "d", "e"),
att3 = c("d"), att4 = c("b", "e")))
setkey(multiple_combinations, att1, att2, att3, att4)
# Merge to generate output subset (multiple_combinations of att present in dt)
multiple_subset <- merge(dt, multiple_combinations)
The output is in simple_subset and multiple_subset.

OptimalCutoff Youden index calculation

After calculating the ROC curve for a dichotomous variable (a vs b). I want to calculate the optimal cut off value to differentiate this variable. The Youden index is the value that optimizes sensitivity and specificity for the differentiation.
Apparently, the package "OptimalCutpoints" should be able to do it. However, I get this strange error. Code inserted below:
library(pROC)
library(OptimalCutpoints)
df <- structure(list(value = c(1945.523629, 2095.549323, 2066.585153,
2445.878083, 2112.252632, 2115.92955, 2000.285032, 2224.611905,
1616.534694, 1668.017699, 1475.980978, 1940.849817, 1716.666667,
2153.284314, 2063.353635, 2163.070313, 1856.319149, 1499.986928,
2240.440449, 1869.083916, 1807.196078, 2025.603604, 1638.22973,
1781.602941, 2014.013809, 1906.027356, 2033.148718, 1923.403162,
1687.107744, 2632.280305, 1774.073084, 2196.162393, 2164.108659,
2055.031216, 2229.501425, 1273.872576, 2224.126126, 2006.858974,
1956.601942, 1808.214521, 1535.387136, 1382.15, 1596.69693, 1779.477273,
1577.174699, 1908.321526, 1833.124454, 1679.492978, 1777.31114,
1988.249023, 1736.75, 1985.68521, 1821.025974, 1745.325862, 1805.640777,
2326.821229, 1858.558824, 2025.622727, 2197.781321, 1475.685446,
2000.906423, 1714.749573, 1436.529412, 1981.15572, 1939.612779,
2007.679335, 2029.189536, 1644.298246, 1824.697342, 2281.990385,
2131.331776, 1143.722714, 1784.578076, 2143.131579, 982.4908457,
2217.021592, 1799.512346, 526.7047753, 1613.25, 951.9103079,
1006.241888, 1146.276835, 1651.474138, 1568.484778, 1938.867704,
792.5410822, 1602.037383, 1244.281863, 957.5739437, 819.6116071,
879.2128326, 1189.638632, 775.5525292, 1148.193333, 1130.812183,
902.34, 994.3302961), type = c("a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b", "b",
"b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b"
)), .Names = c("value", "type"), row.names = c(NA, -97L), class = "data.frame")
rocobj <- plot.roc(df$type, df$value, percent = TRUE, main="ROC", col="#1c61b6", add=FALSE)
optimal.cutpoint.Youden <- optimal.cutpoints(X = "value", status = "type", tag.healthy = 0, methods = "Youden",
data = df, pop.prev = NULL,
control = control.cutpoints(), ci.fit = FALSE, conf.level = 0.95, trace = FALSE)
summary(optimal.cutpoint.Youden)
plot(optimal.cutpoint.Youden)
Error: There are no healthy subjects in your dataset. Please review data and
variables. Prevalence must be a value higher than 0 and lower than 1.
I am probably missing something very obvious here. I tried to modify the code based on the package help file, but I cannot get rid of the error.
Thank you very much and my apologies for my R "skills"
PS: I understand the limitations of defining an "optimal cutoff" because it depends on how important your sensitivity is versus your specificity etc. I just want to have an idea of what value we would get using this technique.
the problem is how you have defined the tag.healthy argument. It should be 'a' or 'b' as these are in your data. You have defined it as 0.
Hope this helps.

Resources