How do I make ggrepel move (some) labels outside US map boundaries? - r

I'm trying to create my first map using ggrepel, but as you can see I've instead created a dumpster fire of overlapping labels. Most of the locations I'm mapping and labelling are clustered in the northeast, so the labels overlap. How do I get some of the labels to slide over beyond the map boundaries (in the ocean, so to speak)? Here's the code I used to create this monster:
plot_usmap(fill = "light blue", alpha = 0.5) +
ggrepel::geom_label_repel(data = top_18_2_transformed, aes(x=x, y=y, label=INSTNM),
size=3,
label.padding = unit(.75,"mm"),
nudge_y = 20,
nudge_x = 20,
box.padding=0.3,
max.overlaps=30,
point.padding=NA,
family="Avenir Next",
fill="gray99",
alpha=1.0,
label.r=unit(0.2,"lines"),
min.segment.length = 0.1,
label.size=unit(.15,"mm"),
segment.color="black",
segment.size=1,seed=1000) +
geom_point(data = top_18_2_transformed, aes(x = x, y = y, size = UGDS),
color = "red",
alpha = 0.75) +
labs(title = "Select Colleges",
size = "Undergrad Enrollment") +
theme(legend.position = "right")
And here's a picture of my problematic map:
Thanks in advance for any corrections you may be able to offer.
UPDATE 31 March 2022: here's the dput(top_18_2_transformed):
structure(list(lon = c(-74.659365, -122.167359, -78.937624, -75.19391,
-71.093226, -77.073463, -118.125878, -117.709837, -71.222839,
-79.941993, -72.926688, -76.483084, -73.961885, -71.169242, -74.025334,
-75.380236, -70.624084, -71.118313), lat = c(40.348732, 37.429434,
36.001135, 39.950929, 42.359243, 38.908809, 34.137349, 34.106515,
42.385995, 40.44357, 41.311158, 42.4472, 40.808286, 42.336213,
40.744776, 40.606822, 41.739072, 42.374471), UNITID = c(186131,
243744, 198419, 215062, 166683, 131496, 110404, 115409, 164739,
211440, 130794, 190415, 190150, 164924, 186867, 213543, 166692,
166027), OPEID = c(262700, 130500, 292000, 337800, 217800, 144500,
113100, 117100, 212400, 324200, 142600, 271100, 270700, 212800,
263900, 328900, 218100, 215500), OPEID6 = c(2627, 1305, 2920,
3378, 2178, 1445, 1131, 1171, 2124, 3242, 1426, 2711, 2707, 2128,
2639, 3289, 2181, 2155), INSTNM = c("Princeton University", "Stanford University",
"Duke University", "University of Pennsylvania", "Massachusetts Institute of Technology",
"Georgetown University", "California Institute of Technology",
"Harvey Mudd College", "Bentley University", "Carnegie Mellon University",
"Yale University", "Cornell University", "Columbia University in the City of New York",
"Boston College", "Stevens Institute of Technology", "Lehigh University",
"Massachusetts Maritime Academy", "Harvard University"), CITY = c("Princeton",
"Stanford", "Durham", "Philadelphia", "Cambridge", "Washington",
"Pasadena", "Claremont", "Waltham", "Pittsburgh", "New Haven",
"Ithaca", "New York", "Chestnut Hill", "Hoboken", "Bethlehem",
"Buzzards Bay", "Cambridge"), STABBR = c("NJ", "CA", "NC", "PA",
"MA", "DC", "CA", "CA", "MA", "PA", "CT", "NY", "NY", "MA", "NJ",
"PA", "MA", "MA"), ZIP = c("08544-0070", "94305", "27708", "19104-6303",
"02139-4307", "20057-0001", "91125", "91711", "02452-4705", "15213-3890",
"6520", "14853", "10027", "2467", "07030-5991", "18015", "02532-1803",
"2138"), ACCREDAGENCY = c("Middle States Commission on Higher Education",
"Western Association of Schools and Colleges Senior Colleges and University Commission",
"Southern Association of Colleges and Schools Commission on Colleges",
"Middle States Commission on Higher Education", "New England Commission on Higher Education",
"Middle States Commission on Higher Education", "Western Association of Schools and Colleges Senior Colleges and University Commission",
"Western Association of Schools and Colleges Senior Colleges and University Commission",
"New England Commission on Higher Education", "Middle States Commission on Higher Education",
"New England Commission on Higher Education", "Middle States Commission on Higher Education",
"Middle States Commission on Higher Education", "New England Commission on Higher Education",
"Middle States Commission on Higher Education", "Middle States Commission on Higher Education",
"New England Commission on Higher Education", "New England Commission on Higher Education"
), INSTURL = c("www.princeton.edu/", "www.stanford.edu/", "www.duke.edu/",
"www.upenn.edu/", "web.mit.edu/", "www.georgetown.edu/", "www.caltech.edu/",
"https://www.hmc.edu/", "www.bentley.edu/", "www.cmu.edu/", "https://www.yale.edu/",
"www.cornell.edu/", "www.columbia.edu/", "www.bc.edu/", "www.stevens.edu/",
"www.lehigh.edu/", "https://www.maritime.edu/", "www.harvard.edu/"
), SCH_DEG = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3), PREDDEG = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3), HIGHDEG = c(4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4), REGION = c(2, 8, 5, 2, 1, 2, 8, 8, 1, 2, 1,
2, 2, 1, 2, 2, 1, 1), CCBASIC = c(15, 15, 15, 15, 15, 15, 15,
21, 18, 15, 15, 15, 15, 15, 16, 16, 22, 15), ADM_RATE = c(0.0578,
0.0434, 0.076, 0.0766, 0.067, 0.1436, 0.0642, 0.1367, 0.4672,
0.1544, 0.0608, 0.1085, 0.0545, 0.2722, 0.3996, 0.321, 0.9146,
0.0464), ACTCM25 = c(33, 32, 33, 33, 34, 31, 35, 33, 27, 33,
33, 32, 33, 31, 31, 29, 19, 33), ACTCM75 = c(35, 35, 35, 35,
36, 35, 36, 35, 31, 35, 35, 35, 35, 34, 34, 33, 24, 35), SAT_AVG = c(1517,
1503, 1522, 1511, 1547, 1473, 1557, 1526, 1327, 1513, 1517, 1487,
1511, 1437, 1429, 1380, 1100, 1517), DISTANCEONLY = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), UGDS = c(5308,
6994, 6546, 10774, 4516, 7141, 938, 893, 4157, 6535, 6089, 14976,
8221, 9637, 3641, 5164, 1654, 7547), CURROPER = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), COSTT4_A = c(70900,
71587, 75105, 75303, 70240, 73840, 72084, 76953, 68577, 72265,
73900, 73879, 76907, 73053, 68734, 68383, 27858, 73485), COSTT4_P = c("NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL"), TUITIONFEE_IN = c(52800, 53529, 58031, 57770, 53790,
56058, 54600, 58660, 51830, 57119, 55500, 57222, 61788, 57910,
54014, 55240, 10018, 51925), TUITIONFEE_OUT = c(52800, 53529,
58031, 57770, 53790, 56058, 54600, 58660, 51830, 57119, 55500,
57222, 61788, 57910, 54014, 55240, 25752, 51925), AVGFACSAL = c(20724,
20865, 16863, 18277, 19624, 15798, 20595, 14397, 14592, 12296,
19830, 15574, 19431, 15599, 15318, 13763, 8928, 20988), PFTFAC = c("0.835",
"0.9881", "0.9364", "0.7779", "0.9885", "0.4815", "0.9289", "0.8992",
"0.6696", "0.9161", "0.717", "0.9074", "0.4521", "0.6662", "1",
"0.8392", "0.5867", "0.862"), C150_4 = c(0.979, 0.9432, 0.9462,
0.96, 0.954, 0.9491, 0.9357, 0.9167, 0.8952, 0.9049, 0.972, 0.9453,
0.9549, 0.9404, 0.8473, 0.8981, 0.7629, 0.971), RET_FT4 = c(0.9768,
0.9876, 0.9827, 0.9808, 0.9946, 0.9679, 0.9826, 0.9744, 0.9201,
0.9732, 0.9892, 0.9748, 0.9853, 0.9467, 0.9394, 0.9349, 0.8672,
0.9722), RET_PT4 = c("NULL", "NULL", "NULL", "0.9245", "NULL",
"0.6667", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "0.95",
"NULL", "NULL", "NULL", "NULL", "NULL"), MD_EARN_WNE_P10 = c("95689",
"97798", "93115", "103246", "111222", "96375", "112166", "108988",
"107974", "99998", "88655", "91176", "89871", "93021", "98159",
"95033", "91668", "84918"), PCT25_EARN_WNE_P10 = c("52729", "61965",
"61558", "65218", "67120", "61372", "67501", "69466", "73117",
"62003", "60311", "59566", "56005", "62006", "72669", "65644",
"68187", "56301"), PCT75_EARN_WNE_P10 = c("167686", "172245",
"151838", "174907", "169465", "147685", "175675", "173725", "146079",
"159483", "146102", "147189", "141158", "147010", "127298", "134075",
"129421", "153746"), MD_EARN_WNE_P6 = c("84713", "88873", "77260",
"80445", "112623", "71107", "129420", "112059", "78514", "87824",
"72046", "78779", "79434", "70858", "82237", "79832", "79354",
"77816"), GRAD_DEBT_MDN_SUPP = c("10450", "12000", "13500", "16763",
"13418", "16500", "PrivacySuppressed", "22089", "25000", "22014",
"13142", "14500", "21500", "18000", "27000", "23000", "26000",
"12665"), GRAD_DEBT_MDN10YR_SUPP = c("104.4654099", "119.9602793",
"134.9553142", "167.5745134", "134.1355856", "164.945384", "PrivacySuppressed",
"220.8168841", "249.9172485", "220.0671323", "131.3764992", "144.9520041",
"214.9288337", "179.9404189", "269.9106283", "229.9238686", "259.9139384",
"126.6080781"), C100_4 = c(0.898, 0.7288, 0.8831, 0.8571, 0.8691,
0.9076, 0.8434, 0.8565, 0.8479, 0.7599, 0.8777, 0.8694, 0.8635,
0.9003, 0.4566, 0.8003, 0.6322, 0.8476), ICLEVEL = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), OPENADMP = c(2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), GRADS = c("2997",
"10253", "10037", "14803", "6990", "12080", "1299", "NULL", "1086",
"7562", "7517", "8984", "23235", "4846", "3624", "1775", "97",
"21592"), ACCREDCODE = c("MSACHE", "WASCSR", "SACSCC", "MSACHE",
"NECHE", "MSACHE", "WASCSR", "WASCSR", "NECHE", "MSACHE", "NECHE",
"MSACHE", "MSACHE", "NECHE", "MSACHE", "MSACHE", "NECHE", "NECHE"
), RET_FT4_POOLED = c(0.9788, 0.9879, 0.9793, 0.9821, 0.9909,
0.9651, 0.9806, 0.9716, 0.9262, 0.97, 0.9892, 0.9741, 0.9825,
0.9479, 0.9423, 0.9378, 0.8633, 0.9817), C100_4_POOLED = c(0.8856,
0.739, 0.8788, 0.8546, 0.8602, 0.9009, 0.8242, 0.8551, 0.8326,
0.7546, 0.8772, 0.8766, 0.8677, 0.8918, 0.4515, 0.7621, 0.5955,
0.8573), BOOKSUPPLY = c("1050", "1245", "1434", "1358", "820",
"1200", "1428", "800", "1300", "1000", "1050", "970", "1294",
"1250", "1200", "1000", "1500", "1000"), ADMCON7 = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1), MDCOMP_ALL = c(0.5845,
0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845,
0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845,
0.5845), MDCOST_ALL = c(15387.5, 15387.5, 15387.5, 15387.5, 15387.5,
15387.5, 15387.5, 15387.5, 15387.5, 15387.5, 15387.5, 15387.5,
15387.5, 15387.5, 15387.5, 15387.5, 15387.5, 15387.5), MDEARN_ALL = c(37078,
37078, 37078, 37078, 37078, 37078, 37078, 37078, 37078, 37078,
37078, 37078, 37078, 37078, 37078, 37078, 37078, 37078), PPTUG_EF = c(0,
0, 0.0031, 0.0537, 0.0064, 0.0214, 0, 0.0011, 0.0118, 0.017,
2e-04, 3e-04, 0.0633, 0.0127, 0, 0.0128, 0.023, 0.0745), INEXPFTE = c(60048,
113338, 68756, 56874, 80756, 31693, 105185, 34419, 15842, 28167,
57231, 29893, 96463, 23266, 12504, 24995, 9687, 46272), C150_4_POOLED = c(0.9712,
0.9435, 0.9512, 0.9574, 0.9477, 0.9452, 0.9278, 0.9179, 0.8917,
0.8968, 0.969, 0.9452, 0.9566, 0.9297, 0.8608, 0.886, 0.7484,
0.974), GRAD_DEBT_MDN = c("10450", "12000", "13500", "16763",
"13418", "16500", "17747", "22089", "25000", "22014", "13142",
"14500", "21500", "18000", "27000", "23000", "26000", "12665"
), x = c(2107384.76948701, -1933340.27810509, 1876178.25472949,
2077243.02501463, 2314261.77712267, 1955381.08673633, -1660141.85673732,
-1623368.30493136, 2303424.70345276, 1678023.03854027, 2211596.23078863,
1896995.53745184, 2147624.50302849, 2309370.68277906, 2144734.86774305,
2041573.64168227, 2373567.48443726, 2311783.20749272), y = c(-188894.792987744,
-582296.149881856, -762721.806918975, -245389.810253038, 123275.753360416,
-404107.357328073, -1027748.36033576, -1039201.65863312, 122405.777575308,
-300870.762534603, -39714.5927185968, -7748.73302456512, -121333.925485063,
118650.586978148, -129820.607837031, -179439.260821836, 71069.0976923304,
124173.1993115)), class = "data.frame", row.names = c(NA, -18L
))

With a little data manipulation, you could move the labels out to either side of the country an draw segments to connect the labels to the universities:
top_18_2_transformed <- top_18_2_transformed[order(-top_18_2_transformed$y),]
colleges_east <- top_18_2_transformed[top_18_2_transformed$x > 0,]
colleges_west <- top_18_2_transformed[top_18_2_transformed$x < 0,]
colleges_west$lab_x <- -2300000
colleges_west$lab_y <- seq(-1000000, -1500000, -250000)
colleges_east$lab_x <- 2800000
colleges_east$lab_y <- seq(1000000, -2500000, -250000)
plot_usmap(fill = "light blue", alpha = 0.5) +
geom_text(data = colleges_west,
aes(x = lab_x, y = lab_y, label =stringr::str_wrap(INSTNM, 25)),
hjust = 1, size = 3, lineheight = 0.8) +
geom_text(data = colleges_east,
aes(x = lab_x, y = lab_y, label = stringr::str_wrap(INSTNM, 25)),
hjust = 0, size = 3, lineheight = 0.8) +
geom_point(data = top_18_2_transformed, aes(x = x, y = y, size = UGDS),
color = "red",
alpha = 0.75) +
geom_segment(data = colleges_east,
aes(x, y, xend = lab_x - 100000, yend = lab_y)) +
geom_segment(data = colleges_west,
aes(x, y, xend = lab_x + 100000, yend = lab_y)) +
labs(title = "Select Colleges",
size = "Undergrad Enrollment") +
theme(legend.position = c(0.35, 0),
legend.direction = 'horizontal') +
coord_cartesian(xlim = c( -3500000, 4000000),
ylim = c(-3000000, 1500000))

It looks like ggrepel::geom_*_repel() won't take xlim within aes() nor can it accept a list of vectors to split the constraints of west and east coast labels. However, you can just split them into two separate layers and then it's easier to control. Below I made a function to supply that position to avoid duplicating the code for those layers. Then you have to customize the exact values used in xlim and expand_limits() to get things to look nice depending on your graphics device etc.
Also IMHO this visualization is very hard to easily get much information out of. The points in the northeast are mostly overlapping and there are so many labels that even when spaced very nicely it is a bit tricky to follow them all. Instead it may be better to have a zoomed in plot for that region and avoid showing lots of space with no data in your plot or other ways to increase the legibility of the plot.
library(tidyverse)
library(ggrepel)
library(usmap)
# create function to generate labels and constrain outside map away from the center of the map
college_layers <- function(d) {
xlimz <- if (all(d$x > 0)) {c(2.5e6, NA)} else {c(NA, -2e6)}
geom_text_repel(
data = d,
aes(x, y, label = INSTNM),
xlim = xlimz,
ylim = c(-Inf, Inf),
size = 3,
force = 20,
box.padding = 0.3,
max.overlaps = 30,
point.padding = NA,
alpha = 1.0,
min.segment.length = 0.1,
segment.color = "black",
segment.size = 1,
seed = 1000
)
}
# plot with separate layer for west coast and east coast
plot_usmap(fill = "light blue", alpha = 0.5) +
geom_point(
data = d,
aes(x = x, y = y, size = UGDS),
color = "red",
alpha = 0.75
) +
college_layers(d = filter(d, x > 0)) +
college_layers(d = filter(d, x < 0)) +
expand_limits(x = c(-3.9e6, 4.6e6),
y = c(-3e6, 2e6)) +
labs(title = "Select Colleges",
size = "Undergrad Enrollment") +
theme(legend.position = c(0.35, 0),
legend.direction = 'horizontal',
plot.title = element_text(hjust = 0.5))
Created on 2022-04-01 by the reprex package (v2.0.1)
Data:
d <- structure(list(INSTNM = c("Princeton University", "Stanford University",
"Duke University", "University of Pennsylvania", "Massachusetts Institute of Technology",
"Georgetown University", "California Institute of Technology",
"Harvey Mudd College", "Bentley University", "Carnegie Mellon University",
"Yale University", "Cornell University", "Columbia University in the City of New York",
"Boston College", "Stevens Institute of Technology", "Lehigh University",
"Massachusetts Maritime Academy", "Harvard University"), x = c(2107384.76948701,
-1933340.27810509, 1876178.25472949, 2077243.02501463, 2314261.77712267,
1955381.08673633, -1660141.85673732, -1623368.30493136, 2303424.70345276,
1678023.03854027, 2211596.23078863, 1896995.53745184, 2147624.50302849,
2309370.68277906, 2144734.86774305, 2041573.64168227, 2373567.48443726,
2311783.20749272), y = c(-188894.792987744, -582296.149881856,
-762721.806918975, -245389.810253038, 123275.753360416, -404107.357328073,
-1027748.36033576, -1039201.65863312, 122405.777575308, -300870.762534603,
-39714.5927185968, -7748.73302456512, -121333.925485063, 118650.586978148,
-129820.607837031, -179439.260821836, 71069.0976923304, 124173.1993115
), UGDS = c(5308, 6994, 6546, 10774, 4516, 7141, 938, 893, 4157,
6535, 6089, 14976, 8221, 9637, 3641, 5164, 1654, 7547)), class = "data.frame", row.names = c(NA,
-18L))

Related

How to locate bar plot label in a fixed position of the plot area in ggplot

I am working in bar race plot where only one bar in each plot will have a label. I want that label to appear in the same position in the bar race plot regardless of the data and ranking in each year (for example, I would like to place the label at the bottom right side of the chart). In other words, I would like to locate the label in the same coordinates (x,y) in the chart. The following code is an example of what I can do right now.
library(ggimage)
library(janitor)
library(gganimate)
library(ggplot2)
library(tidyverse)
library(transformr)
library(ggforce)
theme_set(theme_classic())
data_bar <- structure(list(firm = c("audi", "ford", "mercedes", "toyota",
"audi", "ford", "mercedes", "toyota", "audi", "ford", "mercedes",
"toyota", "audi", "ford", "mercedes", "toyota", "audi", "ford",
"mercedes", "toyota", "audi", "ford", "mercedes", "toyota", "audi",
"ford", "mercedes", "toyota", "audi", "ford", "mercedes", "toyota",
"audi", "ford", "mercedes", "toyota", "audi", "ford", "mercedes",
"toyota", "audi", "ford", "mercedes", "toyota"), year = c(1945,
1945, 1945, 1945, 1946, 1946, 1946, 1946, 1947, 1947, 1947, 1947,
1948, 1948, 1948, 1948, 1949, 1949, 1949, 1949, 1950, 1950, 1950,
1950, 1951, 1951, 1951, 1951, 1952, 1952, 1952, 1952, 1953, 1953,
1953, 1953, 1954, 1954, 1954, 1954, 1955, 1955, 1955, 1955),
value = c(1156L, 2750L, 48436L, NA, 1170L, 3118L, 48653L,
NA, 1189L, 3453L, 47990L, NA, 1240L, 4406L, 49284L, NA, 1245L,
5249L, 51490L, NA, 1156L, 5907L, 48436L, 2007L, 1170L, 6311L,
48653L, 2042L, 1189L, 6323L, 47990L, 2077L, 1240L, 6594L,
49284L, 2110L, 1245L, 7261L, 51490L, 2146L, 1246L, 8054L,
49632L, 2179L), rank = c(3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1,
4, 3, 2, 1, 4, 3, 2, 1, 4, 4, 2, 1, 3, 4, 2, 1, 3, 4, 2,
1, 3, 4, 2, 1, 3, 4, 2, 1, 3, 4, 2, 1, 3), Value_rel = c(0.0238665455446362,
0.0567759517714097, 1, NA, 0.0240478490535013, 0.0640864900417241,
1, NA, 0.0247759949989581, 0.0719524901021046, 1, NA, 0.0251602954305657,
0.0894002110218326, 1, NA, 0.024179452320839, 0.101942124684405,
1, NA, 0.0238665455446362, 0.121954744404988, 1, 0.041436121892807,
0.0240478490535013, 0.129714508868929, 1, 0.0419706903993587,
0.0247759949989581, 0.131756615961659, 1, 0.0432798499687435,
0.0251602954305657, 0.133795958120282, 1, 0.0428130833536239,
0.024179452320839, 0.141017673334628, 1, 0.0416779957273257,
0.0251047711154094, 0.162274339136041, 1, 0.0439031270148291
), Value_lbl = c(" 0", " 0", " 0", " NA", " 0", " 0", " 0",
" NA", " 0", " 0", " 0", " NA", " 0", " 0", " 0", " NA",
" 0", " 0", " 0", " NA", " 0", " 0", " 0", " 0", " 0", " 0",
" 0", " 0", " 0", " 0", " 0", " 0", " 0", " 0", " 0", " 0",
" 0", " 0", " 0", " 0", " 0", " 0", " 0", " 0")), row.names = c(NA,
-44L), class = c("tbl_df", "tbl", "data.frame"))
staticplot = ggplot(data_bar, aes(rank, group = firm, fill = as.factor(if_else(firm == "ford","darkblue", "darkred")), color = as.factor(if_else(firm == "ford","darkblue", "darkred")))) +
geom_tile(aes(y = value/2,
height = value,
width = 0.9), alpha = 0.8, color = NA) +
geom_text(aes(y = value, label = as.factor(ifelse(firm == "ford", round(value,2),NA))), vjust= 0.3, hjust = 1, size = 20, color = "red") +
coord_flip(clip = "off", expand = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_reverse() +
guides(color = "none", fill = "none") +
theme(axis.line=element_blank(),
axis.text.x=element_text(size = 28),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
legend.position="none",
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.grid.major.x = element_line( size=.1, color="black" ),
panel.grid.minor.x = element_line( size=.1, color="black" ),
plot.title=element_blank(),
plot.subtitle=element_blank(),
plot.caption =element_blank(),
plot.background=element_blank(),
plot.tag = element_text(size=102, face="italic", color="red"),
plot.tag.position = c(0.893,0.2),
plot.margin = margin(2,2, 2, 4, "cm"))
anim = staticplot + transition_states(year, transition_length = 1, state_length = 1, wrap = FALSE)
animate(anim, 200, fps = 10, duration = 30, width = 1600, height = 900,
renderer = ffmpeg_renderer()) -> for_mp4
anim_save("firm position by year.mp4", animation = for_mp4)
By "playing" with hjust and vjust in geom_text(), I can easily move the label (the number in the chart), but always with respect to the bar corresponding to each year. What I want is to place the number in a fixed position with respect to the plotting area. Therefore, regardless of the data in each year, I will have the label in the same position of the plotting area.
Thanks in advance
Try moving the y out of aes() in geom_text and give it a specific value.
df <- data.frame(dose = c("D0.5", "D1", "D2"), len = c(4.2, 10, 29.5))
p <- ggplot(data = df, aes(x = dose, y = len)) +
geom_bar(stat = "identity")
p + geom_text(aes(label = as.factor(ifelse(dose == "D1", len, NA))),
x = 2,
y = 15,
vjust = 0.3, hjust = 1, size = 20)

How to correlate multiple subsets in R

How do I correlate 8 subsets separately against two different dependent variables? I keep getting the same correlation coefficient for the two different subsets (example below). Here is the input:
with(subset(mydata2, PARTYID_Strength = 1), cor.test(PARTYID_Strength,
mean.legit))
with(subset(mydata2, PARTYID_Strength = 1), cor.test(PARTYID_Strength,
mean.leegauthor))
with(subset(mydata2, PARTYID_Strength = 2), cor.test(PARTYID_Strength,
mean.legit))
with(subset(mydata2, PARTYID_Strength = 2), cor.test(PARTYID_Strength,
mean.leegauthor))
Output (I get this for both PARTY_Strength = 1 and 2):
Pearson's product-moment correlation
data: PARTYID_Strength and mean.legit t = 3.1005, df = 607, p-value
= 0.002022 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval:
0.0458644 0.2023031 sample estimates:
cor
0.1248597
Pearson's product-moment correlation
data: PARTYID_Strength and mean.leegauthor t = 2.8474, df = 607,
p-value = 0.004557 alternative hypothesis: true correlation is not
equal to 0 95 percent confidence interval:
0.03568431 0.19250344 sample estimates:
cor
0.1148091
Sample data:
> dput(head(mydata2, 10))
``structure(list(PARTYID = c(1, 3, 1, 1, 1, 4, 3, 1, 1, 1), PARTYID_Other =
c("NA",
"NA", "NA", "NA", "NA", "Green", "NA", "NA", "NA", "NA"), PARTYID_Strength =
c(1,
7, 1, 2, 1, 8, 1, 6, 1, 1), PARTYID_Strength_Other = c("NA",
"NA", "NA", "NA", "NA", "Green", "NA", "NA", "NA", "NA"), THERM_Dem = c(80,
65, 85, 30, 76, 15, 55, 62, 90, 95), THERM_Rep = c(1, 45, 10,
5, 14, 14, 0, 4, 10, 3), Gender = c("Female", "Male", "Male",
"Female", "Female", "Male", "Male", "Female", "Female", "Male"
), `MEAN Age` = c(29.5, 49.5, 29.5, 39.5, 29.5, 21, 39.5, 39.5,
29.5, 65), Age = c("25 - 34", "45 - 54", "25 - 34", "35 - 44",
"25 - 34", "18 - 24", "35 - 44", "35 - 44", "25 - 34", "65+"),
Ethnicity = c("White or Caucasian", "Asian or Asian American",
"White or Caucasian", "White or Caucasian", "Hispanic or Latino",
"White or Caucasian", "White or Caucasian", "White or Caucasian",
"White or Caucasian", "White or Caucasian"), Ethnicity_Other = c("NA",
"NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"), States = c("Texas",
"Texas", "Ohio", "Texas", "Puerto Rico", "New Hampshire",
"South Carolina", "Texas", "Texas", "Texas"), Education = c("Master's
degree",
"Bachelor's degree in college (4-year)", "Bachelor's degree in college (4-
year)",
"Master's degree", "Master's degree", "Less than high school degree",
"Some college but no degree", "Master's degree", "Master's degree",
"Some college but no degree"), `MEAN Income` = c(30000, 140000,
150000, 60000, 80000, 30000, 30000, 120000, 150000, 60000
), Income = c("Less than $30,000", "$130,001 to $150,000",
"More than $150,000", "$50,001 to $70,000", "$70,001 to $90,000",
"Less than $30,000", "Less than $30,000", "$110,001 to $130,000",
"More than $150,000", "$50,001 to $70,000"), mean.partystrength = c(3.875,
2.875, 2.375, 3.5, 2.625, 3.125, 3.375, 3.125, 3.25, 3.625
), mean.traitrep = c(2.5, 2.625, 2.25, 2.625, 2.75, 1.875,
2.75, 2.875, 2.75, 3), mean.traitdem = c(2.25, 2.625, 2.375,
2.75, 2.625, 2.125, 1.875, 3, 2, 2.5), mean.leegauthor = c(1,
2, 2, 4, 1, 4, 1, 1, 1, 1), mean.legit = c(1.71428571428571,
3.28571428571429, 2.42857142857143, 2.42857142857143, 2.14285714285714,
1.28571428571429, 1.42857142857143, 1.14285714285714, 2.14285714285714,
1.28571428571429)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))``
Thank you!
To run the tests, create a vector of the columns of interest and then sapply an anonymous function to each of them.
fixed <- "PARTYID_Strength"
cols <- c("mean.leegauthor", "mean.legit")
cor_test_result <- sapply(cols, function(x){
fmla <- paste(fixed, x, sep = "+")
fmla <- as.formula(paste("~", fmla))
cor.test(fmla, mydata2)
}, simplify = FALSE)
cor_test_result$mean.leegauthor
#
# Pearson's product-moment correlation
#
#data: PARTYID_Strength and mean.leegauthor
#t = 1.4804, df = 8, p-value = 0.177
#alternative hypothesis: true correlation is not equal to 0
#95 percent confidence interval:
# -0.2343269 0.8462610
#sample estimates:
# cor
#0.4637152

How to get individual plot percentages on facet wrap plots in r

New to R, apologies in advance. I have run the following code to get separate plots showing the percentage of proportion of Injury_Areas for each Occupation_Proper.
SelectOccupation_InjuryAreas %>%
group_by(Occupation_Proper) %>%
mutate(prop = counnt / sum(counnt)) %>%
ggplot(aes(Injury_Area, prop)) +
ylab('prop') +
geom_bar(stat="identity") +
scale_y_continuous(labels=percent) +
facet_wrap(~ Occupation_Proper,ncol=2)+
geom_col(aes(fill = Occupation_Proper), position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label = scales::percent(prop),
y = prop,
group = Occupation_Proper),
position = position_dodge(width = 0.9),
vjust = 1.5)
I would like to show the percentage of each Injury_Area as a percentage of the total of all Injury_Area for each Occupation_Proper separately. The code that I ran for some reason is displaying the Injury_Area percentages as a percentage of all of the plots combined instead of displaying the percentages for each plot separately (e.g. percentage for each plot separately should add up to 100), see picture of plots 1. How do I get percentages of proportions for each plot separately? Thank you!
My data is as follows (using R Studio so hopefully this is the correct way to display.
structure(list(Occupation_Proper = c("HTO", "HTO", "HTO", "HTO",
"HTO", "HTO", "HTO", "HTO", "HTO", "HTO", "HTO", "HTO", "HTO",
"HTO", "HTO", "HTO", "HTO", "HTO", "Bl", "Bl", "Bl", "Bl", "Bl",
"Bl", "Bl", "Bl", "Bl", "Bl", "Bl", "Cb", "Cb", "Cb", "Cb", "Cb",
"Cb", "Cb", "Cb", "Cb", "Cb", "Cb", "Cb", "CrO", "EO", "EO",
"EO", "EO", "EO", "EO", "EO", "EO", "EO", "EO", "EO", "EO", "EO",
"EO", "EO", "L", "L", "L", "TR", "TR", "TR", "TR", "TR", "TR",
"TR", "TR", "WH", "WH", "WH", "WH", "WH", "WH", "WH", "WH", "WH",
"WH", "WH", "WH"), Injury_Area = c("Back", "Neck", "Hand", "Head",
"Face", "Arm", "Leg", "Foot", "Knee", "Chest", "Eye", "Ear",
"Ribs", "Throat_Tongue", "Other", "Hip", "Buttock_Pelvis", "Torso",
"Neck", "Hand", "Head", "Arm", "Leg", "Knee", "Eye", "Ear", "Other",
"Hip", "Torso", "Neck", "Hand", "Head", "Arm", "Leg", "Foot",
"Knee", "Eye", "Other", "Hip", "Buttock_Pelvis", "Torso", "Knee",
"Neck", "Hand", "Head[", "Face", "Arm", "Leg", "Foot", "Knee",
"Torso_Buttock", "Eye", "Ear", "Spill", "Other", "Buttock_Pelvis",
"Torso", "Arm", "Eye", "Other", "Neck", "Hand", "Face", "Arm",
"Knee", "Eye", "Ear", "Hip", "Hand", "Head", "Face", "Arm", "Leg",
"Knee", "Eye", "Ear", "Other", "Hip", "Buttock_Pelvis", "Torso"
), counnt = c(4, 30, 20, 17, 15, 18, 32, 6, 13, 5, 12, 9, 2,
3, 50, 5, 6, 3, 2, 1, 1, 2, 7, 7, 1, 2, 2, 1, 1, 4, 7, 2, 5,
16, 1, 5, 1, 2, 3, 1, 2, 2, 14, 18, 6, 7, 19, 14, 3, 6, 1, 5,
4, 2, 26, 1, 2, 1, 1, 1, 1, 11, 2, 7, 1, 1, 5, 1, 12, 1, 3, 8,
2, 1, 2, 2, 2, 1, 1, 1)), row.names = c(NA, -80L), class = "data.frame")
df %>%
rename(count = counnt) %>%
group_by(Occupation_Proper) %>%
mutate(percent = (count/sum(count) * 100)) %>%
ggplot(aes(x = Injury_Area, y = percent, fill = Occupation_Proper)) +
geom_bar(stat="identity") +
facet_wrap(~ Occupation_Proper, ncol=2) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
geom_text(aes(label = paste(sprintf("%.1f", percent), "%", sep=""),
y = percent+7.5))

Change labels in facet wrap ggplot [duplicate]

This question already has answers here:
How to change facet labels?
(23 answers)
Closed 4 years ago.
Greeting,
I am having some difficulties with correctly naming labels in ggplot2 using facet_wrap. My graphing code is:
library(ggplot2)
mlm.plots <- ggplot(positions.df, aes(x, y)) +
geom_point(size=0.75, shape=1, colour="darkred", fill="pink")+
geom_abline(data = multilevel.df, aes(intercept=V2, slope=V3,
group=party), color="red", size=.1)+
facet_wrap(~party, ncol=2) +
xlab("")+
ylab("")
This produces the following:
What I really want is instead of the labels ("1", "2"...) to have the names of the political parties in each case. I have a feeling there should be an elegant solution using labeller() but I can't figure one out.
I think part of the problem is that I have set up party as a factor as follows:
party <- as.factor(rep(c("National Coalition Party", "Centre Party",
"Social Democratic Party", "Left Alliance", "Christian Democrats",
"True Finns", "Swedish People's Party", "Greens"), J))
I also know that the plot goes very wonky if party is as.character.
The data to make the graphs is as follows:
structure(list(party = c(5, 1, 6, 4, 2, 8, 7, 3, 5, 1, 6, 4,
2, 8, 7, 3, 5, 1, 6, 4, 2, 8, 7, 3, 5, 1, 6, 4, 2, 8, 7, 3, 5,
1, 6, 4, 2, 8, 7, 3, 5, 1, 6, 4, 2, 8, 7, 3, 5, 1, 6, 4, 2, 8,
7, 3, 5, 1, 6, 4, 2, 8, 7, 3), x = c(-16.667, -36.735, 43.243,
-64.286, 37.963, -9.091, 6.593, -38.614, -30.496, -10.549, -45.455,
-46.515, 25.926, -23.81, -29.53, -38.614, -4.478, 1.266, 1.176,
-30.357, 5, 13.084, -7.692, -38.614, 14.62, 26.829, -13.725,
-14.894, 24.299, 13.084, 4.145, -13.433, 29.464, -1.049, -1.37,
-38.168, 19.444, -5.882, 14.516, -6.25, 9.756, -7.636, -24.742,
-45.946, 7.813, -5.882, -19.931, -33.523, -20.556, -15.09, -36.432,
-42.051, -15.108, -6.518, -25.472, -21.471, 13.75, -16.383, -11.384,
-44.767, -16.771, 0.472, -23.392, -27.715), y = c(-0.295492376,
0.187137648, -0.209073538, 1.026732887, -0.524148543, 0.232093035,
-1.617201837, -0.038851011, -0.351777544, 0.637192933, -0.783167803,
1.549387151, -0.742792721, -0.054633476, -2.204811412, 0.009461977,
-0.594714182, 1.172333694, -0.951553793, 1.59911439, -1.246200649,
-0.337551454, -2.631499836, 0.6051641, -0.885991535, 1.492537342,
-1.275241929, 1.658246706, -1.331133971, -0.676627085, -3.220241861,
0.82922329, -0.841711554, 1.611623219, -1.531110402, 1.469424694,
-1.979679497, -0.724442893, -3.523278033, 1.187782421, -0.842631246,
1.35252299, -1.950335, 0.859798616, -2.152810527, -0.623310324,
-3.48956421, 1.718330701, -0.809637545, 0.741273409, -1.96458669,
1.466255347, -2.675088542, -1.066556748, -3.436585287, 1.935368096,
-0.870188157, 0.477034948, -2.49292584, 1.93375064, -2.924310472,
-1.033098158, -3.250669464, 2.086336567)), .Names = c("party",
"x", "y"), row.names = c(NA, -64L), class = "data.frame")
and
structure(list(party = c(5, 1, 6, 4, 2, 8, 7, 3), V2 =
c(-0.671389852256272,
1.07302815113772, -1.26372215643281, 1.79721076947721,
-1.95951156748975,
-0.541929683566524, -2.8106689095983, 1.42500879635995), V3 =
c(-0.000574845695491941,
0.018171274525851, 0.0127869327689727, 0.00934727979573554,
0.0251920546515927,
0.00326951650086729, 0.00867962541673107, 0.0153496027643832),
V4 = c(-0.417933984027918, -0.417933984027918, -0.417933984027918,
-0.417933984027918, -0.417933984027918, -0.417933984027918,
-0.417933984027918, -0.417933984027918), V5 = c(0.0114033982479481,
0.0114033982479481, 0.0114033982479481, 0.0114033982479481,
0.0114033982479481, 0.0114033982479481, 0.0114033982479481,
0.0114033982479481)), .Names = c("party", "V2", "V3", "V4",
"V5"), row.names = c("National.Coalition.Party", "Centre.Party",
"Social.Democratic.Party", "Left.Alliance", "Christian.Democrats",
"True.Finns", "Swedish.People.s.Party", "Greens"), class =
"data.frame")
When converting the party column to factor, use the labels argument to specify the labels.
positions.df$party <- factor(positions.df$party,
labels = c("National Coalition Party", "Centre Party",
"Social Democratic Party", "Left Alliance", "Christian Democrats",
"True Finns", "Swedish People's Party", "Greens"))
multilevel.df$party <- factor(multilevel.df$party,
labels = c("National Coalition Party", "Centre Party",
"Social Democratic Party", "Left Alliance", "Christian Democrats",
"True Finns", "Swedish People's Party", "Greens"))
After that, you can plot your data using your original code.
mlm.plots <- ggplot(positions.df, aes(x, y)) +
geom_point(size=0.75, shape=1, colour="darkred", fill="pink")+
geom_abline(data = multilevel.df, aes(intercept=V2, slope=V3,
group=party), color="red", size=.1)+
facet_wrap(~party, ncol=2) +
xlab("")+
ylab("")
mlm.plots

Selectively apply custom function based on criteria

I am working with this dataframe:
structure(list(year = c("2012", "2016", "2012", "2016"), month = c("12",
"12", "12", "12"), company = c("ALSN", "ALSN", "DAN", "DAN"),
Revenue = c(2141.8, 1840.2, 7224, 5826), `Cost of Goods Sold` = c(1187.5,
976, 6250, 4982), `Gross Profit` = c(954.3, 864.2, 974, 844
), `Gross Margin %` = c(44.56, 46.96, 13.48, 14.49), `Selling, General, & Admin. Expense` = c(419,
323.9, 424, 406), `Impairment Of Capital Assets` = c(0, 0,
2, 0), Advertising = c(1, 1, 1, 1), `Research & Development` = c(115.1,
88.8, 0, 0), `Restructuring And Mergern Acquisition` = c(0,
0, 47, 0), `Other Operating Expense` = c(-5.68434188608e-14,
1.13686837722e-13, 121, 8), `Operating Income` = c(420.2,
451.5, 429, 430), `Operating Margin %` = c(19.62, 24.54,
5.94, 7.38), `Interest Income` = c(0.9, 0.7, 24, 13), `Interest Expense` = c(-152.1,
-101.6, -84, -113), `Net Interest Income` = c(-151.2, -100.9,
-60, -100), `Other Income (Expense)` = c(-52.8, -9.3, -5,
-115), `Non Operating Income` = c(-52.8, -9.3, -5, -115),
`Other Income (Minority Interest)` = c(0, 0, -15, -13), `Gain on Sale of Security` = c(-1.3,
-0.8, 0, 7), `Write Off` = c(1, 1, 1, 1), `Pre-Tax Income` = c(216.2,
341.3, 364, 215), `Tax Provision` = c(298, -126.4, -51, 424
), `Tax Rate %` = c(-137.84, 37.03, 14.01, -197.21), `Net Income (Continuing Operations)` = c(514.2,
214.9, 315, 653), `Net Income (Discontinued Operations)` = c(0,
0, 0, 0), `Net Income` = c(514.2, 214.9, 300, 640), `Net Margin %` = c(24.01,
11.68, 4.15, 10.99), `Preferred Dividends` = c(0, 0, 31,
0), `EPS (Basic)` = c(2.83, 1.28, 1.82, 4.38), `EPS (Diluted)` = c(2.76,
1.27, 1.4, 4.36), `Shares Outstanding (Diluted Average)` = c(186.2,
168.8, 214.7, 146.8), `Depreciation, Depletion and Amortization` = c(252.5,
175.9, 277, 182), EBITDA = c(620.8, 618.8, 725, 510)), .Names = c("year",
"month", "company", "Revenue", "Cost of Goods Sold", "Gross Profit",
"Gross Margin %", "Selling, General, & Admin. Expense", "Impairment Of Capital Assets",
"Advertising", "Research & Development", "Restructuring And Mergern Acquisition",
"Other Operating Expense", "Operating Income", "Operating Margin %",
"Interest Income", "Interest Expense", "Net Interest Income",
"Other Income (Expense)", "Non Operating Income", "Other Income (Minority Interest)",
"Gain on Sale of Security", "Write Off", "Pre-Tax Income", "Tax Provision",
"Tax Rate %", "Net Income (Continuing Operations)", "Net Income (Discontinued Operations)",
"Net Income", "Net Margin %", "Preferred Dividends", "EPS (Basic)",
"EPS (Diluted)", "Shares Outstanding (Diluted Average)", "Depreciation, Depletion and Amortization",
"EBITDA"), row.names = c(NA, 4L), class = "data.frame")
Constants:
startDate <- "2012-01-01"
endDate <- "2016-12-31"
What I want: to create a function that applies a custom function to all numeric columns. I am trying to calculate CAGRs. The CAGR formula is as such:
((End Value / Beginning Value)^(1/number of years)-1)
So as you can see, I need for each column to be able to find the correct end value and beginning value.
My function right now is this:
cagr <- function(startval,endval,x,y,years){
return(((endval[x == year(endDate)]/startval[y == year(startDate)])^(1/(years-1)))-1)
}
cagrNew <- function(df,colum,x,y,years){
colum <- quo(colum)
x <- quo(x)
y <- quo(y)
out <- df %>%
group_by(!!company) %>%
summarise(xxxx = cagr(!!colum[!!x == year(endDate)],!!colum[!!y == year(startDate)],!!x,!!y,numYears))
return(out)
}
When I run the above function (cagrNEW), I get this error:
Error in `[.formula`(colum, !(!x == year(endDate))) :
attempt to set an attribute on NULL
My desired output:
Company RevenueCagr Cost of Goods Sold CAGR ....
ALSN .5% .3%
DAN .3% .2%
I haven't repeated the data above, to conserve space. Convert to tibble and assign.
# df <- as_tibble(...)
library(tidyverse)
library(scales) #< For percentage formatting
start_year <- 2012
end_year <- 2016
df %>%
filter(year %in% c(start_year, end_year)) %>%
group_by(company) %>%
arrange(desc(year), .by_group = TRUE) %>%
summarise_if(is.double, funs(CAGR = percent( (.[[1]]/.[[2]])^ (1/(end_year - start_year)) - 1) ) )
# CAGR = ((End Value / Beginning Value)^(1/number of years)-1)
#Checksum: ALSN Company, Revenue
# (End Value / Beginning Value)^((1/number of years))-1
percent(( (1840 / 2142) ^ (1/(2016-2012)) - 1))
#> [1] "-3.73%"

Resources