I'm trying to create my first map using ggrepel, but as you can see I've instead created a dumpster fire of overlapping labels. Most of the locations I'm mapping and labelling are clustered in the northeast, so the labels overlap. How do I get some of the labels to slide over beyond the map boundaries (in the ocean, so to speak)? Here's the code I used to create this monster:
plot_usmap(fill = "light blue", alpha = 0.5) +
ggrepel::geom_label_repel(data = top_18_2_transformed, aes(x=x, y=y, label=INSTNM),
size=3,
label.padding = unit(.75,"mm"),
nudge_y = 20,
nudge_x = 20,
box.padding=0.3,
max.overlaps=30,
point.padding=NA,
family="Avenir Next",
fill="gray99",
alpha=1.0,
label.r=unit(0.2,"lines"),
min.segment.length = 0.1,
label.size=unit(.15,"mm"),
segment.color="black",
segment.size=1,seed=1000) +
geom_point(data = top_18_2_transformed, aes(x = x, y = y, size = UGDS),
color = "red",
alpha = 0.75) +
labs(title = "Select Colleges",
size = "Undergrad Enrollment") +
theme(legend.position = "right")
And here's a picture of my problematic map:
Thanks in advance for any corrections you may be able to offer.
UPDATE 31 March 2022: here's the dput(top_18_2_transformed):
structure(list(lon = c(-74.659365, -122.167359, -78.937624, -75.19391,
-71.093226, -77.073463, -118.125878, -117.709837, -71.222839,
-79.941993, -72.926688, -76.483084, -73.961885, -71.169242, -74.025334,
-75.380236, -70.624084, -71.118313), lat = c(40.348732, 37.429434,
36.001135, 39.950929, 42.359243, 38.908809, 34.137349, 34.106515,
42.385995, 40.44357, 41.311158, 42.4472, 40.808286, 42.336213,
40.744776, 40.606822, 41.739072, 42.374471), UNITID = c(186131,
243744, 198419, 215062, 166683, 131496, 110404, 115409, 164739,
211440, 130794, 190415, 190150, 164924, 186867, 213543, 166692,
166027), OPEID = c(262700, 130500, 292000, 337800, 217800, 144500,
113100, 117100, 212400, 324200, 142600, 271100, 270700, 212800,
263900, 328900, 218100, 215500), OPEID6 = c(2627, 1305, 2920,
3378, 2178, 1445, 1131, 1171, 2124, 3242, 1426, 2711, 2707, 2128,
2639, 3289, 2181, 2155), INSTNM = c("Princeton University", "Stanford University",
"Duke University", "University of Pennsylvania", "Massachusetts Institute of Technology",
"Georgetown University", "California Institute of Technology",
"Harvey Mudd College", "Bentley University", "Carnegie Mellon University",
"Yale University", "Cornell University", "Columbia University in the City of New York",
"Boston College", "Stevens Institute of Technology", "Lehigh University",
"Massachusetts Maritime Academy", "Harvard University"), CITY = c("Princeton",
"Stanford", "Durham", "Philadelphia", "Cambridge", "Washington",
"Pasadena", "Claremont", "Waltham", "Pittsburgh", "New Haven",
"Ithaca", "New York", "Chestnut Hill", "Hoboken", "Bethlehem",
"Buzzards Bay", "Cambridge"), STABBR = c("NJ", "CA", "NC", "PA",
"MA", "DC", "CA", "CA", "MA", "PA", "CT", "NY", "NY", "MA", "NJ",
"PA", "MA", "MA"), ZIP = c("08544-0070", "94305", "27708", "19104-6303",
"02139-4307", "20057-0001", "91125", "91711", "02452-4705", "15213-3890",
"6520", "14853", "10027", "2467", "07030-5991", "18015", "02532-1803",
"2138"), ACCREDAGENCY = c("Middle States Commission on Higher Education",
"Western Association of Schools and Colleges Senior Colleges and University Commission",
"Southern Association of Colleges and Schools Commission on Colleges",
"Middle States Commission on Higher Education", "New England Commission on Higher Education",
"Middle States Commission on Higher Education", "Western Association of Schools and Colleges Senior Colleges and University Commission",
"Western Association of Schools and Colleges Senior Colleges and University Commission",
"New England Commission on Higher Education", "Middle States Commission on Higher Education",
"New England Commission on Higher Education", "Middle States Commission on Higher Education",
"Middle States Commission on Higher Education", "New England Commission on Higher Education",
"Middle States Commission on Higher Education", "Middle States Commission on Higher Education",
"New England Commission on Higher Education", "New England Commission on Higher Education"
), INSTURL = c("www.princeton.edu/", "www.stanford.edu/", "www.duke.edu/",
"www.upenn.edu/", "web.mit.edu/", "www.georgetown.edu/", "www.caltech.edu/",
"https://www.hmc.edu/", "www.bentley.edu/", "www.cmu.edu/", "https://www.yale.edu/",
"www.cornell.edu/", "www.columbia.edu/", "www.bc.edu/", "www.stevens.edu/",
"www.lehigh.edu/", "https://www.maritime.edu/", "www.harvard.edu/"
), SCH_DEG = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3), PREDDEG = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3), HIGHDEG = c(4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4), REGION = c(2, 8, 5, 2, 1, 2, 8, 8, 1, 2, 1,
2, 2, 1, 2, 2, 1, 1), CCBASIC = c(15, 15, 15, 15, 15, 15, 15,
21, 18, 15, 15, 15, 15, 15, 16, 16, 22, 15), ADM_RATE = c(0.0578,
0.0434, 0.076, 0.0766, 0.067, 0.1436, 0.0642, 0.1367, 0.4672,
0.1544, 0.0608, 0.1085, 0.0545, 0.2722, 0.3996, 0.321, 0.9146,
0.0464), ACTCM25 = c(33, 32, 33, 33, 34, 31, 35, 33, 27, 33,
33, 32, 33, 31, 31, 29, 19, 33), ACTCM75 = c(35, 35, 35, 35,
36, 35, 36, 35, 31, 35, 35, 35, 35, 34, 34, 33, 24, 35), SAT_AVG = c(1517,
1503, 1522, 1511, 1547, 1473, 1557, 1526, 1327, 1513, 1517, 1487,
1511, 1437, 1429, 1380, 1100, 1517), DISTANCEONLY = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), UGDS = c(5308,
6994, 6546, 10774, 4516, 7141, 938, 893, 4157, 6535, 6089, 14976,
8221, 9637, 3641, 5164, 1654, 7547), CURROPER = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), COSTT4_A = c(70900,
71587, 75105, 75303, 70240, 73840, 72084, 76953, 68577, 72265,
73900, 73879, 76907, 73053, 68734, 68383, 27858, 73485), COSTT4_P = c("NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL"), TUITIONFEE_IN = c(52800, 53529, 58031, 57770, 53790,
56058, 54600, 58660, 51830, 57119, 55500, 57222, 61788, 57910,
54014, 55240, 10018, 51925), TUITIONFEE_OUT = c(52800, 53529,
58031, 57770, 53790, 56058, 54600, 58660, 51830, 57119, 55500,
57222, 61788, 57910, 54014, 55240, 25752, 51925), AVGFACSAL = c(20724,
20865, 16863, 18277, 19624, 15798, 20595, 14397, 14592, 12296,
19830, 15574, 19431, 15599, 15318, 13763, 8928, 20988), PFTFAC = c("0.835",
"0.9881", "0.9364", "0.7779", "0.9885", "0.4815", "0.9289", "0.8992",
"0.6696", "0.9161", "0.717", "0.9074", "0.4521", "0.6662", "1",
"0.8392", "0.5867", "0.862"), C150_4 = c(0.979, 0.9432, 0.9462,
0.96, 0.954, 0.9491, 0.9357, 0.9167, 0.8952, 0.9049, 0.972, 0.9453,
0.9549, 0.9404, 0.8473, 0.8981, 0.7629, 0.971), RET_FT4 = c(0.9768,
0.9876, 0.9827, 0.9808, 0.9946, 0.9679, 0.9826, 0.9744, 0.9201,
0.9732, 0.9892, 0.9748, 0.9853, 0.9467, 0.9394, 0.9349, 0.8672,
0.9722), RET_PT4 = c("NULL", "NULL", "NULL", "0.9245", "NULL",
"0.6667", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "0.95",
"NULL", "NULL", "NULL", "NULL", "NULL"), MD_EARN_WNE_P10 = c("95689",
"97798", "93115", "103246", "111222", "96375", "112166", "108988",
"107974", "99998", "88655", "91176", "89871", "93021", "98159",
"95033", "91668", "84918"), PCT25_EARN_WNE_P10 = c("52729", "61965",
"61558", "65218", "67120", "61372", "67501", "69466", "73117",
"62003", "60311", "59566", "56005", "62006", "72669", "65644",
"68187", "56301"), PCT75_EARN_WNE_P10 = c("167686", "172245",
"151838", "174907", "169465", "147685", "175675", "173725", "146079",
"159483", "146102", "147189", "141158", "147010", "127298", "134075",
"129421", "153746"), MD_EARN_WNE_P6 = c("84713", "88873", "77260",
"80445", "112623", "71107", "129420", "112059", "78514", "87824",
"72046", "78779", "79434", "70858", "82237", "79832", "79354",
"77816"), GRAD_DEBT_MDN_SUPP = c("10450", "12000", "13500", "16763",
"13418", "16500", "PrivacySuppressed", "22089", "25000", "22014",
"13142", "14500", "21500", "18000", "27000", "23000", "26000",
"12665"), GRAD_DEBT_MDN10YR_SUPP = c("104.4654099", "119.9602793",
"134.9553142", "167.5745134", "134.1355856", "164.945384", "PrivacySuppressed",
"220.8168841", "249.9172485", "220.0671323", "131.3764992", "144.9520041",
"214.9288337", "179.9404189", "269.9106283", "229.9238686", "259.9139384",
"126.6080781"), C100_4 = c(0.898, 0.7288, 0.8831, 0.8571, 0.8691,
0.9076, 0.8434, 0.8565, 0.8479, 0.7599, 0.8777, 0.8694, 0.8635,
0.9003, 0.4566, 0.8003, 0.6322, 0.8476), ICLEVEL = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), OPENADMP = c(2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), GRADS = c("2997",
"10253", "10037", "14803", "6990", "12080", "1299", "NULL", "1086",
"7562", "7517", "8984", "23235", "4846", "3624", "1775", "97",
"21592"), ACCREDCODE = c("MSACHE", "WASCSR", "SACSCC", "MSACHE",
"NECHE", "MSACHE", "WASCSR", "WASCSR", "NECHE", "MSACHE", "NECHE",
"MSACHE", "MSACHE", "NECHE", "MSACHE", "MSACHE", "NECHE", "NECHE"
), RET_FT4_POOLED = c(0.9788, 0.9879, 0.9793, 0.9821, 0.9909,
0.9651, 0.9806, 0.9716, 0.9262, 0.97, 0.9892, 0.9741, 0.9825,
0.9479, 0.9423, 0.9378, 0.8633, 0.9817), C100_4_POOLED = c(0.8856,
0.739, 0.8788, 0.8546, 0.8602, 0.9009, 0.8242, 0.8551, 0.8326,
0.7546, 0.8772, 0.8766, 0.8677, 0.8918, 0.4515, 0.7621, 0.5955,
0.8573), BOOKSUPPLY = c("1050", "1245", "1434", "1358", "820",
"1200", "1428", "800", "1300", "1000", "1050", "970", "1294",
"1250", "1200", "1000", "1500", "1000"), ADMCON7 = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1), MDCOMP_ALL = c(0.5845,
0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845,
0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845, 0.5845,
0.5845), MDCOST_ALL = c(15387.5, 15387.5, 15387.5, 15387.5, 15387.5,
15387.5, 15387.5, 15387.5, 15387.5, 15387.5, 15387.5, 15387.5,
15387.5, 15387.5, 15387.5, 15387.5, 15387.5, 15387.5), MDEARN_ALL = c(37078,
37078, 37078, 37078, 37078, 37078, 37078, 37078, 37078, 37078,
37078, 37078, 37078, 37078, 37078, 37078, 37078, 37078), PPTUG_EF = c(0,
0, 0.0031, 0.0537, 0.0064, 0.0214, 0, 0.0011, 0.0118, 0.017,
2e-04, 3e-04, 0.0633, 0.0127, 0, 0.0128, 0.023, 0.0745), INEXPFTE = c(60048,
113338, 68756, 56874, 80756, 31693, 105185, 34419, 15842, 28167,
57231, 29893, 96463, 23266, 12504, 24995, 9687, 46272), C150_4_POOLED = c(0.9712,
0.9435, 0.9512, 0.9574, 0.9477, 0.9452, 0.9278, 0.9179, 0.8917,
0.8968, 0.969, 0.9452, 0.9566, 0.9297, 0.8608, 0.886, 0.7484,
0.974), GRAD_DEBT_MDN = c("10450", "12000", "13500", "16763",
"13418", "16500", "17747", "22089", "25000", "22014", "13142",
"14500", "21500", "18000", "27000", "23000", "26000", "12665"
), x = c(2107384.76948701, -1933340.27810509, 1876178.25472949,
2077243.02501463, 2314261.77712267, 1955381.08673633, -1660141.85673732,
-1623368.30493136, 2303424.70345276, 1678023.03854027, 2211596.23078863,
1896995.53745184, 2147624.50302849, 2309370.68277906, 2144734.86774305,
2041573.64168227, 2373567.48443726, 2311783.20749272), y = c(-188894.792987744,
-582296.149881856, -762721.806918975, -245389.810253038, 123275.753360416,
-404107.357328073, -1027748.36033576, -1039201.65863312, 122405.777575308,
-300870.762534603, -39714.5927185968, -7748.73302456512, -121333.925485063,
118650.586978148, -129820.607837031, -179439.260821836, 71069.0976923304,
124173.1993115)), class = "data.frame", row.names = c(NA, -18L
))
With a little data manipulation, you could move the labels out to either side of the country an draw segments to connect the labels to the universities:
top_18_2_transformed <- top_18_2_transformed[order(-top_18_2_transformed$y),]
colleges_east <- top_18_2_transformed[top_18_2_transformed$x > 0,]
colleges_west <- top_18_2_transformed[top_18_2_transformed$x < 0,]
colleges_west$lab_x <- -2300000
colleges_west$lab_y <- seq(-1000000, -1500000, -250000)
colleges_east$lab_x <- 2800000
colleges_east$lab_y <- seq(1000000, -2500000, -250000)
plot_usmap(fill = "light blue", alpha = 0.5) +
geom_text(data = colleges_west,
aes(x = lab_x, y = lab_y, label =stringr::str_wrap(INSTNM, 25)),
hjust = 1, size = 3, lineheight = 0.8) +
geom_text(data = colleges_east,
aes(x = lab_x, y = lab_y, label = stringr::str_wrap(INSTNM, 25)),
hjust = 0, size = 3, lineheight = 0.8) +
geom_point(data = top_18_2_transformed, aes(x = x, y = y, size = UGDS),
color = "red",
alpha = 0.75) +
geom_segment(data = colleges_east,
aes(x, y, xend = lab_x - 100000, yend = lab_y)) +
geom_segment(data = colleges_west,
aes(x, y, xend = lab_x + 100000, yend = lab_y)) +
labs(title = "Select Colleges",
size = "Undergrad Enrollment") +
theme(legend.position = c(0.35, 0),
legend.direction = 'horizontal') +
coord_cartesian(xlim = c( -3500000, 4000000),
ylim = c(-3000000, 1500000))
It looks like ggrepel::geom_*_repel() won't take xlim within aes() nor can it accept a list of vectors to split the constraints of west and east coast labels. However, you can just split them into two separate layers and then it's easier to control. Below I made a function to supply that position to avoid duplicating the code for those layers. Then you have to customize the exact values used in xlim and expand_limits() to get things to look nice depending on your graphics device etc.
Also IMHO this visualization is very hard to easily get much information out of. The points in the northeast are mostly overlapping and there are so many labels that even when spaced very nicely it is a bit tricky to follow them all. Instead it may be better to have a zoomed in plot for that region and avoid showing lots of space with no data in your plot or other ways to increase the legibility of the plot.
library(tidyverse)
library(ggrepel)
library(usmap)
# create function to generate labels and constrain outside map away from the center of the map
college_layers <- function(d) {
xlimz <- if (all(d$x > 0)) {c(2.5e6, NA)} else {c(NA, -2e6)}
geom_text_repel(
data = d,
aes(x, y, label = INSTNM),
xlim = xlimz,
ylim = c(-Inf, Inf),
size = 3,
force = 20,
box.padding = 0.3,
max.overlaps = 30,
point.padding = NA,
alpha = 1.0,
min.segment.length = 0.1,
segment.color = "black",
segment.size = 1,
seed = 1000
)
}
# plot with separate layer for west coast and east coast
plot_usmap(fill = "light blue", alpha = 0.5) +
geom_point(
data = d,
aes(x = x, y = y, size = UGDS),
color = "red",
alpha = 0.75
) +
college_layers(d = filter(d, x > 0)) +
college_layers(d = filter(d, x < 0)) +
expand_limits(x = c(-3.9e6, 4.6e6),
y = c(-3e6, 2e6)) +
labs(title = "Select Colleges",
size = "Undergrad Enrollment") +
theme(legend.position = c(0.35, 0),
legend.direction = 'horizontal',
plot.title = element_text(hjust = 0.5))
Created on 2022-04-01 by the reprex package (v2.0.1)
Data:
d <- structure(list(INSTNM = c("Princeton University", "Stanford University",
"Duke University", "University of Pennsylvania", "Massachusetts Institute of Technology",
"Georgetown University", "California Institute of Technology",
"Harvey Mudd College", "Bentley University", "Carnegie Mellon University",
"Yale University", "Cornell University", "Columbia University in the City of New York",
"Boston College", "Stevens Institute of Technology", "Lehigh University",
"Massachusetts Maritime Academy", "Harvard University"), x = c(2107384.76948701,
-1933340.27810509, 1876178.25472949, 2077243.02501463, 2314261.77712267,
1955381.08673633, -1660141.85673732, -1623368.30493136, 2303424.70345276,
1678023.03854027, 2211596.23078863, 1896995.53745184, 2147624.50302849,
2309370.68277906, 2144734.86774305, 2041573.64168227, 2373567.48443726,
2311783.20749272), y = c(-188894.792987744, -582296.149881856,
-762721.806918975, -245389.810253038, 123275.753360416, -404107.357328073,
-1027748.36033576, -1039201.65863312, 122405.777575308, -300870.762534603,
-39714.5927185968, -7748.73302456512, -121333.925485063, 118650.586978148,
-129820.607837031, -179439.260821836, 71069.0976923304, 124173.1993115
), UGDS = c(5308, 6994, 6546, 10774, 4516, 7141, 938, 893, 4157,
6535, 6089, 14976, 8221, 9637, 3641, 5164, 1654, 7547)), class = "data.frame", row.names = c(NA,
-18L))
I'm trying to plot a series of demographics factors. Each plot show the frequency distributions of demographic variables by gender. It runs nicely, but some of the labels are ordered in alphabetic order and not in meaningful order eg. Education, Marital Status and SIC2007.
Data structure
structure(list(DMSex = c("Male", "Female", "Male", "Male"), Income = c(980,
-8, 3000, 120), IncCat = c("-1", "-8", "-1", "-1"), HrWkAc = c(-1,
-1, -1, -1), ShiftWk = c(-1, -1, -1, -1), ShiftPat = c(-1, -1,
-1, -1), SOC2010C = c("-1", "9.2.3.3", "-1", "-1"), XSOC2010 = c(-1,
9233, -1, -1), IndexNo = c(-1, 1398, -1, -1), ES2010 = c(-1,
7, -1, -1), nssec = c(-1, 13.4, -1, -1), SECFlag = c(-1, 0, -1,
-1), LSOC2000 = c("-1", "9.2.3.3", "-1", "-1"), XSOC2000 = c(-1,
9233, -1, -1), seg = c(-1, 11, -1, -1), sc = c(-1, 5, -1, -1),
SIC2007 = c(-1, 87, -1, -1), Educ = c(1, 1, -1, 2), EducCur = c(10,
1, -1, -1), FinFTEd = c(-1, -1, -1, 1), FinFTEdY = c(-1,
-1, -1, 21), HiQual = c(22, 10, -1, 1), sic20070 = c(-1,
87, -1, -1), dhhtype = c(6, 8, 7, 3), dagegrp = c(2, 3, 3,
3), dmarsta = c("Single, never married", "Single, never married",
"Interview not achieved", "Married/cohabitating"), dhiqual = c(" Secondary",
" A level or equivalent", "Item not applicable", "Degree or higher"
), dnssec8 = c(-1, 8, -1, -1), duresmc = c(14, 15, 11, 16
), dgorpaf = c(7, 8, 5, 10), dukcntr = c(1, 1, 1, 1), dnrkid04 = c(0,
0, 0, 0), dilodefr = c(3, 3, -1, 3), deconact = c(8, 8, -1,
11), dtenure = c(2, 3, 2, 3), dtotac = c(-1, -1, -1, -1),
dtotus = c(-1, -1, -1, -1), dsic = c("Item not applicable",
"Public admin, education and health", "Item not applicable",
"Item not applicable"), dsoc = c(-1, 9, -1, -1), DVAge_category = c("15 to 30",
"15 to 30", "15 to 30", "15 to 30"), Income_category = c("Less than 1000",
"Less than 1000", "1001 to 3000", "Less than 1000"), HoursWorked_category = c("Less than 20 hours",
"Less than 20 hours", "Less than 20 hours", "Less than 20 hours"
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
#Age variable
demographics$dagegrp_category<-ifelse(demographics$dagegrp_01 > 2 & demographics$dagegrp < 6, age<-"15 to 30",
ifelse(demographics$dagegrp> 6 & demographics$dagegrp < 9, age<-"31 to 45",
ifelse(demographics$dagegrp > 9 & demographics$dagegrp < 12 , age<-"46 to 60",
ifelse(demographics$dagegrp > 12 & demographics$dagegrp < 15 , age<-"61 to 75",
ifelse(demographics$dagegrp > 15 & demographics$dagegrp < 18 , age<-"76+",
age<- "zombie")))))
demographics$DVAge_category<-c("15 to 30","31 to 45", "46 to 60","61 to 75", "76+", "zombie")[findInterval(demographics$dagegrp , c(-Inf, 6, 10, 12, 15,18, Inf))]
Age<-as.vector(demographics$DVAge_category)
#Gender variable
demographics$DMSex[demographics$DMSex==1]<-"Male"
demographics$DMSex[demographics$DMSex==2]<-"Female"
Gender<-as.vector(demographics$DMSex)
#Income variable
demographics$Income_category<-ifelse(demographics$Income < 1001, income<-"Less than 1000",
ifelse(demographics$Income > 999 & demographics$Income < 3001, income<-"1001 to 3000",
ifelse(demographics$Income > 3001 & demographics$Income < 6001, income <-"3001 to 6000",
ifelse(demographics$Income > 6001 & demographics$Income < 10001 , income<-"6001 to 10000",
income<- "zombie"))))
demographics$Income_category<-c("Less than 1000","1001 to 3000", "3001 to 6000", "6001 to 10000","zombie")[findInterval(demographics$Income , c(-Inf, 1001, 3001, 6001,10001, Inf) ) ]
Income<-as.vector(demographics$Income_category)
#Marital status variable
demographics$dmarsta[demographics$dmarsta==-1]<-"Interview not achieved"
demographics$dmarsta[demographics$dmarsta==1]<-"Single, never married"
demographics$dmarsta[demographics$dmarsta==2]<-"Married/cohabitating"
demographics$dmarsta[demographics$dmarsta==3]<-"Divorced/widowed"
MaritalStatus<-as.vector(demographics$dmarsta)
#Education
demographics$dhiqual[demographics$dhiqual==-8]<-"Don't know"
demographics$dhiqual[demographics$dhiqual==-1]<-"Item not applicable"
demographics$dhiqual[demographics$dhiqual==1]<-"Degree or higher"
demographics$dhiqual[demographics$dhiqual==2]<-"Higher education"
demographics$dhiqual[demographics$dhiqual==3]<-" A level or equivalent"
demographics$dhiqual[demographics$dhiqual==4]<-" Secondary"
demographics$dhiqual[demographics$dhiqual==5]<-" Other"
Education<-as.vector(demographics$dhiqual)
#Hours worked per week in main job variable
demographics$HoursWorked_category<-ifelse(demographics$dtotac < 21, workhours<-"Less than 20 hours",
ifelse(demographics$dtotac > 20 & demographics$dtotac< 41, workhours <-"Between 21 to 40 hours",
ifelse(demographics$dtotac > 40 & demographics$dtotac < 61, workhours <-"Between 41 to 60 hours",
ifelse(demographics$dtotac > 62, workhours<-"More than 61 hours",
workhours<- "Not Applicable"))))
demographics$HoursWorked_category<-c("Less than 20 hours", "Between 21 to 40 hours", "Between 41 to 60 hours","More than 61 hours","Not Applicable")[findInterval(demographics$dtotac, c(-Inf, 21, 41, 61, 62, Inf) ) ]
WorkHours<-as.vector(demographics$HoursWorked_category)
#DV: SIC 2007 industry divisions (grouped)
demographics$dsic[demographics$dsic==-8]<-"Don't know"
demographics$dsic[demographics$dsic==-1]<-"Item not applicable"
demographics$dsic[demographics$dsic==1]<-"Agriculture, forestry and fishing"
demographics$dsic[demographics$dsic==2]<-"Manufacturing"
demographics$dsic[demographics$dsic==3]<-"Energy and water supply"
demographics$dsic[demographics$dsic==4]<-"Construction"
demographics$dsic[demographics$dsic==5]<-"Distribution, hotels and restaurants"
demographics$dsic[demographics$dsic==6]<-"Transport and communication"
demographics$dsic[demographics$dsic==7]<-"Banking and finances"
demographics$dsic[demographics$dsic==8]<-"Public admin, education and health"
demographics$dsic[demographics$dsic==9]<-"Other services"
demographics$industry_category<-c("Don't know", "Item not applicable", "Agriculture, forestry and fishing","Manufacturing","Energy and water supply",
"Construction", "Distribution, hotels and restaurants", "Transport and communication", "Banking and finances",
"Public admin, education and health", "Other service")
SIC2007<-as.vector(demographics$dsic)
# creating df
df<-data.frame(Gender, Age, Education, MaritalStatus, Income, WorkHours, SIC2007)
df %>%
#tidy, not gender
gather(variable, value, -c(Gender))%>%
#group by value, variable, then gender
group_by(value, variable, Gender) %>%
#summarise to obtain table cell frequencies
summarise(freq=n()) %>%
#Plot
ggplot(aes(x=value, y=freq, group=Gender))+geom_bar(aes(fill=Gender), stat='identity', position='dodge')+ facet_wrap(~variable, scales='free_x') + theme(legend.position="right", axis.text.x = element_text(angle = 60, hjust = 1)) + labs(x="Characteristics", y="Frequencies")
In ggplot2, the data is ordered according to the factor levels of the data.frame column.
To (re)set the order in your plot, just set the order of the factor by:
df$variable <- factor(df$variable, levels = c(...))
You could do this by first storing the data.frame, before piping to the ggplot function, then manually setting the levels of the variables you want to change. It is maybe a bit inefficient, but this should do the trick:
## Make your plotting data.frame
df2 <- df %>%
gather(variable, value, -c(Gender))%>%
group_by(value, variable, Gender) %>%
summarise(freq=n())
## Apply custom order to MaritalStatus variable:
custom <- c(sort(unique(MaritalStatus))[c(4,3,1,2)],
....)
df2$variable <- factor(df2$variable, levels = c(levels(df2$variable)[!levels(df2$variable) %in% custom],
custom))