Related
I am running a logistic regression model using complex survey data using the survey package in R. After fitting the model, I performed regression diagnostics using the car package. I noticed outlying and influential observations that I would like to remove and then refit the model to check for their effects on the regression coefficients but my current approach is not giving me want I expect.
My dataset has about 10,000 observations. Here is sample data and code I have tried using:
library(car); library(survey)
dat <- structure(list(id = c(1009918, 1012826, 1029625, 1000926, 1027525,
1000115, 1000201, 1000202, 1000214, 1000219, 1000313, 1000324,
1000510, 1000521, 1000624, 1000708, 1000811, 1000817, 1000818,
1000906, 1000922, 1001002, 1001005, 1001401, 1001411, 1001413,
1001420, 1001424, 1001501, 1001510, 1001518, 1001526, 1001621,
1001807, 1001922, 1001926, 1002106, 1002217, 1002406, 1002416,
1002618, 1002709, 1003004, 1003017, 1003103, 1003108, 1003304,
1003319, 1003723, 1003804, 1003811, 1003819, 1004014, 1008902,
1008913, 1009011, 1009022, 1009123, 1009212, 1009215), strata = c(1,
2, 6, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), cluster = c(785,
938, 2337, 28, 2122, 3, 6, 6, 6, 6, 10, 10, 16, 16, 19, 22, 24,
24, 24, 28, 28, 33, 33, 45, 45, 45, 45, 45, 50, 50, 50, 50, 53,
60, 63, 63, 69, 74, 96, 96, 100, 102, 111, 111, 115, 115, 122,
122, 178, 193, 193, 193, 210, 755, 755, 759, 759, 762, 765, 765
), weights = c(621.921704979739, 5440.9107594311, 8450.49341643626,
2457.37241774248, 7174.79930450487, 930.492019594546, 443.253676607562,
443.253676607562, 886.507353215123, 443.253676607562, 1552.30979801343,
517.436599337811, 403.146111343943, 806.292222687886, 439.775494378883,
839.561001668328, 1210.77101540146, 403.590338467152, 403.590338467152,
457.23211170669, 914.464223413381, 584.557580338056, 584.557580338056,
233.135312658304, 233.135312658304, 233.135312658304, 466.270625316608,
233.135312658304, 287.94933168791, 287.94933168791, 287.94933168791,
287.94933168791, 2354.32022397843, 213.628591090648, 300.596873749779,
300.596873749779, 1121.27419052962, 528.482361549292, 1936.60489456861,
1291.06992971241, 282.360930726457, 3526.73915258957, 337.531162185852,
337.531162185852, 2183.63202546241, 2729.54003182802, 1035.32340123929,
1552.98510185893, 1400.62601417017, 717.92144006312, 358.96072003156,
1435.84288012624, 275.058410167952, 557.874242565598, 278.937121282799,
1687.48015279064, 1012.48809167438, 424.663883556537, 227.805527040477,
227.805527040477), age = c(20, 19, 93, 24, 18, 23, 22, 23, 24,
19, 18, 24, 20, 19, 18, 17, 19, 23, 19, 19, 21, 22, 21, 20, 23,
24, 24, 19, 21, 22, 20, 23, 21, 23, 20, 22, 23, 15, 20, 23, 24,
18, 24, 24, 15, 21, 24, 16, 22, 20, 20, 18, 21, 20, 21, 21, 24,
22, 24, 18), gender = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), levels = c("Male", "Female"), class = "factor"),
educ = structure(c(4L, 2L, 1L, 3L, 2L, 3L, 2L, 2L, 2L, 2L,
3L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 4L, 2L, 3L, 2L,
4L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L,
4L, 4L, 4L, 3L, 3L), levels = c("No formal education", "Primary",
"Secondary", "Tertiary"), class = "factor"), employ = structure(c(4L,
3L, 4L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 4L, 3L, 2L, 2L, 2L, 4L,
3L, 3L, 3L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 3L, 3L, 1L, 3L, 2L, 3L, 3L, 4L, 3L, 2L, 4L, 4L,
2L, 3L, 1L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 1L, 4L, 3L, 4L), levels = c("Unemployed",
"Employed", "Self-employed", "Other"), class = "factor"),
know = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), levels = c("No/Don't know", "Yes"), class = "factor"),
status = structure(c(2L, 4L, 2L, 1L, 5L, 3L, 2L, 2L, 2L,
4L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 5L, 4L, 2L, 5L, 5L, 4L, 3L,
2L, 3L, 2L, 2L, 2L, 2L, 4L, 3L, 3L, 4L, 3L, 2L, 2L, 3L, 1L,
2L, 1L, 1L, 2L, 2L, 2L), levels = c("1", "2", "3", "4", "5"
), class = "factor"), smoker = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("no",
"yes"), class = "factor")), row.names = c(81L, 4174L, 6722L,
1255L, 2712L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L,
26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L), class = "data.frame")
## Create survey design object
dat_svy <- survey::svydesign(ids = ~cluster, strat = ~strata, weights = ~weights, data = dat, nest = TRUE)
options(survey.lonely.psu = "adjust")
## fit logistic regression model
mod <- survey::svyglm(formula = smoker ~ age + educ + gender + employ + educ + know + status, design = dat_svy, family = "quasibinomial")
I have tried the following:
update(mod, subset = !(rownames(dat_svy) %in% c(2, 5, 9, 13, 21))) # returns an error
update(mod, subset = -c(2, 5, 9, 13, 21)) # only removes one (first specified) observation
First time I ever cared to sort because I need to report this in a specific way. Typically, smallest to largest sort in excel on column quarter returns YTD, 3,2,1. etc. However, I cannot sort using Order(-begin$Quarter) or rev(begin$Quarter). Other solutions with mix sort prevent me from then also sorting other columns, like here I want to sort by quarter then segment. I have dput examples.
Thanks,
begin<- structure(list(Quarter = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L), .Label = c("1", "2", "3", "YTD"), class = "factor"),
Segment = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Beverages", "Food"), class = "factor"), Person = structure(c(3L,
2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L,
2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L), .Label = c("Chris", "Jackie",
"Josh"), class = "factor"), Sales = c(4, 4, 3, 2, 3, 3, 7,
7, 1, 1, 2, 3, 7, 7, 8, 5, 7, 8, 9, 6, 6, 7, 5, 6)), class = "data.frame", row.names = c(NA,
-24L))
end<- structure(list(Quarter = structure(c(4L, 4L, 4L, 4L, 4L, 4L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("1", "2", "3", "YTD"), class = "factor"),
Segment = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L
), .Label = c("Beverages", "Food"), class = "factor"), Person = structure(c(1L,
1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L,
3L, 3L, 1L, 1L, 2L, 2L, 3L, 3L), .Label = c("Chris", "Jackie",
"Josh"), class = "factor"), Sales = c(3, 6, 2, 5, 1, 7, 1,
6, 7, 6, 7, 9, 3, 8, 3, 7, 2, 5, 3, 8, 4, 7, 4, 7)), class = "data.frame", row.names = c(NA,
-24L))
library(dplyr)
begin %>%
arrange(desc(Quarter), Person, Segment)
So I'm learning to use R/GGplot, it was simple enough to create a single bar chart but I'm struggling to understand how to properly manipulate the data to get the chart I want.
So I have a basic example data file that looks like this in RStudio:
Basically, I wanted to make a bar for each option, which counts the "Yes" options. The Y axis would then be equal to the total number of records, with the scale measured in %.
Here is where I think I went completely wrong:
data_Q1 <- data.frame(Q1 = c("Red", "Blue", "Green", "Yellow", "Pink"))
I believe here I might need to remove the "No" level, then rename the "Yes" level to the colour name before I can work with it, but I seem to be greatly misunderstanding how I do this.
I've tried using droplevels() and raw_data$Q1_1[grepl("Yes", raw_data$Q1, ignore.case=T)] <- "Red" but neither seem to achieve the goal I want.
Here is the code I'm using for the graph:
ggplot(
data_Q1,
aes(
x=Q1,
y=sum(
complete.cases(raw_data)
)
)
)
+geom_bar(
aes(
fill=Q1
),
colour="black",
stat="identity"
)
+labs(
title="Colours respondents liked",
subtitle="Q1. Select all the colours you like",
caption="source: example data"
)
+ylab("Total completes")
+scale_y_continuous(labels = scales::percent)
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139), Q1_1 = structure(c(2L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
Q1_2 = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("No", "Yes"
), class = "factor"), Q1_3 = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
), .Label = c("No", "Yes"), class = "factor"), Q1_4 = structure(c(1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
Q1_5 = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"
), class = "factor")), row.names = c(NA, -139L), variable.labels = c(id = "id",
Q1_1 = "[Red] Q1. Select all the colours you like", Q1_2 = "[Blue] Q1. Select all the colours you like",
Q1_3 = "[Green] Q1. Select all the colours you like", Q1_4 = "[Yellow] Q1. Select all the colours you like",
Q1_5 = "[Pink] Q1. Select all the colours you like"), codepage = 65001L, class = "data.frame")
Okay, I am assuming this is from some survey, so your data is (probably) a bit messy.
An approach with tidyverse and transposing (t()) would look something like this:
library(tidyverse)
df <- raw_data
df2 <- data.frame(t( df %>% summarise(Q1 = sum(Q1_1=="Yes") / length(complete.cases(df)) ,
Q2 = sum(Q1_2=="Yes") / length(complete.cases(df)),
Q3 = sum(Q1_3=="Yes") / length(complete.cases(df)),
Q4 = sum(Q1_4=="Yes") / length(complete.cases(df)),
Q5 = sum(Q1_5=="Yes") / length(complete.cases(df)),) ))
names(df2) <- ("sum_yes")
df2$q <- rownames(df2)
Output of df2:
> df2
sum_yes q
Q1 0.8417266 Q1
Q2 0.7338129 Q2
Q3 0.7122302 Q3
Q4 0.4820144 Q4
Q5 0.7122302 Q5
>
Then, we plot the results:
library(ggplot2)
ggplot(
df2,
aes(
x=q,
y = sum_yes,
fill = q
)
) +
geom_bar(
colour="black",
stat="identity"
) + labs(
title="Colours respondents liked",
subtitle="Q1. Select all the colours you like",
caption="source: example data"
) + ylab("Total completes")+scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values= c("Red", "Blue", "Green", "Yellow", "Pink") )
with scale_fill_manual, you can just specify what the colors of your results should be.
If I understood the data correctly, you would need to replace the names (Q1 to Q5 ) with the respective colors.
I am trying to make a stacked barplot with two variables. My desired outcome looks like this:
This is the first part of my data. There are 220 more rows:
Type Week Stage
<chr> <dbl> <dbl>
1 Captured 1 2
2 Captured 1 1
3 Captured 1 1
4 Captured 1 2
5 Captured 1 1
6 Captured 1 3
7 Captured 1 NA
8 Captured 1 3
9 Captured 1 2
10 Captured 1 1
So far I'm not getting anywhere, this is my code so far
library(data.table)
dat.m <- melt(newrstudio2, id.vars="Type")
dat.m
library(ggplot2)
ggplot(dat.m, aes(x=Type, y=value, fill=variable)) +
geom_bar(stat="identity")
I guess I need to calculate the number of observations of each stage in each week of each type? I've tried both long and wide data, but I somehow need to combine week with type? I don't know, I'm at a loss.
Alternative way:
set.seed(123)
# sample data
my_data <- data.frame(Type = sample(c("W", "C"), 220, replace = TRUE),
Week = sample(paste0("Week ", 1:4), 220, replace = TRUE),
Stage = sample(paste0('S', 1:4), 220, replace = TRUE))
head(my_data)
library(ggplot2)
ggplot(my_data, aes(x = Type, fill = Stage)) +
geom_bar(aes(y = (..count..)/sum(..count..)), position = "fill") +
facet_grid(. ~ Week, switch="both") +
scale_y_continuous(labels = scales::percent) +
ylab("Stage [%]") +
theme(strip.background = element_blank(),
strip.placement = "outside",
panel.spacing = unit(0, "lines"))
Alternatively we could use base graphics. First, what you're probably most interested in, we should reshape the data.
For this we could split the data per week and run a dcast() over it.
L <- lapply(split(d, d$week), function(x)
data.table::dcast(x, type ~ stage, value.var="stage", fun=length))
d2 <- do.call(rbind, L) # transform back into a data frame
Now – with credits to #alemol – we want the proportions.
d2[-1] <- t(apply(d2[-1], 1, prop.table))
Then we are able to plot relatively simply. Note, that barplot() additionally gives us a vector of bar coordinates which we can use later for the axis() labels.
cols <- c("#ed1c24", "#ff7f27", "#00a2e8", "#fff200") # define stage colors
par(mar=c(5, 5, 3, 5) + .1, xpd=TRUE) # set plot margins
p <- barplot(t(d2[-1]), col=cols, border="white", space=rep(c(.2, 0), 5),
font.axis=2, xaxt="n", yaxt="n", xlab="Week")
axis(1, at=p, labels=rep(c("C", "W"), 5), tick=FALSE, line=0)
axis(1, at=apply(matrix(p, , 2, byrow=TRUE), 1, mean), labels=1:5, tick=FALSE, line=1)
axis(2, at=0:10/10, labels=paste0(seq(0, 100, 10), "%"), line=0, las=2)
legend(12, .5, legend=rev(names(d2[-1])), col=rev(cols), pch=15, title="Stage")
Result:
Data:
d <- structure(list(type = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L), .Label = c("C", "W"), class = "factor"), week = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), stage = c(3L,
1L, 1L, 2L, 2L, 2L, 1L, 3L, 2L, 4L, 1L, 1L, 2L, 2L, 3L, 4L, 3L,
2L, 4L, 1L, 1L, 3L, 1L, 2L, 3L, 1L, 4L, 1L, 2L, 4L, 2L, 3L, 4L,
4L, 2L, 4L, 4L, 2L, 3L, 1L, 1L, 4L, 4L, 1L, 4L, 3L, 3L, 3L, 2L,
1L, 3L, 4L, 2L, 4L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 1L, 3L, 2L,
1L, 1L, 1L, 4L, 2L, 4L, 1L, 4L, 3L, 4L, 4L, 4L, 2L, 2L, 2L, 2L,
2L, 1L, 3L, 4L, 2L, 4L, 4L, 2L, 2L, 3L, 4L, 4L, 3L, 3L, 1L, 1L,
1L, 2L, 4L, 3L, 1L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 4L, 2L, 1L,
2L, 1L, 3L, 3L, 2L, 4L, 3L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 2L, 2L,
2L, 1L, 3L, 4L, 3L, 4L, 3L, 4L, 4L, 3L, 1L, 1L, 2L, 1L, 2L, 3L,
2L, 2L, 1L, 4L, 3L, 4L, 2L, 2L, 3L, 1L, 2L, 3L, 3L, 3L, 3L, 2L,
1L, 2L, 2L, 1L, 1L, 3L, 4L, 3L, 4L, 2L, 4L, 1L, 1L, 2L, 1L, 3L,
2L, 1L, 3L, 3L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 1L, 4L, 2L, 4L, 2L,
4L, 3L, 3L, 1L, 3L, 4L, 3L, 2L, 1L, 2L, 4L, 1L, 2L, 4L, 2L, 1L,
2L, 1L, 2L, 2L, 3L, 1L, 3L, 3L, 3L, 2L, 2L, 1L, 2L, 3L, 2L, 2L,
1L, 2L, 1L, 3L, 3L, 2L, 1L, 3L, 4L, 2L, 1L, 2L, 4L, 3L, 4L, 2L,
3L, 2L, 4L, 1L, 4L, 4L, 2L, 1L, 2L)), row.names = c(NA, -250L
), class = "data.frame")
Is this what you're looking for:
set.seed(123)
# sample data
my_data <- data.frame(Type = sample(paste0('T', 1:4), 220, replace = TRUE),
Week = sample(paste0('W', 1:4), 220, replace = TRUE),
Stage = sample(paste0('S', 1:4), 220, replace = TRUE))
ggplot(my_data, aes(x=Week:Type, fill = Stage)) + geom_bar()
I'm trying to remove the redundant "pro/retro" labels on the second row of panels on my plot. However, I still want to keep the top row of panel labels intact. I've tried for the past hour to selectively remove the 1st strip on the 2nd panel row and I was wondering if anyone here knows how to do this. See below for technical details.
I have the following plot:
It was generated from the following data:
absBtwnDat <- structure(list(setSize = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L,
6L, 7L), .Label = c("2", "3", "4", "5", "6", "7", "8"), class = "factor"),
Measure = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Actual", "Predicted"), class = "factor"),
Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("fix", "forced"), class = "factor"),
JudgementType = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("pro", "retro"), class = "factor"),
Accuracy = c(1.91388888888889, 2.95555555555556, 3.74861111111111,
4.37777777777778, 4.21527777777778, 3.0875, 2.85277777777778,
2, 2.99444444444444, 4, 4.77222222222222, 5.24444444444444,
5.18472222222222, 5.20277777777778, 1.98888888888889, 3,
3.97222222222222, 4.85972222222222, 5.70555555555556, 6.56944444444444,
7.27222222222222, 2, 3, 3.99444444444444, 4.99444444444444,
5.86944444444444, 6.75555555555556, 7.57777777777778, 1.96111111111111,
2.97777777777778, 3.78333333333333, 3.97222222222222, 4.22361111111111,
3.64722222222222, 3.68888888888889, 2, 3, 3.97222222222222,
4.67777777777778, 5.26944444444444, 5.4625, 5.8, 2, 3, 3.98333333333333,
4.87777777777778, 5.73055555555556, 6.48333333333333, 7.62916666666667,
2, 3, 3.98333333333333, 4.96666666666667, 5.96944444444444,
6.94444444444444, 7.93333333333333), LL = c(1.85, 2.87777777777778,
3.59861111111111, 4.15555555555556, 3.78888888888889, 2.73055555555556,
2.55555555555556, 2, 2.96111111111111, 4, 4.64444444444444,
5.01666666666667, 4.88333333333333, 4.88611111111111, 1.91111111111111,
3, 3.89444444444444, 4.73611111111111, 5.47777777777778,
6.20277777777778, 6.71666666666667, 2, 3, 3.96666666666667,
4.95555555555556, 5.65096686319131, 6.48333333333333, 7.17222222222222,
1.86637442123568, 2.92222222222222, 3.65, 3.61666666666667,
3.88333333333333, 3.17092476055122, 3.18888888888889, 2,
3, 3.92222222222222, 4.49444444444444, 5.0375, 5.09444444444444,
5.40555555555556, 2, 3, 3.92777777777778, 4.72222222222222,
5.52777777777778, 6.24444444444444, 7.37361111111111, 2,
3, 3.95, 4.88888888888889, 5.93333333333333, 6.88333333333333,
7.73065763697428), UL = c(1.95555555555556, 2.98333333333333,
3.84444444444444, 4.56666666666667, 4.6, 3.43611111111111,
3.17916666666667, 2, 3, 4, 4.86111111111111, 5.42777777777778,
5.48656054159421, 5.58611111111111, 2, 3, 4, 4.93888888888889,
5.83888888888889, 6.76944444444444, 7.6, 2, 3, 4, 5, 5.94166666666667,
6.88888888888889, 7.78888888888889, 1.98888888888889, 2.99444444444444,
3.87777777777778, 4.22777777777778, 4.53611111111111, 4.19722222222222,
4.20555555555556, 2, 3, 3.98888888888889, 4.78333333333333,
5.45555555555556, 5.79583333333333, 6.16666666666667, 2,
3, 3.99444444444444, 4.95, 5.85972222222222, 6.67222222222222,
7.80138888888889, 2, 3, 3.99444444444444, 4.98888888888889,
5.9875, 6.97222222222222, 7.98333333333333)), .Names = c("setSize",
"Measure", "Location", "JudgementType", "Accuracy", "LL", "UL"
), row.names = c(NA, -56L), class = "data.frame")
I visualized it using using the following code:
library(ggplot2)
p1 <- ggplot(data = absBtwnDat, aes(x = as.numeric(as.character(setSize)),
y = Accuracy, group = Measure,
colour = Measure))+
geom_point()+
geom_line(aes(linetype = Measure))+
scale_x_continuous("Trial Set Size", breaks = 2:8)+
scale_y_continuous("Accuracy (# Correct)", breaks = 0:8, limits = c(0, 8))+
geom_errorbar(aes(ymin = LL, ymax = UL), width = .1, size = .75)+
scale_colour_grey(start = .8, end = .4)+
facet_wrap(~JudgementType+Location, dir = "v")+
theme(legend.position = "top")
Just to be certain, I've highlighted unwanted strip in the following image:
With this you'll only have one row of labels per panel, but they still include both words.
p1 <- ggplot(data = absBtwnDat,
aes(x = as.numeric(as.character(setSize)), y = Accuracy,
group = Measure,
colour = Measure))+
geom_point()+
geom_line(aes(linetype = Measure))+
scale_x_continuous("Trial Set Size", breaks = 2:8)+
scale_y_continuous("Accuracy (# Correct)",
breaks = 0:8, limits = c(0, 8))+
geom_errorbar(aes(ymin = LL, ymax = UL),
width = .1, size = .75)+
scale_colour_grey(start = .8, end = .4)+
facet_wrap(~JudgementType + Location,
dir = "v",
labeller = label_wrap_gen(multi_line=FALSE)) +
theme(legend.position = "top")
p1
Here is a possible solution:
g1 <- ggplotGrob(p1)
k <- which(g1$layout$name=="strip-t-1-2")
g1$grobs[[k]]$grobs[[1]]$children[[2]]$children[[1]]$label <- ""
g1$grobs[[k]]$grobs[[1]]$children[[1]]$gp$fill <- NA
k <- which(g1$layout$name=="strip-t-2-2")
g1$grobs[[k]]$grobs[[1]]$children[[2]]$children[[1]]$label <- ""
g1$grobs[[k]]$grobs[[1]]$children[[1]]$gp$fill <- NA
library(grid)
grid.draw(g1)