R scatterplot3d: a custom axis step and ticks - r

Greeting to all.
I am striving with a scatterplot3d plot -- a graphical representation of a data.frame of three variables where one of them is a response variable, where I have a wrong representation of the axis steps. Here is the code ("temp" is a data.frame):
library(scatterplot3d)
temp[,1] <- as.numeric(levels(temp[,1]))[temp[,1]]
for (m in temp[,2]) m <- as.factor(as.numeric(m))
for (m in temp[,3]) m <- as.factor(as.numeric(m))
colnames(temp) = c("Values", "Factors", "AntiFactors") # "Values" is that responce variable
xtickmarks<-c("AntiFactor1","AntiFactor1", "AntiFactor3")
ytickmarks<-c("Factor1","Factor2")
plot3d <- scatterplot3d(temp[,3], temp[,2], temp[,1], color = "blue",
pch = 19, type = "h", box = T, xaxt = "n",
x.ticklabs=xtickmarks, y.ticklabs=ytickmarks,
zlab = "Time, min.")
dput(temp)
structure(list(Values = c(395, 310, 235, 290, 240, 490, 270,
225, 430, 385, 170, 55, 295, 320, 270, 130, 300, 285, 130, 200,
225, 90, 205, 340, 3, 8, 1, 0, 0, 0, 3, 2, 5, 2, 3, 5, 2, 3,
200, 5, 5, 10, 5, 5, 5, 10, 10, 130, 5, 200, 80, 10, 360, 10,
5, 8, 30, 8, 10, 10, 10, 5, 240, 120, 3, 10, 25, 5, 5, 10, 190,
30, 115, 1, 1, 1, 2, 3, 5, 2, 5, 3, 3, 3, 2, 3, 2, 3, 0, 0, 195,
150, 2, 2, 0, 2, 1, 1, 2, 1, 2, 1, 1, 1, 3, 2, 2, 1, 2, 2, 1,
1, 2, 3, 2, 2, 1, 3, 1, 1), Factors = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Factor1", "Factor2"), class = "factor"),
AntiFactors = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("AntiFactor1", "AntiFactor2",
"AntiFactor3"), class = "factor")), .Names = c("Values",
"Factors", "AntiFactors"), row.names = c(NA, -120L), class = "data.frame")
Here is the picture of that plot I got:
The trouble is what I got twice more ticks at the x and y axis than it is needed. It is intended to have just one set of those Factor1..2 and AntiFactor1..3 ticks at each of those x, y axis. If I run that scatterplot3d without using x.ticklabs option, it gives "0, 0.5, 1, 1.5, 2.0, ...3.0" ticks etc at the axis. What is the way to set my step in x, y axis to be just a strong integer "1", so that all my discrete ticks to be displayed in their right place?

It seems that scatterplot3d coerces your discrete explanatory variables 'Factor' and 'AntiFactor' from factor to numeric. See e.g.:
levels(df$Factors)
# [1] "Factor1" "Factor2"
unique(as.numeric(df$Factors))
# [1] 1 2
levels(df$AntiFactors)
# [1] "AntiFactor1" "AntiFactor2" "AntiFactor3"
unique(as.numeric(df$AntiFactors))
# [1] 1 2 3
The labels you have created are recycled to get a label at each (default) tick mark. Also note your typo in 'xtickmarks' - I assume the second 'AntiFactor1' should be 'AntiFactor2'.
You may consider alternative ways to visualize your data, e.g. something like this:
library(ggplot2)
ggplot(data = temp, aes(x = AntiFactors, y = Values, fill = Factors)) +
geom_boxplot()

Related

Removing (influential) cases from fitted svyglm model

I am running a logistic regression model using complex survey data using the survey package in R. After fitting the model, I performed regression diagnostics using the car package. I noticed outlying and influential observations that I would like to remove and then refit the model to check for their effects on the regression coefficients but my current approach is not giving me want I expect.
My dataset has about 10,000 observations. Here is sample data and code I have tried using:
library(car); library(survey)
dat <- structure(list(id = c(1009918, 1012826, 1029625, 1000926, 1027525,
1000115, 1000201, 1000202, 1000214, 1000219, 1000313, 1000324,
1000510, 1000521, 1000624, 1000708, 1000811, 1000817, 1000818,
1000906, 1000922, 1001002, 1001005, 1001401, 1001411, 1001413,
1001420, 1001424, 1001501, 1001510, 1001518, 1001526, 1001621,
1001807, 1001922, 1001926, 1002106, 1002217, 1002406, 1002416,
1002618, 1002709, 1003004, 1003017, 1003103, 1003108, 1003304,
1003319, 1003723, 1003804, 1003811, 1003819, 1004014, 1008902,
1008913, 1009011, 1009022, 1009123, 1009212, 1009215), strata = c(1,
2, 6, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), cluster = c(785,
938, 2337, 28, 2122, 3, 6, 6, 6, 6, 10, 10, 16, 16, 19, 22, 24,
24, 24, 28, 28, 33, 33, 45, 45, 45, 45, 45, 50, 50, 50, 50, 53,
60, 63, 63, 69, 74, 96, 96, 100, 102, 111, 111, 115, 115, 122,
122, 178, 193, 193, 193, 210, 755, 755, 759, 759, 762, 765, 765
), weights = c(621.921704979739, 5440.9107594311, 8450.49341643626,
2457.37241774248, 7174.79930450487, 930.492019594546, 443.253676607562,
443.253676607562, 886.507353215123, 443.253676607562, 1552.30979801343,
517.436599337811, 403.146111343943, 806.292222687886, 439.775494378883,
839.561001668328, 1210.77101540146, 403.590338467152, 403.590338467152,
457.23211170669, 914.464223413381, 584.557580338056, 584.557580338056,
233.135312658304, 233.135312658304, 233.135312658304, 466.270625316608,
233.135312658304, 287.94933168791, 287.94933168791, 287.94933168791,
287.94933168791, 2354.32022397843, 213.628591090648, 300.596873749779,
300.596873749779, 1121.27419052962, 528.482361549292, 1936.60489456861,
1291.06992971241, 282.360930726457, 3526.73915258957, 337.531162185852,
337.531162185852, 2183.63202546241, 2729.54003182802, 1035.32340123929,
1552.98510185893, 1400.62601417017, 717.92144006312, 358.96072003156,
1435.84288012624, 275.058410167952, 557.874242565598, 278.937121282799,
1687.48015279064, 1012.48809167438, 424.663883556537, 227.805527040477,
227.805527040477), age = c(20, 19, 93, 24, 18, 23, 22, 23, 24,
19, 18, 24, 20, 19, 18, 17, 19, 23, 19, 19, 21, 22, 21, 20, 23,
24, 24, 19, 21, 22, 20, 23, 21, 23, 20, 22, 23, 15, 20, 23, 24,
18, 24, 24, 15, 21, 24, 16, 22, 20, 20, 18, 21, 20, 21, 21, 24,
22, 24, 18), gender = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), levels = c("Male", "Female"), class = "factor"),
educ = structure(c(4L, 2L, 1L, 3L, 2L, 3L, 2L, 2L, 2L, 2L,
3L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 4L, 2L, 3L, 2L,
4L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L,
4L, 4L, 4L, 3L, 3L), levels = c("No formal education", "Primary",
"Secondary", "Tertiary"), class = "factor"), employ = structure(c(4L,
3L, 4L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 4L, 3L, 2L, 2L, 2L, 4L,
3L, 3L, 3L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 3L, 3L, 1L, 3L, 2L, 3L, 3L, 4L, 3L, 2L, 4L, 4L,
2L, 3L, 1L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 1L, 4L, 3L, 4L), levels = c("Unemployed",
"Employed", "Self-employed", "Other"), class = "factor"),
know = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), levels = c("No/Don't know", "Yes"), class = "factor"),
status = structure(c(2L, 4L, 2L, 1L, 5L, 3L, 2L, 2L, 2L,
4L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 5L, 4L, 2L, 5L, 5L, 4L, 3L,
2L, 3L, 2L, 2L, 2L, 2L, 4L, 3L, 3L, 4L, 3L, 2L, 2L, 3L, 1L,
2L, 1L, 1L, 2L, 2L, 2L), levels = c("1", "2", "3", "4", "5"
), class = "factor"), smoker = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("no",
"yes"), class = "factor")), row.names = c(81L, 4174L, 6722L,
1255L, 2712L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L,
26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L), class = "data.frame")
## Create survey design object
dat_svy <- survey::svydesign(ids = ~cluster, strat = ~strata, weights = ~weights, data = dat, nest = TRUE)
options(survey.lonely.psu = "adjust")
## fit logistic regression model
mod <- survey::svyglm(formula = smoker ~ age + educ + gender + employ + educ + know + status, design = dat_svy, family = "quasibinomial")
I have tried the following:
update(mod, subset = !(rownames(dat_svy) %in% c(2, 5, 9, 13, 21))) # returns an error
update(mod, subset = -c(2, 5, 9, 13, 21)) # only removes one (first specified) observation

Order function in R cannot be reversed with - or rev() when column has alpha and numeric characters

First time I ever cared to sort because I need to report this in a specific way. Typically, smallest to largest sort in excel on column quarter returns YTD, 3,2,1. etc. However, I cannot sort using Order(-begin$Quarter) or rev(begin$Quarter). Other solutions with mix sort prevent me from then also sorting other columns, like here I want to sort by quarter then segment. I have dput examples.
Thanks,
begin<- structure(list(Quarter = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L), .Label = c("1", "2", "3", "YTD"), class = "factor"),
Segment = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Beverages", "Food"), class = "factor"), Person = structure(c(3L,
2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L,
2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L), .Label = c("Chris", "Jackie",
"Josh"), class = "factor"), Sales = c(4, 4, 3, 2, 3, 3, 7,
7, 1, 1, 2, 3, 7, 7, 8, 5, 7, 8, 9, 6, 6, 7, 5, 6)), class = "data.frame", row.names = c(NA,
-24L))
end<- structure(list(Quarter = structure(c(4L, 4L, 4L, 4L, 4L, 4L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("1", "2", "3", "YTD"), class = "factor"),
Segment = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L
), .Label = c("Beverages", "Food"), class = "factor"), Person = structure(c(1L,
1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L,
3L, 3L, 1L, 1L, 2L, 2L, 3L, 3L), .Label = c("Chris", "Jackie",
"Josh"), class = "factor"), Sales = c(3, 6, 2, 5, 1, 7, 1,
6, 7, 6, 7, 9, 3, 8, 3, 7, 2, 5, 3, 8, 4, 7, 4, 7)), class = "data.frame", row.names = c(NA,
-24L))
library(dplyr)
begin %>%
arrange(desc(Quarter), Person, Segment)

How to transform data for bar graph based on multi choice?

So I'm learning to use R/GGplot, it was simple enough to create a single bar chart but I'm struggling to understand how to properly manipulate the data to get the chart I want.
So I have a basic example data file that looks like this in RStudio:
Basically, I wanted to make a bar for each option, which counts the "Yes" options. The Y axis would then be equal to the total number of records, with the scale measured in %.
Here is where I think I went completely wrong:
data_Q1 <- data.frame(Q1 = c("Red", "Blue", "Green", "Yellow", "Pink"))
I believe here I might need to remove the "No" level, then rename the "Yes" level to the colour name before I can work with it, but I seem to be greatly misunderstanding how I do this.
I've tried using droplevels() and raw_data$Q1_1[grepl("Yes", raw_data$Q1, ignore.case=T)] <- "Red" but neither seem to achieve the goal I want.
Here is the code I'm using for the graph:
ggplot(
data_Q1,
aes(
x=Q1,
y=sum(
complete.cases(raw_data)
)
)
)
+geom_bar(
aes(
fill=Q1
),
colour="black",
stat="identity"
)
+labs(
title="Colours respondents liked",
subtitle="Q1. Select all the colours you like",
caption="source: example data"
)
+ylab("Total completes")
+scale_y_continuous(labels = scales::percent)
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139), Q1_1 = structure(c(2L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
Q1_2 = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("No", "Yes"
), class = "factor"), Q1_3 = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
), .Label = c("No", "Yes"), class = "factor"), Q1_4 = structure(c(1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
Q1_5 = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"
), class = "factor")), row.names = c(NA, -139L), variable.labels = c(id = "id",
Q1_1 = "[Red] Q1. Select all the colours you like", Q1_2 = "[Blue] Q1. Select all the colours you like",
Q1_3 = "[Green] Q1. Select all the colours you like", Q1_4 = "[Yellow] Q1. Select all the colours you like",
Q1_5 = "[Pink] Q1. Select all the colours you like"), codepage = 65001L, class = "data.frame")
Okay, I am assuming this is from some survey, so your data is (probably) a bit messy.
An approach with tidyverse and transposing (t()) would look something like this:
library(tidyverse)
df <- raw_data
df2 <- data.frame(t( df %>% summarise(Q1 = sum(Q1_1=="Yes") / length(complete.cases(df)) ,
Q2 = sum(Q1_2=="Yes") / length(complete.cases(df)),
Q3 = sum(Q1_3=="Yes") / length(complete.cases(df)),
Q4 = sum(Q1_4=="Yes") / length(complete.cases(df)),
Q5 = sum(Q1_5=="Yes") / length(complete.cases(df)),) ))
names(df2) <- ("sum_yes")
df2$q <- rownames(df2)
Output of df2:
> df2
sum_yes q
Q1 0.8417266 Q1
Q2 0.7338129 Q2
Q3 0.7122302 Q3
Q4 0.4820144 Q4
Q5 0.7122302 Q5
>
Then, we plot the results:
library(ggplot2)
ggplot(
df2,
aes(
x=q,
y = sum_yes,
fill = q
)
) +
geom_bar(
colour="black",
stat="identity"
) + labs(
title="Colours respondents liked",
subtitle="Q1. Select all the colours you like",
caption="source: example data"
) + ylab("Total completes")+scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values= c("Red", "Blue", "Green", "Yellow", "Pink") )
with scale_fill_manual, you can just specify what the colors of your results should be.
If I understood the data correctly, you would need to replace the names (Q1 to Q5 ) with the respective colors.

How to make a stacked barplot with nested grouping variables?

I am trying to make a stacked barplot with two variables. My desired outcome looks like this:
This is the first part of my data. There are 220 more rows:
Type Week Stage
<chr> <dbl> <dbl>
1 Captured 1 2
2 Captured 1 1
3 Captured 1 1
4 Captured 1 2
5 Captured 1 1
6 Captured 1 3
7 Captured 1 NA
8 Captured 1 3
9 Captured 1 2
10 Captured 1 1
So far I'm not getting anywhere, this is my code so far
library(data.table)
dat.m <- melt(newrstudio2, id.vars="Type")
dat.m
library(ggplot2)
ggplot(dat.m, aes(x=Type, y=value, fill=variable)) +
geom_bar(stat="identity")
I guess I need to calculate the number of observations of each stage in each week of each type? I've tried both long and wide data, but I somehow need to combine week with type? I don't know, I'm at a loss.
Alternative way:
set.seed(123)
# sample data
my_data <- data.frame(Type = sample(c("W", "C"), 220, replace = TRUE),
Week = sample(paste0("Week ", 1:4), 220, replace = TRUE),
Stage = sample(paste0('S', 1:4), 220, replace = TRUE))
head(my_data)
library(ggplot2)
ggplot(my_data, aes(x = Type, fill = Stage)) +
geom_bar(aes(y = (..count..)/sum(..count..)), position = "fill") +
facet_grid(. ~ Week, switch="both") +
scale_y_continuous(labels = scales::percent) +
ylab("Stage [%]") +
theme(strip.background = element_blank(),
strip.placement = "outside",
panel.spacing = unit(0, "lines"))
Alternatively we could use base graphics. First, what you're probably most interested in, we should reshape the data.
For this we could split the data per week and run a dcast() over it.
L <- lapply(split(d, d$week), function(x)
data.table::dcast(x, type ~ stage, value.var="stage", fun=length))
d2 <- do.call(rbind, L) # transform back into a data frame
Now – with credits to #alemol – we want the proportions.
d2[-1] <- t(apply(d2[-1], 1, prop.table))
Then we are able to plot relatively simply. Note, that barplot() additionally gives us a vector of bar coordinates which we can use later for the axis() labels.
cols <- c("#ed1c24", "#ff7f27", "#00a2e8", "#fff200") # define stage colors
par(mar=c(5, 5, 3, 5) + .1, xpd=TRUE) # set plot margins
p <- barplot(t(d2[-1]), col=cols, border="white", space=rep(c(.2, 0), 5),
font.axis=2, xaxt="n", yaxt="n", xlab="Week")
axis(1, at=p, labels=rep(c("C", "W"), 5), tick=FALSE, line=0)
axis(1, at=apply(matrix(p, , 2, byrow=TRUE), 1, mean), labels=1:5, tick=FALSE, line=1)
axis(2, at=0:10/10, labels=paste0(seq(0, 100, 10), "%"), line=0, las=2)
legend(12, .5, legend=rev(names(d2[-1])), col=rev(cols), pch=15, title="Stage")
Result:
Data:
d <- structure(list(type = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L), .Label = c("C", "W"), class = "factor"), week = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), stage = c(3L,
1L, 1L, 2L, 2L, 2L, 1L, 3L, 2L, 4L, 1L, 1L, 2L, 2L, 3L, 4L, 3L,
2L, 4L, 1L, 1L, 3L, 1L, 2L, 3L, 1L, 4L, 1L, 2L, 4L, 2L, 3L, 4L,
4L, 2L, 4L, 4L, 2L, 3L, 1L, 1L, 4L, 4L, 1L, 4L, 3L, 3L, 3L, 2L,
1L, 3L, 4L, 2L, 4L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 1L, 3L, 2L,
1L, 1L, 1L, 4L, 2L, 4L, 1L, 4L, 3L, 4L, 4L, 4L, 2L, 2L, 2L, 2L,
2L, 1L, 3L, 4L, 2L, 4L, 4L, 2L, 2L, 3L, 4L, 4L, 3L, 3L, 1L, 1L,
1L, 2L, 4L, 3L, 1L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 4L, 2L, 1L,
2L, 1L, 3L, 3L, 2L, 4L, 3L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 2L, 2L,
2L, 1L, 3L, 4L, 3L, 4L, 3L, 4L, 4L, 3L, 1L, 1L, 2L, 1L, 2L, 3L,
2L, 2L, 1L, 4L, 3L, 4L, 2L, 2L, 3L, 1L, 2L, 3L, 3L, 3L, 3L, 2L,
1L, 2L, 2L, 1L, 1L, 3L, 4L, 3L, 4L, 2L, 4L, 1L, 1L, 2L, 1L, 3L,
2L, 1L, 3L, 3L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 1L, 4L, 2L, 4L, 2L,
4L, 3L, 3L, 1L, 3L, 4L, 3L, 2L, 1L, 2L, 4L, 1L, 2L, 4L, 2L, 1L,
2L, 1L, 2L, 2L, 3L, 1L, 3L, 3L, 3L, 2L, 2L, 1L, 2L, 3L, 2L, 2L,
1L, 2L, 1L, 3L, 3L, 2L, 1L, 3L, 4L, 2L, 1L, 2L, 4L, 3L, 4L, 2L,
3L, 2L, 4L, 1L, 4L, 4L, 2L, 1L, 2L)), row.names = c(NA, -250L
), class = "data.frame")
Is this what you're looking for:
set.seed(123)
# sample data
my_data <- data.frame(Type = sample(paste0('T', 1:4), 220, replace = TRUE),
Week = sample(paste0('W', 1:4), 220, replace = TRUE),
Stage = sample(paste0('S', 1:4), 220, replace = TRUE))
ggplot(my_data, aes(x=Week:Type, fill = Stage)) + geom_bar()

Removing specific strips in a double-strip plot

I'm trying to remove the redundant "pro/retro" labels on the second row of panels on my plot. However, I still want to keep the top row of panel labels intact. I've tried for the past hour to selectively remove the 1st strip on the 2nd panel row and I was wondering if anyone here knows how to do this. See below for technical details.
I have the following plot:
It was generated from the following data:
absBtwnDat <- structure(list(setSize = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L,
6L, 7L), .Label = c("2", "3", "4", "5", "6", "7", "8"), class = "factor"),
Measure = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Actual", "Predicted"), class = "factor"),
Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("fix", "forced"), class = "factor"),
JudgementType = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("pro", "retro"), class = "factor"),
Accuracy = c(1.91388888888889, 2.95555555555556, 3.74861111111111,
4.37777777777778, 4.21527777777778, 3.0875, 2.85277777777778,
2, 2.99444444444444, 4, 4.77222222222222, 5.24444444444444,
5.18472222222222, 5.20277777777778, 1.98888888888889, 3,
3.97222222222222, 4.85972222222222, 5.70555555555556, 6.56944444444444,
7.27222222222222, 2, 3, 3.99444444444444, 4.99444444444444,
5.86944444444444, 6.75555555555556, 7.57777777777778, 1.96111111111111,
2.97777777777778, 3.78333333333333, 3.97222222222222, 4.22361111111111,
3.64722222222222, 3.68888888888889, 2, 3, 3.97222222222222,
4.67777777777778, 5.26944444444444, 5.4625, 5.8, 2, 3, 3.98333333333333,
4.87777777777778, 5.73055555555556, 6.48333333333333, 7.62916666666667,
2, 3, 3.98333333333333, 4.96666666666667, 5.96944444444444,
6.94444444444444, 7.93333333333333), LL = c(1.85, 2.87777777777778,
3.59861111111111, 4.15555555555556, 3.78888888888889, 2.73055555555556,
2.55555555555556, 2, 2.96111111111111, 4, 4.64444444444444,
5.01666666666667, 4.88333333333333, 4.88611111111111, 1.91111111111111,
3, 3.89444444444444, 4.73611111111111, 5.47777777777778,
6.20277777777778, 6.71666666666667, 2, 3, 3.96666666666667,
4.95555555555556, 5.65096686319131, 6.48333333333333, 7.17222222222222,
1.86637442123568, 2.92222222222222, 3.65, 3.61666666666667,
3.88333333333333, 3.17092476055122, 3.18888888888889, 2,
3, 3.92222222222222, 4.49444444444444, 5.0375, 5.09444444444444,
5.40555555555556, 2, 3, 3.92777777777778, 4.72222222222222,
5.52777777777778, 6.24444444444444, 7.37361111111111, 2,
3, 3.95, 4.88888888888889, 5.93333333333333, 6.88333333333333,
7.73065763697428), UL = c(1.95555555555556, 2.98333333333333,
3.84444444444444, 4.56666666666667, 4.6, 3.43611111111111,
3.17916666666667, 2, 3, 4, 4.86111111111111, 5.42777777777778,
5.48656054159421, 5.58611111111111, 2, 3, 4, 4.93888888888889,
5.83888888888889, 6.76944444444444, 7.6, 2, 3, 4, 5, 5.94166666666667,
6.88888888888889, 7.78888888888889, 1.98888888888889, 2.99444444444444,
3.87777777777778, 4.22777777777778, 4.53611111111111, 4.19722222222222,
4.20555555555556, 2, 3, 3.98888888888889, 4.78333333333333,
5.45555555555556, 5.79583333333333, 6.16666666666667, 2,
3, 3.99444444444444, 4.95, 5.85972222222222, 6.67222222222222,
7.80138888888889, 2, 3, 3.99444444444444, 4.98888888888889,
5.9875, 6.97222222222222, 7.98333333333333)), .Names = c("setSize",
"Measure", "Location", "JudgementType", "Accuracy", "LL", "UL"
), row.names = c(NA, -56L), class = "data.frame")
I visualized it using using the following code:
library(ggplot2)
p1 <- ggplot(data = absBtwnDat, aes(x = as.numeric(as.character(setSize)),
y = Accuracy, group = Measure,
colour = Measure))+
geom_point()+
geom_line(aes(linetype = Measure))+
scale_x_continuous("Trial Set Size", breaks = 2:8)+
scale_y_continuous("Accuracy (# Correct)", breaks = 0:8, limits = c(0, 8))+
geom_errorbar(aes(ymin = LL, ymax = UL), width = .1, size = .75)+
scale_colour_grey(start = .8, end = .4)+
facet_wrap(~JudgementType+Location, dir = "v")+
theme(legend.position = "top")
Just to be certain, I've highlighted unwanted strip in the following image:
With this you'll only have one row of labels per panel, but they still include both words.
p1 <- ggplot(data = absBtwnDat,
aes(x = as.numeric(as.character(setSize)), y = Accuracy,
group = Measure,
colour = Measure))+
geom_point()+
geom_line(aes(linetype = Measure))+
scale_x_continuous("Trial Set Size", breaks = 2:8)+
scale_y_continuous("Accuracy (# Correct)",
breaks = 0:8, limits = c(0, 8))+
geom_errorbar(aes(ymin = LL, ymax = UL),
width = .1, size = .75)+
scale_colour_grey(start = .8, end = .4)+
facet_wrap(~JudgementType + Location,
dir = "v",
labeller = label_wrap_gen(multi_line=FALSE)) +
theme(legend.position = "top")
p1
Here is a possible solution:
g1 <- ggplotGrob(p1)
k <- which(g1$layout$name=="strip-t-1-2")
g1$grobs[[k]]$grobs[[1]]$children[[2]]$children[[1]]$label <- ""
g1$grobs[[k]]$grobs[[1]]$children[[1]]$gp$fill <- NA
k <- which(g1$layout$name=="strip-t-2-2")
g1$grobs[[k]]$grobs[[1]]$children[[2]]$children[[1]]$label <- ""
g1$grobs[[k]]$grobs[[1]]$children[[1]]$gp$fill <- NA
library(grid)
grid.draw(g1)

Resources