R multiple choice questionnaire data to ggplot - r

I have a Qualtrics multiple choice question that I want to use to create graphs in R. My data is organized so that you can answer multiple answers for each question. For example, participant 1 selected multiple choice answers 1 (Q1_1) & 3 (Q1_3). I want to collapse all answer choices in one bar graph, one bar for each multiple response option (Q1_1:Q1_3) divided by the number of respondents who answered this question (in this case, 3).
df <- structure(list(Participant = 1:3, A = c("a", "a", ""), B = c("", "b", "b"), C = c("c", "c", "c")), .Names = c("Participant", "Q1_1", "Q1_2", "Q1_3"), row.names = c(NA, -3L), class = "data.frame")
I want to use ggplot2 and maybe some sort of loop through Q1_1: Q1_3?

Perhaps this is what you want
f <-
structure(
list(
Participant = 1:3,
A = c("a", "a", ""),
B = c("", "b", "b"),
C = c("c", "c", "c")),
.Names = c("Participant", "Q1_1", "Q1_2", "Q1_3"),
row.names = c(NA, -3L),
class = "data.frame"
)
library(tidyr)
library(dplyr)
library(ggplot2)
nparticipant <- nrow(f)
f %>%
## Reformat the data
gather(question, response, starts_with("Q")) %>%
filter(response != "") %>%
## calculate the height of the bars
group_by(question) %>%
summarise(score = length(response)/nparticipant) %>%
## Plot
ggplot(aes(x=question, y=score)) +
geom_bar(stat = "identity")

Here is a solution using ddply from dplyr package.
# I needed to increase number of participants to ensure it works in every case
df = data.frame(Participant = seq(1:100),
Q1_1 = sample(c("a", ""), 100, replace = T, prob = c(1/2, 1/2)),
Q1_2 = sample(c("b", ""), 100, replace = T, prob = c(2/3, 1/3)),
Q1_3 = sample(c("c", ""), 100, replace = T, prob = c(1/3, 2/3)))
df$answer = paste0(df$Q1_1, df$Q1_2, df$Q1_3)
summ = ddply(df, c("answer"), summarize, freq = length(answer)/nrow(df))
## Re-ordeing of factor levels summ$answer
summ$answer <- factor(summ$answer, levels=c("", "a", "b", "c", "ab", "ac", "bc", "abc"))
# Plot
ggplot(summ, aes(answer, freq, fill = answer)) + geom_bar(stat = "identity") + theme_bw()
Note : it might be more complicated if you have more columns relating to other questions ("Q2_1", "Q2_2"...). In this case, melting data for each question could be a solution.

I think you want something like this (proportion with a stacked bar chart):
Participant Q1_1 Q1_2 Q1_3
1 1 a c
2 2 a a c
3 3 c b c
4 4 b d
# ensure that all question columns have the same factor levels, ignore blanks
for (i in 2:4) {
df[,i] <- factor(df[,i], levels = c(letters[1:4]))
}
tdf <- as.data.frame(sapply(df[2:4], function(x)table(x)/sum(table(x))))
tdf$choice <- rownames(tdf)
tdf <- melt(tdf, id='choice')
ggplot(tdf, aes(variable, value, fill=choice)) +
geom_bar(stat='identity') +
xlab('Questions') +
ylab('Proportion of Choice')

Related

How to iterate over flextable columns

I want to iterate over several columns of a flextable using the mk_par function. Consider the following example:
tibble(a = c(1:10),
b1 = letters[1:10],
b2 = LETTERS[1:10],
c1 = paste0("new_",letters[1:10]),
c2 = paste0(LETTERS[1:10], "_new")) %>%
flextable(col_keys = c("a", "b", "c")) %>%
mk_par(j = "b", value = as_paragraph(b1, b2)) %>%
mk_par(j = "c", value = as_paragraph(c1, c2))
I would like to replace the two mk_par statements by a single expression which takes the arguments c("b", "c") and renders the same output. I have succeeded in rewriting this with a for loop
for(pref in c("b", "c")){
tt <- tt %>%
mk_par(j = pref,
value = as_paragraph(.data[[paste0(pref,1)]],
.data[[paste0(pref,2)]]))
}
but I wonder if there is a one line expression that does the same which integrates smoothly in a dplyr pipe syntax?

Creating random ratios that add up to 1 by group

I have a dataset as follows:
panelID= c(1:50)
year= c(2005, 2010)
country = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
urban = c("A", "B", "C")
indust = c("D", "E", "F")
sizes = c(1,2,3,4,5)
n <- 2
library(AER)
library(data.table)
library(dplyr)
set.seed(123)
DT <- data.table( country = rep(sample(country, length(panelID), replace = T), each = n),
year = c(replicate(length(panelID), sample(year, n))),
sales= round(rnorm(10,10,10),2),
industry = rep(sample(indust, length(panelID), replace = T), each = n),
urbanisation = rep(sample(urban, length(panelID), replace = T), each = n),
size = rep(sample(sizes, length(panelID), replace = T), each = n))
DT <- DT %>%
group_by(country) %>%
mutate(base_rate = as.integer(runif(1, 12.5, 37.5))) %>%
group_by(country, year) %>%
mutate(taxrate = base_rate + as.integer(runif(1,-2.5,+2.5)))
DT <- DT %>%
group_by(country, year) %>%
mutate(vote = sample(c(0,1),1),
votewon = ifelse(vote==1, sample(c(0,1),1),0))
I would like to add a variable to this dataset called ratio. I want ratio to be a random number between 0 and 1, and I want the sum of these ratios by country to be 1.
How would I go about creating such a column? The only thing I could think of is manually creating vectors which add up to one and then sampling from those vectors.
EDIT: The countries do not have equal entries:
> table(DT$country)
A B C D E F G H I J
6 10 14 6 14 10 10 8 10 12
ratio_sample_6 <- c(0.1, 0.2, 0.3, 0.05, 0.15, 0.2)
DT[,ratio:=sample(ratio_sample_6, replace = FALSE), by="country"]
But even that I could not get to work. Any suggestions?
Pick random numbers and normalize by country:
## data.table version
DT[, ratio := runif(.N)][, ratio := ratio / sum(ratio), by = "country"]
## dplyr version
DT %>% group_by(country) %>%
mutate(
ratio = runif(n()),
ratio = ratio / sum(ratio)
)

Insert specific elements in specific locations of a list

I have unequal sized list and I want to append specific items from one list to specific positions in another list
First list
dat <- structure(list(supergrp = c("D", "A", "P", "B"),
clusters = c("1", "2", "3", "1"),
items = structure(list(`1.2` = c("a", "c", "d"),
`2.1` = "b", `3` = "e", `4` = c("e", "b")),
.Names = c("1.2", "2.1", "3", "4"))),
.Names = c("supergrp", "clusters", "items"),
row.names = c(NA, 4L), class = "data.frame")
second list
val_to_append <- structure(list(supergrp = c("D", "A"),
clusters = c(1, 2),
items = structure(list(`1.2` = c("c", "f"),
`2.1` = c("c", "d", "e")),
.Names = c("1.2", "2.1"))),
.Names = c("supergrp", "clusters", "items"),
row.names = c(NA, -2L), class = "data.frame")
I want to append val_to_append$item[[1]] to dat$item[[3]]
Similarly, I want to append item val_to_append$item[[2]] to dat$item[[1]]
The required output is
supergrp clusters items
1 D 1 a, c, d, e
2 A 2 b
3 P 3 e, c, f
4 B 1 e, b
I can do this in loop
dat_indx <- c(3,1)
val_indx <- c(1,2)
fin_result <- dat
for(i in seq_along(dat_indx)) {
out_put_indx <- dat_indx[[i]]
fin_result$items[[dat_indx[[i]]]] <- unique(c(fin_result$items[[dat_indx[[i]]]],
val_to_append$items[[val_indx[[i]]]]))
}
I tried normal vector indexing such as
append(fin_result$items[[dat_indx]], val_to_append$items[[val_indx]])
without success. Is there an efficient way to do this because my list, aka, dataframe is very large hundreds of thousands of samples.
I am thinking of sapply but don't have concrete idea
We can use mapply to achieve this. We append the values from val_to_append$items to dat$items using the index value which is known before hand.
dat_indx <- c(3,1)
val_indx <- c(1,2)
dat$items[dat_indx] <- mapply(function(x, y)
unique(c(dat$items[[x]], val_to_append$items[[y]])), dat_indx, val_indx)
dat
# supergrp clusters items
#1 D 1 a, c, d, e
#2 A 2 b
#3 P 3 e, c, f
#4 B 1 e, b
Although, this is another way of solving the problem I doubt how efficient it is going to be.

discard last or first group after group_by by referencing group directly

Data:
df <- data.frame(A=c(rep(letters[1],3),rep(letters[2],3),rep(letters[3],3)),
B=rnorm(9),
stringsAsFactors=F)
I don't know if there's a way to do this, but what I'd like to know is if there's way to discard the last group by directly referencing the groups after group_by(A) to get the desired output:
A B
1 a -0.4900863
2 a 1.4106594
3 a -0.2245738
4 b -0.2124955
5 b 0.6963785
6 b 0.9151825
I AM INTERESTED IN SOLUTIONS THAT DIRECTLY WORK AT THE GROUPS LEVEL
For instance, something like:
df %>% group_by(A) %>% head(.Groups,-1)
or
df %>% group_by(A) %>% Groups[1:2]
I AM NOT INTERESTED IN THE FOLLOWING KINDS OF SOLUTIONS
df %>% filter(!(A == max(A)))
df %>% filter(!(A %in% max(A)))
OR OTHER SOLUTIONS THAT DO NOT REQUIRE group_by TO WORK
I was assuming you were not supposed to be assuming that we knew in advance what the number of groups might be. Try using the labels attribute:
all_but_last <- df %>% group_by(A) %>% attr("labels") %>% head(-1)
A
1 a
2 b
... to extract desired rows
> df %>% filter(A %in% all_but_last[[1]])
A B
1 a -0.799026840
2 a -0.712402478
3 a 0.685320094
4 b 0.971492883
5 b -0.001479117
6 b -0.817766296
Helps to use dput to look at the actual contents of a "grouped_df":
dput( df %>% group_by(A) )
structure(list(A = c("a", "a", "a", "b", "b", "b", "c", "c",
"c"), B = c(-0.799026840397576, -0.712402478350695, 0.685320094252465,
0.971492883452258, -0.00147911717469651, -0.817766295631676,
-1.00112471676908, 1.88145909873596, -0.305560178617216)), .Names = c("A",
"B"), row.names = c(NA, -9L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = "A", drop = TRUE, indices = list(
0:2, 3:5, 6:8), group_sizes = c(3L, 3L, 3L), biggest_group_size = 3L,
labels = structure(list(
A = c("a", "b", "c")),
row.names = c(NA, -3L),
class = "data.frame",
vars = "A", drop = TRUE, .Names = "A"))
Note that the labels are a data.frame so you could have further applied unlist to the result that became all_but_last and you then would not have needed to extract its value with "[[".
Perhaps this helps
library(dplyr)
df %>%
group_by(A) %>%
group_indices(.) %in% 1:2 %>%
df[.,]
Or with data.table
library(data.table)
setDT(df)[, grp := .GRP, A][grp %in% unique(grp)[1:2]][, grp := NULL][]

How to melt a dataframe with measured variables and associated standard deviations in two columns

I have a premade dataframe, in which each measured variable features an adjacent column with the standard deviations:
df <-
structure(list(Factor = structure(1:3, .Label = c("K", "L", "M"
), class = "factor"), A = c(52127802.82, 63410325.61, 76455661.87
), SD = c(9124562.98, 21975533.21, 9864019.36), B = c(63752980.62,
68303447.17, 73250794.15), SD.1 = c(34800000, 22600000, 6090000
), C = c(103512032.04, 65074190.8, 92686982.97), SD.2 = c(23900000,
20800000, 38300000), D = c(100006463.22, NA, 37406494.3)), .Names = c("Factor",
"A", "SD", "B", "SD.1", "C", "SD.2", "D"), class = "data.frame", row.names = c(NA,
-3L))
(SD.1, SD.2 were auto-renamed; originally they were all called "SD").
I want to melt into long format by factor:
library(reshape)
df.melt <- melt(df, id.vars="Factor").
However, I would like to have the melted object to keep the SD columns attached to their associated columns:
Factor Variable value value.sd
K A 52127802.82 9124562
So, i can call geom_errorbar(ymin=sd.value, ymax=sd.value) in ggplot(df.melt, aes(Factor, value)) + geom_bar(stat="identity") + facet_wrap(~variable).
Is that possible, even with the different row.names for SD?
First, I would drop df$D from the dataset because I think this is an error via df$D <- NULL:
# Factor A SD B SD.1 C SD.2
# 1 K 52127803 9124563 63752981 34800000 103512032 23900000
# 2 L 63410326 21975533 68303447 22600000 65074191 20800000
# 3 M 76455662 9864019 73250794 6090000 92686983 38300000
Then, I would rename the columns (this looks more complicated than it is and I encourage feedback/suggestions that would make this part more straightforward) -- the reason I am renaming the columns is so that I can use separate and spread from the package tidyr:
names(df)[-1][seq(2, length(names(df)) - 1, 2)] <- paste0(names(df)[-1][seq(1, length(names(df)) - 1, 2)], "-SD")
names(df)[-1][seq(1, length(names(df)) - 1, 2)] <- paste0(names(df)[-1][seq(1, length(names(df)) - 1, 2)], "-measure")
df
# Factor A-measure A-SD B-measure B-SD C-measure C-SD
# 1 K 52127803 9124563 63752981 34800000 103512032 23900000
# 2 L 63410326 21975533 68303447 22600000 65074191 20800000
# 3 M 76455662 9864019 73250794 6090000 92686983 38300000
This enables me to make df_clean:
df_clean <- df %>%
gather(measure, value, -Factor) %>%
separate(measure, c("measure_letter", "temp_var")) %>%
spread(temp_var, value)
df_clean
# Factor measure_letter measure SD
# 1 K A 52127803 9124563
# 2 K B 63752981 34800000
# 3 K C 103512032 23900000
# 4 L A 63410326 21975533
# 5 L B 68303447 22600000
# 6 L C 65074191 20800000
# 7 M A 76455662 9864019
# 8 M B 73250794 6090000
# 9 M C 92686983 38300000
Now that our dataset is clean/tidy, we can plot accordingly:
library(ggplot2)
ggplot(df_clean, aes(x = Factor, y = measure, fill = Factor)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = measure - SD, ymax = measure + SD)) +
facet_wrap(~ measure_letter)

Resources