How to create ggplot box plot which add data over time - r

I have a dataframe, which gives values for different courses over a series of weeks.
Course Week m
1 UGS200H 1 44.33333
2 CMSE201 1 73.66667
3 CMSE201 2 88.16667
4 CMSE201 2 88.16667
5 PHY215 2 73.66667
6 PHY215 3 86.33333
7 CMSE201 3 84.00000
8 UGS200H 4 60.66667
9 UGS200H 4 76.66667
I would like to create a series of box plots which plot m values over the weeks for each course. I would like for the box plots to build off of each other though, such that week 1 contains only the data from Week = 1, but week 2 contains data including data from Week = 1 and 2, and week 3 includes data from Week = 1,2,3 and etc. I have create the following code which creates the box plots but without the building up over the weeks.
d <- subset(data_manual)
a <- ggplot(data=d, aes(x=(Week), fill = Course, y=(m), group=interaction(Course, Week)))
geom_boxplot()+
scale_y_continuous(limits = c(-2, 100), breaks = seq(0, 100, by = 20))+
xlab('Week') +
ylab('Course-Level SE') +
print(a) #show us the plot!!
}
This gives plots like this
But these are just individual weeks, not the summed version that I would like. Is there a way to have them build and plot the multiple weeks on one plot?

How about this:
# dat <- tibble::tribble(
# ~Course, ~Week, ~m,
# "UGS200H", 1, 44.33333,
# "CMSE201", 1, 73.66667,
# "CMSE201", 2, 88.16667,
# "CMSE201", 2, 88.16667,
# "PHY215", 2, 73.66667,
# "PHY215", 3, 86.33333,
# "CMSE201", 3, 84.00000,
# "UGS200H", 4, 60.66667,
# "UGS200H", 4, 76.66667)
dat <- data.frame(
Course = rep(c("A", "B", "C"), each=1000),
Week = rep(rep(1:10, each=100), 3),
m = runif(3000, 50, 100)
)
library(ggplot2)
dats <- lapply(1:max(dat$Week), \(i){
tmp <- subset(dat, Week <= i)
tmp$plot_week <- i
tmp})
dats <- do.call(rbind, dats)
table(dat$Week)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 300 300 300 300 300 300 300 300 300 300
table(dats$plot_week)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 300 600 900 1200 1500 1800 2100 2400 2700 3000
ggplot(data=dats, aes(x=as.factor(plot_week), fill = Course, y=(m), group=interaction(Course, plot_week))) +
geom_boxplot()+
scale_y_continuous(limits = c(-2, 100), breaks = seq(0, 100, by = 20))+
xlab('Week') +
ylab('Course-Level SE')
Created on 2022-10-18 by the reprex package (v2.0.1)

Basically the same idea as by #DaveArmstrong but using lapply with multiple geom_boxplots.
Note 1: To make the example a bit more realistic I use some random fake example data.
Note 2: I added an additional geom_point just to check that the number of obs. is actually increasing for each week.
set.seed(123)
d <- data.frame(
Course = rep(c("UGS200H", "CMSE201", "PHY215"), each = 40),
Week = rep(1:4, 30),
m = runif(120, 40, 100)
)
library(ggplot2)
ggplot(data=d, aes(x = factor(Week), fill = Course, y=m)) +
lapply(unique(d$Week), function(x) {
list(
geom_boxplot(data = subset(d, Week <= x) |> transform(Week = x), position = "dodge"),
geom_point(data = subset(d, Week <= x) |> transform(Week = x), position = position_dodge(.9), alpha = .2)
)
}) +
labs(x = 'Week', y = 'Course-Level SE')

Related

How to plot data from two data frames together as a stacked bar chart in R?

I have two dataframes called dbDateSubMSortNacional and dbDateSubMSortNacional, which contain a series of dates and values of covid infections for males and females in a certain region for the corresponding dates. I have plotted them individually as
contagiosH=barplot(dbDateSubHSortNacional$total[1:5], names.arg = dbDateSubHSortNacional$fecha[1:5], cex.names=0.759, , cex.axis=1, main = "Masculino", xlab="Fechas yy-mm-dd" , ylab="Número de contagios", col = rainbow(5), border = "white", ylim=c(0, 5000))
text(contagiosH, dbDateSubHSortNacional$total[1:5], dbDateSubHSortNacional$total[1:5])
contagiosM=barplot(dbDateSubMSortNacional$total[1:5], names.arg = dbDateSubMSortNacional$fecha[1:5], cex.names=0.759, , cex.axis=1, main = "Femenino", xlab="Fechas yy-mm-dd" , ylab="Número de contagios", col = rainbow(5), border = "white", ylim=c(0, 5000))
text(contagiosM, dbDateSubMSortNacional$total[1:5], dbDateSubMSortNacional$total[1:5])
I was wondering if it were possible to plot them as the graph in the attached figure.
I have managed to do it independently for a single dataframe called datosas
barplot(datos, beside=F, col=rainbow(5), legend.text = row.names(datos),
args.legend=list(title ="RRSS preferidas", x="top", inset=c(0, -0.45)))
I have tried to merge both approaches as
barplot(dbDateSubMSortNacional$total[1:5], dbDateSubHSortNacional$total[1:5], names.arg = dbDateSubMSortNacional$fecha[1:5], cex.names=0.759, , cex.axis=1, col = rainbow(5), border = "white", ylim=c(0, 4000))
Unfortunately, I get a single graph. Can someone please tell me how to achieve my goal?
EDIT: To generate a reproducible output, use
fecha <- c("2022-01-01","2022-01-02","2022-01-03", "2022-01-04", "2022-01-05", "2022-01-06", "2022-01-07")
total <- c(1, 3, 5, 7, 9, 11, 13)
dbDateSubMSortNacional <- data.frame(fecha, total);
total <- c(2, 4, 6, 8, 10, 12, 14)
dbDateSubHSortNacional <- data.frame(fecha, total);
Problem with the suggested solution by #jdobres:
Returns the error Error in cbind(total.2022 - 1 - 10, total.2022 - 1 - 11, total.2022 - : object 'total.2022' not found in
barplot(cbind(total.2022-01-10, total.2022-01-11, total.2022-01-04, total.2022-01-05, total.2022-01-12) ~ set, data = combined_wide, col = rainbow(5))
A dplyr and ggplot solution, followed by a base R barplot option:
library(dplyr, warn = FALSE)
library(tidyr)
library(ggplot2)
df1 <- bind_rows(dbDateSubHSortNacional, dbDateSubMSortNacional)
ggplot(df1)+
geom_col(aes(x = set, y = total, fill = fecha))
#get data into wide format:
df2 <-
df1 |>
pivot_wider(names_from = fecha, values_from = total)
barplot(cbind(`2022-01-10`, `2022-01-11`, `2022-01-04`,`2022-01-05`, `2022-01-12`, `2022-01-17`, `2021-12-19`)~set,
data = df2 ,
xlim = c(0,1),
width = 0.30,
col = rainbow(7),
xlab = "Sex",
ylab = "Frequency",
legend.text = colnames(df2)[-1],
args.legend = list(x = "topright"))
Created on 2022-10-15 with reprex v2.0.2
data
fecha <- c("2022-01-10","2022-01-11","2022-01-04","2022-01-05", "2022-01-12","2022-01-17", "2021-12-19")
total <- c(1, 3, 5, 7, 9, 11, 13)
dbDateSubMSortNacional <- data.frame(fecha, total)
total <- c(2, 4, 6, 8, 10, 12, 14)
dbDateSubHSortNacional <- data.frame(fecha, total);
dbDateSubHSortNacional$set <- 'H'
dbDateSubMSortNacional$set <- 'M'
First you will need to combine your two data frames into one:
dbDateSubHSortNacional$set <- 'H'
dbDateSubMSortNacional$set <- 'M'
combined <- rbind(dbDateSubHSortNacional, dbDateSubMSortNacional)
fecha total set
1 Monday 2 H
2 Tuesday 4 H
3 Wendesday 6 H
4 Thuersday 8 H
5 Friday 10 H
6 Saturday 12 H
7 Sunday 14 H
8 Monday 1 M
9 Tuesday 3 M
10 Wendesday 5 M
11 Thuersday 7 M
12 Friday 9 M
13 Saturday 11 M
14 Sunday 13 M
If you want to plot this data with barplot, it would be easier to first reshape the data to "wide" format:
combined_wide <- reshape(combined, direction = 'wide', idvar = 'set', timevar = 'fecha')
set total.Monday total.Tuesday total.Wendesday total.Thuersday total.Friday total.Saturday total.Sunday
1 H 2 4 6 8 10 12 14
8 M 1 3 5 7 9 11 13
And then the barplot command becomes:
barplot(cbind(total.Monday, total.Tuesday, total.Wendesday, total.Thuersday, total.Friday) ~ set, data = combined_wide, col = rainbow(5))
You could also use ggplot2 to create a similar plot, without having to widen the data set:
library(ggplot2)
ggplot(data = subset(combined, !(fecha %in% c('Saturday', 'Sunday'))), aes(x = set, y = total, fill = fecha)) +
geom_col()

Create a Loop in R on participants; per participant 1 boxplot .tiff is created

Dear Stack overflow community,
I would like to perform a loop of 4 participants with different .csv files (1 per participant).
For each subject I would like to produce a bar plot relying "sex(male, female)" on the x axis with the "Proportion of correct answer" on the y axis).
Here is showed how each .csv file appear when opened:
I want to add names to the factors c("observernumber","trial","numstim","memo","sex","corResp","curresp","success")
1 1 284 high female a a 1
2 2 190 low male l l 1
3 3 224 low male l l 1
4 4 218 high male l l 1
5 5 137 high male l l 1
6 6 45 high female a a 1
7 7 87 high female a a 1
8 8 249 high female a a 1
9 9 27 low male l l 1
10 10 53 low male l l 1
11 11 92 low female a a 1
12 12 283 low male l l 1
I start by giving the .csv names per participant:
obsNames = c("1_gender","2_gender","3_gender","4_gender")
nsubj <- length(obsNames)
I preallocate a matrix reading the obsNames:
allData = c()
for (i in 1:obsNames) {
dat = read.csv(sprintf("%s_gender.csv", obsNames[i]), header = FALSE, sep = ";")
obsIndex = rep(i,nrow(dat))
allData = rbind(allData, cbind(obsIndex,dat))
}
I name the total factors I have:
names(allData) = c("subj", "trial", "sex", "success")
I want to plot for each of the 4 current subjects a .tiff file that has to show a)b)c):
a) Name of the current observer as title
paste(obsNames[curSubj], ".tiff", sep = "")
b) "sex(male, female)" factor on the x axis
"sex"
c)with the "Proportion of correct answer" factor on the y axis
"success"
I would like to produce a boxplot for each of the participants, how should I finish this script?
How do I name x and y values?:
for(curSubj in 1:nsubj){
tiff(file = paste(obsNames[curSubj], ".tiff", sep = ""))
barCenters = barplot(
main = paste(obsNames[curSubj],".tiff",sep = ""),
yaxt = "n",
ylab = "Proportion Correct",
xlab = "Gender"
)
mgp.axis(labels = c("Female","Male"))
dev.off()
}
Is there an error using???
curSubj
The following creates one TIFF file with a bar plot per CSV file whose names match the pattern "gender.csv".
It uses ggplot2 graphics.
First, read in the files.
csv_files <- list.files(pattern = "gender\\.csv")
df_list <- lapply(csv_files, read.csv)
obsNames <- sub("\\.csv", "", csv_files)
df_list <- lapply(seq_along(df_list), function(i){
Y <- cbind.data.frame(obsIndex = obsNames[i], df_list[[i]])
})
Now the bar plots, saved in TIFF files.
library(ggplot2)
library(scales)
for(i in seq_along(df_list)){
g <- ggplot(df_list[[i]], aes(x = sex)) +
geom_bar(aes(y = (..count..)/sum(..count..)),
stat = "count", position = "dodge") +
scale_y_continuous(labels = percent) +
ylab(label = "Proportion of correct answer") +
ggtitle(obsNames[i])
outfile <- paste0(obsNames[i], ".tiff")
tiff(filename = outfile)
print(g)
dev.off()
}
Data creation code.
The following code creates 4 csv files in thee working directory, with the structure of the table posted in the question.
f <- function(i, n){
sex <- sample(c("male", "female"), n, TRUE)
success <- rbinom(n, 1, prob = 0.5)
df <- data.frame(trial = seq_len(n), sex, success)
write.csv(df, paste0(i, "_gender.csv"), quote = FALSE, row.names = FALSE)
}
set.seed(1234)
n <- sample(20:50, 4)
lapply(1:4, function(i) f(i, n = n[i]))

How to order boxes in boxplots by the medians of a numerical variable in a dataframe in base R

I have a dataframe with three variables; one ("group") is a factor with two levels, one ("word") is a character vector, and one ("duration") is numeric. For example:
DATA <- data.frame(
group = c(rep("prefinal",10), rep("final", 10)),
word = c(sample(LETTERS[1:5], 10, replace = T), sample(LETTERS[1:5], 10, replace = T)),
duration = rnorm(20)
)
DATA
group word duration
1 prefinal C 0.16378771
2 prefinal E 0.13370196
3 prefinal A 0.69112398
4 prefinal B 0.21499187
5 prefinal D -0.28998279
6 prefinal D -2.00353522
7 prefinal A 0.37842555
8 prefinal E 1.62326170
9 prefinal A -0.26294929
10 prefinal B -0.54276322
11 final D 1.32772171
12 final E -1.84902285
13 final C 0.01058158
14 final E 1.49529743
15 final B 0.55291290
16 final A -0.35484820
17 final D -0.16822110
18 final A 0.88667458
19 final E 0.70889916
20 final B 1.12217332
I'd like to depict the durations of the words by group in boxplots:
boxplot(DATA$duration ~ DATA$group + DATA$word,
xaxt="n",
col = rep(c("blue", "red"), 5))
axis(1, at = seq(from=1.5, to= 10.5, by=2), labels = sort(unique(DATA$word)), cex.axis = 0.9)
R seems to order the boxes in alphabetical order (of the "word" variable) by default.
EDIT:
However I'd prefer that the boxes be sorted by the median durations (in descending order) the items in the "word" variable have in the "prefinal" group. How can that be achieved?
You can reorder the levels of DATA$wordaccording to their median. The - before DATA$duration is to sort it in descending order.
DATA$word <- reorder(DATA$word, -DATA$duration, FUN = median)
boxplot(DATA$duration ~ DATA$group + DATA$word,
xaxt="n",
col = rep(c("blue", "red"), 5))
axis(1, at = seq(from=1.5, to= 10.5, by=2), labels = levels(DATA$word), cex.axis = 0.9)
You can do the same for the subgroup of prefinal. But it requires an additional step:
ordered_levels <- levels(with(DATA[DATA$group == "prefinal",], reorder(word, -duration, FUN = median)))
DATA$word <- factor(DATA$word, levels = ordered_levels)

plot r two categorical variables

I am using below command to plot two categorical variables in R
gender has 2 levels and Income has 9 levels.
spineplot(main$Gender,main$Income, xlab="Gender", ylab="Income levels: 1 is lowest",xaxlabels=c("Male","Female"))
It produces chart like below
How can i plot this chart in color?
How can i show % of each income level within each box? for example female income level 1 has 21% of data. How can i show 21% within the dark colored area?
################update 1
Adding reproducible example
fail <- factor(c(2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1,
1, 1, 1, 2, 1, 1, 1, 1, 1,2,2,2,2),
levels = c(1, 2), labels = c("male", "female"))
gender <- factor(rep(c(1:9),3))
spineplot(fail,gender)
I think it may be easier to do this with a barplot since spineplot doesn't return anything useful.
The default would be the following, but you can adjust the widths of the bars to some other variable (you can see the x-axis coordinates are returned):
par(mfrow = 1:2)
(barplot(table(gender, fail)))
# [1] 0.7 1.9
(barplot(table(gender, fail), width = table(fail)))
# [1] 10.7 26.9
With some final touches we get
tbl <- table(gender, fail)
prp <- prop.table(tbl, 2L)
yat <- prp / 2 + apply(rbind(0, prp[-nrow(prp), ]), 2L, cumsum)
bp <- barplot(prp, width = table(fail), axes = FALSE, col = rainbow(nrow(prp)))
axis(2L, at = yat[, 1L], labels = levels(gender), lwd = 0)
axis(4L)
text(rep(bp, each = nrow(prp)), yat, sprintf('%0.f%%', prp * 100), col = 0)
Compare to
spineplot(fail, gender, col = rainbow(nlevels(gender)))
An alternative to the interesting solution of #rawr is:
fail <- factor(c(2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1,
1, 1, 1, 2, 1, 1, 1, 1, 1,2,2,2,2),
levels = c(1, 2), labels = c("male", "female"))
gender <- factor(rep(c(1:9),3))
mypalette <- colorRampPalette(c("lightblue","darkblue"))
tbl <- spineplot(fail, gender, xlab="Gender", ylab="Income levels: 1 is lowest",
xaxlabels=c("Male","Female"), col=mypalette(nlevels(gender)) )
print(tbl)
# Income levels: 1 is lowest
# Gender 1 2 3 4 5 6 7 8 9
# male 2 1 2 1 3 2 2 2 1
# female 1 2 1 2 0 1 1 1 2
print.perc <- function(k, tbl, ndigits=2, str.pct="%") {
# These lines of codes are the same used by from spineplot
# for the calculation of the x-position of the stacked bars
nx <- nrow(tbl)
off <- 0.02
xat <- c(0, cumsum(prop.table(margin.table(tbl, 1)) + off))
posx <- (xat[1L:nx] + xat[2L:(nx + 1L)] - off)/2
# Proportions by row (gender)
ptbl <- prop.table(tbl,1)
# Define labels as strings with a given format
lbl <- paste(format(round(100*ptbl[k,], ndigits), nsmall=ndigits), str.pct, sep="")
# Print labels
# cumsum(ptbl[k,])-ptbl[k,]/2 is the vector of y-positions
# for the centers of each stacked bar
text(posx[k], cumsum(ptbl[k,])-ptbl[k,]/2, lbl)
}
# Print income levels for males and females
strsPct <- c("%","%")
for (k in 1:nrow(tbl)) print.perc(k, tbl, ndigits=2, str.pct=strsPct[k])
Hope it can help you.

Creating a cumulative step graph in R

Say I have this example data frame
set.seed(12345)
n1 <- 3
n2 <- 10
n3 <- 60
times <- seq(0, 100, 0.5)
individual <- c(rep(1, n1),
rep(2, n2),
rep(3, n3))
events <- c(sort(sample(times, n1)),
sort(sample(times, n2)),
sort(sample(times, n3)))
df <- data.frame(individual = individual, events = events)
Which gives
> head(df, 10)
individual events
1 1 72.0
2 1 75.5
3 1 87.5
4 2 3.0
5 2 14.5
6 2 16.5
7 2 32.0
8 2 45.5
9 2 50.0
10 2 70.5
I would like to plot a cumulative step graph of the events so that I get one line per individual which goes up by 1 each time an event is "encountered".
So, for instance individual 1 will be 0 up to 72.0, then go up to 1, until 75.5 when it becomes 2 and up to 3 at 87.5 to the end of the graph.
What would be the easiest way to do that?
df$step <- 1
library(plyr)
df <- ddply(df,.(individual),transform,step=cumsum(step))
plot(step~events,data=df[df$individual==1,],type="s",xlim=c(0,max(df$events)),ylim=c(0,max(df$step)),xlab="time",ylab="step")
lines(step~events,data=df[df$individual==2,],type="s",col=2)
lines(step~events,data=df[df$individual==3,],type="s",col=3)
There is also the stepfun function in the stats package. Using that, you could use the plot method for that object class:
sdf <- split(df, individual)
plot(1, 1, type = "n", xlim = c(0, max(events)), ylim = c(0, max(table(individual))),
ylab = "step", xlab = "time")
sfun <- lapply(sdf, function(x){
sf <- stepfun(sort(x$events), seq_len(nrow(x) + 1) - 1)
plot(sf, add = TRUE, col = unique(x$individual), do.points = FALSE)
})
Use ggplot2:
library(ggplot2)
# Add step height information with sequence and rle
df$step <- sequence(rle(df$individual)$lengths)
# plot
df$individual <- factor(df$individual)
ggplot(df, aes(x=events, group=individual, colour=individual, y=step)) +
geom_step()

Resources