Create multi-factor geom_bar with distributions assoicated with each other - r

Take the following sample data:
set.seed(123456)
#
Male_alive <- rbinom(100,1,0.6)
Male_age <- sample(20:80, 100, T)
#
Female_alive <- rbinom(100,1,0.7)
Female_age <- sample(20:80, 100, T)
#
Alive <- c(Male_alive, Female_alive)
Age <- c(Male_age, Female_age)
Sex <- c(rep('Male', length(Male_alive)),rep('Female', length(Female_alive)))
#
Patients <- data.frame(Alive, Age, Sex)
I can create a simple bar plot with the following code:
ggplot(Patients, aes(Sex, fill = factor(Alive))) +
geom_bar(position = "fill")
But I want to extend on this, by creating a multi-factored bar plot (for Sex and AgeGr) that looks like the image (colours are not important):
#
Patients$AgeGr <- cut(Patients$Age, 6)
using
ggplot(Patients, aes(..., fill = factor(Alive))) +
geom_bar(position = "fill") + geom_wrap(~Sex)
Where the AgeGr is only fills up to the corresponding height of Alive

Perhaps this could be a terrible way to do it:
#to get plots side by side
library(gridExtra)
#plot count of males
pmales <-ggplot(Patients[Patients$Sex=='Male',], aes(Sex, fill =factor(Alive))) + geom_bar(position='fill')
#plot grage males0
pagegrmales0 <-ggplot(Patients[Patients$Sex=='Male' & Patients$Alive==0,], aes(Sex, fill =factor(AgeGr))) + geom_bar(position='fill') + ylab(NULL) +xlab(NULL) + theme(legend.position="none", plot.margin=unit(c(1,1,-0.5,1), "cm"))
#plot grage males1
pagegrmales1 <-ggplot(Patients[Patients$Sex=='Male' & Patients$Alive==1,], aes(Sex, fill =factor(AgeGr))) + geom_bar(position='fill') + ylab(NULL) +xlab(NULL) + theme(legend.position="none", plot.margin=unit(c(-0.5,1,1,1), "cm"))
factorsmale <- grid.arrange(pagegrmales0, pagegrmales1, heights=c(prop.table(table(Patients[Patients$Sex=='Male',]$Alive))[[1]], prop.table(table(Patients[Patients$Sex=='Male',]$Alive))[[2]]), nrow=2)
males <- grid.arrange(pmales, factorsmale, ncol =2, nrow= 2)
########
#plot count of females
pfemales <-ggplot(Patients[Patients$Sex=='Female',], aes(Sex, fill =factor(Alive))) + geom_bar(position='fill')
#plot grage females0
pagegrfemales0 <-ggplot(Patients[Patients$Sex=='Female' & Patients$Alive==0,], aes(Sex, fill =factor(AgeGr))) + geom_bar(position='fill') + ylab(NULL) +xlab(NULL) + theme(legend.position="none", plot.margin=unit(c(1,1,-0.5,1), "cm"))
#plot grage females1
pagegrfemales1 <-ggplot(Patients[Patients$Sex=='Female' & Patients$Alive==1,], aes(Sex, fill =factor(AgeGr))) + geom_bar(position='fill') + ylab(NULL) +xlab(NULL) + theme(legend.position="none", plot.margin=unit(c(-0.5,1,1,1), "cm"))
factorsfemale <- grid.arrange(pagegrfemales0, pagegrfemales1, heights=c(prop.table(table(Patients[Patients$Sex=='Female',]$Alive))[[1]], prop.table(table(Patients[Patients$Sex=='Female',]$Alive))[[2]]), nrow=2)
females <- grid.arrange(pfemales, factorsfemale, ncol =2, nrow= 2)
grid.arrange(males, females, ncol = 2, nrow = 1)

Combine the alive and agegroup columns
Patients$Alive_AgeGr <- paste(Patients$Alive, Patients$AgeGr, sep="_")
Plot
ggplot(Patients, aes(x = factor(Alive), fill = factor(AgeGr))) +
geom_bar(position = "fill") + facet_wrap(~Sex)

First, you need to create long format dataset with only required columns and then plot the stacked-bar plot with wrap, as below:
library("reshape2")
mPatients <- melt(Patients[,-2], id.vars = "Sex")
ggplot(mPatients, aes(x=variable, fill = factor(value))) +
geom_bar(position = "fill") + facet_wrap(~Sex)

Related

Y axis values different from actual column in dataset in R

I am currently working with a dataset of "world bank islands". In that, I am trying to plot the population Vs country graph for each year. Below is the code that I have done.
library(ggplot2)
options(scipen = 999)
bank <- read.csv("C:/Users/True Gamer/OneDrive/Desktop/world_bank_international_arrivals_islands.csv")
bank[bank == "" | bank == "."] <- NA
bank$country <- as.numeric(bank$country)
bank$year <- as.numeric(bank$year)
bank$areakm2 <- as.numeric(bank$areakm2)
bank$pop <- as.numeric(bank$pop)
bank$gdpnom <- as.numeric(bank$gdpnom)
bank$flights...WB <- as.numeric(bank$flights...WB)
bank$hotels <- as.numeric(bank$hotels)
bank$hotrooms <- as.numeric(bank$hotrooms)
bank$receipt <- as.numeric(bank$receipt)
bank$ovnarriv <- as.numeric(bank$ovnarriv)
bank$dayvisit <- as.numeric(bank$dayvisit)
bank$arram <- as.numeric(bank$arram)
bank$arreur <- as.numeric(bank$arreur)
bank$arraus <- as.numeric(bank$arraus)
str(bank)
plot1 <- ggplot(bank, aes(x=country,y=pop)) + geom_bar(stat = "identity",aes(fill=year)) + ggtitle("Population of each country yearwise") + xlab("Countries") + ylab("Population")
plot1
However, when I do this, the y values shown on the graph are different from the actual y values. This is the link to the dataset
The problem is that you are stacking the bars (this is default behaviour). Also, geom_bar(stat = "identity") is just a long way of writing geom_col. One further point to note is that since all your columns are numeric, the single line:
bank <- as.data.frame(lapply(bank, as.numeric))
replaces all your individual numeric conversions.
The plot you are trying to create would be something like this:
ggplot(bank, aes(x = country, y = pop)) +
geom_col(aes(fill = factor(year)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Countries") +
ylab("Population") +
labs(fill = "Year") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = 1:27)
However, it would probably be best to present your data in a different way. Perhaps, if you are comparing population growth, something like this would be better:
ggplot(bank, aes(x = year, y = pop)) +
geom_line(aes(color = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
Or with bars:
ggplot(bank, aes(x = year, y = pop)) +
geom_col(aes(fill = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")

How to fit a lot of x labels or space out x labels in ggplot2

Is there a way to space out so that each x label is more distinguishable?
I feel like this question has been asked before but I can't seem to find an answer. I believe the graph needs to be bigger for this to work, is their a way to make the graph bigger within Rstudio? or make the text smaller
My code so far :
bar_plt = ggplot(data, aes(fct_infreq(Event))) + geom_bar(fill = "dodgerblue", width = .4) +
xlab("Event Names") + ylab("Number of Observations") + coord_flip()
TIA
what about working with labels in this way (sorry for the fake data, but I have not got a sample of your):
library(ggplot2)
# numbers
set.seed(1)
y<-sample(1:30, 500, TRUE)
# very long and numerous labels
x <- paste(sample(letters[1:22], 500, TRUE),sample(letters[1:2], 500, TRUE),'abcdefghijklmnopqrstuvwxyz')
data <- data.frame(x,y)
# simple ggplot barplot
p <- ggplot(data, aes(x = x, y = y)) + geom_bar(stat = "identity") + coord_flip()
# play with the size to have a fitting dimension
p <- p + theme(axis.text.y = element_text(face="bold", color="black", size=8))
# you can also abbreviate the labels if necessary
p <- p + scale_x_discrete(labels = abbreviate)
p
Your plot could be something like:
library(forcats)
library(ggplot2)
# data
set.seed(1)
Events <- paste(sample(letters[1:22], 500, TRUE),sample(letters[1:2], 500, TRUE),'abcdefghijklmnopqrstuvwxyz')
data <- data.frame(Events)
bar_plt <- ggplot(data, aes(fct_infreq(Events))) + geom_bar(fill = "dodgerblue", width = .4) + coord_flip()
bar_plt <- bar_plt + xlab("Event Names") + ylab("Number of Observations")
bar_plt <- bar_plt + theme(axis.text.y = element_text(face="bold", color="black", size=8))
bar_plt <- bar_plt + scale_x_discrete(labels = abbreviate)
bar_plt

Deleting an entire row of facets of unused factor level combination

I want to remove the 2nd row of facets from my plot below because there is no data for that factor combination.
library(ggplot2)
library(grid)
set.seed(5000)
# generate first df
df1 = data.frame(x=rep(rep(seq(2,8,2),4),6),
y=rep(rep(seq(2,8,2),each=4),6),
v1=c(rep("x1",32),rep("x2",64)),
v2=c(rep("y1",64),rep("y2",32)),
v3=rep(rep(c("t1","t2"),each=16),3),
v4=rbinom(96,1,0.5))
# generate second df
df2 = data.frame(x=runif(20)*10, y=runif(20)*10,
v1=sample(c("x1","x2"),20,T))
# plot
ggplot() +
geom_point(data=df1, aes(x=x, y=y, colour = factor(v4)), shape=15, size=5) +
scale_colour_manual(values = c(NA,"black")) + facet_grid(v1+v2~v3, drop = T) +
geom_point(data=df2, aes(x=x,y=y), shape=23 , colour="black", fill="white", size=4) +
coord_equal(ratio=1) + xlim(0, 10) + ylim(0, 10)
I tried to use the idea from this post..
g=ggplotGrob(y)
pos=which(g$layout$t==5 | g$layout$t==6)
g$layout=g$layout[-c(pos),]
g$grobs=g$grobs[-c(pos)]
grid.newpage()
grid.draw(g)
..but got this.
How do I eliminate the white space? Also, is there a straightforward solution to this, without having to manipulate the grobs, etc?
Just modify the data:
df2 <- rbind(cbind(df2, v2 = "y1"),
cbind(df2, v2 = "y2"))
df2 <- df2[!(df2$v1 == "x1" & df2$v2 == "y2"),]
# plot
ggplot() +
geom_point(data=df1, aes(x=x, y=y, colour = factor(v4)), shape=15, size=5) +
scale_colour_manual(values = c(NA,"black")) + facet_grid(v1+v2~v3, drop = T) +
geom_point(data=df2, aes(x=x,y=y), shape=23 , colour="black", fill="white", size=4) +
coord_equal(ratio=1) + xlim(0, 10) + ylim(0, 10)

How to find Percent Frequency gg plot

There have been a few questions on here asking how to plot percent frequency. I have tried implementing the suggestions but am still having trouble.
I have the following vector:
var <- c(2,2,1,0,1,1,1,1,1,3,2,3,3,5,1,4,4,0,3,4,1,0,3,3,0,0,
1,3,2,6,2,2,2,1,0,2,3,2,0,0,0,0,3,2,2,4,3,2,2,0,4,1,0,1,3,1,4,3,1,2,
6,7,6,1,2,2,4,5,3,0,6,5,2,0,7,1,7,3,1,4,1,1,2,1,1,2,1,1,4,2,0,3,3,2,2,2,5,3,2,5,2,5)
I plotted a histogram using the following code:
df <- data.table(x = var)
df <- df[, .N, by=x]
df$x <- factor(df$x, levels=c(0:25))
p <- ggplot(df, aes(x=x, y= N)) +
geom_bar(
stat="identity", width=1.0,
colour = "darkgreen",
fill = 'paleturquoise4'
)
p <- p + labs(scale_x_discrete(drop=FALSE) )
p = p + coord_cartesian(ylim=c(0, 50)) +
scale_y_continuous(breaks=seq(0, 50, 2))
print(p)
I tried using the following but it does not work.
p <- ggplot(df, aes(x=x, y= N)) +
geom_bar(
aes(y = (..count..)/sum(..count..)),
stat="identity", width=1.0,
colour = "darkgreen",
fill = 'paleturquoise4'
)
One thing you can do is that you can do the calculation before you draw the graphic. But, if I follow your approach, you would want something like this.
ggplot(df, aes(x=x)) +
geom_bar(aes(y = N/sum(N)), stat="identity", width=1.0,
colour = "dark green", fill = 'paleturquoise4') +
ylab("y")

ggplot: relative frequencies of two groups

I want a plot like this except that each facet sums to 100%. Right now group M is 0.05+0.25=0.30 instead of 0.20+0.80=1.00.
df <- rbind(
data.frame(gender=c(rep('M',5)), outcome=c(rep('1',4),'0')),
data.frame(gender=c(rep('F',10)), outcome=c(rep('1',7),rep('0',3)))
)
df
ggplot(df, aes(outcome)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
facet_wrap(~gender, nrow=2, ncol=1)
(Using y = ..density.. gives worse results.)
here's another way
ggplot(df, aes(outcome)) +
geom_bar(aes(y = ..count.. / sapply(PANEL, FUN=function(x) sum(count[PANEL == x])))) +
facet_wrap(~gender, nrow=2, ncol=1)
I usually do this by simply precalculating the values outside of ggplot2 and using stat = "identity":
df1 <- melt(ddply(df,.(gender),function(x){prop.table(table(x$outcome))}),id.vars = 1)
ggplot(df1, aes(x = variable,y = value)) +
facet_wrap(~gender, nrow=2, ncol=1) +
geom_bar(stat = "identity")

Resources