ggplot grouped barchart based on marginal proportions - r

I am trying to create a grouped barplot that uses marginal (row) proportions rather than cell proportions and can't figure out how to change:
y = (..count..)/sum(..count..)
in ggplot to do this.
Using the mtcars dataset as an example and considering two categorical variables (cyl and am - purely for the sake of the example taking cyl as the response and am as the explanatory variable). Can anyone help me to do this:
data(mtcars)
# Get Proportions
mtcars_xtab <- table(mtcars$cyl,mtcars$am)
mtcars_xtab
margin.table(mtcars_xtab, 1) # A frequencies (summed over B)
margin.table(mtcars_xtab, 2) # B frequencies (summed over A)
prop.table(mtcars_xtab) # cell percentages - THIS IS WHAT'S USED IN THE PLOT
prop.table(mtcars_xtab, 1) # row percentages - THESE ARE WHAT I WANT TO USE IN THE PLOT
# Make Plot
mtcars$cyl <- as.factor(mtcars$cyl)
mtcars$am <- as.factor(mtcars$am)
ggplot(mtcars, aes(x=am, fill=cyl)) +
geom_bar(aes(y = (..count..)/sum(..count..)), position = "dodge") +
scale_fill_brewer(palette="Set2")
Thank you.

Related

How to adjust ggplot with facet to just show one or two variables per Plot view

I am having trouble to adjust a boxplot for numerical variables
using ggplot, I would like to adjust the console Plot view
to just show one or two variables at time.
Here's the code I am using:
##################################################################################
# Data #
# #
##################################################################################
data("diamonds")
basePlot <- diamonds[ names(diamonds)[!names(diamonds) %in% c("color", "clarity")] ]
##################################################################################
library(ggplot2)
library(tidyr) # pivot_longer
ggplot(pivot_longer(basePlot, -cut, names_to="var", values_to="val"),
aes(cut, val, color=cut)) +
geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1, notch=FALSE) +
xlab("Diamond Cut") +
facet_wrap(~var, nrow=2, scales="free") +
scale_x_discrete(guide=guide_axis(n.dodge=2))
Console Output:
Expected Output consist in just one or two variables per view,
An example could be just carat and depth in one view,
Price and table in the other,
and so on...

Best Way to Plot three vectors in R?

I have a vector of length 10k for each of the variables x and z. For each of the 10k, I have also estimated propensity scores using logit and other methods. So I have another vector that contains the predicted propensity scores.
I want to plot predicted propensity vector as the height of the 3d graph and as a function of the x and z vectors (I want something like a surface). What is the best way to go about doing this? I tried using scatter3d() from the plot3d library and it looks very bad.
Sample data: https://www.dropbox.com/s/1lf36dpxvebd7kw/mydata2.csv?dl=0
Updated Answer
Using the data you provided, we can bin the data, get the average propensity score by bin and plot using geom_tile. I provide code for that below. A better option would be to fit the propensity score model using the x and z vectors (and the binary treatment variable that you're predicting). Then, create a new data frame of predicted pz_p values on a complete grid of x and z values and plot that. I don't have your binary treatment variable with which to fit the model, so I haven't produced an actual plot, but the code would look something like this:
# Propensity score model
m1 = glm(treat ~ x + z, data=dat, family=binomial)
# Get propensity scores on full grid of x and z values
n = 100 # Number of grid points. Adjust as needed.
pred.dat = expand.grid(x=seq(min(dat$x),max(dat$x),length=n,
z=seq(min(dat$z),max(dat$z),length=n)
pred.dat$pz_p = predict(m1, newdata=pred.dat, type="response")
ggplot(pred.dat. aes(x, z, fill=pz_p)) +
geom_tile() +
scale_fill_gradient2(low="red", mid="white", high="blue", midpoint=0.5, limits=c(0,1))
Code for tile plot with binned data:
library(tidyverse)
theme_set(theme_classic())
dat = read_csv("mydata2.csv")
# Bin by x and z
dat = dat %>%
mutate(xbin = cut(x,breaks=seq(round(min(x),1)-0.05,round(max(x),1)+0.05,0.1),
labels=seq(round(min(x),1), round(max(x),1),0.1)),
xbin=as.numeric(as.character(xbin)),
zbin = cut(z,breaks=seq(round(min(z),1)-0.1,round(max(z),1)+0.1,0.2),
labels=seq(round(min(z),1), round(max(z),1),0.2)),
zbin=as.numeric(as.character(zbin)))
# Calculate average pz_p by bin and then plot
ggplot(dat %>% group_by(xbin, zbin) %>%
summarise(pz_p=mean(pz_p)),
aes(xbin, zbin, fill=pz_p)) +
geom_tile() +
scale_fill_gradient2(low="red", mid="white", high="blue", midpoint=0.5, limits=c(0,1))
Original Answer
A heat map might work well here. For example:
library(ggplot2)
# Fake data
set.seed(2)
dat = expand.grid(x=seq(0,10,length=100),
z=seq(0,10,length=100))
dat$ps = 1/(1 + exp(0.3 + 0.2*dat$x - 0.5*dat$z))
ggplot(dat, aes(x, z, fill=ps)) +
geom_tile() +
scale_fill_gradient2(low="red", mid="white", high="blue", midpoint=0.5, limits=c(0,1)) +
coord_equal()
Or in 3D with rgl::persp3d:
library(rgl)
library(tidyverse)
x=unique(sort(dat$x))
z=unique(sort(dat$z))
ps=dat %>% spread(z, ps) %>% select(-1) %>% as.matrix
persp3d(x, z, ps, col="lightblue")

r: Blank graph when plotting multiple lines on scatterplot

My goal is to produce a graph showing the differences between regression lines using continuous vs categorical variables. I'm using is the "SleepStudy" dataset from Lock5Data, and I want to show the regression lines predicting GPA from ClassYear as either continuous or categorical. The code is below:
library(Lock5Data)
data("SleepStudy")
fit2 <- lm(GPA ~ factor(ClassYear), data = SleepStudy)
fit2_line <- aggregate(fit2$fitted.values ~ SleepStudy$ClassYear, FUN = mean)
colnames(fit2_line) <- c('ClassYear','GPA')
options(repr.plot.width=5, repr.plot.height=5)
library(ggplot2)
ggplot() +
geom_line(data=fit2_line, aes(x=ClassYear, y=GPA)) + # Fit line, ClassYear factor
geom_smooth(data=SleepStudy, method='lm', formula=GPA~ClassYear) + # Fit line, ClassYear continuous
geom_point(data=SleepStudy, aes(x=ClassYear, y=GPA)) # Data points as dots
What is producing the blank graph? What am I missing here?
You have to define the data you are using for the geom_smooth in the ggplot(). This code works:
ggplot(data=SleepStudy, aes(y = GPA,x = ClassYear)) +
geom_smooth(data=SleepStudy, method='lm', formula=y~x)+
geom_line(data=fit2_line, aes(x=ClassYear, y=GPA)) +
geom_point(data=SleepStudy, aes(x=ClassYear, y=GPA))

Creating barplot with standard errors plotted in R

I am trying to find the best way to create barplots in R with standard errors displayed. I have seen other articles but I cannot figure out the code to use with my own data (having not used ggplot before and this seeming to be the most used way and barplot not cooperating with dataframes). I need to use this in two cases for which I have created two example dataframes:
Plot df1 so that the x-axis has sites a-c, with the y-axis displaying the mean value for V1 and the standard errors highlighted, similar to this example with a grey colour. Here, plant biomass should the mean V1 value and treatments should be each of my sites.
Plot df2 in the same way, but so that before and after are located next to each other in a similar way to this, so pre-test and post-test equate to before and after in my example.
x <- factor(LETTERS[1:3])
site <- rep(x, each = 8)
values <- as.data.frame(matrix(sample(0:10, 3*8, replace=TRUE), ncol=1))
df1 <- cbind(site,values)
z <- factor(c("Before","After"))
when <- rep(z, each = 4)
df2 <- data.frame(when,df1)
Apologies for the simplicity for more experienced R users and particuarly those that use ggplot but I cannot apply snippets of code that I have found elsewhere to my data. I cannot even get enough code together to produce a start to a graph so I hope my descriptions are sufficient. Thank you in advance.
Something like this?
library(ggplot2)
get.se <- function(y) {
se <- sd(y)/sqrt(length(y))
mu <- mean(y)
c(ymin=mu-se, ymax=mu+se)
}
ggplot(df1, aes(x=site, y=V1)) +
stat_summary(fun.y=mean, geom="bar", fill="lightgreen", color="grey70")+
stat_summary(fun.data=get.se, geom="errorbar", width=0.1)
ggplot(df2, aes(x=site, y=V1, fill=when)) +
stat_summary(fun.y=mean, geom="bar", position="dodge", color="grey70")+
stat_summary(fun.data=get.se, geom="errorbar", width=0.1, position=position_dodge(width=0.9))
So this takes advantage of the stat_summary(...) function in ggplot to, first, summarize y for given x using mean(...) (for the bars), and then to summarize y for given x using the get.se(...) function for the error-bars. Another option would be to summarize your data prior to using ggplot, and then use geom_bar(...) and geom_errorbar(...).
Also, plotting +/- 1 se is not a great practice (although it's used often enough). You'd be better served plotting legitimate confidence limits, which you could do, for instance, using the built-in mean_cl_normal function instead of the contrived get.se(...). mean_cl_normal returns the 95% confidence limits based on the assumption that the data is normally distributed (or you can set the CL to something else; read the documentation).
I used group_by and summarise_each function for this and std.error function from package plotrix
library(plotrix) # for std error function
library(dplyr) # for group_by and summarise_each function
library(ggplot2) # for creating ggplot
For df1 plot
# Group data by when and site
grouped_df1<-group_by(df1,site)
#summarise grouped data and calculate mean and standard error using function mean and std.error(from plotrix)
summarised_df1<-summarise_each(grouped_df1,funs(mean=mean,std_error=std.error))
# Define the top and bottom of the errorbars
limits <- aes(ymax = mean + std_error, ymin=mean-std_error)
#Begin your ggplot
#Here we are plotting site vs mean and filling by another factor variable when
g<-ggplot(summarised_df1,aes(site,mean))
#Creating bar to show the factor variable position_dodge
#ensures side by side creation of factor bars
g<-g+geom_bar(stat = "identity",position = position_dodge())
#creation of error bar
g<-g+geom_errorbar(limits,width=0.25,position = position_dodge(width = 0.9))
#print graph
g
For df2 plot
# Group data by when and site
grouped_df2<-group_by(df2,when,site)
#summarise grouped data and calculate mean and standard error using function mean and std.error
summarised_df2<-summarise_each(grouped_df2,funs(mean=mean,std_error=std.error))
# Define the top and bottom of the errorbars
limits <- aes(ymax = mean + std_error, ymin=mean-std_error)
#Begin your ggplot
#Here we are plotting site vs mean and filling by another factor variable when
g<-ggplot(summarised_df2,aes(site,mean,fill=when))
#Creating bar to show the factor variable position_dodge
#ensures side by side creation of factor bars
g<-g+geom_bar(stat = "identity",position = position_dodge())
#creation of error bar
g<-g+geom_errorbar(limits,width=0.25,position = position_dodge(width = 0.9))
#print graph
g

Force bars to start from a lower value than 0 in ggplot geom_bar in R

I want to plot bar plots using ggplot. I used scale_y_log10 to rescale the y axis. When use the script below, I get bars that are in both directions (upward and downward) and the bars start from 1. I understand that it is because, log10(1)=0.
ggplot (data=dat, aes(x=Col2,y=Col4,fill=Col3,group=Col1))+
geom_bar(stat="identity")+ facet_grid(.~Col1,scales="free_x",space="fixed")+
ylab("")+xlab("")+scale_fill_discrete("")+
scale_y_log10("#Counts",breaks=c(.1,1,10,100,1000),expand=c(0,0))+
theme_bw()+
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1))
Now, I want to force the bars to start from a lower value (say 0.1 or 0.01) and make all the bars in upward direction. How can I do that? Tried to find related posts and info but had no luck. Can someone direct me if there are such info already?
Edit: I want to use scale_y_log10 because most of the bars are low and some are extremely high. If I use scale_y_continuous, the shorter bars are not visible properly.
You can define your own scale, instead of using scale_y_continuous(trans="log10")). In the example below, you will have to change the argument from=-2 to you specific example.
# defining example data (since I don't have your data)
data(mtcars)
mtcars <- rbind(mtcars, mtcars)
mtcars <- rbind(mtcars, mtcars)
mtcars <- rbind(mtcars, mtcars)
mtcars <- rbind(mtcars, mtcars)
mtcars[1, "cyl"] <- 2
# sample plot
c <- ggplot(mtcars, aes(factor(cyl))) + geom_bar()
c + scale_y_log10() # this starts from 1
# defining the scale change
require(scales)
mylog_trans <- function(base=exp(1), from=0)
{
trans <- function(x) log(x, base)-from
inv <- function(x) base^(x+from)
trans_new("mylog", trans, inv, log_breaks(base=base),
domain = c(base^from, Inf))
}
#
c + scale_y_continuous(trans = mylog_trans(base=10, from=-2)) # starts from 1e-2
c + scale_y_continuous(trans = mylog_trans(base=10, from=-5)) # starts from 1e-5
As you can see in the above example, this plot can be very misleading. The two plots display the same data, but look very different, so be careful when using this scale-change.

Resources