Boxplot with multiple x variables - r

I'm new to R and having a few issues with using ggplot2.
This is an example of my data (subset of larger data set) :
df <-
structure(list(logpvalue = c(22.36, 6.93, 16.78, 1.78, 17.75,
20.99, 21.03, 9.19, 15.01, 22.25, 13.4, 6.47, 1.34, 13.4, 3.21,
0.37, 0.5, 0.12, 1.8, 0.71, 1.15, 6.73, 0.12, 6.97, 0.64, 9.85,
1.45, 1.67, 2.6, 1.8, 1.35, 4.69, 0.37, 1.91, 0.31, 0, 2.45,
1.68, 2.31, 1.35, 6.48, 4.68), SNP = structure(c(1L, 7L, 6L,
5L, 11L, 1L, 9L, 5L, 8L, 11L, 7L, 5L, 8L, 11L, 1L, 7L, 1L, 4L,
2L, 3L, 10L, 7L, 1L, 4L, 2L, 3L, 10L, 4L, 2L, 3L, 10L, 4L, 2L,
3L, 10L, 4L, 2L, 3L, 7L, 9L, 5L, 1L), .Label = c("rs10244", "rs10891244",
"rs10891245", "rs11213821", "rs12296076", "rs138567267", "rs45615536",
"rs6589218", "rs7103178", "rs7127721", "rs7944895"), class = "factor"),
X173 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("het", "hom"), class = "factor")), .Names = c("logpvalue",
"SNP", "X173"), class = "data.frame", row.names = c(NA, -42L))
I want to plot a boxplot of logpvalue on y axis, with SNP on the x-axis but with each SNP also categorized by whether the patient is het or hom for X173. So from this data I'd imagine 4 boxes on my boxplot.
If possible I'd also like to incorporate the individual data points (dotplot-boxplot overlay) with jitter.
This is the usual code I'd use for a boxplot of logpavlue vs SNP:
qplot(logpvalue, SNP, data = mydata, geom="boxplot")
+ geom_jitter(position=position_jitter(w=0.1, h=0.1)) + theme_bw()
How do I add the extra x variable into this code?

Try this:
boxplot(df$logpvalue~paste(df$SNP,df$X173))
Or using ggolot2 :
library(ggplot2)
ggplot(data=df,aes(SNP,logpvalue,colour=SNP)) +
geom_boxplot() +
geom_jitter() +
facet_grid(.~X173)

Related

Error in eigen(x) : infinite or missing values in 'x' > in robumeta

When I try to perform a meta-analysis with hierarchical weights in robumeta I get
Error in eigen(x) : infinite or missing values in 'x',
using the same data that does not produce any errors with correlational weights.
My data matrix does not have any NA or missing values. Cluster entails whole numbers between 1 and 4.
Does anyone know why I get the Eigen(x) error?
Code needed to reproduce error:
#load data, you need to adjust read.table depending on where the file is saved.
mydata <- read.table ("H:/Desktop/Max_R_Dataset_Meta_Analysis.csv", header = TRUE, sep = ",")
#install & load packages
library (robumeta)
library (devtools)
install_github("jepusto/clubSandwich")
library (clubSandwich)
#fit moderator model with CORR
res_2 <- robu (formula = effect_size ~ pathway, var.eff.size = effect_size_variance, studynum = Study_ID, modelweights = "CORR", rho = 0.8, small = TRUE, data = mydata)
print (res_2)
#fit moderator model with HIER
hier1 <- robu (formula = effect_size ~ pathway, var.eff.size = effect_size_variance, studynum = cluster, modelweights = "HIER", small = TRUE, data = mydata)
print (hier1)
dput (head(mydata,35))
structure(list(Study_ID = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L), effect_size = c(-0.05,
-0.09, -4.44, 0.28, 0.25, 0.91, 0.31, 0.31, 0.33, 0.27, 0.13,
0.71, -0.1, -0.09, -0.28, 0.2, 0.23, 1.23, 0.21, 0.22, 0.29,
-0.18, -0.16, -0.75, 0.2, 0.24, 2.47, 0.37, 0.36, 2.34, 0.17,
0.15, 0.85, 0.04, 0), effect_size_variance = c(0.010737802, 0.008056791,
30.135452, 0.010478163, 0.011260784, 0.093962475, 0.006933061,
0.008891908, 0.007840352, 0.006092875, 0.007411207, 0.040583305,
0.021610499, 0.019590468, 0.104406625, 0.012783255, 0.011467534,
0.333023923, 0.004151044, 0.008464275, 0.006936499, 0.012797742,
0.007904113, 0.307592997, 0.001625522, 0.002084078, 0.230050467,
0.009038613, 0.00895868, 0.34524772, 0.004019923, 0.002854116,
0.078314231, 0.007680706, 0), pathway = c(2L, 4L, 6L, 2L, 4L,
6L, 2L, 4L, 6L, 2L, 4L, 6L, 2L, 4L, 6L, 2L, 4L, 6L, 2L, 4L, 6L,
1L, 3L, 5L, 1L, 3L, 5L, 1L, 3L, 5L, 1L, 3L, 5L, 1L, 3L), cluster = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L), Study_Name = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L,
1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L), .Label = c("Desiree Thesis Arab",
"Desiree Thesis White", "Gijs Direct Replication", "Gijs Indirect Replication",
"Irina Africa Black", "Irina Africa White", "Irina Thesis", "Max Thesis",
"Stein Race", "Yuan Exp1"), class = "factor")), row.names = c(NA,
35L), class = "data.frame")
The HIER version works with the example data provided by the robumeta authors.
Thanks to NelsonGon I got an answer:
My dataset included an effect size of 0 which produces an infinite eigenvalue in the hierarchical model.
That seems to be caused by differences in the calculation of CORR and HIER models:
CORR uses: dframe$weights <- 1 / (dframe$k * dframe$avg.var.eff.size) while HIER uses dframe$weights <- 1 / dframe$var.eff.size. Although both could theoretically produce 0s, you can check it here: github.com/zackfisher/robumeta/blob/master/R/robu.R
The eigen values are actually calculated later in the source code, the Inside Matrix part.
Because HIER divides by var.eff.size, a var.eff.size of 0 produces an error.

Multi level pie chart ggplot: Label overlap and legend

I have the following dataset:
data <- structure(list(Year = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 3L, 1L, 3L, 1L, 2L, 2L, 3L, 2L, 3L,
1L, 3L, 2L, 2L, 2L, 3L, 1L, 2L, 3L, 3L, 3L, 2L, 1L, 3L, 1L,
1L, 2L, 1L, 2L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 3L,
3L, 2L, 3L, 2L, 1L, 1L, 2L, 2L, 1L),
.Label = c("2013", "2014", "2015"),
class = "factor"),
Place = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L),
.Label = c("Inside", "Outside"),
class = "factor"),
Variable = structure(c(6L, 4L, 8L, 13L, 16L, 11L, 12L, 13L, 4L, 10L, 10L, 11L,
1L, 3L, 13L, 7L, 11L, 7L, 6L, 2L, 6L, 1L, 1L, 7L, 5L,
3L, 14L, 3L, 14L, 2L, 9L, 6L, 6L, 9L, 2L, 5L, 9L, 5L,
9L, 9L, 15L, 1L, 13L, 3L, 6L, 3L, 3L, 9L, 15L, 1L, 13L,
1L, 13L, 15L),
.Label = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8",
"Y1", "Y2", "Y3", "Y4", "Y5", "Y6", "Y7", "Y8"),
class = "factor"),
Group = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L),
.Label = c("Var1", "Var2"),
class = "factor"),
Percent = c(0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.7, 0.7, 1.3, 1.6, 1.9, 1.9, 2.3, 2.7,
2.9, 3.4, 3.7, 4.5, 4.7, 5.3, 5.7, 6.2, 7.6, 7.9, 10.6, 10.7, 12.5,
13.3, 14.4, 15.4, 15.8, 16.9, 17.7, 19.6, 20.5, 24.8, 25.3, 30.4, 31,
36.8, 41.6, 43.9, 43.9, 44.2, 45.4, 51.8, 52.8, 56.1, 57.4, 68.9, 68.9,
80.4, 80.4, 81.5)),
class = "data.frame", row.names = c(NA, -54L))
I would really like to display the data in a a multilevel like this:
I tried it by doing:
library(ggplot2)
ggplot(data, aes(x = Group, y = Percent, fill = Variable)) +
geom_bar(stat = "identity", position = "fill") +
facet_grid(Year ~ Place) +
geom_text(aes(label= paste(Percent, "%", Variable)) ,
position = position_fill(0.9), size = 3)+
coord_polar(theta = "y")
But because some percentages are very low, the layers overlap. I would like to either place the labels outside like the example if that's possible.
I have looked at the other forum topics, but because my data is structured in a different way I wasn't able to translate that for me. The other problem is, is that this is just an example but my data input is actually dependent on he Shiny input. So fixing specific angles for this example also doesn't work.
I would be very grateful if anyone could help me.

Drop unused level using facet_grid and coord_flip in ggplot2

I have a dataframe df
df<- structure(list(Categorie = structure(c(1L, 1L, 2L, 2L, 4L, 4L,
3L, 1L, 1L, 2L, 2L, 4L, 4L, 3L, 1L, 1L, 2L, 2L, 4L, 4L, 3L, 1L,
1L, 2L, 2L, 4L, 4L, 3L, 1L, 1L, 2L, 2L, 4L, 4L, 3L, 1L, 1L, 2L,
2L, 4L, 4L, 3L, 1L, 1L, 2L, 2L, 4L, 4L, 3L, 1L, 1L, 2L, 2L, 4L,
4L, 3L), .Label = c("Age classes", "Climate", "Nutrient availability",
"PFT"), class = "factor"), Sub_categories = structure(c(7L, 4L,
6L, 1L, 3L, 2L, 5L, 7L, 4L, 6L, 1L, 3L, 2L, 5L, 7L, 4L, 6L, 1L,
3L, 2L, 5L, 7L, 4L, 6L, 1L, 3L, 2L, 5L, 7L, 4L, 6L, 1L, 3L, 2L,
5L, 7L, 4L, 6L, 1L, 3L, 2L, 5L, 7L, 4L, 6L, 1L, 3L, 2L, 5L, 7L,
4L, 6L, 1L, 3L, 2L, 5L), .Label = c("Continental", "DBF", "ENF",
"Intermediate-Old", "Low-High", "Temperate", "Young"), class = "factor"),
Variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L), .Label = c("Age", "Clay content", "GPP", "GPP*Age",
"GPP*P", "GPP*P trend", "N availability", "SPI"), class = "factor"),
Importance = c(19.2, 23.7, 45.2, 45.7, 39, 66.8, 34.8, 10.9,
16.2, 9.2, 6.3, 15.7, 2.1, 10, 13.2, 7.1, 6.1, 3.8, 2.4,
7.3, 5.2, 6.4, 10, 6.6, 3.7, 2.9, 5.8, 8.4, 17.7, 0, 6.1,
5.2, 8.4, 2.8, 6.7, 11.8, 21.1, 9.8, 21.9, 20, 6.3, 13.5,
2.3, 7.6, 3.9, 1.3, 3.9, 0.4, 3.8, 10.9, 7.5, 4.5, 5.8, 0.3,
2.5, 9.4)), .Names = c("Categorie", "Sub_categories", "Variable",
"Importance"), class = "data.frame", row.names = c(NA, -56L))
I want to plot my data by doing a facet_grid. with the Categorie and Sub_categories variables. I run the below command line:
library(ggplot2)
ggplot(data = var_Imp) +
geom_bar(mapping = aes(x = Variable, y = Importance, fill=Variable), width = 1, stat= "identity", position = "stack") +
coord_flip() +
facet_grid(Categorie~Sub_categories, scales="free", space="free", shrink=TRUE, drop=TRUE)+
theme_bw(base_size = 14, base_family = "Helvetica")+
theme(axis.ticks.length=unit(-0.25, "cm"),
legend.position="none",
legend.box="horizontal",
legend.key = element_blank(),
legend.text=element_text(size=14),
axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.ticks.y=element_blank())+
xlab("") +
ylab("Relative contribution [%]")+
scale_fill_brewer(type = "div")
However, the levels with no information in some of the facets are still plotted although they should not because there is no information in it. I thought the scales="free" and space="free" parameters will do the job but apparently not. Anyone knows how I can plot my data without the unused levels? Thanks
I don't know about a specific command in ggplot2. Two possible options:
1) Any of the options proposed in this post ggplot2: How to force the number of facets with too few plots?
2) Export the plot in .svg (or other vectorial format), open it with inkscape or any other program and delete the empty facets

Faceting bars in ggplot2

I have this problem: I want to build a stacked bar plot with the faceting capabilities, so I can compare the distribution of frequencies for five common categories, within two different objects, separated according to three groups. I have six objects, five categories and three groups. The problem is that each group has only two different and exclusive objects to plot, but so far I can only produce a plot in which the six objects are plotted across the three groups. This is not optimal, since for each group I have four objects with no data.
Is it possible to plot just two objects for each group with the faceting capabilities?
EDITED
This is my data:
structure(list(Face = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L), .Label = c("LGH002", "LGH003", "LGM009",
"SCM018", "VAH022", "VAM028"), class = "factor"), Race = structure(c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L),
.Label = c("1. Amerindian", "2. White", "3. Mestizo", "4. Other races",
"5. Cannot tell"), class = "factor"), Count = c(19L, 0L, 13L, 8L, 0L, 2L,
7L, 23L, 6L, 2L, 1L, 1L, 29L, 6L, 3L, 29L, 0L, 11L, 0L, 0L, 0L, 38L, 1L, 0L,
1L, 0L, 30L, 9L, 0L, 1L), Density = c(0.475, 0, 0.325, 0.2, 0,
0.05, 0.175, 0.575, 0.15, 0.05, 0.025, 0.025, 0.725, 0.15,
0.075, 0.725, 0, 0.275, 0, 0, 0, 0.95, 0.025, 0, 0.025, 0,
0.75, 0.225, 0, 0.025), School = structure(c(1L, 1L, 1L,
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Municipal",
"Private Fee-Paying", "Private-Voucher"), class = "factor")),
.Names =c("Face", "Race", "Count", "Density", "School"),
class = "data.frame", row.names = c(NA, -30L))
This is the code I'm using to build the plot:
P <- ggplot(data = races.df, aes(x = Face, y = Density, fill = Race)) +
geom_bar(stat="identity") +
scale_y_continuous(labels=percent)
P + facet_grid(School ~ ., scales="free") + coord_flip()
As you can imagine, I only want to see the x-values "SCM018" and "LGH002" in "Municipal"; "LGM009" and "LGH003" in "Private-Voucher"; and "VAH022" and "VAM028" in "Private Fee-Paying" (only two objects per group). Is it possible? Any help?
All the best,
Mauricio.

How to display date format (Sep-12) in bar chart and set y axis limit?

I have two questions on building a bar plot by using ggplot().
How to display data format (Sep-12)?
I would like to display the date in the format of Sep-12. My data is a quarterly summary. I would like to show Mar, Jun, Sep and Dec quarters. However, I used the as.Date(YearQuarter) within the ggplot() function. It shows a different sequence of Apr, July, Oct, Jan.
How to increase y axis limit?
The y axis is set at 70%, one of value label is out of the picutre. I have added ylim(0,1) to increase the y limit to 1. However, I lost the percentage format as the y axis is not displaying the percentage anymore.
x4.can.t.m <- structure(list(NR_CAT = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L), .Label = c("0%", "1 to 84%", "85% +"
), class = "factor"), TYPE = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("PM BUSINESS", "PM CONSUMER",
"PREPAY"), class = "factor"), YearQuarter = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("2011-09-01",
"2011-12-01", "2012-03-01", "2012-06-01", "2012-09-01"), class = "factor"),
value = c(0.5, 0, 0.5, 0.35, 0, 0.65, 0.28, 0.02, 0.7, 0.4,
0, 0.6, 0.38, 0, 0.62, 0.43, 0.01, 0.56, 0.57, 0, 0.43, 0.35,
0, 0.65, 0.39, 0.01, 0.6, 0.55, 0, 0.45, 0.4, 0.02, 0.58,
0.35, 0.02, 0.63, 0.35, 0, 0.65, 0.55, 0.01, 0.44, 0.47,
0, 0.53)), .Names = c("NR_CAT", "TYPE", "YearQuarter", "value"
), row.names = c(NA, -45L), class = "data.frame")
This is my plot code:
x4.can.t.m$YearQuarter <- as.Date(x4.can.t.m$YearQuarter)
x4.can.t.d.bar <- ggplot(data=x4.can.t.m, aes(x=YearQuarter, y=value,fill=NR_CAT)) +
geom_bar(stat="identity",position = "dodge",ymax=NR_CAT+0.2) +
facet_wrap(~TYPE,ncol=1) +
geom_text(aes(label =paste(round(value*100,0),"%",sep="")),
position=position_dodge(width=0.9),
vjust=-0.25,size=3) +
scale_y_continuous(formatter='percent',ylim=1) +
labs(y="Percentage",x="Year Quarter") +
ylim(0,100%)
x4.can.t.d.bar +scale_fill_manual("Canopy Indicators",values=tourism.cols(c(6,9,8)))+
opts(title="Canopy Indicator: All Customers portout for Network
Issues",size=4)
It looks like you have an older version of ggplot; the following is for ggplot 0.2.9.1. I had to fix several things to make your plot work. Starting from your original definition of x4.can.t.m:
x4.can.t.m$YearQuarter <- format(as.Date(x4.can.t.m$YearQuarter),"%b-%y")
library("scales")
ggplot(data=x4.can.t.m, aes(x=YearQuarter, y=value, fill=NR_CAT)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = paste(round(value*100,0),"%",sep=""), group=NR_CAT),
position=position_dodge(width=0.9),
vjust=-0.25, size=3) +
scale_y_continuous("Percentage", labels=percent, limits=c(0,1)) +
labs(x="Year Quarter") +
scale_fill_discrete("Canopy Indicators") +
facet_wrap(~TYPE,ncol=1) +
ggtitle("Canopy Indicator: All Customers portout for Network Issues") +
theme(plot.title = element_text(size=rel(1.2)))
The first part of the question is just achieved by formatting YearQuarter into the format you wanted, leaving it as a string.
The second part specifies the limits in scale_y_continuous and uses the labels argument to specify the formatting function. Note that library("scales") is needed for this part to work.

Resources