ggplot2 boxplot for each variable with unequal distance - r

I have the following data frame:
date DGS1MO DGS3MO DGS6MO DGS1 DGS2 DGS3 DGS5 DGS7 DGS10 DGS20 DGS30
1 2006-02-28 4.47 4.62 4.74 4.73 4.69 4.67 4.61 4.57 4.55 4.70 4.51
2 2006-03-31 4.65 4.63 4.81 4.82 4.82 4.83 4.82 4.83 4.86 5.07 4.90
3 2006-04-28 4.60 4.77 4.91 4.90 4.87 4.87 4.92 4.98 5.07 5.31 5.17
4 2006-05-31 4.75 4.86 5.08 5.07 5.04 5.03 5.04 5.06 5.12 5.35 5.21
5 2006-06-30 4.54 5.01 5.24 5.21 5.16 5.13 5.10 5.11 5.15 5.31 5.19
6 2006-07-31 5.02 5.10 5.18 5.11 4.97 4.93 4.91 4.93 4.99 5.17 5.07
Using melt (from reshape2) I got this data frame:
date variable value
1 2006-02-28 DGS1MO 4.47
2 2006-03-31 DGS1MO 4.65
3 2006-04-28 DGS1MO 4.60
4 2006-05-31 DGS1MO 4.75
5 2006-06-30 DGS1MO 4.54
6 2006-07-31 DGS1MO 5.02
As you can see I have 1, 3, 6 month, along with 10, 20, 30 year time horizons. I would like to plot box-and-whisker plot for each of these columns and have the following code:
bwplot <- ggplot(df, aes(x = variable, y = value, color = variable)) +
stat_boxplot(geom = "errorbar") +
geom_boxplot() +
bwplot
However, the issue is the distance (space) between the boxplots for each variable is the same. Ideally, there should be very small distance between the boxplots for 1 month and 3 month. And the gap between the boxplots for 10 year and 20 year should be wide. To remedy, I have tried to convert the variables into numbers (1/12, 3/12, 6/12, 1, 2, etc.) and then tried this code:
levels(df$variable) <- c(0.83, 0.25, 0.5, 1, 2, 3, 5, 7, 10, 20, 30)
bwplot <- ggplot(df, aes(x = as.numeric(as.character(df$variable)), y = value, color = variable)) +
stat_boxplot(geom = "errorbar") +
geom_boxplot() +
bwplot
But what I am getting is only one huge boxplot for the entire time horizon followed by this warning msg:
Warning messages:
1: Continuous x aesthetic -- did you forget aes(group=...)?
If I try
group = variable
I get
Error: Continuous value supplied to discrete scale
What is the right way of doing this?
Thanks.

s<-data.frame(date=seq(as.Date("2006-02-01"), by="month", length.out=6), M1=rnorm(6,5,0.5), M3=rnorm(6,5,0.5), M6=rnorm(6,5,0.5), Y1=rnorm(6,5,0.5), Y2=rnorm(6,5,0.5), Y3=rnorm(6,5,0.5), Y10=rnorm(6,5,0.5), Y20=rnorm(6,5,0.5), Y30=rnorm(6,5,0.5))
require(ggplot2)
require(reshape2)
s.melted<-melt(s, id.var="date")
#Create an axis where the numbers represent the number of months elapsed
s.melted$xaxis <-c("M"=1, "Y"=12)[sub("(M|Y)([0-9]+)","\\1",s.melted$variable)] * as.numeric(sub("(M|Y)([0-9]+)","\\2",s.melted$variable))
s.melted[sample(1:nrow(s.melted),6),]
date variable value xaxis
23 2006-06-01 Y1 4.645595 12
38 2006-03-01 Y10 5.190710 120
25 2006-02-01 Y2 4.831788 24
50 2006-03-01 Y30 3.892580 360
39 2006-04-01 Y10 4.513831 120
31 2006-02-01 Y3 4.357127 36
# Only show the ticks for variable
bwplot <- ggplot(s.melted, aes(x = xaxis, y = value, color = variable)) +
stat_boxplot(geom = "errorbar") +
geom_boxplot() + scale_x_continuous(breaks=s.melted$xaxis,
labels=s.melted$variable)
bwplot

Related

standardize a variable values differently based on another categorical variable in R (Using R Base)

I have a large dataset that has a continuous variable "Cholesterol" for two visits for each participant (each participant has two rows: first visit = Before & second visit= After). I'd like to standadise cholesterol but I have both Before and After visits merged which will not make my standardisation accurate as it is calculated using the mean and the SD
USING R BASE, How can I create a new cholesterol variable standardised based on Visit in the same data set (in this process standardisation should be done twice; once for Before and another time for After, but the output (standardised values) will be in a one variable again following the same structure of this DF
DF$Cholesterol<- c( 0.9861551,2.9154158, 3.9302373,2.9453085, 4.2248018,2.4789901, 0.9972635, 0.3879830, 1.1782336, 1.4065341, 1.0495609,1.2750138, 2.8515144, 0.4369885, 2.2410429, 0.7566147, 3.0395565,1.7335131, 1.9242212, 2.4539439, 2.8528908, 0.8432039,1.7002653, 2.3952744,2.6522959, 1.2178764, 2.3426695, 1.9030782,1.1708246,2.7267124)
DF$Visit< -c(Before,After,Before,After,Before,After,Before,After,Before,After,Before,After,Before,After,Before,After,Before,After,Before,After,Before,After,Before,After,Before, After,Before,After,Before,After)
# the standardisation function I want to apply
standardise <- function(x) {return((x-min(x,na.rm = T))/sd(x,na.rm = T))}
thank you in advance
Let's make your data, fix the df$visit assignment, fix the standardise function to be mean rather than min, and then assume each new occasion of before is the next person, pivot to wide format, then mutate our before and after standardised variables:
df <- data.frame(x = rep(1, 30))
df$cholesterol<- c( 0.9861551,2.9154158, 3.9302373,2.9453085, 4.2248018,2.4789901, 0.9972635, 0.3879830, 1.1782336, 1.4065341, 1.0495609,1.2750138, 2.8515144, 0.4369885, 2.2410429, 0.7566147, 3.0395565,1.7335131, 1.9242212, 2.4539439, 2.8528908, 0.8432039,1.7002653, 2.3952744,2.6522959, 1.2178764, 2.3426695, 1.9030782,1.1708246,2.7267124)
df$visit <- rep(c("before", "after"), 15)
standardise <- function(x) {return((x-mean(x,na.rm = T))/sd(x,na.rm = T))}
df <- df %>%
mutate(person = cumsum(visit == "before"))%>%
pivot_wider(names_from = visit, id_cols = person, values_from = cholesterol)%>%
mutate(before_std = standardise(before),
after_std = standardise(after))
gives:
person before after before_std after_std
<int> <dbl> <dbl> <dbl> <dbl>
1 1 0.986 2.92 -1.16 1.33
2 2 3.93 2.95 1.63 1.36
3 3 4.22 2.48 1.91 0.842
4 4 0.997 0.388 -1.15 -1.49
5 5 1.18 1.41 -0.979 -0.356
6 6 1.05 1.28 -1.10 -0.503
7 7 2.85 0.437 0.609 -1.44
8 8 2.24 0.757 0.0300 -1.08
9 9 3.04 1.73 0.788 0.00940
10 10 1.92 2.45 -0.271 0.814
11 11 2.85 0.843 0.611 -0.985
12 12 1.70 2.40 -0.483 0.749
13 13 2.65 1.22 0.420 -0.567
14 14 2.34 1.90 0.126 0.199
15 15 1.17 2.73 -0.986 1.12
If you actually want min in your standardise function rather than mean, editing it should be simple enough.
Edited for BaseR solution, but with a cautionary tale that there's probably a much neater solution:
df <- data.frame(id = rep(c(seq(1, 15, 1)), each = 2))
df$cholesterol<- c( 0.9861551,2.9154158, 3.9302373,2.9453085, 4.2248018,2.4789901, 0.9972635, 0.3879830, 1.1782336, 1.4065341, 1.0495609,1.2750138, 2.8515144, 0.4369885, 2.2410429, 0.7566147, 3.0395565,1.7335131, 1.9242212, 2.4539439, 2.8528908, 0.8432039,1.7002653, 2.3952744,2.6522959, 1.2178764, 2.3426695, 1.9030782,1.1708246,2.7267124)
df$visit <- rep(c("before", "after"), 15)
df <- reshape(df, direction = "wide", idvar = "id", timevar = "visit")
standardise <- function(x) {return((x-mean(x,na.rm = T))/sd(x,na.rm = T))}
df$before_std <- round(standardise(df$cholesterol.before), 2)
df$aafter_std <- round(standardise(df$cholesterol.after), 2)
gives:
i id cholesterol.before cholesterol.after before_std after_std
1 1 0.9861551 2.9154158 -1.16 1.33
3 2 3.9302373 2.9453085 1.63 1.36
5 3 4.2248018 2.4789901 1.91 0.84
7 4 0.9972635 0.3879830 -1.15 -1.49
9 5 1.1782336 1.4065341 -0.98 -0.36
11 6 1.0495609 1.2750138 -1.10 -0.50
13 7 2.8515144 0.4369885 0.61 -1.44
15 8 2.2410429 0.7566147 0.03 -1.08
17 9 3.0395565 1.7335131 0.79 0.01
19 10 1.9242212 2.4539439 -0.27 0.81
21 11 2.8528908 0.8432039 0.61 -0.99
23 12 1.7002653 2.3952744 -0.48 0.75
25 13 2.6522959 1.2178764 0.42 -0.57
27 14 2.3426695 1.9030782 0.13 0.20
29 15 1.1708246 2.7267124 -0.99 1.12

ggplot - order of y axis is not like the one of the dataset

I got the following data set:
Skalen
Werte
1
Allgemeine Beanspruchung
1.55
2
Emotionale Beanspruchung
1.59
3
Soziale Beanspruchung
1.79
4
Konflikte/Leistungsdruck
1.76
5
Übermüdung
1.79
6
Energielosigkeit
2.13
7
Somatische Beanspruchung
1.52
8
Erfolg
2.74
9
Soziale Erholung
3.26
10
Somatische Erholung
3.41
11
Allgemeine Erholung
3.84
12
Schlaf
4.29
13
Gestörte Pause
1.07
14
Emotionale Erschöpfung
1.36
15
Verletzungsanfälligkeit
1.59
16
In-Form-sein
3.28
17
Persönliche Verwirklichung
2.42
18
Selbstwirksamkeitsüberzeugung
3.29
19
Selbstregulation
3.41
And I used this code to plot it as a vertical line graph
ggplot(data=df_ebf, aes(x=Skalen, y=Werte,group="")) +
geom_line() +
geom_point() +
coord_flip()
The result is this:
The values belong to the right description on the y-axis. But the order is reverse alphabetically. I want that the order is like it is in the dataset.
I found a different approach to solve this problem.
I changed the order of the variables in the dataframe with
df <- df %>%
map_df(rev)
and then used the first function that Jon Spring suggested in the ggplot command
ggplot(data=df, aes(x=forcats::fct_inorder(Skalen), y=Werte, group="")) +
geom_line() +
geom_point() +
coord_flip()
Now I got the right order in the plot.
Thanks for the support!

Heatmap using ggplot2 in r

My Data looks like this:
data1 <- read.table(text = "District Block IE FE AOE CLE CS
A A1 4.87 17.54 13.85 9.01 45.27
B B1 8.19 20.83 14.59 7.04 50.65
C C1 8.71 19.16 16.54 8.24 52.65
D D1 2.43 11.77 11.51 6.96 32.67
E E1 6.85 13.54 14.54 5.7 40.63
F F1 7.02 19.96 13.96 3.82 44.76
G G1 2.55 11.64 8.74 5.06 27.99
H H1 9.81 20.2 12.62 5.95 48.58
I I1 6.56 15.49 12.32 8.08 42.45
J J1 9.47 22.86 25 22.73 80.06
K K1 10.2 20.18 20.14 20.06 70.58
L L1 9.52 14.86 16.95 18.23 59.56", header = TRUE)
I have created data matrix from the dataframe.My initial code looks like this
row.names(data1)<-data1$Column1
data1<-select(data1,-c(1))
data1<-data.matrix()
data1_heatmap<-heatmap(data1,Rowv = NA,Colv = NA,col=heat.colors(256),scale = "none",margins = c(12,3))
Whenever I am using the above code,it plots for the whole sheet.
I have 2 issues:
I need to show the cell values which are present in the data matrix.
Also i need to specify a color range in each column from IE to CS.For example,in IE column less the 4.87 is red,6.56 to 6.85 is orange and green for greater than 8.17.So basically user defined range for each column.
Try this with ggplot2 (starting with the original data1):
library(ggplot2)
library(reshape2)
row.names(data1)<-data1$Column1
data1<-select(data1,-c(1))
data1 <- melt(data1, id='Block')
data1$value <- cut(data1$value,breaks = c(-Inf,4.87, 6.56, 6.58, 8.17, 14, 19, 21, Inf),right = FALSE) # bin IE variable
ggplot(data = data1, aes(x = Block, y = variable)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_brewer(palette = "PRGn")

Different colors in ggplot2 using groups

I have a problem trying to use different colors in my plot for two groups. I created a plot with odds ratios (including 95%CI) over a period of serveral years for 2 groups (mfin and ffin). When using the syntax below, all points and lines are black and my attempts to adjust them e.g. geom_linerange(colour=c("red","blue")) have failed (Error: Incompatible lengths for set aesthetics: colour).
Can anyone help me with this?
ggplot(rbind(data.frame(mfin, group=mfin), data.frame(ffin, group=ffin)),
aes(x = JAAR, y = ror, ymin = llror, ymax = ulror)) +
geom_linerange() +
geom_point() +
geom_hline(yintercept = 1) +
ylab("Odds ratio & 95% CI") +
xlab("") +
geom_errorbar(width=0.2)
Below are some sample data (1st group = mfin, #ND GROUP + ffin)
JAAR ror llror ulror
2008 2.00 1.49 2.51
2009 2.01 1.57 2.59
2010 2.06 1.55 2.56
2011 2.07 1.56 2.58
2012 2.19 1.70 2.69
2013 2.23 1.73 2.72
2014 2.20 1.71 2.69
2015 2.31 1.84 2.78
2016 .230 1.83 2.76
JAAR ror llror ulror
2008 1.36 0.88 1.84
2009 1.20 0.73 1.68
2010 1.16 0.68 1.64
2011 1.23 0.77 1.69
2012 1.43 1.00 1.86
2013 1.46 1.04 1.88
2014 1.49 1.07 1.90
2015 1.30 0.89 1.70
2016 1.29 0.89 1.70
You need to map the group membership variable to the color aesthetic (in the long version of the data):
library(readr)
library(dplyr)
library(ggplot2)
# simulate some data
year_min = 1985
year_max = 2016
num_years = year_max - year_min + 1
num_groups = 2
num_estimates = num_years*num_groups
df_foo = data_frame(
upper_limit = runif(n = num_estimates, min = -20, max = 20),
lower_limit = upper_limit - runif(n = num_estimates, min = 0, max = 5),
point_estimate = runif(num_estimates, min = lower_limit, max = upper_limit),
year = rep(seq(year_min, year_max), num_groups),
group = rep(c("mfin", "ffin"), each = num_years)
)
# plot the confidence intervals
df_foo %>%
ggplot(aes(x = year, y = point_estimate,
ymin = lower_limit, ymax = upper_limit,
color = group)) +
geom_point() +
geom_errorbar() +
theme_bw() +
ylab("Odds Ratio & 95% CI") +
xlab("Year") +
scale_color_discrete(name = "Group")
This produces what I think you are looking for, except the simulated data makes it look somewhat messy:

How to match legend colors and plot colors in overlapping area plots in ggplot2

I have the following data:
head(MP_rates_dateformat)
Month repo revrepo bankrate CRR Callrate WPI GDP FED
1 2001-04-01 9.00 6.75 7 8.0 7.49 5.41 4.6 4.50
2 2001-05-01 8.75 6.50 7 7.5 8.03 5.60 4.6 4.00
3 2001-06-01 8.50 6.50 7 7.5 7.24 5.30 4.6 3.75
4 2001-07-01 8.50 6.50 7 7.5 7.19 5.23 5.3 3.75
5 2001-08-01 8.50 6.50 7 7.5 6.94 5.41 5.3 3.50
6 2001-09-01 8.50 6.50 7 7.5 7.30 4.52 5.3 3.00
I am trying to plot timeseries overlapping area plots for variables repo and revrepo using ggplot2.
p2 <- ggplot(MP_rates_dateformat, aes(x= Month)) + geom_area(aes(y=repo, color="repo"), fill="yellowgreen") + geom_area(aes(y=revrepo,color="revrepo"), fill="dodgerblue", alpha=0.7, linetype="dotted") + labs(color="")+ labs(title="Overlapping - Repo & Reverse Repo")
p2
As we can see the legend is showing with same colors fills in legend boxes for both variables. I want it to show the correct corresponding colors i.e. yellowgreen for repo and dodgerblue for revrepo.
Suppose I melt the data as:
df <- reshape2::melt(MP_rates_dateformat[, c("Month", "repo", "revrepo")], id="Month")
head(df, 3)
Month variable value
1 2001-04-01 repo 9.00
2 2001-05-01 repo 8.75
3 2001-06-01 repo 8.50
p1 <- ggplot(df, aes(x=Month)) + geom_area(aes(y=value, fill=variable)) + labs(title="Non-Overlapping - Repo & Reverse Repo")
But this is giving me non-overlapping area plots with correct legend....BUT I am looking for overlapping area plot.
Here is a solution based on the idea of melting/gathering the data, using fill in ggplot combined with position = "identity". Note that the order of the columns is important as the smallest value revrepo should be plot after the first one repo.
library("tidyr")
df_gather <- gather(select(MP_rates_dateformat, 1:3), variable, value, -Month)
library("ggplot2")
ggplot(data = df_gather, aes(x = Month)) +
geom_area(aes(y = value, fill = variable), position = "identity")
Finally Got it!
library("tidyr")
long_DF<- MP_rates_dateformat[,1:3] %>% gather(variable, value, -Month)
head(long_DF)
Month variable value
1 2001-04-01 repo 9.00
2 2001-05-01 repo 8.75
3 2001-06-01 repo 8.50
4 2001-07-01 repo 8.50
5 2001-08-01 repo 8.50
6 2001-09-01 repo 8.50
library("ggplot2")
ggplot(data = long_DF, aes(x = Month)) +
geom_area(aes(y = value, fill = variable), position = "identity") + labs(fill="") + xlab('\nYears') + ylab('LAF Rates (%)\n') + labs(title="Overlapping - Repo & Reverse Repo\n")

Resources