Stacked bar chart with percentage labels - r

I created a stacked bar chart where the bars represent a percentage of the population. I would like to add labels to the 65+ category (or for all 3 categories if it is not possible to do it just for 1 category) showing the % value for each year. If I add geom_text(label = datm$value), the bars become extremely small because the labels represent absolute values instead of percentages. This is my code:
dat <- read.table(text = "2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
0-20 24.0 23.9 23.7 23.5 23.3 23.1 22.9 22.7 22.5 22.3 22.2
20-65 61.3 61.2 61.0 60.9 60.5 60.1 59.8 59.6 59.3 59.1 59.0
65+ 14.8 15.0 15.3 15.6 16.2 16.8 17.4 17.7 18.2 18.5 18.8", sep = " ", header = TRUE)
library(reshape)
datm <- melt(cbind(dat, ind = rownames(dat)), id.vars = c('ind'))
library(scales)
library(ggplot2)
ggplot(datm,aes(x = variable, y = value, fill = ind)) +
geom_bar(position = "fill",stat = "identity") +
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))

You can try this. Explanations in comments below:
library(dplyr)
# calculate percentage within each year
datm2 <- datm %>%
group_by(variable) %>%
mutate(p = value / sum(value)) %>%
ungroup()
> head(datm2)
# A tibble: 6 x 4
ind variable value p
<fct> <fct> <dbl> <dbl>
1 0-20 X2008 24 0.240
2 20-65 X2008 61.3 0.612
3 65+ X2008 14.8 0.148
4 0-20 X2009 23.9 0.239
5 20-65 X2009 61.2 0.611
6 65+ X2009 15 0.150
ggplot(datm2, aes(x = variable, y = value, fill = ind)) +
geom_col(position = "fill") + # geom_col is equivalent to geom_bar(stat = "identity")
geom_text(aes(label = scales::percent(p), # add layer for percentage values
alpha = ifelse(ind == "65+", 1, 0)), # only visible for 65+ category
position = position_fill(vjust = 0.5)) + # follow barplot's position
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
scale_alpha_identity() +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))

Related

overlapping plot with emmeans

I have the following emmeans tables:
emm_1
$emmeans
Order rate SE df asymp.LCL asymp.UCL
1 19.3 1.51 Inf 16.5 22.5
2 26.0 2.33 Inf 21.8 31.0
emm_2
$emmeans
Order rate SE df asymp.LCL asymp.UCL
1 25.6 1.62 Inf 22.6 28.9
2 18.8 2.34 Inf 14.8 24.0
And I'm trying to plot them both together in the same plot:
plot(emm_1,col="steelblue4") + theme_bw() +
labs(title = "Choice1",
x = "Estimated marginal mean",
y = "Order") + theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = seq(0, 33,5), limits =c(0,33))
par(new=TRUE)
plot(emm_2,col="green") + theme_bw() +
labs(title = "Choice2",
x = "Estimated marginal mean",
y = "Order") + theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = seq(0, 33,5), limits =c(0,33))
This runs, although only the second plot is plotted. Is it possible to do this? What do I need to fix?

How to add manually pvalue on geom_barplot with group?

I have this dataframe
bat=rep(c("A", "B", "C", "D"),3)
loc=c(rep("ab",4), rep("te",4), rep("po", 4))
value=c(17,8,14,20,2,9,11,18,5,17,7,14)
tot=rep(c(30,45,36,58),3)
say=rep()
dt=data_frame(bat, loc, value, tot) %>% mutate(pct=value/tot*100) %>% mutate(se=sqrt(((pct*(100-pct))/(tot)))) %>% mutate(value2=tot-value)
> dt
# A tibble: 12 x 7
bat loc value tot pct se value2
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A ab 17 30 56.7 9.05 13
2 B ab 8 45 17.8 5.70 37
3 C ab 14 36 38.9 8.12 22
4 D ab 20 58 34.5 6.24 38
5 A te 2 30 6.67 4.55 28
6 B te 9 45 20 5.96 36
7 C te 11 36 30.6 7.68 25
8 D te 18 58 31.0 6.07 40
9 A po 5 30 16.7 6.80 25
10 B po 17 45 37.8 7.23 28
11 C po 7 36 19.4 6.60 29
12 D po 14 58 24.1 5.62 44
I made fisher test between each pair of bat for each loc with this following code.
dt_ab=dt %>% filter(loc=="ab")
dt_po=dt %>% filter(loc=="po")
dt_te=dt %>% filter(loc=="te")
# ab
idx = t(combn(seq_along(dt_ab$bat),2))
res_ab = lapply(1:nrow(idx),function(i){
test = fisher.test(dt_ab[idx[i,],c("value","value2")])
data.frame(
group1 = dt_ab$bat[idx[i,1]],
group2 = dt_ab$bat[idx[i,2]],
loc=dt_ab$loc[idx[i,1]],
odds_ratio = as.numeric(test$estimate),
p = as.numeric(test$p.value)
)
})
res_ab = do.call(rbind,res_ab)
res_ab
# po
idx = t(combn(seq_along(dt_po$bat),2))
res_po = lapply(1:nrow(idx),function(i){
test = fisher.test(dt_po[idx[i,],c("value","value2")])
data.frame(
group1 = dt_po$bat[idx[i,1]],
group2 = dt_po$bat[idx[i,2]],
loc=dt_po$loc[idx[i,1]],
odds_ratio = as.numeric(test$estimate),
p = as.numeric(test$p.value)
)
})
res_po = do.call(rbind,res_po)
res_po
# te
idx = t(combn(seq_along(dt_te$bat),2))
res_te = lapply(1:nrow(idx),function(i){
test = fisher.test(dt_te[idx[i,],c("value","value2")])
data.frame(
group1 = dt_te$bat[idx[i,1]],
group2 = dt_te$bat[idx[i,2]],
loc=dt_te$loc[idx[i,1]],
odds_ratio = as.numeric(test$estimate),
p = as.numeric(test$p.value)
)
})
res_te= do.call(rbind,res_te)
res_te
res=NULL
res=rbind(res_ab, res_po, res_te)
res
group1 group2 loc odds_ratio p
1 A B ab 5.8817685 0.0009288055
2 A C ab 2.0321378 0.2157927844
3 A D ab 2.4578341 0.0678462682
4 B C ab 0.3445393 0.0451456088
5 B D ab 0.4143084 0.0750566107
6 C D ab 1.2066231 0.6665182345
7 A B po 0.3341536 0.0700086821
8 A C po 0.8309230 1.0000000000
9 A D po 0.6317547 0.5859688210
10 B C po 2.4870606 0.0896596469
11 B D po 1.8959538 0.1934540389
12 C D po 0.7608147 0.7994419705
13 A B te 0.2899040 0.1823689517
14 A C te 0.1664441 0.0270616991
15 A D te 0.1614577 0.0141330017
16 B C te 0.5722244 0.3084931936
17 B D te 0.5586957 0.2609751992
18 C D te 0.9780082 1.0000000000
Now I would like to add manually the pvalue and the odd ratio for each pair on the following barplot
I saw we can use the stat_pvalue_manual function to do that, but I have some difficulties to use it. Please, someone can have look on my code and tell me what is wrong. Below is the code and the plot I obtained.
bp_tab_pct<- ggplot(dt, aes(x=loc, y=pct, fill=cat))+
geom_bar(stat = "identity",position="dodge") +
scale_fill_manual(values = c("A"= "#5977FF", "B"="#FF7F50", "C"= "#00CED1", "D" = "#FFFF33"))+
theme(plot.title = element_text(color="black", size=15, face="bold.italic")) +
labs(fill = "") +
ylim(c(0,100))+
ggtitle(label = "")+
geom_text(aes(y = pct+se+2, label = paste(round(pct,2),"%", "\n","(",value,"/",tot,")")), color = "black",size=5, position = position_dodge(width = 0.9))+
theme(plot.title = element_text(color="black", size=15, face="bold.italic"))+
theme(axis.text.x = element_text(size=16), axis.text.y = element_text(size=14))+
theme(axis.title.y = element_text(size=16), axis.title.x = element_blank(),) +
geom_errorbar(aes(ymin=pct-se, ymax=pct+se), width=.2,
position=position_dodge(.9)) +
stat_pvalue_manual(res, y.position = 35, step.increase = 0.1,
label = "p", inherit.aes = FALSE)
enter image description here
I also tried with facet insted of group but I still don't have the plot I want.
It seems that the stat_pvalue_manual function considere only the 10 first lines and I don't know why.
bp_tab_pct<- ggplot(dt, aes(x=bat, y=pct, fill=bat))+
facet_wrap(~loc) +
geom_bar(stat = "identity",position="dodge") +
scale_fill_manual(values = c("A"= "#5977FF", "B"="#FF7F50", "C"= "#00CED1", "D" = "#FFFF33"))+
theme(plot.title = element_text(color="black", size=15, face="bold.italic")) +
labs(fill = "") +
ylim(c(0,100))+
ggtitle(label = "")+
geom_text(aes(y = pct+se+2, label = paste(round(pct,2),"%", "\n","(",value,"/",tot,")")), color = "black",size=5, position = position_dodge(width = 0.9))+
theme(plot.title = element_text(color="black", size=15, face="bold.italic"))+
theme(axis.text.x = element_text(size=16), axis.text.y = element_text(size=14))+
theme(axis.title.y = element_text(size=16), axis.title.x = element_blank(),) +
geom_errorbar(aes(ymin=pct-se, ymax=pct+se), width=.2,
position=position_dodge(.9)) +
stat_pvalue_manual(res, y.position = 35, step.increase = 0.1,
label = "p", inherit.aes = FALSE)
enter image description here
I understood the problem. It was du to the ylim masking the highest values on the y axis. Now I would like to be able to adjust for the y position of each pvalue as I did for the pct indicated up to the bar but I will post another question for that
enter image description here

Creating a bar graph with several variables in the x axis

I have the following dataset (graph_data):
# A tibble: 18 x 7
construction phase group mean se se_top se_bottom
<chr> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 hacer pre-test heritage 7.67 3.67 11.3 4
2 hacer treatment heritage 15.5 3.00 18.5 12.5
3 hacer post-test heritage 9.83 4.25 14.1 5.58
4 acc pre-test heritage 0.166 0.166 0.332 0
5 acc treatment heritage 4.33 2.67 7.00 1.67
6 acc post-test heritage 0.166 0.166 0.332 0
7 spe pre-test heritage 2.33 1.36 3.69 0.975
8 spe treatment heritage 6.67 2.69 9.36 3.98
9 spe post-test heritage 0.833 0.477 1.31 0.356
10 hacer pre-test monolingual 1 0.707 1.71 0.293
11 hacer treatment monolingual 1 0.577 1.58 0.423
12 hacer post-test monolingual 0.25 0.25 0.5 0
13 acc pre-test monolingual 0 0 0 0
14 acc treatment monolingual 1 0.577 1.58 0.423
15 acc post-test monolingual 0 0 0 0
16 spe pre-test monolingual 4 3.37 7.37 0.634
17 spe treatment monolingual 15.8 2.36 18.1 13.4
18 spe post-test monolingual 3.5 3.18 6.68 0.325
I want to create a bar graph using ggplot2 with the following conditions:
y axis: mean
x axis: phase + construction
facet: group
error bars: standard error (se_top, se_bottom)
I have used this code:
graph_data %>%
ggplot(aes(graph_data, x=phase, y=mean, fill =phase)) +
geom_bar(stat = "identity", color = "black", position = "dodge") +
scale_y_continuous(limits = c(0,20)) +
labs(x = "Phase", y = "Average number of targets produced") +
facet_wrap( ~ group) +
geom_errorbar(aes(ymin= se_bottom, ymax = se_top), width=.2,
position=position_dodge(.9)) +
theme(text = element_text(size=20)) +
theme_classic() + scale_fill_manual(values=c("#90EE90", "#3CB371", "#2E8B57")) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=15,face="bold"),
axis.text.x = element_text(angle=45, hjust = 1),
legend.position = "none")
However, in the graph that I get, columns are stacked on top of each other, even though I have used position = "dodge" in my code:
What should I change in order to get the graph I want?
Maybe this can be useful:
library(ggplot2)
#Code
graph_data %>%
ggplot(aes(graph_data, x=interaction(phase,construction), y=mean, fill =phase,group=group)) +
geom_bar(stat = "identity", color = "black", position = position_dodge(0.9)) +
scale_y_continuous(limits = c(0,20)) +
labs(x = "Phase", y = "Average number of targets produced") +
facet_wrap( ~ group) +
geom_errorbar(aes(ymin= se_bottom, ymax = se_top), width=.2,
position=position_dodge(.9)) +
theme(text = element_text(size=20)) +
theme_classic() + scale_fill_manual(values=c("#90EE90", "#3CB371", "#2E8B57")) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=15,face="bold"),
axis.text.x = element_text(angle=45, hjust = 1),
legend.position = "none")
Output:
Update: In order to get some order, try this:
#Code 2
graph_data %>%
mutate(phase=factor(phase,levels = c('pre-test','treatment','post-test'),
ordered = T)) %>%
arrange(phase) %>%
mutate(conc=paste(as.character(phase),construction),
conc=factor(conc,levels = unique(conc),ordered = T)) %>%
ggplot(aes(graph_data, x=conc, y=mean, fill =phase,group=group)) +
geom_bar(stat = "identity", color = "black", position = position_dodge(0.9)) +
scale_y_continuous(limits = c(0,20)) +
labs(x = "Phase", y = "Average number of targets produced") +
facet_wrap( ~ group) +
geom_errorbar(aes(ymin= se_bottom, ymax = se_top), width=.2,
position=position_dodge(.9)) +
theme(text = element_text(size=20)) +
theme_classic() + scale_fill_manual(values=c("#90EE90", "#3CB371", "#2E8B57")) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=15,face="bold"),
axis.text.x = element_text(angle=45, hjust = 1),
legend.position = "none")
Output:

Correlation Coefficients between population and suicide in Rstudio

Percentage Change
selectedCountry <- dataset %>%
filter(country == 'Japan') %>%
select(population, suicides_no, year) %>%
group_by(year) %>%
summarise(s_count = sum(suicides_no), p_count = sum(population))
year s_count p_count
<dbl> <dbl> <dbl>
1 1979 20711 107268500
2 1980 20416 108473500
3 1981 19976 109674700
4 1982 20535 110722900
5 1983 24853 111070000
6 1984 24221 111950000
I want to find the correlation between population and suicide after I aggregate population and suicide like this
percentage <- selectedCountry %>%
arrange(year) %>%
mutate(pct.chg.s = 100 * (s_count - lag(s_count,default=first(s_count))) / lag(s_count,default=first(s_count))) %>%
mutate(pct.chg.p = 100 * (p_count - lag(p_count,default=first(p_count))) / lag(p_count,default=first(p_count))) %>%
mutate(correlation = cor(pct.chg.s, pct.chg.p))
head(percentage)
I end up with a result
year s_count p_count pct.chg.s pct.chg.p correlation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1979 20711 107268500 0 0 0.00789
2 1980 20416 108473500 -1.42 1.12 0.00789
3 1981 19976 109674700 -2.16 1.11 0.00789
4 1982 20535 110722900 2.80 0.956 0.00789
5 1983 24853 111070000 21.0 0.313 0.00789
6 1984 24221 111950000 -2.54 0.792 0.00789
to plot the corr between two variables using two way (I can't sure if this true or false)
ggscatter(percentage, x = "pct.chg.s", y = "pct.chg.p",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Percentage Change in Suicide",
ylab = "Percentage Change in Population",
color = "blue", shape = 19,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
ellipse = TRUE, mean.point = TRUE,
star.plot = TRUE)
ggplot(percentage, aes(x = pct.chg.p, y = pct.chg.s)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x) +
labs(title = "Correlation between Pubulation and Suicides",
x = "Percentage Change in Suicide",
y = "Percentage Change in Population")

How to change the linetype of one of two lines

I would like to change the linetype of one of my two lines in the plot, only making "line1" into a dashed one.
My plot:
My data looks like as bellow:
Year Sex value Rate Group
<dbl> <chr> <dbl> <dbl> <chr>
1 1912 Female 18 1.14 A
2 1912 Male 52 0.893 L
3 1913 Female 25 1.02 A
4 1913 Male 42 1.05 L
5 1914 Female 14 1.26 A
6 1914 Male 67 1.29 L
7 1915 Female 25 1.32 A
8 1915 Male 61 1.45 L
9 1916 Female 32 1.52 A
10 1916 Male 71 1.64 L
11 1917 Female 42 2.01 A
12 1917 Male 92 1.87 L
My code:
data %>% ggplot() +
geom_bar(aes(x = Year, y = value, fill= Sex), stat = "identity",
width=0.8,
alpha=0.8) +
geom_line(aes(x = Year, y = Rate * 100, colour= Group),
size = 1.0) +
scale_colour_manual(labels = c("line1","line2"),
values = c("red","blue"))+
scale_fill_manual(values = c("Female"="green","Male"="black"))+
guides(fill=guide_legend(title = "Number"),
color=guide_legend(title= "Ratio"))
Could anyone help? I tried for quite a while but failed. Thanks in advance.
As suggested in comments, and using the "name" term in each scale_*_manual to specify the legend names:
data %>%
ggplot() +
geom_bar(aes(x = Year, y = value, fill= Sex), stat = "identity",
width=0.8,
alpha=0.8) +
geom_line(aes(x = Year, y = Rate * 100,
colour = Group, linetype = Group),
size = 1.0) +
scale_colour_manual(name = "Ratio",
labels = c("line1","line2"),
values = c("red","blue"))+
scale_linetype_manual(name = "Ratio",
labels = c("line1","line2"),
values = c("dashed","solid"))+
scale_fill_manual(name = "Number",
values = c("Female"="green","Male"="black"))

Resources