Y axis values different from actual column in dataset in R - r

I am currently working with a dataset of "world bank islands". In that, I am trying to plot the population Vs country graph for each year. Below is the code that I have done.
library(ggplot2)
options(scipen = 999)
bank <- read.csv("C:/Users/True Gamer/OneDrive/Desktop/world_bank_international_arrivals_islands.csv")
bank[bank == "" | bank == "."] <- NA
bank$country <- as.numeric(bank$country)
bank$year <- as.numeric(bank$year)
bank$areakm2 <- as.numeric(bank$areakm2)
bank$pop <- as.numeric(bank$pop)
bank$gdpnom <- as.numeric(bank$gdpnom)
bank$flights...WB <- as.numeric(bank$flights...WB)
bank$hotels <- as.numeric(bank$hotels)
bank$hotrooms <- as.numeric(bank$hotrooms)
bank$receipt <- as.numeric(bank$receipt)
bank$ovnarriv <- as.numeric(bank$ovnarriv)
bank$dayvisit <- as.numeric(bank$dayvisit)
bank$arram <- as.numeric(bank$arram)
bank$arreur <- as.numeric(bank$arreur)
bank$arraus <- as.numeric(bank$arraus)
str(bank)
plot1 <- ggplot(bank, aes(x=country,y=pop)) + geom_bar(stat = "identity",aes(fill=year)) + ggtitle("Population of each country yearwise") + xlab("Countries") + ylab("Population")
plot1
However, when I do this, the y values shown on the graph are different from the actual y values. This is the link to the dataset

The problem is that you are stacking the bars (this is default behaviour). Also, geom_bar(stat = "identity") is just a long way of writing geom_col. One further point to note is that since all your columns are numeric, the single line:
bank <- as.data.frame(lapply(bank, as.numeric))
replaces all your individual numeric conversions.
The plot you are trying to create would be something like this:
ggplot(bank, aes(x = country, y = pop)) +
geom_col(aes(fill = factor(year)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Countries") +
ylab("Population") +
labs(fill = "Year") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = 1:27)
However, it would probably be best to present your data in a different way. Perhaps, if you are comparing population growth, something like this would be better:
ggplot(bank, aes(x = year, y = pop)) +
geom_line(aes(color = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
Or with bars:
ggplot(bank, aes(x = year, y = pop)) +
geom_col(aes(fill = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")

Related

geom_text in barplot to show frequency over bars using R

I have this graph that I want to show the count over the bar, however my code shows the number 1 inside the bars..
What I have:
What I am trying to make:
# Library
library(ggplot2)
# 1. Read data (comma separated)
df = read.csv2(text = "Id;Date
1;2021-06-09
2;2021-06-08
3;2021-06-08
4;2021-06-09
5;2021-06-09")
# 2. Print table
df_date <- df[, "Date"]
df_date <- as.data.frame(table(df_date))
colnames(df_date)[which(names(df_date) == "df_date")] <- "Date" # Set column name to Date
df_date
# 3. Plot bar chart
ggplot(df_date, aes(x = Date, y = Freq)) +
geom_bar(stat = "identity") +
theme_classic() +
ggtitle("Date") +
xlab("Date") +
ylab("Frequency") +
geom_text(stat= "count", aes(label = ..count.., y= ..prop..), vjust = -1)
Since you have already calculated the frequency use geom_col.
library(ggplot2)
ggplot(df_date, aes(x = Date, y = Freq)) +
geom_col() +
theme_classic() +
ggtitle("Date") +
xlab("Date") +
ylab("Frequency") +
geom_text(aes(label = Freq), vjust = -1)
If you use df you can use geom_bar as -
ggplot(df, aes(x = Date)) +
geom_bar() +
theme_classic() +
ggtitle("Date") +
xlab("Date") +
ylab("Frequency") +
geom_text(stat= "count",aes(label = ..count..), vjust = -1)

Faceted bar plot with observation name adjacent to bar for each group and space=free

Using ggplot I'm trying to make something like a faceted barplot where
bars representing the same value are the same size (sort of like space = "free")
names are adjacent to bars (sort of like scales = "free_y")
graphs are generated with code - no trial and error adjustment of size or scale or stuff
I'm open to a multi-plot solution with something like cowplot::plot_grid
Here's a sample dataset.
df <- data.frame(name = c('A very long name','A short name','A really truly long big name that is very long','One shorter name'),
value =c(100,50,10,10),
group = c(2022,2022,2022,2021))
What I'm aiming for would look something like this:
Two things I've tried and rejected:
ggplot(df,
aes(x = name, y = value)) +
geom_col(aes(fill = -value)) +
coord_flip() +
facet_grid(~group, space = "free", scales = "free_x") +
theme(legend.position = "none")
ggplot(df,
aes(x = name, y = value)) +
geom_col(aes(fill = -value)) +
coord_flip() +
facet_wrap(~group, scales = "free_y") +
theme(legend.position = "none")
Here is a solution using vanilla ggplot2, taken from a related ggplot2 issue. You can use the fact that breaks and limits arguments accept functions. Below, we use that to pad limits with dummy names, and then use the breaks function to censor the dummy names. It requires you to know the maximum number of categories on a facet beforehand though.
library(ggplot2)
df <- data.frame(name = c('A very long name','A short name','A really truly long big name that is very long','One shorter name'),
value =c(100,50,10,10),
group = c(2022,2022,2022,2021))
max_categories <- 3
ggplot(df,
aes(y = name, x = value)) +
geom_col(aes(fill = -value)) +
scale_y_discrete(
limits = function(x) {
y <- paste0("dummy", seq_len(max_categories))
c(y[seq_len(max_categories - length(x))], x)
},
breaks = function(x) {
x[!startsWith(x, "dummy")]
}
) +
facet_wrap(~group, scales = "free_y") +
theme(legend.position = "none")
Created on 2021-05-09 by the reprex package (v0.3.0)
A few sidenotes; I switched the x and y aes to make the coord_flip() unnecessary. Also, you can set scales = "free"+ space = "free_x", if you want the panels to adjust their width in response to the data.
With patchwork you could try:
library(ggplot2)
library(dplyr)
library(patchwork)
df <- data.frame(name = c('A very long name','A short name','A really truly long big name that is very long','One shorter name'),
value =c(100,50,10,10),
group = c(2022,2022,2022,2021))
# plots could be simplified with a function and appearance edited to suit your needs
p2022 <-
ggplot(data = filter(df, group == 2022), aes(x = name, y = value)) +
geom_col(aes(fill = -value)) +
coord_flip() +
labs(x = NULL) +
facet_grid(~group) +
theme(legend.position = "none")
p2021 <-
ggplot(data = filter(df, group == 2021), aes(x = name, y = value)) +
geom_col(aes(fill = -value)) +
coord_flip() +
scale_y_continuous(limits = c(0, max(df$value)))+
labs(x = NULL) +
facet_grid(~group) +
theme(legend.position = "none")
# define the plotting layout
design <- "
12
#2
#2"
# plot
p2021 + p2022 + plot_layout(design = design)
Created on 2021-05-09 by the reprex package (v2.0.0)
Another approach could be:
ggplot(df,
aes(x = name, y = value)) +
geom_col(aes(fill = -value)) +
coord_flip() +
facet_wrap(~group)+
theme(legend.position = "none")
This could be an alternative approach:
p <- ggplot(df,
aes(x = name, y = value)) +
geom_col(aes(fill = -value)) +
coord_flip() +
facet_grid(group~., space = "free", scales = "free") +
theme(legend.position = "none")

Changing facet labels in face_wrap() ggplot2

So the code below is working w/out errors, and I am trying to fix the following issue.
First, I am trying to change the group name for each graph to say, for instance, "< 1500 dollars" to refer to the group of workers earnings $1500 or less etc...
I tried this solution: to change the underlying factor level names but I keep getting this error:
"Error: unexpected ',' in ""< 1500 Dollars",""
outflows <- Wage_Outflows
levels(outflows$wage_group)
"< 1500", "1501 ~ 2999", "3000",
levels(outflows$wage_group) <- c("< 1500 Dollars", "1501 ~ 2999 Dollars", "3000 Dollars")
text.on.each.panel <-"Dollars"
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p2 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="1501 ~ 2999",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(800000, 1100000, by = 20000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p3 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="3000",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(50000, 120000, by = 5000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
grid.arrange(p1, p2,p3, ncol=1)
For your first question have a look at the labeller argument in the facet_wrap function.
And for your second question the labs function might be the solution.
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",],
aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
labs(y = "Number of workers") +
facet_wrap(~ wage_group, labeller = labeller(wage_group = c(`< 1500` = "< 1500
dollars"))) +
theme(axis.title.x = element_blank())
Maybe you can shorten your code like this:
# Example dataset:
df <- data.frame(wage_group = rep(c("A","B","C"), each = 10),
year = 2001:2010,
labor = seq(5000,34000, 1000))
ggplot(df , aes(x = factor(year), y = labor)) +
geom_point() +
labs(y = "# of workers") +
facet_wrap(~wage_group, ncol = 1, scales = "free",
labeller = labeller(wage_group = c(`A` = "less than 1500 dollars",
`B` = "1500-2999 dollars", `C` = "more than 3000 dollars"))) +
theme(axis.title.x = element_blank())

Make overlapping histogram in with geom_histogram

I am trying to make an overlapping histogram like this:
ggplot(histogram, aes = (x), mapping = aes(x = value)) +
geom_histogram(data = melt(tpm_18_L_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
geom_histogram(data = melt(tpm_18_S_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
geom_histogram(data = melt(tpm_18_N_SD), breaks = seq(1,10,by = 1),
aes(y = 100*(..count../sum(..count..))), alpha=0.2) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
My code can only make them plot side by side and I would like to also make them overlap. Thank you! I based mine off of the original post where this came from but it did not work for me. It was originally 3 separate graphs which I combined with grid and ggarrange. It looks like this right now.
Here is the code of the three separate graphs.
SD_18_L <- ggplot(data = melt(tpm_18_L_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
SD_18_S <- ggplot(data = melt(tpm_18_S_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
SD_18_N <- ggplot(data = melt(tpm_18_N_SD), mapping = aes(x = value)) +
geom_histogram(aes(y = 100*(..count../sum(..count..))), breaks = seq(1, 10, by = 1)) +
facet_wrap(~variable, scales = 'free_x') +
ylim(0, 20) +
ylab("Percentage of Genes") +
xlab("Standard Deviation")
What my graphs look like now:
ggplot expects dataframes in a long format. I'm not sure what your data looks like, but you shouldn't have to call geom_histogram for each category. Instead, get all your data into a single dataframe (you can use rbind for this) in long format (what you're doing already with melt) first, then feed it into ggplot and map fill to whatever your categorical variable is.
Your call to facet_wrap is what puts them in 3 different plots. If you want them all on the same plot, take that line out.
An example using the iris data:
ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
geom_histogram(alpha = 0.6, position = "identity")
I decreased alpha in geom_histogram so you can see where colors overlap, and added position = "identity" so observations aren't being stacked. Hope that helps!

How can I add a line to one of the facets?

ggplot(all, aes(x=area, y=nq)) +
geom_point(size=0.5) +
geom_abline(data = levelnew, aes(intercept=log10(exp(interceptmax)), slope=fslope)) + #shifted regression line
scale_y_log10(labels = function(y) format(y, scientific = FALSE)) +
scale_x_log10(labels = function(x) format(x, scientific = FALSE)) +
facet_wrap(~levels) +
theme_bw() +
theme(panel.grid.major = element_line(colour = "#808080"))
And I get this figure
Now I want to add one geom_line to one of the facets. Basically, I wanted to have a dotted line (Say x=10,000) in only the major panel. How can I do this?
I don't have your data, so I made some up:
df <- data.frame(x=rnorm(100),y=rnorm(100),z=rep(letters[1:4],each=25))
ggplot(df,aes(x,y)) +
geom_point() +
theme_bw() +
facet_wrap(~z)
To add a vertical line at x = 1 we can use geom_vline() with a dataframe that has the same faceting variable (in my case z='b', but yours will be levels='major'):
ggplot(df,aes(x,y)) +
geom_point() +
theme_bw() +
facet_wrap(~z) +
geom_vline(data = data.frame(xint=1,z="b"), aes(xintercept = xint), linetype = "dotted")
Another way to express this which is possibly easier to generalize (and formatting stuff left out):
ggplot(df, aes(x,y)) +
geom_point() +
facet_wrap(~ z) +
geom_vline(data = subset(df, z == "b"), aes(xintercept = 1))
The key things being: facet first, then decorate facets by subsetting the original data frame, and put the details in a new aes if possible. Other examples of a similar idea:
ggplot(df, aes(x,y)) +
geom_point() +
facet_wrap(~ z) +
geom_vline(data = subset(df, z == "b"), aes(xintercept = 1)) +
geom_smooth(data = subset(df, z == "c"), aes(x, y), method = lm, se = FALSE) +
geom_text(data = subset(df, z == "d"), aes(x = -2, y=0, label = "Foobar"))

Resources