Related
I am currently working with a dataset of "world bank islands". In that, I am trying to plot the population Vs country graph for each year. Below is the code that I have done.
library(ggplot2)
options(scipen = 999)
bank <- read.csv("C:/Users/True Gamer/OneDrive/Desktop/world_bank_international_arrivals_islands.csv")
bank[bank == "" | bank == "."] <- NA
bank$country <- as.numeric(bank$country)
bank$year <- as.numeric(bank$year)
bank$areakm2 <- as.numeric(bank$areakm2)
bank$pop <- as.numeric(bank$pop)
bank$gdpnom <- as.numeric(bank$gdpnom)
bank$flights...WB <- as.numeric(bank$flights...WB)
bank$hotels <- as.numeric(bank$hotels)
bank$hotrooms <- as.numeric(bank$hotrooms)
bank$receipt <- as.numeric(bank$receipt)
bank$ovnarriv <- as.numeric(bank$ovnarriv)
bank$dayvisit <- as.numeric(bank$dayvisit)
bank$arram <- as.numeric(bank$arram)
bank$arreur <- as.numeric(bank$arreur)
bank$arraus <- as.numeric(bank$arraus)
str(bank)
plot1 <- ggplot(bank, aes(x=country,y=pop)) + geom_bar(stat = "identity",aes(fill=year)) + ggtitle("Population of each country yearwise") + xlab("Countries") + ylab("Population")
plot1
However, when I do this, the y values shown on the graph are different from the actual y values. This is the link to the dataset
The problem is that you are stacking the bars (this is default behaviour). Also, geom_bar(stat = "identity") is just a long way of writing geom_col. One further point to note is that since all your columns are numeric, the single line:
bank <- as.data.frame(lapply(bank, as.numeric))
replaces all your individual numeric conversions.
The plot you are trying to create would be something like this:
ggplot(bank, aes(x = country, y = pop)) +
geom_col(aes(fill = factor(year)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Countries") +
ylab("Population") +
labs(fill = "Year") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = 1:27)
However, it would probably be best to present your data in a different way. Perhaps, if you are comparing population growth, something like this would be better:
ggplot(bank, aes(x = year, y = pop)) +
geom_line(aes(color = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
Or with bars:
ggplot(bank, aes(x = year, y = pop)) +
geom_col(aes(fill = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
I used R language to perform regression analysis/smoothing on the known data. The red line is the fitted regression curve, and the green background is the 95% confidence interval.
Now I want to set different colors for the red line and green confidence interval in this picture by time period (red solid line and gray confidence interval before 2020, green dotted line and blue confidence interval from 2020 to 2030), but I don't know how to set the code.
Here is my code.
data <- data.frame(
year = c(2003:2030),
number = c(40.84,49.2354,51.5988,53.9622,56.3256,64.79,61.0524,63.4158,65.7792,68.1426,76.4,72.8694,75.2328,77.5962,79.9596,76.36,71.99236,71.10268,70.39136,69.80928,69.31588,68.89523,68.53132,68.21586,67.93846,67.69439,67.47674,67.28291),
stringsAsFactors = FALSE
)
p3 <- ggplot(data, aes(x=year, y=number)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), color="red", fill="#69b3a2", se=TRUE) +
theme_ipsum()
p3
This figure is what I have drawn so far.
This figure is what I want in the end.
Thanks!!
This is another try:
library(tidyverse)
library(hrbrthemes)
data %>%
mutate(year_dich = ifelse(year<2015, "<2020", ">=2020")) %>%
ggplot(aes(x=year, y=number, group=year_dich)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se=TRUE,
aes(color = ifelse(data$year<2015, "red", "green"),
fill = ifelse(data$year<2015, "blue", "black"),
linetype = ifelse(data$year<2015, "dashed", "solid"))
)+
theme_ipsum()+
theme(legend.position = "none")
Often people try to do too much inside ggplot - when you have very specific requirements it is best to just work out what you want to plot, then draw it:
library(ggplot2)
library(hrbrthemes)
mod <- lm(formula = number ~ splines::bs(year, 3), data = data)
pred <- as.data.frame(predict(mod, se = TRUE)[1:2])
pred$year <- 2003:2030
pred$number <- pred$fit
pred$pre <- pred$year < 2020
ggplot(pred, aes(x=year, y=number)) +
geom_point(data = data) +
geom_line(aes(color = pre, linetype = pre), size = 1.2) +
geom_ribbon(aes(ymin = fit - 1.96 * se.fit, ymax = fit + 1.96 * se.fit,
fill = pre), alpha = 0.2) +
scale_fill_manual(values = c("dodgerblue", "gray")) +
scale_color_manual(values = c("forestgreen", "red")) +
scale_linetype_manual(values = c(2, 1)) +
theme_ipsum() +
theme(legend.position = "none")
If you want an interpretable formula for the line (since the actual formula for a spline regression is complicated and not easy to recover), you could try a simple polynomial regression, which seems to fit your data about the same as a spline regression:
mod <- lm(number ~ poly(year, 3), data = data)
pred <- as.data.frame(predict(mod, se.fit = TRUE)[1:2])
pred$year <- 2003:2030
pred$number <- pred$fit
pred$pre <- pred$year < 2020
ggplot(pred, aes(x=year, y=number)) +
geom_point(data = data) +
geom_line(aes(color = pre, linetype = pre), size = 1.2) +
geom_ribbon(aes(ymin = fit - 1.96 * se.fit, ymax = fit + 1.96 * se.fit,
fill = pre), alpha = 0.2) +
scale_fill_manual(values = c("dodgerblue", "gray")) +
scale_color_manual(values = c("forestgreen", "red")) +
scale_linetype_manual(values = c(2, 1)) +
theme_ipsum() +
theme(legend.position = "none") +
annotate("text", x = 2020, y = 45,
label = paste("number =",
paste0(format(coef(mod), digits = 3),
c("", " * year", " * year\u00b2", " * year\u00b3"),
collapse = " + ")), fontface = "bold")
Where we can see the equation added as an annotation, retrieved from the regression equation with:
paste("number =",
paste0(format(coef(mod), digits = 3),
c("", " * year", " * year\u00b2", " * year\u00b3"),
collapse = " + "))
#> [1] "number = 66.5 + 27.4 * year + -34.3 * year² + 10.0 * year³"
I want to customize the formula used in geom_smooth like this:
library(MASS)
library(ggplot2)
data("Cars93", package = "MASS")
str(Cars93)
Cars93.log <- transform(Cars93, log.price = log(Price))
log.model <- lm(log.price ~ Horsepower*Origin, data = Cars93.log)
summary(log.model)
plot(log.model)
p <- ggplot(data = Cars93.log, aes(x = Horsepower, y = log.price, colour = Origin)) +
geom_point(aes(shape = Origin, color = Origin)) + # Punkte
facet_grid(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
lm.mod <- function(df) {
y ~ x*Cars93.log$Origin
}
p_smooth <- by(Cars93.log, Cars93.log$Origin,
function(x) geom_smooth(data=x, method = lm, formula = lm.mod(x)))
p + p_smooth
However, I receive the error that the computation failed because of different lengths of my used variables.
length(Cars93.log$log.price)
length(Cars93.log$Origin)
length(Cars93.log$Horsepower)
But when I check the length for each variable they're all the same... Any ideas, what's wrong?
Thanks a lot, Martina
I agree with #Rui Barradas, seems like the issue is the lines for lm.mod and p_smooth and the by function
Once you are making a distinction by Origin (e.g., by doing either facet_wrap or color = Origin) then geom_smooth will automatically run different models for those facets.
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price, color = Origin)) +
geom_point(aes(shape = Origin)) +
facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)
you can convince yourself that this is the same as the output of log.model by extending the x-axis limits to see where the geom_smooth line would cross the y axis (e.g., + coord_cartesian(xlim = c(0, 300)))
You can also see the difference in the graph if you don't pass color = Origin to the geom_smooth function (essentially what is happening if you comment this out from the first ggplot() initialization):
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price)) + # color = Origin)) +
geom_point(aes(shape = Origin)) +
#facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)
The colors are added to the ggplot scatter plot based on interaction of two variables : choice and flag (each has two values, therefore, total four combinations). I used faceting based on z value.
library(tidyverse)
x <- runif(10000)
y <- runif(10000)
z <- c(rep(0, 5000), rep(1, 5000))
flag <- c(rep(0, 500), rep(1, 4500), rep(0, 4500), rep(1, 500))
choice <- rep(c(0, 1), 5000)
tbl <- tibble(x, y, z, flag, choice)
scatterplot <- ggplot(tbl,
aes(x = x,
y = y,
color = factor(interaction(choice, flag)))
) +
geom_point(alpha = 0.7,
size = 2) +
scale_color_manual(values = c("blue3", "cyan1", "red3", "orange")) +
facet_grid(z ~ .) +
theme_bw() +
theme(legend.position = "right") +
theme(aspect.ratio = 1) +
ggtitle("Scatter plot")
scatterplot
But I have the following requirement -
z is used for facetting. For z = i, I want points with flag = i to be above, i.e. in the figure below,
for z = 0, blue points (flag = 0) should be over red/orange points.
for z = 1, red/orange points (flag = 1) should be over blue points (as shown)
If I understand you correctly, you are happy with the lower panel, but you need the blue dots in the top panel to be overlaid on the orange dots (at the moment the orange dots are overlaid on the blue dots in both panels).
If this is the case, then calling geom_point a second time with a subsetted data frame where z == 0 & flag == 0 will overlay the appropriate blue points on the top panel without affecting the lower panel.
tbl <- tbl %>%
mutate(col = interaction(choice, flag))
ggplot(tbl, aes(x, y, color = col)) +
geom_point(alpha = 0.7, size = 2) +
geom_point(data = subset(tbl, z == 0 & flag == 0),
alpha = 0.7, size = 2) +
scale_color_manual(values = c("blue3", "cyan1", "red3", "orange")) +
facet_grid(z ~ .) +
theme_bw() +
theme(legend.position = "right") +
theme(aspect.ratio = 1) +
ggtitle("Scatter plot")
Consider this as an option for you. With facets it was complex to set specific order but you can do the same plot using patchwork:
library(tidyverse)
library(patchwork)
#Data
x <- runif(10000)
y <- runif(10000)
z <- c(rep(0, 5000), rep(1, 5000))
flag <- c(rep(0, 500), rep(1, 4500), rep(0, 4500), rep(1, 500))
choice <- rep(c(0, 1), 5000)
tbl <- tibble(x, y, z, flag, choice)
Plots:
#Plot
G1 <- ggplot(subset(tbl,z==0),aes(x = x,y = y,
color = factor(interaction(choice, flag),
levels = rev(unique(interaction(choice, flag))),
ordered = T))) +
geom_point(alpha = 0.7,
size = 2) +
scale_color_manual(values = c("blue3", "cyan1", "red3", "orange")) +
facet_grid(z ~ .) +
theme_bw() +
theme(legend.position = "right") +
theme(aspect.ratio = 1) +
ggtitle("Scatter plot")+
labs(color='Color',x='')+theme(legend.position = 'none')
#Plot 2
G2 <- ggplot(subset(tbl,z==1),aes(x = x,y = y,
color = factor(interaction(choice, flag)))) +
geom_point(alpha = 0.7,
size = 2) +
scale_color_manual(values = c("blue3", "cyan1", "red3", "orange")) +
facet_grid(z ~ .) +
theme_bw() +
theme(legend.position = "right") +
theme(aspect.ratio = 1) +
labs(color='Color')
Final arrange:
#Final plot
G <- G1/G2
G <- G+plot_layout(guides = 'collect')
Output:
So the code below is working w/out errors, and I am trying to fix the following issue.
First, I am trying to change the group name for each graph to say, for instance, "< 1500 dollars" to refer to the group of workers earnings $1500 or less etc...
I tried this solution: to change the underlying factor level names but I keep getting this error:
"Error: unexpected ',' in ""< 1500 Dollars",""
outflows <- Wage_Outflows
levels(outflows$wage_group)
"< 1500", "1501 ~ 2999", "3000",
levels(outflows$wage_group) <- c("< 1500 Dollars", "1501 ~ 2999 Dollars", "3000 Dollars")
text.on.each.panel <-"Dollars"
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p2 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="1501 ~ 2999",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(800000, 1100000, by = 20000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p3 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="3000",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(50000, 120000, by = 5000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
grid.arrange(p1, p2,p3, ncol=1)
For your first question have a look at the labeller argument in the facet_wrap function.
And for your second question the labs function might be the solution.
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",],
aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
labs(y = "Number of workers") +
facet_wrap(~ wage_group, labeller = labeller(wage_group = c(`< 1500` = "< 1500
dollars"))) +
theme(axis.title.x = element_blank())
Maybe you can shorten your code like this:
# Example dataset:
df <- data.frame(wage_group = rep(c("A","B","C"), each = 10),
year = 2001:2010,
labor = seq(5000,34000, 1000))
ggplot(df , aes(x = factor(year), y = labor)) +
geom_point() +
labs(y = "# of workers") +
facet_wrap(~wage_group, ncol = 1, scales = "free",
labeller = labeller(wage_group = c(`A` = "less than 1500 dollars",
`B` = "1500-2999 dollars", `C` = "more than 3000 dollars"))) +
theme(axis.title.x = element_blank())