So the code below is working w/out errors, and I am trying to fix the following issue.
First, I am trying to change the group name for each graph to say, for instance, "< 1500 dollars" to refer to the group of workers earnings $1500 or less etc...
I tried this solution: to change the underlying factor level names but I keep getting this error:
"Error: unexpected ',' in ""< 1500 Dollars",""
outflows <- Wage_Outflows
levels(outflows$wage_group)
"< 1500", "1501 ~ 2999", "3000",
levels(outflows$wage_group) <- c("< 1500 Dollars", "1501 ~ 2999 Dollars", "3000 Dollars")
text.on.each.panel <-"Dollars"
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p2 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="1501 ~ 2999",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(800000, 1100000, by = 20000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p3 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="3000",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(50000, 120000, by = 5000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
grid.arrange(p1, p2,p3, ncol=1)
For your first question have a look at the labeller argument in the facet_wrap function.
And for your second question the labs function might be the solution.
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",],
aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
labs(y = "Number of workers") +
facet_wrap(~ wage_group, labeller = labeller(wage_group = c(`< 1500` = "< 1500
dollars"))) +
theme(axis.title.x = element_blank())
Maybe you can shorten your code like this:
# Example dataset:
df <- data.frame(wage_group = rep(c("A","B","C"), each = 10),
year = 2001:2010,
labor = seq(5000,34000, 1000))
ggplot(df , aes(x = factor(year), y = labor)) +
geom_point() +
labs(y = "# of workers") +
facet_wrap(~wage_group, ncol = 1, scales = "free",
labeller = labeller(wage_group = c(`A` = "less than 1500 dollars",
`B` = "1500-2999 dollars", `C` = "more than 3000 dollars"))) +
theme(axis.title.x = element_blank())
Related
I am currently working with a dataset of "world bank islands". In that, I am trying to plot the population Vs country graph for each year. Below is the code that I have done.
library(ggplot2)
options(scipen = 999)
bank <- read.csv("C:/Users/True Gamer/OneDrive/Desktop/world_bank_international_arrivals_islands.csv")
bank[bank == "" | bank == "."] <- NA
bank$country <- as.numeric(bank$country)
bank$year <- as.numeric(bank$year)
bank$areakm2 <- as.numeric(bank$areakm2)
bank$pop <- as.numeric(bank$pop)
bank$gdpnom <- as.numeric(bank$gdpnom)
bank$flights...WB <- as.numeric(bank$flights...WB)
bank$hotels <- as.numeric(bank$hotels)
bank$hotrooms <- as.numeric(bank$hotrooms)
bank$receipt <- as.numeric(bank$receipt)
bank$ovnarriv <- as.numeric(bank$ovnarriv)
bank$dayvisit <- as.numeric(bank$dayvisit)
bank$arram <- as.numeric(bank$arram)
bank$arreur <- as.numeric(bank$arreur)
bank$arraus <- as.numeric(bank$arraus)
str(bank)
plot1 <- ggplot(bank, aes(x=country,y=pop)) + geom_bar(stat = "identity",aes(fill=year)) + ggtitle("Population of each country yearwise") + xlab("Countries") + ylab("Population")
plot1
However, when I do this, the y values shown on the graph are different from the actual y values. This is the link to the dataset
The problem is that you are stacking the bars (this is default behaviour). Also, geom_bar(stat = "identity") is just a long way of writing geom_col. One further point to note is that since all your columns are numeric, the single line:
bank <- as.data.frame(lapply(bank, as.numeric))
replaces all your individual numeric conversions.
The plot you are trying to create would be something like this:
ggplot(bank, aes(x = country, y = pop)) +
geom_col(aes(fill = factor(year)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Countries") +
ylab("Population") +
labs(fill = "Year") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = 1:27)
However, it would probably be best to present your data in a different way. Perhaps, if you are comparing population growth, something like this would be better:
ggplot(bank, aes(x = year, y = pop)) +
geom_line(aes(color = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
Or with bars:
ggplot(bank, aes(x = year, y = pop)) +
geom_col(aes(fill = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
I have data similar to the one I've created below:
set.seed(42)
dates <- seq.Date(as.Date("2012-08-01"), as.Date("2014-08-30"), "day")
n <- length(dates)
dat <- data.frame(date = dates,
category = rep(LETTERS[1:4], n/2),
daily_count = sample(18:100, n, replace=TRUE))
#following to be used for creating dotted lines; highlighting a certain point for each category
point_dates <- sample(seq.Date(as.Date("2012-08-01"), as.Date("2014-08-30"), "month"),4)
category_name <- list("A", "B", "C", "D")
I am creating a boxplot for each category using facet_wrap, and point_dates are important for me as they shows the point of interest in each boxplot. This is how I am creating the plot:
ggplot(dat) +
geom_boxplot(aes(y = daily_count,
x = yearmonth(date),
group = paste(yearmonth(date), category),
fill = category)) +
labs(x = 'Month & Year',
y = 'Count',
fill = "Category") +
theme_bw() +
theme(axis.text=element_text(size=10),
axis.title=element_text(size=10),
legend.position="none") +
geom_vline(xintercept = lubridate::ymd("2013-08-23"), linetype=1, colour="red", size = 0.5)+
sapply(point_dates[[1]], function(xint) geom_vline(data=filter(dat,
category==category_name[[1]]),aes(xintercept = xint),
linetype=3, colour="black", size = 1))+
sapply(point_dates[[2]], function(xint) geom_vline(data=filter(dat,
category==category_name[[2]]),aes(xintercept = xint),
linetype=3, colour="black", size = 1))+
sapply(point_dates[[3]], function(xint) geom_vline(data=filter(dat,
category==category_name[[3]]),aes(xintercept = xint),
linetype=3, colour="black", size = 1))+
sapply(point_dates[[4]], function(xint) geom_vline(data=filter(dat,
category==category_name[[4]]),aes(xintercept = xint),
linetype=3, colour="black", size = 1))+
facet_wrap(~category, nrow = 2)
And this is the output of the code:
The plot is being created just fine. My question is, is there any better way (loop may be?) that would help me get rid of writing sapply multiple times. Because the number of categories may change (increase/decrease), that would be to change the code everytime.
Any guidance please?
I'm not sure that this is the best way, but you could do all of them in one go using map2 from tidyr. This would save you time from having to write out individual sapply.
library(tidyverse)
ggplot(dat) +
geom_boxplot(aes(y = daily_count,
x = yearmonth(date),
group = paste(yearmonth(date), category),
fill = category)) +
labs(x = 'Month & Year',
y = 'Count',
fill = "Category") +
theme_bw() +
theme(axis.text=element_text(size=10),
axis.title=element_text(size=10),
legend.position="none") +
geom_vline(xintercept = lubridate::ymd("2013-08-23"),
linetype=1, colour="red", size = 0.5)+
map2(point_dates, category_name,
~geom_vline(data=filter(dat, category==.y),
aes(xintercept = .x),
linetype=3, colour="black", size = 1))+
facet_wrap(~category, nrow = 2)
You can use map() to iterate the calls to sapply():
ggplot(dat) +
geom_boxplot(aes(y = daily_count,
x = yearmonth(date),
group = paste(yearmonth(date), category),
fill = category)) +
labs(x = 'Month & Year',
y = 'Count',
fill = "Category") +
theme_bw() +
theme(axis.text=element_text(size=10),
axis.title=element_text(size=10),
legend.position="none") +
geom_vline(xintercept = lubridate::ymd("2013-08-23"), linetype=1, colour="red", size = 0.5)+
map(seq_along(unique(dat$category)), ~sapply(point_dates[[.]], function(xint) geom_vline(data=filter(dat,
category==category_name[[.]]),aes(xintercept = xint),
linetype=3, colour="black", size = 1))) +
facet_wrap(~category, nrow = 2)
If i got it correct, you have already defined the dates for each group. So make the first plot:
library(ggplot2)
library(tsibble)
g = ggplot(dat) +
geom_boxplot(aes(y = daily_count,
x = yearmonth(date),
group = paste(yearmonth(date), category),
fill = category)) +
labs(x = 'Month & Year',
y = 'Count',
fill = "Category") +
theme_bw() +
theme(axis.text=element_text(size=10),
axis.title=element_text(size=10),
legend.position="none") +
geom_vline(xintercept = lubridate::ymd("2013-08-23"), linetype=1, colour="red", size = 0.5)+
facet_wrap(~category, nrow = 2)
You just need to provide a new data frame and call geom_vline:
tmp = data.frame(category=unlist(category_name),date=point_dates)
g + geom_vline(data=tmp,aes(xintercept = date),
linetype=3, colour="black", size = 1)
I want to customize the formula used in geom_smooth like this:
library(MASS)
library(ggplot2)
data("Cars93", package = "MASS")
str(Cars93)
Cars93.log <- transform(Cars93, log.price = log(Price))
log.model <- lm(log.price ~ Horsepower*Origin, data = Cars93.log)
summary(log.model)
plot(log.model)
p <- ggplot(data = Cars93.log, aes(x = Horsepower, y = log.price, colour = Origin)) +
geom_point(aes(shape = Origin, color = Origin)) + # Punkte
facet_grid(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
lm.mod <- function(df) {
y ~ x*Cars93.log$Origin
}
p_smooth <- by(Cars93.log, Cars93.log$Origin,
function(x) geom_smooth(data=x, method = lm, formula = lm.mod(x)))
p + p_smooth
However, I receive the error that the computation failed because of different lengths of my used variables.
length(Cars93.log$log.price)
length(Cars93.log$Origin)
length(Cars93.log$Horsepower)
But when I check the length for each variable they're all the same... Any ideas, what's wrong?
Thanks a lot, Martina
I agree with #Rui Barradas, seems like the issue is the lines for lm.mod and p_smooth and the by function
Once you are making a distinction by Origin (e.g., by doing either facet_wrap or color = Origin) then geom_smooth will automatically run different models for those facets.
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price, color = Origin)) +
geom_point(aes(shape = Origin)) +
facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)
you can convince yourself that this is the same as the output of log.model by extending the x-axis limits to see where the geom_smooth line would cross the y axis (e.g., + coord_cartesian(xlim = c(0, 300)))
You can also see the difference in the graph if you don't pass color = Origin to the geom_smooth function (essentially what is happening if you comment this out from the first ggplot() initialization):
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price)) + # color = Origin)) +
geom_point(aes(shape = Origin)) +
#facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)
I'm using ggplot() to build two scatter plots that plot health assessment score for (1) male and (2) female patients vs. # weeks in treatment, plus I'm using geom_line() to plot regression line overlays for both the males and females on each graph.
My question: How do I match the colors of the line overlays with the colors of the scatter plot points ('steelblue2' and 'pink3') while still retaining the legend?
I've found if I move color outside of the aes() argument in geom_line(), the colors of the lines and scatterplot points match, but then the legend disappears.
My code & a sample from my data:
mean_behav_by_numweeks = data.frame(
numweeks_round = c(1:10),
Mean_Behavior_Score_Male = c(3.32,4.18,3.82,4.06,3.33, 3.80,3.64,3.66,3.37,3.82),
nrow_male = c(396,323,293,259,226,217,202,190,170,167),
lm_results_predict_male = c(3.82,3.80,3.78,3.76,3.74, 3.72,3.70,3.68,3.66,3.64),
Mean_Behavior_Score_Female = c(2.91,3.79,3.65,3.41, 2.88,2.88,3.78,2.98,3.67,3.93),
nrow_female = c(109,82,72,74,66,60,58,56,52,50),
lm_results_predict_female=c(3.44,3.44,3.45,3.45, 3.46,3.47,3.47,3.48,3.48,3.49))
gg_plot1 <- ggplot(mean_behav_by_numweeks,
aes(numweeks_round,
Mean_Behavior_Score_Male,
size = mean_behav_by_numweeks$nrow_male)) +
geom_point(colour='steelblue2') +
ggtitle(paste("Scatter plot of mean behavior assessment score by member by # weeks \n since 1st assessment for",
as.character(var),
"among Male Medi-Cal plan members")) +
theme(plot.title = element_text(size=10.9, hjust = 0.5)) +
theme(axis.text = element_text(size=8)) +
scale_size_continuous(range = c(1, 7)) +
xlab("Number of weeks since 1st assessment") +
ylab("Mean behavior assessment score") +
theme(legend.position="bottom") +
labs(size="# members") +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_male, color='steelblue2'),
size=1) +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_female, color='pink3'),
size=1) +
scale_color_discrete(name = "GenderCode", labels = c("Female", "Male")) +
theme(legend.position="bottom") +
guides(color = guide_legend(order=1, direction="vertical"))
gg_plot1
gg_plot2 <- ggplot(mean_behav_by_numweeks,
aes(numweeks_round,
Mean_Behavior_Score_Female,
size = mean_behav_by_numweeks$nrow_female)) +
geom_point(colour='pink3') +
ggtitle(paste("Scatter plot of mean behavior assessment score by member by # weeks \n since 1st assessment for",
as.character(var),
"among Female Medi-Cal plan members")) +
theme(plot.title = element_text(size=10.9, hjust = 0.5)) +
theme(axis.text = element_text(size=8)) +
scale_size_continuous(range = c(1, 7)) +
xlab("Number of weeks since 1st assessment") +
ylab("Mean behavior assessment score") +
theme(legend.position="bottom") +
labs(size="# members") +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_male, color='steelblue2'),
size=1) +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_female, color='pink3'), size=1) +
scale_color_discrete(name = "GenderCode", labels = c("Female", "Male")) +
theme(legend.position="bottom") +
guides(color = guide_legend(order=1, direction="vertical"))
windows()
gg_plot2
You will want to reshape your data into long format, although you don't have to use melt or gather if you don't want to -- you can stack your data manually, like
library(dplyr)
library(ggplot2)
new_df <- bind_rows(
male = select(mean_behav_by_numweeks,
numweeks_round,
Mean_Behavior_Score = Mean_Behavior_Score_Male,
nrow = nrow_male,
lm_results_predict = lm_results_predict_male),
female = select(mean_behav_by_numweeks,
numweeks_round,
Mean_Behavior_Score = Mean_Behavior_Score_Female,
nrow = nrow_female,
lm_results_predict = lm_results_predict_female),
.id = "gender"
)
Then you can just do
ggplot(new_df, aes(numweeks_round, Mean_Behavior_Score, size = nrow, colour = gender)) +
geom_point() +
theme(plot.title = element_text(size=10.9, hjust = 0.5),
axis.text = element_text(size=8),
legend.position="bottom") +
scale_size_continuous(range = c(1, 7)) +
labs(x = "Number of weeks since 1st assessment",
y = "Mean behavior assessment score",
size="# members") +
geom_line(aes(y = lm_results_predict), size = 1) +
scale_color_manual(name = "GenderCode", labels = c("Female", "Male"), values = c("pink3", "steelblue2")) +
guides(color = guide_legend(order=1, direction="vertical")) +
facet_wrap("gender")
which gives you
One can use gather/separate to first convert data in long format and then plot.
# A simple capitalization function to convert first letter in Caps
# This function is used to convert male/female to Male/Female
.simpleCap <- function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
sep = "", collapse = " ")
}
library(tidyverse)
df <- mean_behav_by_numweeks %>%
gather(key, value, - numweeks_round) %>%
separate(key, c("key", "GenderCode"), sep = "_(?=[^_]*?$)") %>% #separates on last _
mutate(GenderCode = mapply(.simpleCap,GenderCode)) %>%
spread(key, value)
Plot the graph:
ggplot(df, aes(numweeks_round, Mean_Behavior_Score, size = nrow, color = GenderCode )) +
geom_point() +
geom_line(aes(y = lm_results_predict, color = GenderCode), size = 1) +
theme(plot.title = element_text(size=10.9, hjust = 0.5),
axis.text = element_text(size=8),
legend.position="bottom") +
labs(x = "Number of weeks since 1st assessment",
y = "Mean behavior assessment score",
size="# members") +
guides(color = guide_legend(order=1, direction="vertical"))
Data:
mean_behav_by_numweeks = data.frame(
numweeks_round = c(1:10),
Mean_Behavior_Score_Male = c(3.32,4.18,3.82,4.06,3.33, 3.80,3.64,3.66,3.37,3.82),
nrow_male = c(396,323,293,259,226,217,202,190,170,167),
lm_results_predict_male = c(3.82,3.80,3.78,3.76,3.74, 3.72,3.70,3.68,3.66,3.64),
Mean_Behavior_Score_Female = c(2.91,3.79,3.65,3.41, 2.88,2.88,3.78,2.98,3.67,3.93),
nrow_female = c(109,82,72,74,66,60,58,56,52,50),
lm_results_predict_female=c(3.44,3.44,3.45,3.45, 3.46,3.47,3.47,3.48,3.48,3.49))
In the following plot, time is on the x-axis but tick marks do not show for every year:
ggplot(mm, aes(x = time, y = value)) +
geom_point(aes(color = variable)) +
geom_line(stat = "smooth", method = "lm", alpha = 0.5) +
facet_grid(variable ~ ., scales = "free_y") +
theme(legend.position="none") +
coord_fixed(ratio = 10)
In order to fix this, I have converted the time variable to a factor, which works but then the linear regression disappears:
ggplot(mm, aes(x = factor(time), y = value)) +
geom_point(aes(color = variable)) +
geom_line(stat = "smooth", method = "lm", alpha = 0.5) +
facet_grid(variable ~ ., scales = "free_y") +
theme(legend.position = "none") +
coord_fixed(ratio = 10)
Is there a workaround for this with geom_line?
I think that scale_x_date is what you are looking for.
First, some reproducible data:
df <-
data.frame(
y = 99:117
, x = seq(as.Date("1999-01-01")
, as.Date("2017-01-01")
, "year")
)
Then, this is the way you can set to some "pretty" break points while still getting a tick at each year. If you want every year labelled, then use date_breaks = "1 year" instead of the breaks and date_minor_breaks arguments I have now
ggplot(df, aes(x = x, y = y) ) +
geom_smooth(method = "lm") +
geom_point() +
scale_x_date(breaks = pretty(df$x)
, date_minor_breaks = "1 year"
, date_labels = "%Y")
gives
Or, if your years are just numeric (and not dates), you can use scale_x_continuous for a similar effect:
df <-
data.frame(
y = 99:117
, x = 1999:2017
)
ggplot(df, aes(x = x, y = y) ) +
geom_smooth(method = "lm") +
geom_point() +
scale_x_continuous(breaks = pretty(df$x)
, minor_breaks = unique(df$x)) +
theme_gray()
Gives a plot that is indistinguishable from above.