Regression line lost after factor conversion - r

In the following plot, time is on the x-axis but tick marks do not show for every year:
ggplot(mm, aes(x = time, y = value)) +
geom_point(aes(color = variable)) +
geom_line(stat = "smooth", method = "lm", alpha = 0.5) +
facet_grid(variable ~ ., scales = "free_y") +
theme(legend.position="none") +
coord_fixed(ratio = 10)
In order to fix this, I have converted the time variable to a factor, which works but then the linear regression disappears:
ggplot(mm, aes(x = factor(time), y = value)) +
geom_point(aes(color = variable)) +
geom_line(stat = "smooth", method = "lm", alpha = 0.5) +
facet_grid(variable ~ ., scales = "free_y") +
theme(legend.position = "none") +
coord_fixed(ratio = 10)
Is there a workaround for this with geom_line?

I think that scale_x_date is what you are looking for.
First, some reproducible data:
df <-
data.frame(
y = 99:117
, x = seq(as.Date("1999-01-01")
, as.Date("2017-01-01")
, "year")
)
Then, this is the way you can set to some "pretty" break points while still getting a tick at each year. If you want every year labelled, then use date_breaks = "1 year" instead of the breaks and date_minor_breaks arguments I have now
ggplot(df, aes(x = x, y = y) ) +
geom_smooth(method = "lm") +
geom_point() +
scale_x_date(breaks = pretty(df$x)
, date_minor_breaks = "1 year"
, date_labels = "%Y")
gives
Or, if your years are just numeric (and not dates), you can use scale_x_continuous for a similar effect:
df <-
data.frame(
y = 99:117
, x = 1999:2017
)
ggplot(df, aes(x = x, y = y) ) +
geom_smooth(method = "lm") +
geom_point() +
scale_x_continuous(breaks = pretty(df$x)
, minor_breaks = unique(df$x)) +
theme_gray()
Gives a plot that is indistinguishable from above.

Related

Adding labels to scatter plot points in ggplot2

I am having trouble adding labels to points on a scatter plot using ggplot. Instead of adding the country name, it is adding the row number. What changes to geom_text do I need to make to fix this?
ggplot(data = World, aes(x = pop_age, y = peace_index_score, label = country)) + geom_point() + labs(title = "Youth Buldge and Instability", x = "Median Age in Country",y = "Overall Peacefulness of Country") + theme_economist() + ylim(0,4) + xlim(15,45) + geom_smooth(method = lm, color = "red") + geom_text(aes(label=country))
Is this working for you:
ggplot() +
geom_point(data = World, aes(x = pop_age, y = peace_index_score)) +
labs(title = "Youth Buldge and Instability", x = "Median Age in Country",y = "Overall Peacefulness of Country") +
theme_economist() +
ylim(0,4) +
xlim(15,45) +
geom_smooth(data = World, aes(x = pop_age, y = peace_index_score), method = lm, color = "red") +
geom_text(data = World, aes(x = pop_age, y = peace_index_score, label = country))

Y axis values different from actual column in dataset in R

I am currently working with a dataset of "world bank islands". In that, I am trying to plot the population Vs country graph for each year. Below is the code that I have done.
library(ggplot2)
options(scipen = 999)
bank <- read.csv("C:/Users/True Gamer/OneDrive/Desktop/world_bank_international_arrivals_islands.csv")
bank[bank == "" | bank == "."] <- NA
bank$country <- as.numeric(bank$country)
bank$year <- as.numeric(bank$year)
bank$areakm2 <- as.numeric(bank$areakm2)
bank$pop <- as.numeric(bank$pop)
bank$gdpnom <- as.numeric(bank$gdpnom)
bank$flights...WB <- as.numeric(bank$flights...WB)
bank$hotels <- as.numeric(bank$hotels)
bank$hotrooms <- as.numeric(bank$hotrooms)
bank$receipt <- as.numeric(bank$receipt)
bank$ovnarriv <- as.numeric(bank$ovnarriv)
bank$dayvisit <- as.numeric(bank$dayvisit)
bank$arram <- as.numeric(bank$arram)
bank$arreur <- as.numeric(bank$arreur)
bank$arraus <- as.numeric(bank$arraus)
str(bank)
plot1 <- ggplot(bank, aes(x=country,y=pop)) + geom_bar(stat = "identity",aes(fill=year)) + ggtitle("Population of each country yearwise") + xlab("Countries") + ylab("Population")
plot1
However, when I do this, the y values shown on the graph are different from the actual y values. This is the link to the dataset
The problem is that you are stacking the bars (this is default behaviour). Also, geom_bar(stat = "identity") is just a long way of writing geom_col. One further point to note is that since all your columns are numeric, the single line:
bank <- as.data.frame(lapply(bank, as.numeric))
replaces all your individual numeric conversions.
The plot you are trying to create would be something like this:
ggplot(bank, aes(x = country, y = pop)) +
geom_col(aes(fill = factor(year)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Countries") +
ylab("Population") +
labs(fill = "Year") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = 1:27)
However, it would probably be best to present your data in a different way. Perhaps, if you are comparing population growth, something like this would be better:
ggplot(bank, aes(x = year, y = pop)) +
geom_line(aes(color = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
Or with bars:
ggplot(bank, aes(x = year, y = pop)) +
geom_col(aes(fill = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")

Customize formula in geom-smooth / ggplot2 / R

I want to customize the formula used in geom_smooth like this:
library(MASS)
library(ggplot2)
data("Cars93", package = "MASS")
str(Cars93)
Cars93.log <- transform(Cars93, log.price = log(Price))
log.model <- lm(log.price ~ Horsepower*Origin, data = Cars93.log)
summary(log.model)
plot(log.model)
p <- ggplot(data = Cars93.log, aes(x = Horsepower, y = log.price, colour = Origin)) +
geom_point(aes(shape = Origin, color = Origin)) + # Punkte
facet_grid(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
lm.mod <- function(df) {
y ~ x*Cars93.log$Origin
}
p_smooth <- by(Cars93.log, Cars93.log$Origin,
function(x) geom_smooth(data=x, method = lm, formula = lm.mod(x)))
p + p_smooth
However, I receive the error that the computation failed because of different lengths of my used variables.
length(Cars93.log$log.price)
length(Cars93.log$Origin)
length(Cars93.log$Horsepower)
But when I check the length for each variable they're all the same... Any ideas, what's wrong?
Thanks a lot, Martina
I agree with #Rui Barradas, seems like the issue is the lines for lm.mod and p_smooth and the by function
Once you are making a distinction by Origin (e.g., by doing either facet_wrap or color = Origin) then geom_smooth will automatically run different models for those facets.
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price, color = Origin)) +
geom_point(aes(shape = Origin)) +
facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)
you can convince yourself that this is the same as the output of log.model by extending the x-axis limits to see where the geom_smooth line would cross the y axis (e.g., + coord_cartesian(xlim = c(0, 300)))
You can also see the difference in the graph if you don't pass color = Origin to the geom_smooth function (essentially what is happening if you comment this out from the first ggplot() initialization):
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price)) + # color = Origin)) +
geom_point(aes(shape = Origin)) +
#facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)

Changing facet labels in face_wrap() ggplot2

So the code below is working w/out errors, and I am trying to fix the following issue.
First, I am trying to change the group name for each graph to say, for instance, "< 1500 dollars" to refer to the group of workers earnings $1500 or less etc...
I tried this solution: to change the underlying factor level names but I keep getting this error:
"Error: unexpected ',' in ""< 1500 Dollars",""
outflows <- Wage_Outflows
levels(outflows$wage_group)
"< 1500", "1501 ~ 2999", "3000",
levels(outflows$wage_group) <- c("< 1500 Dollars", "1501 ~ 2999 Dollars", "3000 Dollars")
text.on.each.panel <-"Dollars"
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p2 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="1501 ~ 2999",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(800000, 1100000, by = 20000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p3 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="3000",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(50000, 120000, by = 5000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
grid.arrange(p1, p2,p3, ncol=1)
For your first question have a look at the labeller argument in the facet_wrap function.
And for your second question the labs function might be the solution.
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",],
aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
labs(y = "Number of workers") +
facet_wrap(~ wage_group, labeller = labeller(wage_group = c(`< 1500` = "< 1500
dollars"))) +
theme(axis.title.x = element_blank())
Maybe you can shorten your code like this:
# Example dataset:
df <- data.frame(wage_group = rep(c("A","B","C"), each = 10),
year = 2001:2010,
labor = seq(5000,34000, 1000))
ggplot(df , aes(x = factor(year), y = labor)) +
geom_point() +
labs(y = "# of workers") +
facet_wrap(~wage_group, ncol = 1, scales = "free",
labeller = labeller(wage_group = c(`A` = "less than 1500 dollars",
`B` = "1500-2999 dollars", `C` = "more than 3000 dollars"))) +
theme(axis.title.x = element_blank())

ggplot error bar legend

I am having difficulties adding a legend to my error bar plot. I tried several command that I've seen in other subject, but unfortunately it doesn't work (I am sure I'm missing something but I can't figure out what)
library(ggplot2)
errors=matrix(c(-3.800904,-3.803444,-3.805985,-3.731204,-3.743969,
-3.756735,-3.742510,-3.764961,-3.787413,-3.731204,-3.743969,-3.756735,
-3.711420,-3.721589,-3.731758,-3.731204,-3.743969,-3.756735,-3.636346,
-3.675159,-3.713971,-3.731204,-3.743969,-3.756735),nrow=4,byrow=TRUE)
modelName=c("model 1","model 2","model 3","model 0")
boxdata=data.frame(errors,modelName)
colnames(boxdata)=c("icp","pred","icm","icp_obs","obs","icm_obs","model")
qplot(boxdata$model,boxdata$pred,
main = paste("confidance level 95% for age ", age_bp + start_age - 1,sep="")) +
geom_errorbar(aes(x=boxdata$model, ymin=boxdata$icm, ymax=boxdata$icp), width=0.20,col='deepskyblue') +
geom_point(aes(x=boxdata$model,y=boxdata$obs),shape=4,col="orange") +
geom_errorbar(aes(x=boxdata$model, ymin=boxdata$icm_obs, ymax=boxdata$icp_obs), width=0.20,col='red') +
scale_shape_manual(name="legend", values=c(19,4)) +
scale_color_manual(name="legend", values = c("black","orange")) +
xlab("models") +
ylab("confidence level")
The problem is that you are using wide form data rather than long form data. You need to convert the data from wide to long before plotting if you want to get a legend.
library(ggplot2)
errors=matrix(c(-3.800904,-3.803444,-3.805985,-3.731204,-3.743969,
-3.756735,-3.742510,-3.764961,-3.787413,-3.731204,-3.743969,-3.756735,
-3.711420,-3.721589,-3.731758,-3.731204,-3.743969,-3.756735,-3.636346,
-3.675159,-3.713971,-3.731204,-3.743969,-3.756735),nrow=4,byrow=TRUE)
errors = rbind(errors[, 1:3], errors[,4:6]) # manually reshaping the data
modelName=c("model 1","model 2","model 3","model 0")
type = rep(c("model", "obs"), each = 4)
boxdata=data.frame(errors,modelName, type)
colnames(boxdata)=c("icp","pred","icm","model", "type")
ggplot(boxdata, aes(x = model, y = pred, ymax = icp, ymin = icm,
group = type, colour = type, shape = type)) +
geom_errorbar(width=0.20) +
geom_point() +
scale_shape_manual(values=c(19, 4)) +
scale_color_manual(values = c("black","orange")) +
xlab("models") +
ylab("confidence level")
The output looks closer to your output can be generated by:
ggplot(boxdata, aes(x = model, y = pred, ymax = icp, ymin = icm,
group = type, colour = type, shape = type)) +
geom_errorbar(width=0.20) +
geom_point(colour = rep(c("black","orange"), each = 4)) +
scale_shape_manual(values=c(19, 4)) +
scale_color_manual(values = c("deepskyblue", "red")) +
xlab("models") +
ylab("confidence level")

Resources