match colours with specific variables - r

I have a colour data frame that I join with my input data to match the colours to categories. The issue is that when using fill=mycolour the legend displays the colour names and not the names of my categories.
I would like fill to be name_assigned while still matching the colours in mycolors.
df %>%
dplyr::left_join(colors.variable, by="name_assigned") %>%
ggplot(aes(reorder(chr,chr,function(x)-length(x)),y=name_assigned, fill=mycolors)) +
geom_bar(aes(y = (..count..))) +
scale_fill_identity()

You don't need the join, from the colors data set create a named colors vector and use it in scale_fill_manual.
Also, you seem to have swapped x and y coordinates.
library(ggplot2)
set.seed(2022)
df <- data.frame(
chr = rbinom(1e3, 1, 0.5),
name_assigned = sample(letters[1:3], 1e3, TRUE)
)
colors.variable <- data.frame(
name_assigned = letters[1:3],
mycolors = c("pink", "purple", "seagreen")
)
mycolors <- with(colors.variable, setNames(mycolors, name_assigned))
ggplot(df, aes(name_assigned, fill = name_assigned)) +
geom_bar() +
scale_fill_manual(values = mycolors)

Related

How to change the colour for missing values in geom_miss_point (with two different color scales)

I'm struggling to modifing the colour/shape/... of the points based of if it's a missing value or not.
library(ggplot2)
library(naniar)
ggplot(data = airquality,
aes(x = Ozone,
y = Solar.R)) +
geom_miss_point()
What I have
airquality_no_na <-airquality[!(is.na(airquality$Ozone) | is.na(airquality$Solar.R)) ,]
airquality_na <-airquality[(is.na(airquality$Ozone) | is.na(airquality$Solar.R)),]
ggplot() +
geom_point(data = airquality_no_na,
aes(x = Ozone,
y = Solar.R, colour = "NoMissing")) +
geom_miss_point(data = airquality_na,
aes(x = Ozone,
y = Solar.R, colour = "Missing")) +
scale_colour_manual(name = 'Legende',
values =c('NoMissing'='green',
'Missing'='blue'))
What I would like to have
I don't know how to make the missing value in green and the non-missing value in blue without spliting in two dataframe.
EDIT :
My issue was a bit more complexe. I want to have the possibility to choose the color for the first data set (missing in blue, not missing in green) ans the second data set (missing in red, not missing in yellow)
#Create dataframes
df1=as.data.frame(matrix(data=runif(n=200, 0,1),ncol=2))
df2=as.data.frame(matrix(data=runif(n=100, 0,1),ncol=2))
#Add missing values
df1[rbinom(n=100,size=1,prob = 0.1) ==1,1] <- NA
df1[rbinom(n=100,size=1,prob = 0.1) ==1,2] <- NA
df2[rbinom(n=50,size=1,prob = 0.1) ==1,1] <- NA
df2[rbinom(n=50,size=1,prob = 0.1) ==1,2] <- NA
#This doesnt work. It only print in blue (missing) and green (not missing)
ggplot() +
geom_miss_point(data = df1,
aes(x = V1,
y = V2)) +
geom_miss_point(data = df2,
aes(x = V1,
y = V2)) +
scale_colour_manual(values = c("blue", "green", "yellow","red"))
I am not sure if this a good idea. But for the sake of "showing how to do this in theory". From what I understand from a quick look into the naniar package, is that the color aesthetic is mapped to ..missing.. by default. You would need to dig quite a lot into the actual geom to change that behaviour. But there is a simple workaround for it.
Create a second color scale with ggnewscale.
You will not get around subsetting your data first, but this is not a bad thing. Don't fear to subset your data, that's a very normal thing to do.
library(tidyverse)
library(naniar)
library(ggnewscale)
ggplot() +
geom_miss_point(data = df1, aes(V1, V2)) +
scale_colour_manual(name = "df1", values = c("blue", "green")) +
new_scale_color() +
geom_miss_point(data = df2, aes(V1, V2)) +
scale_colour_manual(name = "df2", values = c("yellow","red"))
With some trial and error I came up with a solution using the group aesthetic:
Row bind your datasets and add an identifier
Map the dataset identifier on group
Map the interaction of ..group.. and naniars ..missing.. on color. (I first tried by using dataset directly but that did not work. ): )
library(ggplot2)
library(naniar)
set.seed(42)
#Create dataframes
df1=as.data.frame(matrix(data=runif(n=200, 0,1),ncol=2))
df2=as.data.frame(matrix(data=runif(n=100, 0,1),ncol=2))
#Add missing values
df1[rbinom(n=100,size=1,prob = 0.1) ==1,1] <- NA
df1[rbinom(n=100,size=1,prob = 0.1) ==1,2] <- NA
df2[rbinom(n=50,size=1,prob = 0.1) ==1,1] <- NA
df2[rbinom(n=50,size=1,prob = 0.1) ==1,2] <- NA
dplyr::bind_rows(df1, df2, .id = "dataset") %>%
ggplot() +
geom_miss_point(aes(x = V1,
y = V2,
group = dataset,
colour = interaction(..group.., ..missing..))) +
scale_colour_manual(values = c("blue", "red", "green", "yellow"))

R: connect points on a graph (ggplot2)

Suppose I have data in the following form:
library(ggplot2)
Data <- data.frame(
"ID" = c("ABC111", "ABC111", "ABC111", "ABC111", "ABC112", "ABC112", "ABC112", "ABC113", "ABC113", "ABC114", "ABC115"),
"color" = c("red", "red", "red", "red", "blue", "blue", "blue", "green", "green", "black", "yellow"),
"start_date" = c("2005/01/01", "2006/01/01", "2007/01/01", "2008/01/01", "2009/01/01", "2010/01/01", "2011/01/01", "2012/01/01", "2013/01/01", "2014/01/01", "2015/01/01"),
"end_date" = c("2005/09/01", "2006/06/01", "2007/04/01", "2008/05/07", "2009/06/01", "2010/10/01", "2011/12/12", "2013/05/01", "2013/06/08", "2015/01/01", "2016/08/09")
)
Data$ID = as.factor(Data$ID)
Data$color = as.factor(Data$color)
Now what I want to do is for each row, plot the start_date and the end_date ... and then connect them with a straight line. I believe this can be done with geom_line() in ggplot2.
I want something that looks like this:
I tried using the following code:
q <- qplot(start_date, end_date, data=Data)
q <- q + geom_line(aes(group = ID))
q
But the graph looks completely different than what I expected.
Can anyone please show me what I am doing wrong?
Thanks
Does the following work for you?
ggplot(data = Data, aes(start_date, end_date, color = ID))+
geom_line(aes(group = ID))+
geom_point()
or maybe geom_segment ?
# Adding x and y coordinates for geom_segment
Data$x <- as.character(as.Date(Data$start_date) + (as.Date(Data$end_date) - as.Date(Data$start_date)))
Data$y <- 1:nrow(Data)
ggplot(data = Data, aes(x, y, colour = ID))+
geom_segment(aes(xend = start_date, yend = end_date))
Here's a solution using the tidyverse package. I used the number of each row in the original data as the y-axis values in the plot. As these values are meaningless, I removed the y-axis title, labels and ticks from the plot.
library(tidyverse)
Data %>%
# Number each row in its order of appearance,
# save this numbers in a new column named order
rowid_to_column("order") %>%
# Change data from wide to long format
pivot_longer(cols = c(start_date, end_date),
names_to = "date_type",
values_to = "date") %>%
# Ggplot, use date as x, order as y, ID as col and order as group
ggplot(aes(x = date,
y = order,
col = ID,
group = order)) +
# Draw points
geom_point()+
# Draw lines
geom_line() +
# Maybe you want to remove the y axis title, text and ticks
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
# I added a vertical format to the x axis labels
# it might easier to read this way
axis.text.x = element_text(angle = 90, vjust = 0.5))

Force ggplot2 to use the same labels and colors on the legends for two plots

I'm trying to create several plots with ggplot2, and I'd like it to assign always the same color to the each factor type.
Here you have a toy example.
mydata <- data.frame(from=rep(c("b","c"), each=15),
to=rep(c("a","b","c"), each=10),
value=c(rep(1:5,5:1),rep(1:5,1:5)) )
I first convert the categories to factors in order to assign always the same value and because it worked on other plots I did.
I want to create two density plots (or similar) from the value grouped by categories in two ways, one using the categories of the "from" column, another one with the categories of the "to" column, using the same legend.
niv <- c("a", "b", "c")
colo <- c("black", "red", "blue")
mydata$from <- factor(mydata$from, levels=niv)
mydata$to <- factor(mydata$to, levels=niv)
And now I generate the plots.
First with grouping by the "from" column.
ggplot(mydata) + stat_density(geom="line",size=0.8,
position = "identity", aes(x=value, color=from)) +
scale_colour_manual(name="Type",labels = niv,
values=colo) + theme_bw()
As you can see it doesn't produce the desired plot because it shows an "a" factor but it doesn't exist on the "from" column.
And now grouping by the "to" column.
ggplot(mydata) + stat_density(geom="line",size=0.8,
position = "identity", aes(x=value, color=to)) +
scale_colour_manual(name="Type",labels = niv,
values=colo) + theme_bw()
It works as expected.
Now I try to produce again the first plot without the labels parameter.
ggplot(mydata) + stat_density(geom="line",size=0.8,
position = "identity", aes(x=value, color=from)) +
scale_colour_manual(name="Type", values=colo) +
theme_bw()
Now it labels properly the categories but the color don't match with the second plot.
How can I do it?
The real problem has more categories and many values.
You could use a named vector in scale_color_manual to map Type explicitly to colors:
color_map <- c("a" = "black", "b" = "red", "c" = "blue")
scale_colour_manual(values=color_map)
From the help(scale_color_manual):
values
a set of aesthetic values to map data values to. If this is a
named vector, then the values will be matched based on the names. If
unnamed, values will be matched in order (usually alphabetical) with
the limits of the scale. Any data values that don't match will be
given na.value.
Here is the full code that, I believe, produces the output that you want:
library(tidyverse)
mydata <- data.frame(
from = rep(c("b", "c"), each = 15),
to = rep(c("a", "b", "c"), each = 10),
value = c(rep(1:5, 5:1), rep(1:5, 1:5))
)
niv <- c("a", "b", "c")
colo <- c("black", "red", "blue")
color_map <- set_names(colo, niv)
mydata$from <- factor(mydata$from, levels = niv)
mydata$to <- factor(mydata$to, levels = niv)
ggplot(mydata) + stat_density(
geom = "line", size = 0.8,
position = "identity", aes(x = value, color = from)
) +
scale_colour_manual(name = "Type", values = color_map) + theme_bw()
ggplot(mydata) + stat_density(
geom = "line", size = 0.8,
position = "identity", aes(x = value, color = to)
) +
scale_colour_manual(
name = "Type",
values = color_map
) + theme_bw()

Plot different parts of a vector with different colors on the same graph

As from the title suppose this vector and plot:
plot(rnorm(200,5,2),type="l")
This returns this plot
What i would like to know is whether there is a way to make the first half of it to be in blue col="blue" and the rest of it to be in red "col="red".
Similar question BUT in Matlab not R: Here
You could simply use lines for the second half:
dat <- rnorm(200, 5, 2)
plot(1:100, dat[1:100], col = "blue", type = "l", xlim = c(0, 200), ylim = c(min(dat), max(dat)))
lines(101:200, dat[101:200], col = "red")
Not a base R solution, but I think this is how to plot it using ggplot2. It is necessary to prepare a data frame to plot the data.
set.seed(1234)
vec <- rnorm(200,5,2)
dat <- data.frame(Value = vec)
dat$Group <- as.character(rep(c(1, 2), each = 100))
dat$Index <- 1:200
library(ggplot2)
ggplot(dat, aes(x = Index, y = Value)) +
geom_line(aes(color = Group)) +
scale_color_manual(values = c("blue", "red")) +
theme_classic()
We can also use the lattice package with the same data frame.
library(lattice)
xyplot(Value ~ Index, data = dat, type = 'l', groups = Group, col = c("blue", "red"))
Notice that the blue line and red line are disconnected. Not sure if this is important, but if you want to plot a continuous line, here is a workaround in ggplot2. The idea is to subset the data frame for the second half, plot the entire data frame with color as blue, and then plot the second data frame with color as red.
dat2 <- dat[dat$Index %in% 101:200, ]
ggplot(dat, aes(x = Index, y = Value)) +
geom_line(color = "blue") +
geom_line(data = dat2, aes(x = Index, y = Value), color = "red") +
theme_classic()

how to customize names of legend labels in ggplot

I have the following code:
p <- ggplot() + coord_fixed() + xlab("") + ylab("")
base_world <- p + geom_polygon(data=world_map, aes(x=long, y=lat, group=group),
colour="green", fill="whitesmoke") +
geom_point(data = as.data.frame(coordinates(busxy)), size = 1,
mapping = aes(x = busxy#coords[,1], y = busxy#coords[,2],
color = busxy$color)) +
labs(title = "Cities\n", color = "States\n") +
scale_color_manual(labels = col2state$s, values = col2state$c)
It prints this:
The problem is the colors on map doesn't correspond with those in legend.
When I delete scale_color_manual(labels = col2state$s, values = col2state$c) from the plot it's all right but 'States' have names of colors from the data.
My question is: How to leave labels names like in the image but also assign proper colors to those labels as in the map?
In col2state$c are 29 color names (like #29A934)
In col2state$s are 29 state labels like in legend.
Data frame busxy contains 144k records with 29 unique values of states.
Data is from:
library(maps)
world_map <- map_data("world")
busxy <- data.frame(x=bus[[1]]$latitude, y=bus[[1]]$longitude, city=bus[[1]]$city, state=bus[[1]]$state)
bus <- llply(as.list(jfile5), function(x) jsonlite::stream_in(file(x), pagesize = 10000))
and jfile5 is the path to json file contains all data.
scale_color_manual can do without the labels parameter. A way to solve this is scale_color_manual(values = my_colors) where my_colors is the mapping of countries to their color, organized in a named character vector, e.g. c(AZ = "blue", NV = "red", ...)
An example:
df <- data.frame(x=1:3, y = 2:4, f = as.factor(1:3))
my_colors <- c('1'= "blue", '2' = "red", '3' = "yellow")
ggplot(df) + geom_point(aes(x = x, y = y, color = f)) + scale_color_manual(values = my_colors)
Instead of calling the columns, save vectors of the unique values for color and for state labels. Ensure the vectors are ordered to match.
vec_c <- unique(col2state$c)
vec_s <- unique(col2state$s) #may need to re-order, or opt to manually create vector
scale_color_manual(labels = vec_c, values = vec_s)

Resources