r ggplot show cluster labels on the plot - r

I am new to R and I am trying to generate a series of figures for my clustering algorithm. Right now I am using the following code:
ggplot(df,aes(x=V1,y=V2)) +
geom_point(aes(colour = factor(cluster)),alpha=0.7) +
scale_colour_manual(values=c("purple", "green","orange","black")) +
ggtitle("Visualizing users and their K-Means Euclidean Clusters")
As you can see I have four clusters which are results of k-means. Now I want to show some text over my plot. For example in the following image:
I need the mean of each cluster (or any text like cluster labels) shown over it in this figure (for example 0.5 over the green area). I guess I should geom_text for this purpose but unfortunately I have no idea how. Any help is much appreciated.
Thanks

Try this
library(ggplot2)
cl <- kmeans(iris[, 1:2], 3, nstart = 25)
ggplot(transform(iris[, 1:2], cl = factor(cl$cluster)),
aes(x = Sepal.Length, y = Sepal.Width, colour = cl)) +
geom_point() +
scale_colour_manual(values=c("purple", "green","orange")) +
annotate("point", x = cl$centers[, 1], y = cl$centers[, 2], size = 5, colour = c("purple", "green","orange")) +
annotate("text", x = cl$centers[, 1], y = cl$centers[, 2], font = 2, size = 10,
label = apply(cl$centers, 1, function(x) paste(sprintf('%02.2f', x), collapse = ",") ),
colour = c("purple", "green","orange") )

library(dplyr);library(purrr);library(ggplot2)
data.plot %>%
group_by(Class) %>%
do(model = kmeans(.[c('x', 'y')], 1)) %>%
ungroup() %>% group_by(Class) %>% do(map_df(.$model, broom::tidy)) %>% ungroup() %>%
select(Class,x,y ) %>% data.frame() %>% dplyr::rename(x.center=x,y.center=y,Class=Class) ->label.data

Related

Adding a power curve to scatterplot

I want to add a power curve with confidence intervals to my diamter-weight relationship, which clearly follows a y=a*x^b regression. So far, I used the geom_smooth "loess" version, but this is not yet quite right and perfect. Any suggestion how to add a power regression line would be much appreciated. Below is the used code:
p2<-ggplot(Data,aes(x=Diameter,y=Wet_weight,colour=Site))+
geom_point(size=3.5,alpha=0.3)+
geom_smooth(aes(group=Species),method=loess,colour="black")+
labs(x="\nUmbrella diamter (mm)",y="Wet weight (mg)\n")+theme_classic()+
scale_colour_manual(values=c("black","dark blue","blue","dark green","green"))+
theme(axis.title.x=element_text(size=20),
axis.text.x=element_text(size=18,colour="black"),
axis.title.y=element_text(size=20),
axis.text.y=element_text(size=18,colour="black"),
axis.ticks=element_line(colour="black",size=1),
axis.line=element_line(colour="black",size=1,linetype="solid"),
legend.position=c(0.18,0.75),
legend.text=element_text(colour="black",size=17),
legend.title=element_text(colour="black",size=18))
p2
Thank you!
I used this to get many equations, R2, and plots.
df= #change your data frame so it fits the current code
variables=c("group","year") #if you have multiple groups/seasons/years/elements add them here
df$y= #which variable will be your y
df$x= #which variable will be your x
#No changes get the equations
text=df %>%
group_by(across(all_of(variables))) %>% #your grouping variables
do(broom::tidy(lm(log(y) ~ log(x), data = .))) %>%
ungroup() %>%
mutate(y = round(ifelse(term=='(Intercept)',exp(estimate),estimate),digits = 2)) %>% #your equation values rounded to 2
select(-estimate,-std.error,-statistic ,-p.value) %>%
pivot_wider(names_from = term,values_from = y) %>%
rename(.,a=`(Intercept)`,b=`log(x)`)
#CHANGE before running!! add your grouping variables
rsq=df %>%
split(list(.$group,.$year)) %>% #---- HERE add the names after $
map(~lm(log(y) ~ log(x), data = .)) %>%
map(summary) %>%
map_dbl("r.squared") %>%
data.frame()
#Join the R2 and y results for the plot in a single data frame and write the equations
labels.df=mutate(rsq,groups=row.names(rsq)) %>%
separate(col = groups,into = c(variables),sep = "[.]",
convert = TRUE, remove = T, fill = "right") %>%
rename("R"='.') %>%
left_join(text,.) %>%
mutate(R=round(R,digits = 4), #round your R2 digits
eq= paste('y==',a,"~x^(",b,")", sep = ""),
rsql=paste("R^2==",R),
full= paste('y==',a,"~x^(",b,")","~~R^2==",R, sep = ""))
# plot
ggplot(df,aes(x = x,y = y)) +
geom_point(size=4,mapping = aes(
colour=factor(ifelse(is.na(get(variables[2])),"",(get(variables[2])))), #points colour
shape=get(variables[1]))) + # different shapes
facet_wrap(get(variables[1])~ifelse(is.na(get(variables[2])),"",get(variables[2])),
scales = "free",labeller = labeller(.multi_line = F))+ #for multiple groups; join text in one line
stat_smooth(mapping=aes(colour=get(variables[1])), #colours for our trend
method = 'nls', formula = 'y~a*x^b',
method.args = list(start=c(a=1,b=1)),se=FALSE) +
geom_text(labels.df,x = Inf, y = Inf,size=5, mapping = aes(label = (eq)), parse = T,vjust=1, hjust=1)+
geom_text(labels.df,x = Inf,y = Inf,size=5, mapping = aes(label = (rsql)), parse = T,vjust=2.5, hjust=1)+
#scale_y_log10() + #add this to avoid problems with big y values
labs(x="Your x label",y="your y label")+
theme_bw(base_size = 16) +
theme(legend.position = "none",
strip.background = element_rect(fill="#b2d6e2"))

Simulate a two-dimensional random walk in a grid in R and plot with ggplot

I was looking for a simple code that could simulate a two-dimensional random walk in a grid (using R), and then plot the data using ggplot.
In particular, I was interested to a random walk from few position (5 points) in a 2D grid to the center of the square grid. It is just for visualisation purposes.
And my idea was then to plot the results with ggplot on a discrete grid (as the one simulated), may be using the function geom_tile.
Do you have any suggestion for a pre-existing code that I could easily manipulate?
Here is a small example with a for loop. From here, you can simply adjust how X_t and Y_t are defined:
Xt = 0; Yt = 0
for (i in 2:1000)
{
Xt[i] = Xt[i-1] + rnorm(1,0,1)
Yt[i] = Yt[i-1] + rnorm(1,0,1)
}
df <- data.frame(x = Xt, y = Yt)
ggplot(df, aes(x=x, y=y)) + geom_path() + theme_classic() + coord_fixed(1)
EDIT ----
After chatting with OP I've revised the code to include a step probability. This may result in the walk being stationary much more frequently. In higher dimensions, you will need to scale your prob factor lower in order to compensate for more options.
finally, my function does not account for an absolute distance, it only considers points on the grid that are within a certain step size in all dimensions. For example, hypothetically, at position c(0,0) you could go to c(1,1) with this function. But I guess this is relative to the grid's connectiveness.
If the OP wants to only consider nodes that are within 1 (by distance) of the current position, then use the following version of move_step()
move_step <- function(cur_pos, grid, prob = 0.04, size = 1){
opts <- grid %>%
rowwise() %>%
mutate(across(.fns = ~(.x-.env$cur_pos[[cur_column()]])^2,
.names = '{.col}_square_diff')) %>%
filter(sqrt(sum(c_across(ends_with("_square_diff"))))<=.env$size) %>%
select(-ends_with("_square_diff")) %>%
left_join(y = mutate(cur_pos, current = TRUE), by = names(grid))
new_pos <- opts %>%
mutate(weight = case_when(current ~ 1-(prob*(n()-1)), #calculate chance to move,
TRUE ~ prob), #in higher dimensions, we may have more places to move
weight = if_else(weight<0, 0, weight)) %>% #thus depending on prob, we may always move.
sample_n(size = 1, weight = weight) %>%
select(-weight, -current)
new_pos
}
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
library(gganimate)
move_step <- function(cur_pos, grid, prob = 0.04, size = 1){
opts <- grid %>%
filter(across(.fns = ~ between(.x, .env$cur_pos[[cur_column()]]-.env$size, .env$cur_pos[[cur_column()]]+.env$size))) %>%
left_join(y = mutate(cur_pos, current = TRUE), by = names(grid))
new_pos <- opts %>%
mutate(weight = case_when(current ~ 1-(prob*(n()-1)), #calculate chance to move,
TRUE ~ prob), #in higher dimensions, we may have more places to move
weight = if_else(weight<0, 0, weight)) %>% #thus depending on prob, we may always move.
sample_n(size = 1, weight = weight) %>%
select(-weight, -current)
new_pos
}
sim_walk <- function(cur_pos, grid, grid_prob = 0.04, steps = 50, size = 1){
iterations <- cur_pos
for(i in seq_len(steps)){
cur_pos <- move_step(cur_pos, grid, prob = grid_prob, size = size)
iterations <- bind_rows(iterations, cur_pos)
}
iterations$i <- 1:nrow(iterations)
iterations
}
origin <- data.frame(x = 0, y =0)
small_grid <- expand.grid(x = -1:1, y = -1:1)
small_walk <- sim_walk(cur_pos = origin,
grid = small_grid)
ggplot(small_walk, aes(x, y)) +
geom_path() +
geom_point(color = "red") +
transition_reveal(i) +
labs(title = "Step {frame_along}") +
coord_fixed()
large_grid <- expand.grid(x = -10:10, y = -10:10)
large_walk <- sim_walk(cur_pos = origin,
grid = large_grid,
steps = 100)
ggplot(large_walk, aes(x,y)) +
geom_path() +
geom_point(color = "red") +
transition_reveal(i) +
labs(title = "Step {frame_along}") +
xlim(c(-10,10)) + ylim(c(-10,10))+
coord_fixed()
large_walk %>%
count(x, y) %>%
right_join(y = expand.grid(x = -10:10, y = -10:10), by = c("x","y")) %>%
mutate(n = if_else(is.na(n), 0L, n)) %>%
ggplot(aes(x,y)) +
geom_tile(aes(fill = n)) +
coord_fixed()
multi_dim_walk <- sim_walk(cur_pos = data.frame(x = 0, y = 0, z = 0),
grid = expand.grid(x = -20:20, y = -20:20, z = -20:20),
steps = 100, size = 2)
library(cowplot)
plot_grid(
ggplot(multi_dim_walk, aes(x, y)) + geom_path(),
ggplot(multi_dim_walk, aes(x, z)) + geom_path(),
ggplot(multi_dim_walk, aes(y, z)) + geom_path())
Created on 2021-05-06 by the reprex package (v1.0.0)
Here is a base R option using Reduce + replicate + plot for 2D random walk process
set.seed(0)
plot(
setNames(
data.frame(replicate(
2,
Reduce(`+`, rnorm(99), init = 0, accumulate = TRUE)
)),
c("X", "Y")
),
type = "o"
)

Plotting multiple plots with two discrete variables - how to include all discrete variables in both axes

I have a dataset that looks like this:
test<-data.frame("M"=c("a","b","c","a","b","b","c","a","b","c"),
"N"=c(1,3,4,6,6,7,7,8,8,8),
"X"=c(0,1,0,1,1,0,1,0,1,1),
"Y"=c(1,1,0,0,1,0,1,1,1,0))
I'm making a simple plot where I want X and Y on the y axis, M on the x axis, each grid colored if the value of X or Y is 1 and empty if the value of X or Y is 0. I'm repeating this for each categories in N (the categories of N are 1 to 5, 6, 7, 8), then stacking all plots together. Right now, I'm doing this with the following code.
test <- test[order(test$N),]
test1 <- test[c(1:3),]
test2 <- test[c(4:5),]
test3 <- test[c(6:7),]
test4 <- test[c(8:10),] # I'm doing this to "separate" categories of `N` manually
p1 <- test1[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'red'))
p2 <- test2[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'yellow'))
p3 <- test3[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'green'))
p4 <- test4[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'blue'))
grid.arrange(p1, p2, p3, p4, ncol = 1)
I'm attaching an image of what I have right now. I want to fix these plots so that I would have the same factors of M for all four plots (right now, only p1 and p4 have all three factors (a, b and c) in the x axis but I want to add factor c to p2 and a to p3 so that all x axes are identical to each other. Can anyone give me suggestions on how to do this?
(Also, I'm suspecting that the current way I'm plotting things is probably not the most quickest/easiest way to go, if anyone has suggestions on how to improve things it'd be really helpful!)
To continue using grid.arrange(), instead of facet_wrap(), do the following:
Make M a factor:
test$M <- factor(test$M)
Add the following to each of your plots:
scale_x_discrete(limits = levels(test$M))
Maybe one approach I can suggest you is using facets after applying a smart trick to group your values and avoid splitting in different dataframes. Here the code as an option for you (The colors will be the same across the facets in base of TRUE/FALSE values):
library(tidyverse)
#Code
test %>% mutate(Var=lead(N)) %>%
mutate(Diff=Var-N,Diff=ifelse(row_number()==1,0,Diff)) %>%
mutate(Group=ifelse(Diff==0,N,NA)) %>%
fill(Group) %>% select(-c(N,Var,Diff)) %>%
group_by(Group) %>% mutate(NG=paste0('p',cur_group_id())) %>% ungroup() %>%
select(-Group) %>%
pivot_longer(cols = -c(NG,M)) %>%
ggplot(aes(factor(M), name, fill = value == 1,group=value))+
geom_tile(colour = 'black')+
facet_wrap(.~NG,ncol = 1)+
scale_fill_manual('value',values=c('tomato','cyan3'))+
xlab('M')
Output:
The othe option would be patchwork with a customized function:
library(tidyverse)
library(patchwork)
#Code
data <- test %>% mutate(Var=lead(N)) %>%
mutate(Diff=Var-N,Diff=ifelse(row_number()==1,0,Diff)) %>%
mutate(Group=ifelse(Diff==0,N,NA)) %>%
fill(Group) %>% select(-c(N,Var,Diff)) %>%
group_by(Group) %>% mutate(NG=paste0('p',cur_group_id())) %>% ungroup() %>%
select(-Group) %>%
mutate(M=factor(M,levels = unique(M),ordered = T)) %>%
pivot_longer(cols = -c(NG,M))
#List
List <- split(data,data$NG)
#Function
myfun <- function(x)
{
#Test for color
val <- unique(x$NG)
#Conditioning for color
if(val=='p1') {vcolor=c('FALSE' = 'white', 'TRUE' = 'red')} else
if(val=='p2') {vcolor=c('FALSE' = 'white', 'TRUE' = 'yellow')} else
if(val=='p3') {vcolor=c('FALSE' = 'white', 'TRUE' = 'green')} else
{vcolor=c('FALSE' = 'white', 'TRUE' = 'blue')}
#Update data
x <- x %>% mutate(M=factor(M,levels = c('a','b','c'),ordered = T)) %>% complete(M=M)
#Plot
G <- ggplot(x,aes(factor(M), name, fill = (value == 1 & !is.na(value))))+
geom_tile(colour = 'black')+
scale_fill_manual('value',values=vcolor)+
xlab('M')+
scale_y_discrete(limits=c('X','Y'))+
theme_bw()+
ggtitle(val)
return(G)
}
#Apply
Lplot <- lapply(List,myfun)
#Wrap
GF <- wrap_plots(Lplot,ncol = 1)
Output:
Something like this?
test<-data.frame("M"=c("a","b","c","a","b","b","c","a","b","c"),
"N"=c(1,3,4,6,6,7,7,8,8,8),
"X"=c(0,1,0,1,1,0,1,0,1,1),
"Y"=c(1,1,0,0,1,0,1,1,1,0))
library(tidyverse)
test = mutate(test, N2 = cut(N, breaks = c(0,5:100)))
m = pivot_longer(test, c(X, Y))
ggplot(m, aes(M, name,fill=factor(value))) +
geom_tile(colour = 'black') +
facet_wrap(~N2, scales = 'free') +
scale_fill_manual(values = c(`0` = 'white', `1` = 'red'))

Point colors and facets in ggplot2

Hadley Wickham's "ggplot2: Elegant Graphics for Data Analysis" book has this example graph in chapter 7, where mean points for all three colors of z appear on each facet:
Here's the code provided by the book to make the graph:
df <- data.frame(
x = rnorm(120, c(0, 2, 4)),
y = rnorm(120, c(1, 2, 1)),
z = letters[1:3]
)
df_sum <- df %>%
group_by(z) %>%
summarise(x = mean(x), y = mean(y)) %>%
rename(z2 = z)
ggplot(df, aes(x, y)) +
geom_point() +
geom_point(data = df_sum, aes(colour = z2), size = 4) +
facet_wrap(~z)
Ideally, keeping the color name as z as opposed to renaming it to z2 should produce the same result:
df_sum <- df %>%
group_by(z) %>%
summarise(x = mean(x), y = mean(y))
df_sum <- df %>%
group_by(z) %>%
summarise(x = mean(x), y = mean(y))
ggplot(df, aes(x, y)) +
geom_point() +
geom_point(data = df_sum, aes(colour = z), size = 4) +
facet_wrap(~z)
But doing so makes it so that only one colored point appears on each facet:
Is there a reason for this?
No, it should not produce the same result. As you use facet_wrap by variable z, you split the data for each facet panel, only plotting what belongs to a, b or c (your "incorrect" graph). By renaming z to z2 you can keep the facet_wrap by z for the black points but still plot each coloured point in each panel (as in df_sum there is no variable z that facet_wrap would use).

ggplot/GGally - Parallel Coordinates - y-axis labels

Does anyone know if there is a way to add variable labels to the ggparcoord function in GGally? I've tried numerous ways with geom_text, but nothing is yielding results.
To be more explicit, I am looking to pass the row.names(mtcars) through geom_text. The only way that I can distinguish the car is passing row.names(mtcars) through the groupColumn argument, but I don't like the way this looks.
Doesn't work:
mtcars$carName <- row.names(mtcars) # This becomes column 12
library(GGally)
# Attempt 1
ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1) +
geom_text(aes(label = carName))
# Attempt 2
ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1,
mapping = aes(label = carName))
Any ideas would be appreciated!
Solution 1: If you want to stick close to your original attempt, you can calculate the appropriate y coordinates for the car names, & add that as a separate data source. Use inherit.aes = FALSE so that this geom_text layer doesn't inherit anything from the ggplot object created using ggparcoord():
library(dplyr)
p1 <- ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1) +
geom_text(data = mtcars %>%
select(carName) %>%
mutate(x = 1,
y = scale(as.integer(factor(carName)))),
aes(x = x, y = y, label = carName),
hjust = 1.1,
inherit.aes = FALSE) +
# optional: remove "carName" from x-axis labels
scale_x_discrete(labels = function(x) c("", x[-1])) +
# also optional: hide legend, which doesn't really seem relevant here
theme(legend.position = "none")
p1
Solution 2: This alternative uses carName as the group column, & doesn't pass it as one of the parallel coordinate columns. (which I think this might be closer to the use cases intended by this function...) Specifying carName as the group column allows the car name values to be captured in the data slot of the ggplot object created by ggparcoord() this time, so our geom_text label can inherit it directly, & even filter only for rows corresponding to variable == "mpg" (or whatever the first of the parallel coordinate columns is named, in the actual use case). The y coordinates are not as evenly spread out as above, but geom_text_repel from the ggrepel package does a decent job at shifting overlapping text labels away from one another.
library(dplyr)
library(ggrepel)
p2 <- ggparcoord(mtcars,
columns = c(1, 6),
groupColumn = "carName") +
geom_text_repel(data = . %>%
filter(variable == "mpg"),
aes(x = variable, y = value, label = carName),
xlim = c(NA, 1)) + # limit repel region to the left of the 1st column
theme(legend.position = "none") # as before, hide legend since the labels
# are already in the plot
p2
Solution 3 / 4: You can actually plot the same with ggplot(), without relying on extensions that may do unexpected stuff behind the scenes:
library(dplyr)
library(tidyr)
library(ggrepel)
# similar output to solution 1
p3 <- mtcars %>%
select(carName, mpg, wt) %>%
mutate(carName.column = as.integer(factor(carName))) %>%
gather(variable, value, -carName) %>%
group_by(variable) %>%
mutate(value = scale(value)) %>%
ungroup() %>%
ggplot(aes(x = variable, y = value, label = carName, group = carName)) +
geom_line() +
geom_text(data = . %>% filter(variable == "carName.column"),
hjust = 1.1) +
scale_x_discrete(labels = function(x) c("", x[-1]))
p3
# similar output to solution 2
p4 <- mtcars %>%
select(carName, mpg, wt) %>%
gather(variable, value, -carName) %>%
group_by(variable) %>%
mutate(value = scale(value)) %>%
ungroup() %>%
ggplot(aes(x = variable, y = value, label = carName, group = carName)) +
geom_line() +
geom_text_repel(data = . %>% filter(variable == "mpg"),
xlim = c(NA, 1))
p4
Edit
You can add text labels on the right as well, for each of the above. Do note that the location for labels may not be nicely spaced out, since they are positioned according to wt's scaled values:
p1 +
geom_text(data = mtcars %>%
select(carName, wt) %>%
mutate(x = 3,
y = scale(wt)),
aes(x = x, y = y, label = carName),
hjust = -0.1,
inherit.aes = FALSE)
p2 +
geom_text_repel(data = . %>%
filter(variable == "wt"),
aes(x = variable, y = value, label = carName),
xlim = c(2, NA))
p3 +
geom_text(data = . %>% filter(variable == "wt"),
hjust = -0.1)
p4 +
geom_text_repel(data = . %>% filter(variable == "wt"),
xlim = c(2, NA))

Resources