I have some data for which I would like to circle some different subsets. I am using ggplot2 and ggforce to plot the data and draw an ellipse (geom_mark_ellipse) around the data.
I have an issue in that the positions of the connectors on the ellipses (for my data) are in ambiguous positions (at the conjunction of two ellipses, on the border of two ellipses that graze each other).
How can I manually set the position of the connector to the ellipse? Or at least influence them into a particular region?
I have some code below which captures the spirit in which I'm plotting my data. For the purpose of the example, how could I make all of the labels appear in the top left of the plot, or all join the ellipses at x == 0, -2, -4 for each of the factors?
library(tidyverse)
library(ggforce)
x <- c(-1,0,1,-3,-2,2,3,-5,-4,4,5)
t <- c(1,1,1,2,2,2,2,3,3,3,3)
tmp <- as_tibble_col(x, column_name = "x")
tmp <- tmp %>% mutate(t = t)
#How do I move the position of the label connectors on the ellipses?
tmp %>%
ggplot(aes(x=x, y=x)) +
geom_mark_ellipse(aes(label = t, group=t),con.cap = 0) +
geom_point()
Created on 2020-05-05 by the reprex package (v0.3.0)
I've managed to do it for my contrived example, yet to try on my real data, but there is hope.
As shown in the code below, I created data to fill the area (top left) that I didn't want to have labels in, and gave it a factor of "". I manually set the colour of the connectors to NA for that factor, and got rid of the label background for everything. Because the factor is "", the label is an empty string, and nothing shows up. I also set scale_colour_manual to give the colour NA to the ellipse I didn't want to see.
I also filtered the geom_point to not show the data with a factor of "". Finally, I deleted the legend.
library(tidyverse)
library(ggforce)
x <- c(-1,0,1,-3,-2,2,3,-5,-4,4,5)
t <- c(1,1,1,2,2,2,2,3,3,3,3)
tmp <- as_tibble_col(x, column_name = "x")
tmp <- tmp %>% mutate(y=x)
tmp <- tmp %>% mutate(t = t)
#now lets add some dodging data
tmp <- tmp %>% mutate(t = as.character(t))
tmp <- tmp %>% add_row(x=c(-5,2.5,-2.5), y=c(-2.5,5,2.5),t="")
tmp %>%
ggplot(aes(x=x, y=y)) +
geom_mark_ellipse(aes(label = t, group=t, colour=factor(t)),
con.cap = 0, con.colour = c(NA, "black","black","black"),
label.fill=NA) +
scale_colour_manual(values=c(NA, "black", "black", "black")) +
geom_point(data = subset(tmp, t != "")) +
theme(legend.position = "none")
Created on 2020-05-06 by the reprex package (v0.3.0)
Related
I am working on 3-way interaction effect plotting using my own data. But my code creates too many (continuous) shapes along the lines.
How can I leave the points only at the ends of the lines instead of the figure attached above?
I will deeply appreciate if anybody helps.
g1=ggplot(mygrid,aes(x=control,y=pred,color=factor(nknowledge),
lty=factor(nknowledge),shape=factor(nknowledge)))+
geom_line(size=1.5)+
geom_point(size=2.5)+
labs(x="control", y="attitudes",lty = "inc level")+
scale_linetype_manual("know level",breaks=1:3,values=c("longdash", "dotted","solid"),label=c("M-SD","M","M+SD"))+
scale_color_manual("know level",breaks=1:3,values=c("red", "blue","grey"),label=c("M-SD","M","M+SD"))+
scale_shape_manual("know level",breaks=1:3,values=c(6,5,4),label=c("M-SD","M","M+SD"))+
theme_classic()
This could be achieved by making use of a second dataset which filters the data for the endpoints by group using e.g. a group_by and range and passing the filtered dataset as data to geom_point:
Using some random example data try this:
set.seed(42)
mygrid <- data.frame(
control = runif(30, 1, 7),
pred = runif(30, 1, 3),
nknowledge = sample(1:3, 30, replace = TRUE)
)
library(ggplot2)
library(dplyr)
mygrid_pt <- mygrid %>%
group_by(nknowledge) %>%
filter(control %in% range(control))
ggplot(mygrid,aes(x=control,y=pred,color=factor(nknowledge),
lty=factor(nknowledge),shape=factor(nknowledge)))+
geom_line(size=1.5)+
geom_point(data = mygrid_pt, size=2.5)+
labs(x="control", y="attitudes",lty = "inc level")+
scale_linetype_manual("know level",breaks=1:3,values=c("longdash", "dotted","solid"),label=c("M-SD","M","M+SD"))+
scale_color_manual("know level",breaks=1:3,values=c("red", "blue","grey"),label=c("M-SD","M","M+SD"))+
scale_shape_manual("know level",breaks=1:3,values=c(6,5,4),label=c("M-SD","M","M+SD"))+
theme_classic()
If you use geom_point, then you'll get points for all rows in your data frame. If you want specific points and shapes plotted at the ends of your lines, you'll want to create a filtered data frame for the only points you want to have plotted.
library(ggplot2); library(dplyr)
g1 <- ggplot()+
geom_line(data = mtcars,
mapping = aes(x=hp,y=mpg,color=factor(cyl),lty=factor(cyl)),
size=1.5)+
geom_point(data = mtcars %>% group_by(cyl) %>% filter(hp == max(hp) | hp == min(hp)),
mapping = aes(x=hp,y=mpg,color=factor(cyl),shape=factor(cyl)),
size=2.5)
g1
Created on 2021-01-28 by the reprex package (v0.3.0)
This is my R-script, I've been trying to include a legend onto the line plot but it isn't working? Any guidance? I also can't seem to get the geom_point() working either (I've taken the code for it out below).
library(ggsignif)
library(readxl)
library(svglite)
library(tidyverse)
library(ggplot2)
library(tidyr)
library(dplyr)
url <-'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2850-3/MediaObjects/41586_2020_2850_MOESM10_ESM.xlsx'
temp <-tempfile()
download.file(url, temp, mode='wb')
myData <- read_excel(path=temp, sheet = "ExFig.5f")
names(myData) <- NULL
view(myData)
Time_post_inj <- (myData[1])
Time_post_inj <- Time_post_inj[-c(1),]
dose_450_ug <- (myData[2])
dose_450_ug <- dose_450_ug[-c(1),]
dose_150_ug <- (myData[4])
dose_150_ug <- dose_150_ug[-c(1),]
dose_100_ug <- (myData[6])
dose_100_ug <- dose_100_ug[-c(1),]
dose_50_ug <- (myData[8])
dose_50_ug <- dose_50_ug[-c(1),]
colnames(Time_post_inj) <-c("Time_Post_Injection")
colnames(dose_450_ug) <-c("dose_450_µg")
colnames(dose_150_ug) <-c("dose_150_µg")
colnames(dose_100_ug) <-c("dose_100_µg")
colnames(dose_50_ug) <-c("dose_50_µg")
Newdata <-data.frame(Time_post_inj, dose_450_ug, dose_150_ug, dose_100_ug, dose_50_ug)
Newdata$Time_Post_Injection <-as.numeric(Newdata$Time_Post_Injection)
Newdata$dose_450_µg <-as.numeric(Newdata$dose_450_µg)
Newdata$dose_150_µg <-as.numeric(Newdata$dose_150_µg)
Newdata$dose_100_µg <-as.numeric(Newdata$dose_100_µg)
Newdata$dose_50_µg <-as.numeric(Newdata$dose_50_µg)
str(Newdata)
ggplot(data=Newdata, aes(x=Time_Post_Injection, y=hCD4_occupancy, group = 1)) + geom_line(aes(y=dose_450_µg)) + geom_line(aes(y=dose_150_µg)) + geom_line(aes(y=dose_100_µg)) + geom_line(aes(y=dose_50_µg))
Newdata
tidyr::pivot_longer(Time_Post_Injection, names_to = "DOSE", values_to = "VALUE") %>%
ggplot2::ggplot(aes(Time_Post_Injection, VALUE, group = DOSE, color = DOSE)) + ggplot2::geom_line()
The following is a full reprex, meaning that if you copy and paste, it will reproduce the plot exactly as below. You can see I have simplified your parsing considerably too; this starts with the url and produces the plot with a lot less data wrangling:
library(ggplot2) # Only load packages you really need
# This format is a handy way of keeping a long string on a single page
url <- paste0("https://static-content.springer.com/esm/art%3A10.",
"1038%2Fs41586-020-2850-3/MediaObjects/41586_2020",
"_2850_MOESM10_ESM.xlsx")
temp <- tempfile()
download.file(url, temp, mode = 'wb')
# Instead of loading an entire library to use one function, we can
# access read_excel by doing readxl::read_excel
myData <- readxl::read_excel(temp, sheet = "ExFig.5f")
# This single line subsets the data frame to chop out the first row
# and the empty columns. It also converts all columns to numeric
NewData <- as.data.frame(lapply(myData[-1, -c(3, 5, 7)], as.numeric))
names(NewData) <-c("Time_Post_Injection", "dose_450_ug",
"dose_150_ug", "dose_100_ug", "dose_50_ug")
# This switches your data to long format, which helps ggplot to work
# We put all the values in one column and have the dosages as labels
# in another column instead of having multiple columns. This allows us
# to map Color to the dosages.
NewData <- cbind(NewData[1], stack(NewData[-1]))
# Now we just tell ggplot to map colours to ind
ggplot(NewData, aes(x = Time_Post_Injection, y = values, color = ind)) +
geom_line() +
geom_point() +
scale_color_discrete(name = "Dose") +
labs(x = "Time Pist Injection") +
theme_bw()
Created on 2020-11-11 by the reprex package (v0.3.0)
Hi the main problem is that you did not get your data into a easy to handle format
library(dplyr)
library(tidyr)
library(ggplot2)
Newdata %>%
# get data in easy to handle format
tidyr::pivot_longer(-Time_Post_Injection, names_to = "DOSE", values_to = "VALUE") %>%
# plot and use the new DOSE column as group and color so you do not need one geom per line! (you can change geom_line() to geom_point also())
ggplot2::ggplot(aes(Time_Post_Injection, VALUE, group = DOSE, color = DOSE)) +
ggplot2::geom_line()
I have a set of points on a map, each with a given parameter value. I would like to:
Cluster them spatially and ignore any clusters having fewer than
10 points. My df should have a column (Clust) for the cluster each point belongs to [DONE]
Sub-cluster the parameter values within each cluster; add a column to my df (subClust) used to categorize each point by sub-cluster.
I don't know how to do the second part, except maybe with loops.
The image shows the set of spatially distributed points (top left) colour coded by cluster and sorted by parameter value in the top right plot. The bottom row shows clusters with >10 points (left) and facets for each cluster sorted by parameter value (right). It's these facets that I'd like to be able to colour code by sub-cluster according to a minimum cluster separation distance (d=1)
Any pointers/help appreciated. My reproducible code is below.
# TESTING
library(tidyverse)
library(gridExtra)
# Create a random (X, Y, Value) dataset
set.seed(36)
x_ex <- round(rnorm(200,50,20))
y_ex <- round(runif(200,0,85))
values <- rexp(200, 0.2)
df_ex <- data.frame(ID=1:length(y_ex),x=x_ex,y=y_ex,Test_Param=values)
# Cluster data by (X,Y) location
d = 4
chc <- hclust(dist(df_ex[,2:3]), method="single")
# Distance with a d threshold - used d=40 at one time but that changes...
chc.d40 <- cutree(chc, h=d)
# max(chc.d40)
# Join results
xy_df <- data.frame(df_ex, Clust=chc.d40)
# Plot results
breaks = max(chc.d40)
xy_df_filt <- xy_df %>% dplyr::group_by(Clust) %>% dplyr::mutate(n=n()) %>% dplyr::filter(n>10)# %>% nrow
p1 <- ggplot() +
geom_point(data=xy_df, aes(x=x, y=y, colour = Clust)) +
scale_color_gradientn(colours = rainbow(breaks)) +
xlim(0,100) + ylim(0,100)
p2 <- xy_df %>% dplyr::arrange(Test_Param) %>%
ggplot() +
geom_point(aes(x=1:length(Test_Param),y=Test_Param, colour = Test_Param)) +
scale_colour_gradient(low="red", high="green")
p3 <- ggplot() +
geom_point(data=xy_df_filt, aes(x=x, y=y, colour = Clust)) +
scale_color_gradientn(colours = rainbow(breaks)) +
xlim(0,100) + ylim(0,100)
p4 <- xy_df_filt %>% dplyr::arrange(Test_Param) %>%
ggplot() +
geom_point(aes(x=1:length(Test_Param),y=Test_Param, colour = Test_Param)) +
scale_colour_gradient(low="red", high="green") +
facet_wrap(~Clust, scales="free")
grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)
THIS SNIPPET DOES NOT WORK - can't pipe within dplyr mutate() ...
# Second Hierarchical Clustering: Try to sub-cluster by Test_Param within the individual clusters I've already defined above
xy_df_filt %>% # This part does not work
dplyr::group_by(Clust) %>%
dplyr::mutate(subClust = hclust(dist(.$Test_Param), method="single") %>%
cutree(, h=1))
Below is a way around it using a loop - but I'd really rather learn how to do this using dplyr or some other non-loop method. An updated image showing the sub-clustered facets follows.
sub_df <- data.frame()
for (i in unique(xy_df_filt$Clust)) {
temp_df <- xy_df_filt %>% dplyr::filter(Clust == i)
# Cluster data by (X,Y) location
a_d = 1
a_chc <- hclust(dist(temp_df$Test_Param), method="single")
# Distance with a d threshold - used d=40 at one time but that changes...
a_chc.d40 <- cutree(a_chc, h=a_d)
# max(chc.d40)
# Join results to main df
sub_df <- bind_rows(sub_df, data.frame(temp_df, subClust=a_chc.d40)) %>% dplyr::select(ID, subClust)
}
xy_df_filt_2 <- left_join(xy_df_filt,sub_df, by=c("ID"="ID"))
p4 <- xy_df_filt_2 %>% dplyr::arrange(Test_Param) %>%
ggplot() +
geom_point(aes(x=1:length(Test_Param),y=Test_Param, colour = subClust)) +
scale_colour_gradient(low="red", high="green") +
facet_wrap(~Clust, scales="free")
grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)
There should be a way to do it using a combination of do and tidy, but I always have a hard time getting things to line up the way I want using do. Instead, what I usually do is combine split from base R and map_dfr from purrr. split will split the dataframe by Clust and give you a list of dataframes that you can then map over. map_dfr maps over each of those dataframes and returns a single dataframe.
I started from your xy_df_filt and generated what I believe should be the same as the xy_df_filt_2 that you got from the for loop. I made two plots, although the two sets of clusters are a little hard to see.
xy_df_filt_2 <- xy_df_filt %>%
split(.$Clust) %>%
map_dfr(function(df) {
subClust <- hclust(dist(df$Test_Param), method = "single") %>% cutree(., h = 1)
bind_cols(df, subClust = subClust)
})
ggplot(xy_df_filt_2, aes(x = x, y = y, color = as.factor(subClust), shape = as.factor(Clust))) +
geom_point() +
scale_color_brewer(palette = "Set2")
Clearer with faceting
ggplot(xy_df_filt_2, aes(x = x, y = y, color = as.factor(subClust), shape = as.factor(Clust))) +
geom_point() +
scale_color_brewer(palette = "Set2") +
facet_wrap(~ Clust)
Created on 2018-04-14 by the reprex package (v0.2.0).
You could do this for your subclusters...
xy_df_filt_2 <- xy_df_filt %>%
group_by(Clust) %>%
mutate(subClust = tibble(Test_Param) %>%
dist() %>%
hclust(method="single") %>%
cutree(h=1))
Nested pipes are fine. I think the problem with your version was that you were not passing the right sort of object to dist.
The tibble term is not needed if you are only passing a single column to dist, but I have left it in in case you want to use several columns as you do for the main clustering.
You could use the same sort of formula, but without the group_by, to calculate xy_df from df_ex.
I would like to label points in a scatterplot, but only those within the facet_zoom panel. Here is an example:
library(ggplot2)
library(ggforce)
library(ggrepel)
library(magrittr)
labels <- letters
example_values_x <- rnorm(26)
example_values_y <- rnorm(26)
df <- data.frame(labels,
example_values_x,
example_values_y)
df %>% ggplot(aes(y = example_values_y,
x = example_values_x)) +
geom_point() +
facet_zoom(x = example_values_x > 0.5) +
geom_label_repel(data = filter(df, example_values_x > 0.5), aes(label = labels))
Any idea how to make it so the labels don't also appear on the non-zoomed panel?
NOTE: The following answer works with the GitHub version of ggforce. As of writing this, the version that's on CRAN appears to have a different interface for facet_zoom(), even though the package version is the same.
First, take your subset of data being labeled and add a zoom column, specifying whether the data should be rendered in the zoomed panel (TRUE), the original panel (FALSE), or both (NA):
dftxt <- dplyr::filter(df, example_values_x > 0.5) %>%
dplyr::mutate( zoom = TRUE ) ## All entries to appear in the zoom panel only
You can now pass this new data frame to geom_label_repel, while telling facet_zoom() to use the zoom column to determine where the data should be drawn:
df %>% ggplot(aes(y = example_values_y,
x = example_values_x)) +
geom_point() +
facet_zoom(x = example_values_x > 0.5, zoom.data=zoom) + # Note the zoom.data argument
geom_label_repel(data = dftxt, aes(label = labels))
Note that because the original df doesn't have a zoom column, facet_zoom() will treat it as NA and draw geom_point() in both panels, as desired:
Is there a way to jitter the lines in geom_line()? I know it kinda defies the purpose of this plot, but if you have a plot with few lines and would like them all to show it could be handy. Maybe some other solution to this visibility problem.
Please see below for code,
A <- c(1,2,3,5,1)
B <- c(3,4,1,2,3)
id <- 1:5
df <- data.frame(id, A, B)
# install.packages(reshape2)
require(reshape2) # for melt
dfm <- melt(df, id=c("id"))
# install.packages(ggplot2)
require(ggplot2)
p1 <- ggplot(data = dfm, aes(x = variable, y = value, group = id,
color= as.factor(id))) + geom_line() + labs(x = "id # 1 is hardly
visible as it is covered by id # 5") + scale_colour_manual(values =
c('red','blue', 'green', 'yellow', 'black'))
p2 <- ggplot(subset(dfm, id != 5), aes(x = variable, y = value,
group = id, color= as.factor(id))) + geom_line() + labs(x = "id #
5 removed, id # 1 is visible") + scale_colour_manual(values =
c('red','blue', 'green', 'yellow', 'black'))
# install.packages(RODBC)
require(gridExtra)
grid.arrange(p1, p2)
You can try
geom_line(position=position_jitter(w=0.02, h=0))
and see if that works well.
If you just want to prevent two lines from overlapping exactly, there is now a better way: position_dodge(), which "adjusts position by dodging overlaps to the side". This is nicer than adding jitter to any line, even when it's not needed.
Avoid ggplot2 lines overlapping exactly using position_dodge()
Code example:
df<-data.frame(x=1:10,y=1:10,z=1:10);
df.m <- melt(df, id.vars = "x");
ggplot(df.m, aes(x=x,y=value,group=variable,colour=variable))
+ geom_line(position=position_dodge(width=0.2));
Thanks to position_dodge(), we can now see that there are two lines in the plot, which just happen to co-incide exactly:
I tend to use different linestyles, so that, say, a solid blue line "peeks through" a dashed red line on top of it.
Then again, it does depend on what you want to impart to the reader. Keep in mind first and foremost that data should be points and theory lines unless this makes things cluttered. Unless the y and x values are identical, it'll be easier to see the points. (or you could apply the existing jitter function to the x-values)
Next, if you just want to show which runs are in the "bundle" and which are outliers, overlap doesn't matter because it's very unlikely that two outliers will be near-equal.
If you want to show a bunch of near-equal runs, you may prefer (which is to say, your readers will understand better) to plot the deltas against a mean rather than the actual values.
I would like to suggest a solution to a different problem than described, in which the Y axis is a factor, so position_dodge does nothing.
code:
library(tidyverse)
time_raw <- tibble(year=1900:1909,
person_A=c(rep("Rome",2),rep("Jerusalem",8)),
person_B=c(rep("Jerusalem",5),rep("Rome",5)))
achievements <- tribble(~year,~who,~what,
1900,"person_A","born",
1900,"person_B","born",
1909,"person_A","died",
1909,"person_B","died",
1905,"person_A","super star",
1905,"person_B","super star")
SCALE=0.5
jitter_locations <- time_raw %>%
pivot_longer(-year,names_to="who",values_to="place") %>%
distinct(place)%>%
filter(!is.na(place)) %>%
mutate(y_place=seq_along(place))
jitter_lines <- time_raw %>%
pivot_longer(-year,names_to="who",values_to="place") %>%
distinct(who) %>%
mutate(y_jitter=scale(seq_along(who))*0.015)
data_for_plot <- time_raw %>%
pivot_longer(-year,names_to="who",values_to="place") %>%
filter(!is.na(place)) %>%
left_join(achievements) %>%
left_join(jitter_locations) %>%
left_join(jitter_lines)
data_for_plot %>%
ggplot(aes(x=year,y=y_place+y_jitter,color=who,group=who))+
geom_line(size=2)+
geom_hline(aes(yintercept=y_place),size=50,alpha=0.1)+
geom_point(data = . %>% filter(!is.na(what)),size=5)+
geom_label(aes(label=what),size=3,nudge_y = -0.025)+
theme_bw()+
coord_cartesian(ylim = c(min(jitter_locations$y_place)-0.5*SCALE,
max(jitter_locations$y_place)+0.5*SCALE))+
scale_y_continuous(breaks =
min(jitter_locations$y_place):max(jitter_locations$y_place),
labels = jitter_locations$place)+
scale_x_continuous(breaks =
min(data_for_plot$year):max(data_for_plot$year))+
ylab("Place")