Specification curve "choices" plot using ggplot2 - r

I have a small dataset of estimates from many regressions of an outcome variable on a main treatment variable and then various sets of control variables (in fact, all possible combinations of those controls variables). The table of estimates is as follows:
df <-
structure(list(control_set = c("cen21_hindu_pct", "cen83_urban_pct",
"cen21_hindu_pct + cen83_urban_pct", "NONE"), xest = c(0.0124513609978549,
0.00427174623249021, 0.006447506098051, 0.0137107176362076),
xest_conf_low = c(0.00750677700140716, -0.00436301983024899,
-0.0013089334064237, 0.00925185534519074), xest_conf_high = c(0.0173959449943027,
0.0129065122952294, 0.0142039456025257, 0.0181695799272245
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
I want to make the two plots for the classic "specification curve analysis." The top plot is simply the set of estimates ordered by the magnitude of the estimate on the main treatment variable (no issue here):
df %>%
arrange(xest) %>%
mutate(specifications = 1:nrow(.)) %>%
ggplot(aes(x = specifications, y = xest, ymin = xest_conf_low, ymax = xest_conf_high)) +
geom_pointrange(alpha = 0.1, size = 0.6, fatten = 1) +
labs(x = "", y = "Estimate\n") +
theme_bw()
My problem is with the aligned plot underneath that describes the control-set choices. Directly underneath each coefficient dot and whisker from the plot just made I want a plot that indicates the set of corresponding control variables that were included in that model (i.e. the list of controls in the control_set column in the df data frame row). So the plot I need in this example would look just like this:
This is a (failed) sketch of what I tried to get there, by modifying the earlier estimation dataset in long form, but I couldn't get multiple ticks to show vertically: (Note, this bit of code won't run)
# forplot %>%
# arrange(xest) %>%
# mutate(specifications = 1:nrow(.)) %>%
# mutate(value = "|") %>%
# ggplot(aes(specifications, term)) +
# geom_text(aes(label = value)) +
# scale_color_manual(values = c("lightblue")) +
# labs(x = "\nSpecification number", y = "") +
# theme_bw()
How can I use ggplot2 to make the plot-figure shown above from the information in the data frame, df?

If we define your plot as -> a...
library(patchwork)
b <- tibble(specifications = c(1,2,2,3),
control_set = rep(c("cen83_urban_pct", "cen21_hindu_pct"), each = 2)) %>%
ggplot(aes(specifications, control_set)) +
geom_text(aes(label = "|"), size = 5) +
coord_cartesian(xlim = c(1,4)) +
labs(x = NULL, y = NULL) +
theme_bw()+
theme(axis.ticks = element_blank(),
axis.text.x = element_blank())
a/b + plot_layout(heights = c(3,1))
If you want to generate the key automatically, you might use something like this:
library(dplyr)
df %>%
select(control_set) %>%
mutate(specifications = 1:4) %>%
separate_rows(control_set, sep = "\\+") %>%
mutate(control_set = trimws(control_set)) %>% # b/c my regex not good enough to trim spaces in line above
...

If you want to relabel the numbers in the y-axis with the control_set labels you can add
+ scale_y_continuous(breaks = df$xest, labels = df$control_set)

Related

Adding a power curve to scatterplot

I want to add a power curve with confidence intervals to my diamter-weight relationship, which clearly follows a y=a*x^b regression. So far, I used the geom_smooth "loess" version, but this is not yet quite right and perfect. Any suggestion how to add a power regression line would be much appreciated. Below is the used code:
p2<-ggplot(Data,aes(x=Diameter,y=Wet_weight,colour=Site))+
geom_point(size=3.5,alpha=0.3)+
geom_smooth(aes(group=Species),method=loess,colour="black")+
labs(x="\nUmbrella diamter (mm)",y="Wet weight (mg)\n")+theme_classic()+
scale_colour_manual(values=c("black","dark blue","blue","dark green","green"))+
theme(axis.title.x=element_text(size=20),
axis.text.x=element_text(size=18,colour="black"),
axis.title.y=element_text(size=20),
axis.text.y=element_text(size=18,colour="black"),
axis.ticks=element_line(colour="black",size=1),
axis.line=element_line(colour="black",size=1,linetype="solid"),
legend.position=c(0.18,0.75),
legend.text=element_text(colour="black",size=17),
legend.title=element_text(colour="black",size=18))
p2
Thank you!
I used this to get many equations, R2, and plots.
df= #change your data frame so it fits the current code
variables=c("group","year") #if you have multiple groups/seasons/years/elements add them here
df$y= #which variable will be your y
df$x= #which variable will be your x
#No changes get the equations
text=df %>%
group_by(across(all_of(variables))) %>% #your grouping variables
do(broom::tidy(lm(log(y) ~ log(x), data = .))) %>%
ungroup() %>%
mutate(y = round(ifelse(term=='(Intercept)',exp(estimate),estimate),digits = 2)) %>% #your equation values rounded to 2
select(-estimate,-std.error,-statistic ,-p.value) %>%
pivot_wider(names_from = term,values_from = y) %>%
rename(.,a=`(Intercept)`,b=`log(x)`)
#CHANGE before running!! add your grouping variables
rsq=df %>%
split(list(.$group,.$year)) %>% #---- HERE add the names after $
map(~lm(log(y) ~ log(x), data = .)) %>%
map(summary) %>%
map_dbl("r.squared") %>%
data.frame()
#Join the R2 and y results for the plot in a single data frame and write the equations
labels.df=mutate(rsq,groups=row.names(rsq)) %>%
separate(col = groups,into = c(variables),sep = "[.]",
convert = TRUE, remove = T, fill = "right") %>%
rename("R"='.') %>%
left_join(text,.) %>%
mutate(R=round(R,digits = 4), #round your R2 digits
eq= paste('y==',a,"~x^(",b,")", sep = ""),
rsql=paste("R^2==",R),
full= paste('y==',a,"~x^(",b,")","~~R^2==",R, sep = ""))
# plot
ggplot(df,aes(x = x,y = y)) +
geom_point(size=4,mapping = aes(
colour=factor(ifelse(is.na(get(variables[2])),"",(get(variables[2])))), #points colour
shape=get(variables[1]))) + # different shapes
facet_wrap(get(variables[1])~ifelse(is.na(get(variables[2])),"",get(variables[2])),
scales = "free",labeller = labeller(.multi_line = F))+ #for multiple groups; join text in one line
stat_smooth(mapping=aes(colour=get(variables[1])), #colours for our trend
method = 'nls', formula = 'y~a*x^b',
method.args = list(start=c(a=1,b=1)),se=FALSE) +
geom_text(labels.df,x = Inf, y = Inf,size=5, mapping = aes(label = (eq)), parse = T,vjust=1, hjust=1)+
geom_text(labels.df,x = Inf,y = Inf,size=5, mapping = aes(label = (rsql)), parse = T,vjust=2.5, hjust=1)+
#scale_y_log10() + #add this to avoid problems with big y values
labs(x="Your x label",y="your y label")+
theme_bw(base_size = 16) +
theme(legend.position = "none",
strip.background = element_rect(fill="#b2d6e2"))

Using different data for positioning and display of labels in plots

TL;DR: with plot labels using geom_label etc., is it possible to use different data for the calculation of positions of using position_stack or similar functions, than for the display of the label itself? Or, less generally, is it possible to subset the label data after positions have been calculated?
I have some time series data for many different subjects. Observations took place at multiple time points, which are the same for each subject. I would like to plot this data as a stacked area plot, where the height of a subject's curve at each time point corresponds to the observed value for that subject at that time point. Crucially, I also need to add labels to identify each subject.
However, the trivial solution of adding one label at each observation makes the plot unreadable, so I would like to limit the displayed labels to the "most important" subjects (the ones that have the highest peak), as well as only display a label at the respective peak. This subsetting of the labels themselves is not a problem either, but I cannot figure out how to then position the (subset of) labels correctly so they match with the stacked area chart.
Here is some example code, which should work out of the box with tidyverse installed, to illustrate my issue. First, we generate some data which has the same structure as mine:
library(tidyverse)
set.seed(0)
# Generate some data
num_subjects = 50
num_timepoints = 10
labels = paste(sample(words, num_subjects), sample(fruit, num_subjects), sep = "_")
col_names = c("name", paste0("timepoint_", c(1:num_timepoints)))
df = bind_rows(map(labels,
~c(., cumsum(rnorm(num_timepoints))) %>%
set_names(col_names))) %>%
pivot_longer(starts_with("timepoint_"), names_to = "timepoint", names_prefix = "timepoint_") %>%
mutate(across(all_of(c("timepoint", "value")), as.numeric)) %>%
mutate(value = if_else(value < 0, 0, value)) %>%
group_by(name) %>% mutate(peak = max(value)) %>% ungroup()
Now, we can trivially make a simple stacked area plot without labels:
# Plot (without labels)
ggplot(df,
mapping = aes(x = factor(timepoint), y = value, group = name, fill = factor(peak))) +
geom_area(show.legend = FALSE, position = "stack", colour = "gray25") +
scale_fill_viridis_d()
Plot without labels (it appears that I currently cannot embed images, which is very unfortunate as they are extremely illustrative here...)
It is also not too hard to add non-specific labels to this data. They can easily be made to appear at the correct position — so the center of the label is at the middle of the area for each time point and subject — using position_stack:
# Plot (all labels, positions are correct but the plot is basically unreadable)
ggplot(df,
mapping = aes(x = factor(timepoint), y = value, group = name, fill = factor(peak))) +
geom_area(show.legend = FALSE, position = "stack", colour = "gray25") +
geom_label(mapping = aes(label = name), show.legend = FALSE, position = position_stack(vjust = 0.5)) +
scale_fill_viridis_d()
Plot with a label at each observation
However, as noted before, the labels almost entirely obscure the plot itself. So my approach would be to only show labels at the peaks, and only for the 10 subjects with the highest peaks:
# Plot (only show labels at the peak for the 10 highest peaks, readable but positions are wrong)
max_labels = 10 # how many labels to show
df_labels = df %>%
group_by(name) %>% slice_max(value, n = 1) %>% ungroup() %>%
slice_max(value, n = max_labels)
ggplot(df,
mapping = aes(x = factor(timepoint), y = value, group = name, fill = factor(peak))) +
geom_area(show.legend = FALSE, position = "stack", colour = "gray25") +
geom_label(data = df_labels, mapping = aes(label = name), show.legend = FALSE, position = position_stack(vjust = 0.5)) +
scale_fill_viridis_d()
Plot with only a subset of labels
This code also works fine, but it is apparent that the labels no longer show up at the correct positions, but are instead too low on the plot, especially for the subjects which would otherwise be higher up. (The only subject where the position is correct is work_eggplant.) This makes perfect sense, as the data used for calculation of position_stack are now only a subset of the original data, so the observations which would receive no labels are not considered when stacking. This can be illustrated by zeroing out all the observations which would not receive a label:
df_zeroed = anti_join(df %>% mutate(value = 0),
df_labels,
by = c("name", "timepoint")) %>% bind_rows(df_labels)
ggplot(df_zeroed,
mapping = aes(x = factor(timepoint), y = value, group = name, fill = factor(peak))) +
geom_area(show.legend = FALSE, position = "stack", colour = "gray25") +
geom_label(data = df_labels, mapping = aes(label = name), show.legend = FALSE, position = position_stack(vjust = 0.5)) +
scale_fill_viridis_d()
Plot with unlabeled observations zeroed out
So now my question is, how can this problem be solved? Is there a way to use the original data for the positioning, but the subset data for the actual display of the labels?
Maybe this is what you are looking for. To achieve the desired result you could
use the whole dataset for plotting the labels to get the right positions,
use an empty string "" for the non-desired labels ,
set the fill and color of non-desired labels to "transparent"
# Plot (only show labels at the peak for the 10 highest peaks, readable but positions are wrong)
max_labels = 10 # how many labels to show
df_labels = df %>%
group_by(name) %>%
slice_max(value, n = 1) %>%
ungroup() %>%
slice_max(value, n = max_labels) %>%
mutate(label = name)
df1 <- df %>%
left_join(df_labels) %>%
replace_na(list(label = ""))
#> Joining, by = c("name", "timepoint", "value", "peak")
ggplot(df1,
mapping = aes(x = factor(timepoint), y = value, group = name, fill = as.character(peak))) +
geom_area(show.legend = FALSE, position = "stack", colour = "gray25") +
geom_label(mapping = aes(
label = label,
fill = ifelse(label != "", as.character(peak), NA_character_),
color = ifelse(label != "", "black", NA_character_)),
show.legend = FALSE, position = position_stack(vjust = 0.5)) +
scale_fill_viridis_d(na.value = "transparent") +
scale_color_manual(values = c("black" = "black"), na.value = "transparent")
EDIT If you want the fill colors to correspond to the value of peak then
a simple solution would be to map peak on fill instead of factor(peak) and make use of fill = ifelse(label != "", peak, NA_real_) in geom_label. However, in that case you have to switch to a continuous fill scale.
as I guess that you had a good reason to make use of discrete scale an other option would be to make peak an orderd factor. This approach however is not that simple. To make this work I first reorder factor(peak) according to peak, add an additional NA level and make us of an auxilliary variable peak1 to fill the labels. However, as we have two different variables to be mapped on fill I would suggest to make use of a second fill scale using ggnewscale::new_scale_fill to achieve the desired result:
library(tidyverse)
set.seed(0)
#cumsum(rnorm(num_timepoints)) * 3
# Generate some data
num_subjects = 50
num_timepoints = 10
labels = paste(sample(words, num_subjects), sample(fruit, num_subjects), sep = "_")
col_names = c("name", paste0("timepoint_", c(1:num_timepoints)))
df = bind_rows(map(labels,
~c(., cumsum(rnorm(num_timepoints)) * 3) %>%
set_names(col_names))) %>%
pivot_longer(starts_with("timepoint_"), names_to = "timepoint", names_prefix = "timepoint_") %>%
mutate(across(all_of(c("timepoint", "value")), as.numeric)) %>%
mutate(value = if_else(value < 0, 0, value)) %>%
group_by(name) %>% mutate(peak = max(value)) %>% ungroup()
# Plot (only show labels at the peak for the 10 highest peaks, readable but positions are wrong)
max_labels = 10 # how many labels to show
df_labels = df %>%
group_by(name) %>%
slice_max(value, n = 1) %>%
ungroup() %>%
slice_max(value, n = max_labels) %>%
mutate(label = name)
df1 <- df %>%
left_join(df_labels) %>%
replace_na(list(label = ""))
#> Joining, by = c("name", "timepoint", "value", "peak")
df2 <- df1 %>%
mutate(
# Make ordered factor
peak = fct_reorder(factor(peak), peak),
# Add NA level to peak
peak = fct_expand(peak, NA),
# Auxilliary variable to set the fill to NA for non-desired labels
peak1 = if_else(label != "", peak, factor(NA)))
ggplot(df2, mapping = aes(x = factor(timepoint), y = value, group = name, fill = peak)) +
geom_area(show.legend = TRUE, position = "stack", colour = "gray25") +
scale_fill_viridis_d(na.value = "transparent") +
# Add a second fill scale
ggnewscale::new_scale_fill() +
geom_label(mapping = aes(
label = label,
fill = peak1,
color = ifelse(label != "", "black", NA_character_)),
show.legend = FALSE, position = position_stack(vjust = 0.5)) +
scale_fill_viridis_d(na.value = "transparent") +
scale_color_manual(values = c("black" = "black"), na.value = "transparent")

How to clean my data and create a graph with ggplot2?

So I want to create a graph using data from Wikipedia, I created a data frame out of table that I have found. It contains two columns - style of beer and range of bitternes (IBU) like "20-50". Both are character, so I can't make a graph out of it that makes sense. I managed to change IBU column to two separate ones that are both numeric (min and max) but it created second data frame inside my first data frame, tried to find similar case but I couldn't, I'm now stuck and don't know what to do next :(
Sorry in advance for pasting so much code, I just want someone to read the data and see it's structure.
library(xml2)
library(rvest)
library(ggplot2)
library(tidyr)
file_html <- read_html(
"https://pl.wikipedia.org/wiki/International_Bittering_Units",
encoding = "UTF-8")
table_html <- html_node(file_html, "#mw-content-text > div > table")
table_IBU <- html_table(table_html, fill = TRUE)
table_IBU$IBU2 <- str_replace(table_IBU$`Stopień IBU`, "\\+", "")
table_IBU$IBU3 <- tidyr::separate(table_IBU, IBU2, into = c("min", "max"), sep = " – ")
table_IBU <- subset(table_IBU, select = -c(IBU2,
`Stopień IBU`,
`Gatunek piwa`))
table_IBU$IBU3$min2 <- as.numeric(table_IBU$IBU3$min)
table_IBU$IBU3$max2 <- as.numeric(table_IBU$IBU3$max)
#graph that I can come up with on my own
IBUgraph <- ggplot(table_IBU$IBU3, aes(reorder(`Gatunek piwa`, + max2),
max2)) +
geom_point(width = 0.5, color = "darkolivegreen",
fill = "darkseagreen4") +
theme(text=element_text(size = 9))
IBUgraph = IBUgraph +
labs(y = "Międzynarodowe Jednostki Goryczy (IBU)",
x = "Gatunek",
title = "Skala IBU - International Bitterness Units,
czyli międzynarodowe jednostki goryczy")
IBUgraph <- IBUgraph + theme(axis.text.x=element_text(angle=45, hjust=1.1))
IBUgraph
In the end I want to create a graph using ggplot() showcasing style of beer on x axis, and two points for each style showcasing minimum vaule, maximum value.
You can do this for example, it's called a dumbbell chart
ggplot(table_IBU$IBU3,aes(x=`Gatunek piwa`)) +
geom_point(aes(y=min2)) + # add point for min
geom_point(aes(y=max2)) + # add point for max
geom_segment(aes(xend=`Gatunek piwa`,y=min2,yend=max2)) + # create segment between min and max
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # rotate x axis
So, are you looking for something like this?
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(rvest)
#Acquire table
table_IBU <- read_html("https://pl.wikipedia.org/wiki/International_Bittering_Units", encoding = "UTF-8") %>%
html_node(., "#mw-content-text > div > table") %>%
html_table(., fill = TRUE)
#Extract scores into min and max values
table_IBU$IBU2 <- str_replace(table_IBU$`Stopień IBU`, "\\+", "")
table_IBU %<>% separate(., IBU2, into = c("min", "max"), sep = " – ") %>% select(-c(`Stopień IBU`))
table_IBU$min <- as.integer(table_IBU$min)
table_IBU$max <- as.integer(table_IBU$max)
table_IBU %<>% gather(data = ., key = "Limit", value = "Value", min, max)
#Plot
table_IBU %>% ggplot(data = ., aes(x = `Gatunek piwa`)) +
geom_point(aes(y = Value, col = Limit)) +
xlab("Type of beer") +
ylab("Score (0-120)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Quite an odd way to display this data.

Faceting a proportional half-area plot made with ggforce?

I am using ggforce to create a plot like this. .
My goal is to facet this type of plot.
For background on how the chart was made, check out update 3 on this question. The only modification that I have made was adding a geom_segment between the x axis and the Y value positions.
The reason why I believe faceting this graph is either difficult, or even impossible, is because continuous value x coordinates are used to determine where the geom_arc_bar is positioned in space.
My only idea for getting this to work has been supplying each "characteristic" that I want to facet with a set of x coordinates (1,2,3). Initially, as I will demonstrate in my code, I worked with set of highly curated data. Ideally, I would like to scale this to a dataset with many variables.
In the example graph that I have provided, the Y value is from table8, filtered for rows with "DFT". The area of the half-circles is proportional to the values of DDFS and FDFS from table9. Ideally, I would like to be able to create a function allowing for the easy creation of these graphs, with perhaps 3 parameters, the data for the y value, and for both half circles.
Here is my data.
Here is the code that I have written thus far.
For making a single plot
#Filter desired Age and Measurement
table9 %>%
filter(Age == "6-11" & Measurement != 'DFS' ) %>%
select( SurveyYear, Total , Measurement ) %>%
arrange(SurveyYear) %>%
dplyr::rename(Percent = Total) -> table9
#Do the same for table 8.
table8 %>%
filter(Age == "6-11" & Measurement != "DS" & Measurement != "FS") %>%
select(SurveyYear, Total) %>%
dplyr::rename(Y = Total)-> table8
table8 <- table8 %>%
bind_rows(table8) %>%
arrange(Y) %>%
add_column(start = rep(c(-pi/2, pi/2), 3), x = c(1,1,2,2,3,3))
table8_9 <- bind_cols(table8,table9) %>%
select(-SurveyYear1)
#Create the plot
ggplot(table8_9) + geom_segment( aes(x=x, xend=x, y=0, yend=Y), size = 0.5, linetype="solid") +
geom_arc_bar(aes(x0 = x, y0 = Y, r0 = 0, r = sqrt((Percent*2)/pi)/20,
start = start, end = start + pi, fill = Measurement),
color = "black") + guides(fill = guide_legend(title = "Type", reverse = T)) +
guides(fill = guide_legend(title = "Measurement", reverse = F)) +
xlab("Survey Year") + ylab("Mean dfs") + coord_fixed() + theme_pubr() +
scale_y_continuous(expand = c(0, 0), limits = c(0, 5.5)) +
scale_x_continuous(breaks = 1:3, labels = paste0(c("1988-1994", "1999-2004", "2011-2014"))) +
scale_fill_discrete(labels = c("ds/dfs", "fs/dfs")) -> lolliPlot
lolliPlot
Attempt at many plots
#Filter for "DFS"
table8 <- table8 %>%
filter(Measurement=="DFS")
#Duplicate DF vertically, and add column specifying the start point for the arcs.
table8 <- table8 %>%
bind_rows(table8) %>%
add_column(start = rep(c(-pi/2, pi/2), length(.$SurveyYear)/2), x = rep(x = c(1,2,3),length(.$SurveyYear)/3)) %>%
arrange(Age, x)
#Bind two tables today, removing all of the characteristic columns from table 8.
table8_9 <- bind_cols(table8,table9) %>%
select(-Age1, -SurveyYear1, -Measurement) %>%
gather(key = Variable, value = Y, -x,-start,-Age, -SurveyYear, -Measurement1, -Total1, -Male1, -Female1, -'White, non-Hispanic1', -'Black, non-hispanic1', -'Mexican American1', -'Less than 100% FPG1', -'100-199% FPG1', -'Greater than 200% FPG1')
This is where I get stuck. I can't figure out a way to format the data so that I can facet the graph. If anybody has any ideas or advice, I would greatly appreciate it.

R: PCA ggplot Error "arguments imply differing number of rows"

I have a dataset:
https://docs.google.com/spreadsheets/d/1ZgyRQ2uTw-MjjkJgWCIiZ1vpnxKmF3o15a5awndttgo/edit?usp=sharing
that I'm trying to apply PCA analysis and to achieve a graph based on graph provided in this post:
https://stats.stackexchange.com/questions/61215/how-to-interpret-this-pca-biplot-coming-from-a-survey-of-what-areas-people-are-i
However, an error doesn't seem to go away:
Error in (function (..., row.names = NULL, check.rows = FALSE, check.names =
TRUE, :
arguments imply differing number of rows: 0, 1006
Following is my code that I have trouble finding the source of error. Would like to have some help for error detection. Any hints?
The goal is to produced a PCA graph grouped by levels of Happiness.in.life. I modified the original code to fit with my dataset. Originally, group is determined by Genders, which has 2 levels. What I'm attempting to do is to build a graph based on 5 levels of Happiness.in.life. However, it doesn't seem I can use the old code...
Thanks!
library(magrittr)
library(dplyr)
library(tidyr)
df <- happiness_reduced %>% dplyr::select(Happiness.in.life:Internet.usage, Happiness.in.life)
head(df)
vars_on_hap <- df %>% dplyr::select(-Happiness.in.life)
head(vars_on_hap)
group<-df$Happiness.in.life
fit <- prcomp(vars_on_hap)
pcData <- data.frame(fit$x)
vPCs <- fit$rotation[, c("PC1", "PC2")] %>% as.data.frame()
multiple <- min(
(max(pcData[,"PC1"]) - min(pcData[,"PC1"]))/(max(vPCs[,"PC1"])-
min(vPCs[,"PC1"])),
(max(pcData[,"PC2"]) - min(pcData[,"PC2"]))/(max(vPCs[,"PC2"])-
min(vPCs[,"PC2"]))
)
ggplot(pcData, aes(x=PC1, y=PC2)) +
geom_point(aes(colour=groups)) +
coord_equal() +
geom_text(data=vPCs,
aes(x = fit$rotation[, "PC1"]*multiple*0.82,
y = fit$rotation[,"PC2"]*multiple*0.82,
label=rownames(fit$rotation)),
size = 2, vjust=1, color="black") +
geom_segment(data=vPCs,
aes(x = 0,
y = 0,
xend = fit$rotation[,"PC1"]*multiple*0.8,
yend = fit$rotation[,"PC2"]*multiple*0.8),
arrow = arrow(length = unit(.2, 'cm')),
color = "grey30")
Here is an approach on how to plot the result of PCA in ggplot2:
library(tidyverse)
library(ggrepel)
A good idea (not in all cases for instance if they are all in the same units) is to scale the variables prior to PCA
hapiness %>% #this is the data from google drive. In the future try not top post such links on SO because they tend to be unusable after some time has passed
select(-Happiness.in.life) %>%
prcomp(center = TRUE, scale. = TRUE) -> fit
Now we can proceed to plotting the fit:
fit$x %>% #coordinates of the points are in x element
as.data.frame()%>% #convert matrix to data frame
select(PC1, PC2) %>% #select the first two PC
bind_cols(hapiness = as.factor(hapiness$Happiness.in.life)) %>% #add the coloring variable
ggplot() +
geom_point(aes(x = PC1, y = PC2, colour = hapiness)) + #plot points and color
geom_segment(data = fit$rotation %>% #data we want plotted by geom_segment is in rotation element
as.data.frame()%>%
select(PC1, PC2) %>%
rownames_to_column(), #get to row names so you can label after
aes(x = 0, y = 0, xend = PC1 * 7, yend = PC2* 7, group = rowname), #I scaled the rotation by 7 so it fits in the plot nicely
arrow = arrow(angle = 20, type = "closed", ends = "last",length = unit(0.2,"cm")),
color = "grey30") +
geom_text_repel(data = fit$rotation %>%
as.data.frame()%>%
select(PC1, PC2) %>%
rownames_to_column(),
aes(x = PC1*7,
y = PC2*7,
label = rowname)) +
coord_equal(ratio = fit$sdev[2]^2 / fit$sdev[1]^2) + #I like setting the ratio to the ratio of eigen values
xlab(paste("PC1", round(fit$sdev[1]^2/ sum(fit$sdev^2) *100, 2), "%")) +
ylab(paste("PC2", round(fit$sdev[2]^2/ sum(fit$sdev^2) *100, 2), "%")) +
theme_bw()
Look at all them happy people on the left (well it is hard to notice because of the colors used, I suggest using the palette jco from ggpubr library) get_palette('jco', 5) ie scale_color_manual(values = get_palette('jco', 5))
quite a similar plot can be achieved with library ggord:
library(ggord)
ggord(fit, grp_in = as.factor(hapiness$Happiness.in.life),
size = 1, ellipse = F, ext = 1.2, vec_ext = 5)
the major difference is ggord uses equal scaling for axes. Also I scaled the rotation by 5 instead of 7 as in the first plot.
As you can see I do not like many intermediate data frames.

Resources