How to display the ID of outliers on a boxplot - r

I want to display the IDs that have extreme values on a boxplot but I have no idea how to do it.
For example the IDs corresponding to the values 10, 98 and 120
Poids<-c(round(rnorm(100,65,10),1),10,53,120,98)
ID<-c(paste("A",1:26,sep = ""),paste("B",1:26,sep = ""),paste("C",1:26,sep = ""),
paste("D",1:26,sep = ""))
mydata<-data.frame(ID=ID,Poids=Poids)

Using tidyverse packages you can create a subset inside geom_text, here how:
Data
Poids <- c(round(rnorm(100,65,10),1),10,53,120,98)
ID <- c(paste("A",1:26,sep = ""),paste("B",1:26,sep = ""),paste("C",1:26,sep = ""),
paste("D",1:26,sep = ""))
mydata <- data.frame(ID=ID,Poids=Poids)
Setting values manually
Code
library(dplyr)
library(ggplot2)
mydata %>%
ggplot(aes(x = Poids))+
geom_boxplot()+
geom_text(
data = mydata %>% filter(Poids %in% c(10,98,120)),
mapping = aes(y = 0,label = ID),
nudge_y = .05
)
Output
Using boxplot outlier criteria
Code
# remotes::install_github("vbfelix/relper")
library(relper)
mydata %>%
ggplot(aes(x = Poids))+
geom_boxplot()+
geom_text(
data = mydata %>% filter(is_outlier(Poids)),
mapping = aes(y = 0,label = ID),
nudge_y = .05
)
Output

Related

Adding a power curve to scatterplot

I want to add a power curve with confidence intervals to my diamter-weight relationship, which clearly follows a y=a*x^b regression. So far, I used the geom_smooth "loess" version, but this is not yet quite right and perfect. Any suggestion how to add a power regression line would be much appreciated. Below is the used code:
p2<-ggplot(Data,aes(x=Diameter,y=Wet_weight,colour=Site))+
geom_point(size=3.5,alpha=0.3)+
geom_smooth(aes(group=Species),method=loess,colour="black")+
labs(x="\nUmbrella diamter (mm)",y="Wet weight (mg)\n")+theme_classic()+
scale_colour_manual(values=c("black","dark blue","blue","dark green","green"))+
theme(axis.title.x=element_text(size=20),
axis.text.x=element_text(size=18,colour="black"),
axis.title.y=element_text(size=20),
axis.text.y=element_text(size=18,colour="black"),
axis.ticks=element_line(colour="black",size=1),
axis.line=element_line(colour="black",size=1,linetype="solid"),
legend.position=c(0.18,0.75),
legend.text=element_text(colour="black",size=17),
legend.title=element_text(colour="black",size=18))
p2
Thank you!
I used this to get many equations, R2, and plots.
df= #change your data frame so it fits the current code
variables=c("group","year") #if you have multiple groups/seasons/years/elements add them here
df$y= #which variable will be your y
df$x= #which variable will be your x
#No changes get the equations
text=df %>%
group_by(across(all_of(variables))) %>% #your grouping variables
do(broom::tidy(lm(log(y) ~ log(x), data = .))) %>%
ungroup() %>%
mutate(y = round(ifelse(term=='(Intercept)',exp(estimate),estimate),digits = 2)) %>% #your equation values rounded to 2
select(-estimate,-std.error,-statistic ,-p.value) %>%
pivot_wider(names_from = term,values_from = y) %>%
rename(.,a=`(Intercept)`,b=`log(x)`)
#CHANGE before running!! add your grouping variables
rsq=df %>%
split(list(.$group,.$year)) %>% #---- HERE add the names after $
map(~lm(log(y) ~ log(x), data = .)) %>%
map(summary) %>%
map_dbl("r.squared") %>%
data.frame()
#Join the R2 and y results for the plot in a single data frame and write the equations
labels.df=mutate(rsq,groups=row.names(rsq)) %>%
separate(col = groups,into = c(variables),sep = "[.]",
convert = TRUE, remove = T, fill = "right") %>%
rename("R"='.') %>%
left_join(text,.) %>%
mutate(R=round(R,digits = 4), #round your R2 digits
eq= paste('y==',a,"~x^(",b,")", sep = ""),
rsql=paste("R^2==",R),
full= paste('y==',a,"~x^(",b,")","~~R^2==",R, sep = ""))
# plot
ggplot(df,aes(x = x,y = y)) +
geom_point(size=4,mapping = aes(
colour=factor(ifelse(is.na(get(variables[2])),"",(get(variables[2])))), #points colour
shape=get(variables[1]))) + # different shapes
facet_wrap(get(variables[1])~ifelse(is.na(get(variables[2])),"",get(variables[2])),
scales = "free",labeller = labeller(.multi_line = F))+ #for multiple groups; join text in one line
stat_smooth(mapping=aes(colour=get(variables[1])), #colours for our trend
method = 'nls', formula = 'y~a*x^b',
method.args = list(start=c(a=1,b=1)),se=FALSE) +
geom_text(labels.df,x = Inf, y = Inf,size=5, mapping = aes(label = (eq)), parse = T,vjust=1, hjust=1)+
geom_text(labels.df,x = Inf,y = Inf,size=5, mapping = aes(label = (rsql)), parse = T,vjust=2.5, hjust=1)+
#scale_y_log10() + #add this to avoid problems with big y values
labs(x="Your x label",y="your y label")+
theme_bw(base_size = 16) +
theme(legend.position = "none",
strip.background = element_rect(fill="#b2d6e2"))

Multigroup frequency with ggplot

I'm trying to replicate this histogram in R.
Here is how to mock my dataset:
dft <- data.frame(
menutype = sample(c(1,2,4,5,6,8,12), 120, replace = T),
Belief = sample(c(0,1), 120, replace = T),
Choice = sample(c(0,1), 120, replace = T)
)
Here is my code :
library(ggplot2)
library(dplyr)
library(tidyr)
library(MASS)
df <- data.frame(
menutype = factor(df$menutype, labels = c("GUILT" , "SSB0", "SSB1", "FLEX0", "FLEX1", "STD", "FLEX01"),
levels = c(1,2,4,5,6,8,12)),
Belief = factor(df$belieflearn, levels = c(1), labels= c("Believe Learn")), #Interested only in this condition
Choice = factor(df$learned, levels = c(1), labels= c("Learn")) #Same here
)
df1 <- rbind(na.omit(df %>%
count(Belief, menutype) %>%
group_by(menutype) %>%
mutate(prop = n / sum(n))),
na.omit(df %>%
count(Choice, menutype) %>%
group_by(menutype) %>%
mutate(prop = n / sum(n))))
test <- paste(df1$Belief[1:6],paste(df1$Choice[7:13]))
test[1:6] <- paste(df1$Belief[1:6])
test[7:13] <- paste(df1$Choice[7:13])
df1$combine <- paste(test)
ggplot(data = df1, aes(menutype, prop, fill = combine)) +
labs(title = "Classification based on rank ordering\n", x = "", y = "Fraction of subjects", fill = "\n") +
geom_bar(stat = "identity", position = "dodge")+
theme_bw() +
theme(legend.position="bottom", plot.title = element_text(hjust = 0.5)) #Centering of the main title+
#geom_text(aes(label="ok"), vjust=-0.3, size=3.5)+
The problem is that it's more or less working, I'm almost getting the graph that I want but it is a workaround and there is still some errors. Indeed, I've for example the same value for STD (0.10), while it should be 0 and 0.10 like in the original graph.
What I would like to do optimally is to have two different dataframe, one with menutype and Belief, the other one with menutype and Choice, then as I did, compute the proportion of a specific modality in each latter variables on menutype, and finally to plot it as histograms, much as the graph in the original study. Additionally, I'd like to have the proportions as fractions above each bar, but that is optional.
Could someone help me on this matter? I'm really struggling to get it working.
Thanks in advance!
EDIT: I think the issue is with the fill =. I would like to specify for each bar the variable I want (e.g, fill = df2$Belief & df2$Choice) but I don't know how to proceed.
library(tidyverse)
set.seed(10)
# example data frame
df <- data.frame(
menutype = sample(c(1,2,4,5,6,8,12), 120, replace = T),
Belief = sample(c(0,1), 120, replace = T),
Choice = sample(c(0,1), 120, replace = T)
)
# calculate all metrics based on all variables you want to plot in a tidy way
df_plot = df %>%
group_by(Choice) %>%
count(menutype, Belief) %>%
mutate(prop = n / sum(n),
prop_text = paste0(n, "/", sum(n))) %>%
ungroup()
# barplots using one variable and split plots using another variable
df_plot %>%
mutate(Belief = factor(Belief),
menutype = factor(menutype)) %>%
ggplot(aes(menutype, prop, fill = Belief))+
geom_col(position = "dodge")+
facet_wrap(~Choice, ncol=1)+
geom_text(aes(label=prop_text), position = position_dodge(1), vjust = -0.5)+
ylim(0,0.2)

ggplot2::geom_text(): how to display all factor levels, but suppress specific values like '0':

Here is code to give context to my question:
set.seed(1); tibble(x=factor(sample(LETTERS[1:7],7,replace = T),levels = LETTERS[1:7])) %>% group_by_all() %>% count(x,.drop = F) %>%
ggplot(mapping = aes(x=x,y=n))+geom_bar(stat="identity")+geom_text(
aes(label = n, y = n + 0.05),
position = position_dodge(1),
vjust = 0)
I want ALL of the levels of the variable x to be displayed on the x-axis (LETTERS[1:7]). For each Level with n>0, I want the value to display atop the bar for that level. For each level with n==0, I want the value label to NOT be displayed. Currently, the plot displays the 0 for 'empty' factor levels c("C","F"), and I want to suppress the display of '0's for those levels, but still display "C", and "F" on the x-axis.
I hope someone might be able to help me.
Thanks.
A simple ifelse() will do it. You can enter any text you like for example ifelse( n>0, n , "No Data")
library( tidyr)
library( ggplot2)
library( dplyr )
set.seed(1); tibble(x=factor(sample(LETTERS[1:7],7,replace = T),levels = LETTERS[1:7])) %>% group_by_all() %>% count(x,.drop = F) %>%
ggplot(mapping = aes(x=x,y=n))+geom_bar(stat="identity")+
geom_text(
aes(label = ifelse( n>0, n , ""), y = n + 0.05),
position = position_dodge(1),
vjust = 0)
You pass a function to the data argument inside geom_test, for this example you can do a subset on the piped data (referred as .x):
set.seed(1);
tibble(x=factor(sample(LETTERS[1:7],7,replace = T),levels = LETTERS[1:7])) %>% group_by_all() %>% count(x,.drop = F) %>%
ggplot(mapping = aes(x=x,y=n))+geom_bar(stat="identity")+
geom_text(data=~subset(.x,n>0),
aes(label = n, y = n + 0.05),
position = position_dodge(1),
vjust = 0)

Plot NA counts in a histogram

I have a question related to the histograms in R using ggplot2. I have been working trying to represent some values in a histogram from two different variables. After trying and looking for some solutions in Stackoverflow I got it but...does somebody know how to print NAs count as a new column just to compare the missings in the two variables?
Here is the R code:
i<-"ADL_1_bathing"
j<-"ADL_1_T2_bathing"
t1<-data.frame(datosMedicos[,i])
colnames(t1)<-"datos"
t2<-data.frame(datosMedicos[,j])
colnames(t2)<-"datos"
t1$time<-"t1"
t2$time<-"t2"
juntarParaGrafico<-rbind(t1,t2)
ggplot(juntarParaGrafico, aes(datos, fill = time) ) +
geom_histogram(col="darkblue",alpha = 0.5, aes(y = ..count..), binwidth = 0.2, position = 'dodge', na.rm = F) +
theme(legend.justification = c(1, 1), legend.position=c(1, 1))+
labs(title=paste0("Distribution of ",i), x=i, y="Count")
And this is the output:
Image about the two variables values but without the missing bars:
you could try to summarise the number of NAs b4 plotting. How about this?
library(ggplot2)
library(dplyr)
df1 = data.frame(a = rnorm(1:20))
df1[sample(1:20, 5),] = NA
df2 = data.frame(a = rnorm(1:20))
df2[sample(1:20, 3),] = NA
df2$time = "t2"
df1$time = "t1"
df = rbind(df1, df2)
df %>% group_by(time) %>% summarise(numNAs = sum(is.na(a)))
histogramDF= df %>% group_by(time) %>% summarise(numNAs = sum(is.na(a)))
qplot(x=time, y = numNAs, fill=time, data = histogramDF, stat='identity', geom="histogram")

ggplot/GGally - Parallel Coordinates - y-axis labels

Does anyone know if there is a way to add variable labels to the ggparcoord function in GGally? I've tried numerous ways with geom_text, but nothing is yielding results.
To be more explicit, I am looking to pass the row.names(mtcars) through geom_text. The only way that I can distinguish the car is passing row.names(mtcars) through the groupColumn argument, but I don't like the way this looks.
Doesn't work:
mtcars$carName <- row.names(mtcars) # This becomes column 12
library(GGally)
# Attempt 1
ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1) +
geom_text(aes(label = carName))
# Attempt 2
ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1,
mapping = aes(label = carName))
Any ideas would be appreciated!
Solution 1: If you want to stick close to your original attempt, you can calculate the appropriate y coordinates for the car names, & add that as a separate data source. Use inherit.aes = FALSE so that this geom_text layer doesn't inherit anything from the ggplot object created using ggparcoord():
library(dplyr)
p1 <- ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1) +
geom_text(data = mtcars %>%
select(carName) %>%
mutate(x = 1,
y = scale(as.integer(factor(carName)))),
aes(x = x, y = y, label = carName),
hjust = 1.1,
inherit.aes = FALSE) +
# optional: remove "carName" from x-axis labels
scale_x_discrete(labels = function(x) c("", x[-1])) +
# also optional: hide legend, which doesn't really seem relevant here
theme(legend.position = "none")
p1
Solution 2: This alternative uses carName as the group column, & doesn't pass it as one of the parallel coordinate columns. (which I think this might be closer to the use cases intended by this function...) Specifying carName as the group column allows the car name values to be captured in the data slot of the ggplot object created by ggparcoord() this time, so our geom_text label can inherit it directly, & even filter only for rows corresponding to variable == "mpg" (or whatever the first of the parallel coordinate columns is named, in the actual use case). The y coordinates are not as evenly spread out as above, but geom_text_repel from the ggrepel package does a decent job at shifting overlapping text labels away from one another.
library(dplyr)
library(ggrepel)
p2 <- ggparcoord(mtcars,
columns = c(1, 6),
groupColumn = "carName") +
geom_text_repel(data = . %>%
filter(variable == "mpg"),
aes(x = variable, y = value, label = carName),
xlim = c(NA, 1)) + # limit repel region to the left of the 1st column
theme(legend.position = "none") # as before, hide legend since the labels
# are already in the plot
p2
Solution 3 / 4: You can actually plot the same with ggplot(), without relying on extensions that may do unexpected stuff behind the scenes:
library(dplyr)
library(tidyr)
library(ggrepel)
# similar output to solution 1
p3 <- mtcars %>%
select(carName, mpg, wt) %>%
mutate(carName.column = as.integer(factor(carName))) %>%
gather(variable, value, -carName) %>%
group_by(variable) %>%
mutate(value = scale(value)) %>%
ungroup() %>%
ggplot(aes(x = variable, y = value, label = carName, group = carName)) +
geom_line() +
geom_text(data = . %>% filter(variable == "carName.column"),
hjust = 1.1) +
scale_x_discrete(labels = function(x) c("", x[-1]))
p3
# similar output to solution 2
p4 <- mtcars %>%
select(carName, mpg, wt) %>%
gather(variable, value, -carName) %>%
group_by(variable) %>%
mutate(value = scale(value)) %>%
ungroup() %>%
ggplot(aes(x = variable, y = value, label = carName, group = carName)) +
geom_line() +
geom_text_repel(data = . %>% filter(variable == "mpg"),
xlim = c(NA, 1))
p4
Edit
You can add text labels on the right as well, for each of the above. Do note that the location for labels may not be nicely spaced out, since they are positioned according to wt's scaled values:
p1 +
geom_text(data = mtcars %>%
select(carName, wt) %>%
mutate(x = 3,
y = scale(wt)),
aes(x = x, y = y, label = carName),
hjust = -0.1,
inherit.aes = FALSE)
p2 +
geom_text_repel(data = . %>%
filter(variable == "wt"),
aes(x = variable, y = value, label = carName),
xlim = c(2, NA))
p3 +
geom_text(data = . %>% filter(variable == "wt"),
hjust = -0.1)
p4 +
geom_text_repel(data = . %>% filter(variable == "wt"),
xlim = c(2, NA))

Resources