Select observations near to their neighbor in PCA cloud - r

I have a dataset ind with two population fr2100 and nr, where each individual in this population have an unique numerous. Each individual has coordinates, a Dim.1 and Dim.2 value. As you can see here:
> ind <- get_pca_ind(res_acp)
> ind
Principal Component Analysis Results for individuals
===================================================
Name Description
1 "$coord" "Coordinates for the individuals"
2 "$cos2" "Cos2 for the individuals"
3 "$contrib" "contributions of the individuals"
# isolate the population 'fr2100'
> fr2100 <- ind$coord[substr(rownames(ind$coord), 1, 7) == 'fr2100_', ]
> str(fr2100)
'data.frame': 6873 obs. of 3 variables:
$ rowname: chr "fr2100_72" "fr2100_73" "fr2100_74" "fr2100_75" ...
$ Dim.1 : num 1.37 1.3 1.25 1.25 1.18 ...
$ Dim.2 : num -1.249 -1.028 -0.835 -0.624 -0.483 ...
# isolate the population 'nr'
> nr <- ind$coord[substr(rownames(ind$coord), 1, 3) == 'nr_', ]
> str(nr)
'data.frame': 4897 obs. of 3 variables:
$ rowname: chr "nr_174" "nr_175" "nr_176" "nr_177" ...
$ Dim.1 : num -3.74 -3.44 -3.26 -2.97 -3.88 ...
$ Dim.2 : num 1.26 1.55 1.7 1.91 1.3 ...
My question: I am trying to understand how I can select only, among the 6873 individuals of fr2100, the individuals who have a value of Dim.1 AND Dim.2 at a distance of more or less 0.01 from the 4897 individuals nr, represented in this cloud of points:
In other words, each individuals fr2100 that can be within the perimeter (at 0.01) of an individual nr. as theoretically represented here
I'm interested in any answers. I can provide more information if needed. Thank you in advance.

I guess distance_semi_join() from fuzzyjoin package would be rather straightforward and compact way to filter by euclidean distance. Other variants like distance_left_join() are also worth considering as those will provide an optional distance variable in resulting dataframe.
library(fuzzyjoin)
library(ggplot2)
# example datasets
set.seed(1)
nr <- data.frame(rowname = paste0("nr_", 1:100), Dim.1 = rnorm(100, -0.05, 0.03), Dim.2 = rnorm(100, 0, 0.02))
fr <- data.frame(rowname = paste0("fr_", 1:100), Dim.1 = rnorm(100, 0.05, 0.03), Dim.2 = rnorm(100, 0, 0.02))
# fr points within distance of closest nr point:
fr_in_dist <- distance_semi_join(fr, nr,
by = c("Dim.1","Dim.2"),
max_dist=0.01)
fr_in_dist
#> rowname Dim.1 Dim.2
#> 5 fr_5 -0.018557066 3.308291e-02
#> 14 fr_14 0.008893764 -1.311564e-02
#> 18 fr_18 0.012401307 -2.420202e-03
#> 25 fr_25 0.015302829 9.640590e-03
#> 28 fr_28 0.001834598 3.409789e-03
#> 32 fr_32 -0.036667620 -3.138164e-02
#> 38 fr_38 0.014406241 8.797409e-05
#> 46 fr_46 -0.010004948 -2.817701e-02
#> 57 fr_57 -0.022092886 -2.347154e-02
#> 68 fr_68 0.014326601 1.135904e-02
#> 77 fr_77 -0.018673719 2.577108e-03
#> 79 fr_79 0.010512645 -3.278219e-03
#> 84 fr_84 0.028963050 3.286837e-03
#> 86 fr_86 0.019967835 -1.130428e-03
#> 94 fr_94 0.007212280 6.132097e-03
ggplot() +
geom_point(data = nr, aes(x = Dim.1, y = Dim.2, color = "nr"))+
geom_point(data = fr, aes(x = Dim.1, y = Dim.2, color = "fr"))+
geom_point(data = fr_in_dist, aes(x = Dim.1, y = Dim.2), shape = 1, size = 5 )+
coord_fixed() +
theme_bw()
Original answer was about singe reference point vs point could, in this dist() from base is also quite straightforward:
library(ggplot2)
# sample data, add point fr2100_xx that would fall outside of the perimeter
df <- read.csv(text = "rowname, Dim.1, Dim.2
fr2100_72, 0.003810163, 0.006935450
fr2100_73, 0.003433946, 0.004698691
fr2100_74, 0.003168248, 0.003097222
fr2100_xx, 0.015, 0.015")
# nr and threshold distance
nr <- c(0.0035, 0.005)
thr_dist <- 0.01
# insert nr point to first position to use it in distance matrix calculation
dist_m <- rbind(c(0.0035, 0.005),df[,c("Dim.1", "Dim.2")]) |> dist() |> as.matrix()
# distances:
as.dist(dist_m)
#> 1 2 3 4
#> 2 0.0019601448
#> 3 0.0003084643 0.0022681777
#> 4 0.0019314822 0.0038915356 0.0016233602
#> 5 0.0152397507 0.0137930932 0.0154884012 0.0167829223
# extract first column, distnaces from point "nr" ([1,1] = 0)
df$dist <-dist_m[-1,1]
# flag points that fall outside of the perimeter
df$in_dist = df$dist <= thr_dist
df
#> rowname Dim.1 Dim.2 dist in_dist
#> 1 fr2100_72 0.003810163 0.006935450 0.0019601448 TRUE
#> 2 fr2100_73 0.003433946 0.004698691 0.0003084643 TRUE
#> 3 fr2100_74 0.003168248 0.003097222 0.0019314822 TRUE
#> 4 fr2100_xx 0.015000000 0.015000000 0.0152397507 FALSE
Viz - https://i.imgur.com/jiqHmXn.png

Related

How can I convert my R code into a loop and reproducible script?

I have put together an R code for split plot analysis by exploring the internet (thanks to data scientists for making our lives easier). However, the problem now is I have more than a 100 variables to test. Inside this code there are a few instances where I need to change the column/variable name each time, axis names for each figure, and output figure name. Also, when the Anova results are printed on the screen I manually copy the p-values and lsd scores before moving forward. Is there a way to just automate everything to save many hours of manual editing? The code I am using is as below:
data <- read.csv("file.csv", header = TRUE)
data$Genotype <- as.factor(data$Genotype)
data$N.level <- as.factor(data$N.level)
library(agricolae)
attach(data)
Here before running the model I change response variables name each time, also the model output prints anova results which I am copying manually before moving on
model <- sp.plot(block = Block,
pplot = Genotype,
splot = N.level,
Y = GPC.30DAP)
Edf_a <- model$gl.a
Edf_b <- model$gl.b
EMS_a <- model$Ea
EMS_b <- model$Eb
Again I change Y for LSD test each time, and from LSD output manually copy the results of each test 1,2,3
out1 <- LSD.test(y = GPC.30DAP,
trt = Genotype,
DFerror = Edf_a,
MSerror = EMS_a,
alpha = 0.05,
#p.adj = "bonferroni",
group = TRUE,
console = TRUE)
out2 <- LSD.test(y = GPC.30DAP,
trt = N.level,
DFerror = Edf_b,
MSerror = EMS_b,
alpha = 0.05,
#p.adj = "bonferroni",
group = TRUE,
console = TRUE)
out3 <- LSD.test(y = GPC.30DAP,
trt = Genotype:N.level,
DFerror = Edf_b,
MSerror = EMS_b,
alpha = 0.05,
#p.adj = "bonferroni",
group = TRUE,
console = TRUE)
library(dplyr)
ascend_AB = out3$groups %>%
group_by(rownames(out3$groups)) %>%
arrange(rownames(out3$groups))
print(ascend_AB)
CHANGE VARIABLE NAME AGAIN
MeanSD_AB = data %>%
group_by(Genotype, N.level) %>%
summarise(avg_AB = mean(GPC.30DAP),
sd = sd(GPC.30DAP))
print(MeanSD_AB)
attach(MeanSD_AB)
##------ Plotting "G*N interaction"
library(ggplot2)
## Create plotting object
p3 = ggplot(MeanSD_AB, aes(x = N.level,
y = avg_AB,
fill = factor(Genotype)))
print(p3)
## Plotting bars
plotA = p3 +
geom_bar(stat = "identity",
color = "black",
position = position_dodge(width=0.9))
print(plotA)
## Adding error bars
plotC = plotA +
geom_errorbar(aes(ymax = avg_AB + sd,
ymin = avg_AB - sd),
position = position_dodge(width=0.9),
width = 0.25)
print(plotC)
Changing main title, X and Y labels and legend title
plotD = plotC +
labs(title = "",
x = "",
y = "GPC (%) 30 days after anthesis",
fill = "Genotype")
print(plotD)
## Adding lettering from test applied
plotE = plotD +
geom_text(aes(x = N.level,
y = avg_AB + sd,
label = as.matrix(ascend_AB$groups)),
position = position_dodge(width = 0.9),
vjust = -(0.5))
print(plotE)
figure title change to avoid overwriting
ggsave("GPC 30 days after anthesis.jpeg", width = 10, height = 10, units = c("cm"), dpi = 300)
There’s a few things going on here which I think would be useful to go over
in some detail. Before diving in, let’s load the example data directly from pastebin:
library(agricolae)
data <- read.delim("https://pastebin.com/raw/kkjZVxsW")
data$Genotype <- as.factor(data$Genotype)
data$N.level <- as.factor(data$N.level)
data
#> Block Genotype N.level GPC.15DAP GPC.23DAP GPC.30DAP
#> 1 1 1290H Low N 4.9 2.7 2.6
#> 2 2 1290H Low N 4.6 3.3 2.3
#> 3 3 1290H Low N 5.1 2.7 3.7
#> 4 1 1290W Low N 5.0 2.4 1.9
#> 5 2 1290W Low N 4.4 3.2 2.7
#> 6 3 1290W Low N 4.5 3.0 2.4
#> 7 1 1290H High N 8.2 3.6 4.1
#> 8 2 1290H High N 7.6 4.4 3.4
#> 9 3 1290H High N 7.3 4.3 4.5
#> 10 1 1290W High N 5.5 3.9 2.8
#> 11 2 1290W High N 6.1 3.3 2.0
#> 12 3 1290W High N 5.8 3.8 2.4
Looping
You might hope to loop over the variable names along these lines:
for (variable in c("GPC.15DAP", "GPC.30DAP")) {
sp.plot(Block, Genotype, N.level, variable)
}
This won’t work because variable now contains the name of the variable in
a string, rather than the actual value of the variable. Instead, you can
get() the value given a name:
variable <- "GPC.30DAP"
with(data, {
sp.plot(Block, Genotype, N.level, get(variable))
})
#>
#> ANALYSIS SPLIT PLOT: get(variable)
#> Class level information
#>
#> Genotype : 1290H 1290W
#> N.level : Low N High N
#> Block : 1 2 3
#>
#> Number of observations: 12
#>
#> Analysis of Variance Table
#>
#> Response: get(variable)
#> Df Sum Sq Mean Sq F value Pr(>F)
#> Block 2 0.8600 0.4300 NaN NaN
#> Genotype 1 3.4133 3.4133 9.3945 0.09199 .
#> Ea 2 0.7267 0.3633 NaN NaN
#> N.level 1 1.0800 1.0800 5.6348 0.07651 .
#> Genotype:N.level 1 0.8533 0.8533 4.4522 0.10249
#> Eb 4 0.7667 0.1917 NaN NaN
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> cv(a) = 20.8 %, cv(b) = 15.1 %, Mean = 2.9
This gives the correct result, but there’s something not quite right.
The name of the response variable given in the output is less than useful:
get(variable). That happens because sp.plot() captures the code that was
used to pass the arguments, and extracts the name to use for the response variable from that.
To get the name of the response variable to show up,
we need some metaprogramming to modify our code before
running it. I’ll use a toy example to introduce the
concepts, and then we’ll see how they apply to the problem at hand.
The fist component is str2lang(), which takes a string and turns
it into a “language object” – some unevaluated code:
code <- str2lang("1 + 1")
code
#> 1 + 1
Next, bquote() can insert our piece of code into another, using
a special .() syntax:
bquote(2 * .(code))
#> 2 * (1 + 1)
Finally we have eval(), which takes a piece of code and runs it:
eval(bquote(2 * .(code)))
#> [1] 4
Let’s put the pieces together:
first create a language object from the variable name in a string with
str2lang(), then insert that into the sp.plot() code with bquote(),
and finally eval() the result to run the analysis:
variable <- str2lang("GPC.30DAP")
code <- bquote({
sp.plot(Block, Genotype, N.level, .(variable))
})
with(data, eval(code))
#>
#> ANALYSIS SPLIT PLOT: GPC.30DAP
#> Class level information
#>
#> Genotype : 1290H 1290W
#> N.level : Low N High N
#> Block : 1 2 3
#>
#> Number of observations: 12
#>
#> Analysis of Variance Table
#>
#> Response: GPC.30DAP
#> Df Sum Sq Mean Sq F value Pr(>F)
#> Block 2 0.8600 0.4300 NaN NaN
#> Genotype 1 3.4133 3.4133 9.3945 0.09199 .
#> Ea 2 0.7267 0.3633 NaN NaN
#> N.level 1 1.0800 1.0800 5.6348 0.07651 .
#> Genotype:N.level 1 0.8533 0.8533 4.4522 0.10249
#> Eb 4 0.7667 0.1917 NaN NaN
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> cv(a) = 20.8 %, cv(b) = 15.1 %, Mean = 2.9
You could use this directly, but it might be helpful to create a little
helper function to encapsulate the process. Then you won’t have to think
about what’s happening in this fairly complex block of code when you
actually call it:
fit_model <- function(variable) {
variable <- str2lang(variable)
code <- bquote({
sp.plot(Block, Genotype, N.level, .(variable))
})
eval(code, parent.frame())
}
with(data, fit_model("GPC.30DAP"))
Now we have the tools to execute the loop we envisioned in the beginning:
for (variable in c("GPC.15DAP", "GPC.30DAP")) {
with(data, fit_model(variable))
}
#>
#> ANALYSIS SPLIT PLOT: GPC.15DAP
#> Class level information
#>
#> Genotype : 1290H 1290W
#> N.level : Low N High N
#> Block : 1 2 3
#>
#> Number of observations: 12
#>
#> Analysis of Variance Table
#>
#> Response: GPC.15DAP
#> Df Sum Sq Mean Sq F value Pr(>F)
#> Block 2 0.1350 0.0675 NaN NaN
#> Genotype 1 3.4133 3.4133 67.147 0.01457 *
#> Ea 2 0.1017 0.0508 NaN NaN
#> N.level 1 12.0000 12.0000 68.900 0.00115 **
#> Genotype:N.level 1 2.0833 2.0833 11.962 0.02585 *
#> Eb 4 0.6967 0.1742 NaN NaN
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> cv(a) = 3.9 %, cv(b) = 7.3 %, Mean = 5.75
#>
#>
#> ANALYSIS SPLIT PLOT: GPC.30DAP
#> Class level information
#>
#> Genotype : 1290H 1290W
#> N.level : Low N High N
#> Block : 1 2 3
#>
#> Number of observations: 12
#>
#> Analysis of Variance Table
#>
#> Response: GPC.30DAP
#> Df Sum Sq Mean Sq F value Pr(>F)
#> Block 2 0.8600 0.4300 NaN NaN
#> Genotype 1 3.4133 3.4133 9.3945 0.09199 .
#> Ea 2 0.7267 0.3633 NaN NaN
#> N.level 1 1.0800 1.0800 5.6348 0.07651 .
#> Genotype:N.level 1 0.8533 0.8533 4.4522 0.10249
#> Eb 4 0.7667 0.1917 NaN NaN
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> cv(a) = 20.8 %, cv(b) = 15.1 %, Mean = 2.9
Capturing results
Unfortunately sp.plot() simultaneously both fits the model and prints the results.
This is not ideal, as when fitting hundreds of models you don’t want the output
to just overwhelm your console. We need some additional tools to work around that:
# Helper to discard printed output when evaluating `expr`
silence.output <- function(expr) {
sink(nullfile())
on.exit(sink())
expr
}
Now lapply() is a convenient way to loop and collect the results in a list:
variables <- c("GPC.15DAP", "GPC.30DAP")
models <- lapply(variables, \(variable) {
with(data, fit_model(variable)) |> silence.output()
})
Inspecting the result, we see we’ve saved all the relevant components:
str(models, 2)
#> List of 2
#> $ :List of 5
#> ..$ ANOVA:Classes 'anova' and 'data.frame': 6 obs. of 5 variables:
#> .. ..- attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Response: GPC.15DAP"
#> ..$ gl.a : int 2
#> ..$ gl.b : int 4
#> ..$ Ea : num 0.0508
#> ..$ Eb : num 0.174
#> $ :List of 5
#> ..$ ANOVA:Classes 'anova' and 'data.frame': 6 obs. of 5 variables:
#> .. ..- attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Response: GPC.30DAP"
#> ..$ gl.a : int 2
#> ..$ gl.b : int 4
#> ..$ Ea : num 0.363
#> ..$ Eb : num 0.192
The ANOVA table is also accessible, which you can use directly instead of
copying the results manually:
models[[1]]$ANOVA
#> Analysis of Variance Table
#>
#> Response: GPC.15DAP
#> Df Sum Sq Mean Sq F value Pr(>F)
#> Block 2 0.1350 0.0675 NaN NaN
#> Genotype 1 3.4133 3.4133 67.147 0.01457 *
#> Ea 2 0.1017 0.0508 NaN NaN
#> N.level 1 12.0000 12.0000 68.900 0.00115 **
#> Genotype:N.level 1 2.0833 2.0833 11.962 0.02585 *
#> Eb 4 0.6967 0.1742 NaN NaN
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data.frame(
term = rownames(models[[1]]$ANOVA),
pval = models[[1]]$ANOVA$`Pr(>F)`
)
#> term pval
#> 1 Block NaN
#> 2 Genotype 0.014567943
#> 3 Ea NaN
#> 4 N.level 0.001150332
#> 5 Genotype:N.level 0.025851402
#> 6 Eb NaN
Let’s give LSD.test() the same treatment. An additional complication here
is that we need to pass more arguments than just the variable name now. To
handle that with ..., we swap bquote() for substitute():
lsd_test <- function(variable, ...) {
variable <- str2lang(variable)
code <- substitute({
LSD.test(variable, ...)
})
eval(code, parent.frame())
}
with(data, lsd_test("GPC.30DAP", Genotype, 2, 0.050833, console = TRUE))
#>
#> Study: GPC.30DAP ~ Genotype
#>
#> LSD t Test for GPC.30DAP
#>
#> Mean Square Error: 0.050833
#>
#> Genotype, means and individual ( 95 %) CI
#>
#> GPC.30DAP std r LCL UCL Min Max
#> 1290H 3.433333 0.8524475 6 3.037298 3.829368 2.3 4.5
#> 1290W 2.366667 0.3614784 6 1.970632 2.762702 1.9 2.8
#>
#> Alpha: 0.05 ; DF Error: 2
#> Critical Value of t: 4.302653
#>
#> least Significant Difference: 0.560078
#>
#> Treatments with the same letter are not significantly different.
#>
#> GPC.30DAP groups
#> 1290H 3.433333 a
#> 1290W 2.366667 b
Now the code to analyse one variable would be:
with(data, {
model <- fit_model("GPC.30DAP") |> silence.output()
test1 <- lsd_test("GPC.30DAP", Genotype, model$gl.a, model$Ea)
test2 <- lsd_test("GPC.30DAP", N.level, model$gl.b, model$Eb)
test3 <- lsd_test("GPC.30DAP", Genotype:N.level, model$gl.b, model$Eb)
})
Again, a function to encapsulate this would be helpful:
analyze_variable <- function(data, variable, ...) {
with(data, {
model <- silence.output(fit_model(variable))
test1 <- lsd_test(variable, Genotype, model$gl.a, model$Ea, ...)
test2 <- lsd_test(variable, N.level, model$gl.b, model$Eb, ...)
test3 <- lsd_test(variable, Genotype:N.level, model$gl.b, model$Eb, ...)
list(variable = variable, model = model, tests = list(test1, test2, test3))
})
}
analysis <- analyze_variable(data, "GPC.30DAP")
str(analysis, 2)
#> List of 3
#> $ variable: chr "GPC.30DAP"
#> $ model :List of 5
#> ..$ ANOVA:Classes 'anova' and 'data.frame': 6 obs. of 5 variables:
#> .. ..- attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Response: GPC.30DAP"
#> ..$ gl.a : int 2
#> ..$ gl.b : int 4
#> ..$ Ea : num 0.363
#> ..$ Eb : num 0.192
#> $ tests :List of 3
#> ..$ :List of 5
#> .. ..- attr(*, "class")= chr "group"
#> ..$ :List of 5
#> .. ..- attr(*, "class")= chr "group"
#> ..$ :List of 5
#> .. ..- attr(*, "class")= chr "group"
Processing results
The analysis object that we return contains all the relevant information
for further processing. We could for example produce a similar interaction
plot to what you had from this result:
library(tidyverse)
plot_analysis <- function(analysis) {
means <- analysis$tests[[3]]$means
groups <- analysis$tests[[3]]$groups
means |>
cbind(groups = groups[, 2]) |>
rename(mean = 1) |>
rownames_to_column("term") |>
separate(term, into = c("Genotype", "N.level"), sep = ":") |>
ggplot(aes(N.level, mean, color = Genotype, group = Genotype)) +
geom_errorbar(
aes(ymin = LCL, ymax = UCL),
position = position_dodge(0.1),
width = 0.1
) +
geom_line(position = position_dodge(0.1)) +
geom_point(position = position_dodge(0.1), size = 3) +
geom_text(
aes(y = UCL, label = groups),
position = position_dodge(0.1),
vjust = -0.5,
show.legend = FALSE
) +
labs(y = analysis$variable, x = NULL)
}
plot_analysis(analysis)
Or we can make a helper to print the result summaries:
print_analysis <- function(analysis) {
cat("Split plot ANOVA of", analysis$variable, "\n")
print(analysis$model$ANOVA)
test_rows <- list()
for (test in analysis$tests) {
row <- cbind(test$parameters, test$statistics)
test_rows <- c(test_rows, list(row))
}
cat("\n")
print(do.call("rbind", test_rows))
invisible(analysis)
}
print_analysis(analysis)
#> Split plot ANOVA of GPC.30DAP
#> Analysis of Variance Table
#>
#> Response: GPC.30DAP
#> Df Sum Sq Mean Sq F value Pr(>F)
#> Block 2 0.8600 0.4300 NaN NaN
#> Genotype 1 3.4133 3.4133 9.3945 0.09199 .
#> Ea 2 0.7267 0.3633 NaN NaN
#> N.level 1 1.0800 1.0800 5.6348 0.07651 .
#> Genotype:N.level 1 0.8533 0.8533 4.4522 0.10249
#> Eb 4 0.7667 0.1917 NaN NaN
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> test p.ajusted name.t ntr alpha MSerror Df Mean CV
#> Fisher-LSD none Genotype 2 0.05 0.3633333 2 2.9 20.78522
#> 1 Fisher-LSD none N.level 2 0.05 0.1916667 4 2.9 15.09647
#> 2 Fisher-LSD none Genotype:N.level 4 0.05 0.1916667 4 2.9 15.09647
#> t.value LSD
#> 4.302653 1.4973671
#> 1 2.776445 0.7017812
#> 2 2.776445 0.9924686
Now you can write one loop to do all the analyses, and then continue working
with the list of results:
# Analyze all but the 3 first variables in data
variables <- tail(names(data), -3)
variables
#> [1] "GPC.15DAP" "GPC.23DAP" "GPC.30DAP"
analyses <- lapply(variables, \(variable) {
analyze_variable(data, variable)
})
For example, we can save the results of the plots and summaries using the
functions we created:
output_dir <- tempdir()
for (analysis in analyses) {
# Save summaries
output_file <- file.path(output_dir, paste0(analysis$variable, ".txt"))
sink(output_file)
print_analysis(analysis)
sink()
# Save plots
output_file <- file.path(output_dir, paste0(analysis$variable, ".jpeg"))
jpeg(output_file, width = 10, height = 10, units = "cm", res = 300)
plot_analysis(analysis) |> print()
dev.off()
}
list.files(output_dir, pattern = "^GPC")
#> [1] "GPC.15DAP.jpeg" "GPC.15DAP.txt" "GPC.23DAP.jpeg" "GPC.23DAP.txt"
#> [5] "GPC.30DAP.jpeg" "GPC.30DAP.txt"

How to add stars for significance level with odds ratio for polr?

My question is at the very bottom of this post.
Here is an example of codes that is very similar to the method I use:
url <- "http://peopleanalytics-regression-book.org/data/soccer.csv"
soccer <- read.csv(url)
head(soccer)
## discipline n_yellow_25 n_red_25 position result country level
## 1 None 4 1 S D England 1
## 2 None 2 2 D W England 2
## 3 None 2 1 M D England 1
## 4 None 2 1 M L Germany 1
## 5 None 2 0 S W Germany 1
## 6 None 3 2 M W England 1
str(soccer)
## 'data.frame': 2291 obs. of 7 variables:
## $ discipline : chr "None" "None" "None" "None" ...
## $ n_yellow_25: int 4 2 2 2 2 3 4 3 4 3 ...
## $ n_red_25 : int 1 2 1 1 0 2 2 0 3 3 ...
## $ position : chr "S" "D" "M" "M" ...
## $ result : chr "D" "W" "D" "L" ...
## $ country : chr "England" "England" "England" "Germany" ...
## $ level : int 1 2 1 1 1 1 2 1 1 1 ...
# convert discipline to ordered factor
soccer$discipline <- ordered(soccer$discipline,
levels = c("None", "Yellow", "Red"))
# check conversion
str(soccer)
# apply as.factor to four columns
cats <- c("position", "country", "result", "level")
soccer[ ,cats] <- lapply(soccer[ ,cats], as.factor)
# check again
str(soccer)
# run proportional odds model
library(MASS)
model <- polr(
formula = discipline ~ n_yellow_25 + n_red_25 + position +
country + level + result,
data = soccer
)
# get summary
summary(model)
## Call:
## polr(formula = discipline ~ n_yellow_25 + n_red_25 + position +
## country + level + result, data = soccer)
##
## Coefficients:
## Value Std. Error t value
## n_yellow_25 0.32236 0.03308 9.7456
## n_red_25 0.38324 0.04051 9.4616
## positionM 0.19685 0.11649 1.6899
## positionS -0.68534 0.15011 -4.5655
## countryGermany 0.13297 0.09360 1.4206
## level2 0.09097 0.09355 0.9724
## resultL 0.48303 0.11195 4.3147
## resultW -0.73947 0.12129 -6.0966
##
## Intercepts:
## Value Std. Error t value
## None|Yellow 2.5085 0.1918 13.0770
## Yellow|Red 3.9257 0.2057 19.0834
##
## Residual Deviance: 3444.534
## AIC: 3464.534
# get coefficients (it's in matrix form)
coefficients <- summary(model)$coefficients
# calculate p-values
p_value <- (1 - pnorm(abs(coefficients[ ,"t value"]), 0, 1))*2
# bind back to coefficients
coefficients <- cbind(coefficients, p_value)
# calculate odds ratios
odds_ratio <- exp(coefficients[ ,"Value"])
# combine with coefficient and p_value
(coefficients <- cbind(
coefficients[ ,c("Value", "p_value")],
odds_ratio
))
Doing this i get the following output:
## Value p_value odds_ratio
## n_yellow_25 0.32236030 0.000000e+00 1.3803820
## n_red_25 0.38324333 0.000000e+00 1.4670350
## positionM 0.19684666 9.105456e-02 1.2175573
## positionS -0.68533697 4.982908e-06 0.5039204
## countryGermany 0.13297173 1.554196e-01 1.1422177
## level2 0.09096627 3.308462e-01 1.0952321
## resultL 0.48303227 1.598459e-05 1.6209822
## resultW -0.73947295 1.083595e-09 0.4773654
## None|Yellow 2.50850778 0.000000e+00 12.2865822
## Yellow|Red 3.92572124 0.000000e+00 50.6896241
My question
However I want stars with odds ratios. How is this possible with THIS method? If possible I would like to also add the standard error.
I do not want to use modelsummary() or gtsummary()
How about this:
url <- "http://peopleanalytics-regression-book.org/data/soccer.csv"
soccer <- read.csv(url)
soccer$discipline <- ordered(soccer$discipline,
levels = c("None", "Yellow", "Red"))
cats <- c("position", "country", "result", "level")
soccer[ ,cats] <- lapply(soccer[ ,cats], as.factor)
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked _by_ '.GlobalEnv':
#>
#> cats
model <- polr(
formula = discipline ~ n_yellow_25 + n_red_25 + position +
country + level + result,
data = soccer
)
coefficients <- summary(model)$coefficients
#>
#> Re-fitting to get Hessian
# calculate p-values
p_value <- (1 - pnorm(abs(coefficients[ ,"t value"]), 0, 1))*2
# bind back to coefficients
coefficients <- cbind(coefficients, p_value)
# calculate odds ratios
coefficients <- cbind(coefficients, odds_ratio = exp(coefficients[ ,"Value"]))
# combine with coefficient and p_value
printCoefmat(coefficients[ ,c("Value", "Std. Error", "odds_ratio", "p_value")],
P.values=TRUE,
has.Pvalue=TRUE)
#> Value Std. Error odds_ratio p_value
#> n_yellow_25 0.322360 0.033078 1.3804 < 2.2e-16 ***
#> n_red_25 0.383243 0.040505 1.4670 < 2.2e-16 ***
#> positionM 0.196847 0.116487 1.2176 0.09105 .
#> positionS -0.685337 0.150112 0.5039 4.983e-06 ***
#> countryGermany 0.132972 0.093599 1.1422 0.15542
#> level2 0.090966 0.093547 1.0952 0.33085
#> resultL 0.483032 0.111951 1.6210 1.598e-05 ***
#> resultW -0.739473 0.121293 0.4774 1.084e-09 ***
#> None|Yellow 2.508508 0.191826 12.2866 < 2.2e-16 ***
#> Yellow|Red 3.925721 0.205714 50.6896 < 2.2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Created on 2022-12-06 by the reprex package (v2.0.1)

Inflection point for binomial mixed GLM model

I'd like to explore some possibilities and comparison approaches for inflection point calculation for the binomial mixed GLM model. I find the inflection package that used Extremum Surface Estimator (ESE) and Extremeum Distance Estimator (EDE). I make:
library(inflection)
library(dplyr)
library(glmmTMB)
library(DHARMa)
library(ggplot2)
library(ggeffects)
# My binomial data set
binom.ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/mort_binon.csv")
str(binom.ds)
# 'data.frame': 400 obs. of 4 variables:
# $ temp : num 0 0 0 0 0 0 0 0 0 0 ...
# $ days : int 5 5 5 5 5 5 5 5 5 5 ...
# $ rep : chr "r1" "r2" "r3" "r4" ...
# $ mortality: int 0 1 1 1 1 1 1 1 0 1 ...
# Fit a binomial mixed GLM model
m_F <- glmmTMB(mortality ~ temp + days +
(1 | days ), data = binom.ds,
family = "binomial")
# Check the fitted model using DHARMa
plot(s1 <- simulateResiduals(m_F))
# All look likes OK
# Find a inflection point
# for temp
ds_F <- cbind(x=binom.ds$temp,y=exp(predict(m_F)))
ds_F<-as.data.frame(ds_F)
bb=bede(ds_F$x,ds_F$y,0);bb
bb$iplast
# [1] 12.5
# $iters
# n a b EDE
# 1 400 0 25 12.5
# Vizualize the inflection point for temp
ggpredict(m_F, terms = "temp [all]") %>% plot(add.data = TRUE) + geom_vline(xintercept = bb$iplast, colour="red", linetype = "longdash")
#for days
ds_F <- cbind(x=binom.ds$days,y=exp(predict(m_F)))
ds_F<-as.data.frame(ds_F)
bb2=bede(ds_F$x,ds_F$y,0);bb2
bb2$iplast
# [1] 22.5
# $iters
# n a b EDE
# 1 400 5 30 17.5
# 2 221 5 30 17.5
# 3 181 15 5 10.0
# 4 61 15 30 22.5
# Vizualize the inflection point for days
ggpredict(m_F, terms = "days [all]") %>% plot(add.data = TRUE) + geom_vline(xintercept = bb2$iplast, colour="red", linetype = "longdash")
My question is there other approaches/packages for this calculus?

R Flexsurv and time-dependent covariates

I read that the R flexsurv package can also be used for modeling time-dependent covariates according to Christopher Jackson (2016) ["flexsurv: a platform for parametric survival modeling in R, Journal of Statistical Software, 70 (1)].
However, I was not able to figure out how, even after several adjustments and searches in online forums.
Before turning to the estimation of time-dependent covariates I tried to create a simple model with only time-independent covariates to test whether I specified the Surv object correctly. Here is a small example.
library(splitstackshape)
library(flexsurv)
## create sample data
n=50
set.seed(2)
t <- rpois(n,15)+1
x <- rnorm(n,t,5)
df <- data.frame(t,x)
df$id <- 1:n
df$rep <- df$t-1
Which looks like this:
t x id rep
1 12 17.696149 1 11
2 12 20.358094 2 11
3 11 2.058789 3 10
4 16 26.156213 4 15
5 13 9.484278 5 12
6 15 15.790824 6 14
...
And the long data:
long.df <- expandRows(df, "rep")
rep.vec<-c()
for(i in 1:n){
rep.vec <- c(rep.vec,1:(df[i,"t"]-1))
}
long.df$start <- rep.vec
long.df$stop <- rep.vec +1
long.df$censrec <- 0
long.df$censrec<-ifelse(long.df$stop==long.df$t,1,long.df$censrec)
Which looks like this:
t x id start stop censrec
1 12 17.69615 1 1 2 0
1.1 12 17.69615 1 2 3 0
1.2 12 17.69615 1 3 4 0
1.3 12 17.69615 1 4 5 0
1.4 12 17.69615 1 5 6 0
1.5 12 17.69615 1 6 7 0
1.6 12 17.69615 1 7 8 0
1.7 12 17.69615 1 8 9 0
1.8 12 17.69615 1 9 10 0
1.9 12 17.69615 1 10 11 0
1.10 12 17.69615 1 11 12 1
2 12 20.35809 2 1 2 0
...
Now I can estimate a simple Cox model to see whether it works:
coxph(Surv(t)~x,data=df)
This yields:
coef exp(coef) se(coef) z p
x -0.0588 0.9429 0.0260 -2.26 0.024
And in the long format:
coxph(Surv(start,stop,censrec)~x,data=long.df)
I get:
coef exp(coef) se(coef) z p
x -0.0588 0.9429 0.0260 -2.26 0.024
Taken together I conclude that my transformation into the long format was correct. Now, turning to the flexsurv framework:
flexsurvreg(Surv(time=t)~x,data=df, dist="weibull")
yields:
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 5.00086 4.05569 6.16631 0.53452 NA NA NA
scale NA 13.17215 11.27876 15.38338 1.04293 NA NA NA
x 15.13380 0.01522 0.00567 0.02477 0.00487 1.01534 1.00569 1.02508
But
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull")
causes an error:
Error in flexsurvreg(Surv(start, stop, censrec) ~ x, data = long.df, dist = "weibull") :
Initial value for parameter 1 out of range
Would anyone happen to know the correct syntax for the latter Surv object? If you use the correct syntax, do you get the same estimates?
Thank you very much,
best,
David
===============
EDIT AFTER FEEDBACK FROM 42
===============
library(splitstackshape)
library(flexsurv)
x<-c(8.136527, 7.626712, 9.809122, 12.125973, 12.031536, 11.238394, 4.208863, 8.809854, 9.723636)
t<-c(2, 3, 13, 5, 7, 37 ,37, 9, 4)
df <- data.frame(t,x)
#transform into long format for time-dependent covariates
df$id <- 1:length(df$t)
df$rep <- df$t-1
long.df <- expandRows(df, "rep")
rep.vec<-c()
for(i in 1:length(df$t)){
rep.vec <- c(rep.vec,1:(df[i,"t"]-1))
}
long.df$start <- rep.vec
long.df$stop <- rep.vec +1
long.df$censrec <- 0
long.df$censrec<-ifelse(long.df$stop==long.df$t,1,long.df$censrec)
coxph(Surv(t)~x,data=df)
coxph(Surv(start,stop,censrec)~x,data=long.df)
flexsurvreg(Surv(time=t)~x,data=df, dist="weibull")
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull",inits=c(shape=.1, scale=1))
Which yields the same estimates for both coxph models but
Call:
flexsurvreg(formula = Surv(time = t) ~ x, data = df, dist = "weibull")
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 1.0783 0.6608 1.7594 0.2694 NA NA NA
scale NA 27.7731 3.5548 216.9901 29.1309 NA NA NA
x 9.3012 -0.0813 -0.2922 0.1295 0.1076 0.9219 0.7466 1.1383
N = 9, Events: 9, Censored: 0
Total time at risk: 117
Log-likelihood = -31.77307, df = 3
AIC = 69.54614
and
Call:
flexsurvreg(formula = Surv(start, stop, censrec) ~ x, data = long.df,
dist = "weibull", inits = c(shape = 0.1, scale = 1))
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 0.8660 0.4054 1.8498 0.3353 NA NA NA
scale NA 24.0596 1.7628 328.3853 32.0840 NA NA NA
x 8.4958 -0.0912 -0.3563 0.1739 0.1353 0.9128 0.7003 1.1899
N = 108, Events: 9, Censored: 99
Total time at risk: 108
Log-likelihood = -30.97986, df = 3
AIC = 67.95973
Reading the error message:
Error in flexsurvreg(Surv(start, stop, censrec) ~ x, data = long.df, dist = "weibull", :
initial values must be a numeric vector
And then reading the help page, ?flexsurvreg, it seemed as though an attempt at setting values for inits to a named numeric vector should be attempted:
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull", inits=c(shape=.1, scale=1))
Call:
flexsurvreg(formula = Surv(start, stop, censrec) ~ x, data = long.df,
dist = "weibull", inits = c(shape = 0.1, scale = 1))
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 5.00082 4.05560 6.16633 0.53454 NA NA NA
scale NA 13.17213 11.27871 15.38341 1.04294 NA NA NA
x 15.66145 0.01522 0.00567 0.02477 0.00487 1.01534 1.00569 1.02508
N = 715, Events: 50, Censored: 665
Total time at risk: 715
Log-likelihood = -131.5721, df = 3
AIC = 269.1443
Extremely similar results. My guess was basically a stab in the dark, so I have no guidance on how to make a choice if this had not succeeded other than to "expand the search."
I just want to mention that in flexsurv v1.1.1, running this code:
flexsurvreg(Surv(start,stop,censrec) ~ x ,data=long.df, dist="weibull")
doesn't return any errors. It also gives the same estimates as the non time-varying command
flexsurvreg(Surv(time=t)~x,data=df, dist="weibull")

Adding Different Percentiles in boxplots in R

I am failry new to R and recently used it to make some Boxplots. I also added the mean and standard deviation in my boxplot. I was wondering if i could add some kind of tick mark or circle in different percentile as well. Let's say if i want to mark the 85th, $ 90th percentile in each HOUR boxplot, is there a way to do this? My data consist of a year worth of loads in MW in each hour & My output consist of 24 boxplots for each hour for each month. I am doing each month at a time because i am not sure if there is a way to run all 96(Each month, weekday/weekend , for 4 different zones) boxplots at once. Thanks in advance for help.
JANWD <-read.csv("C:\\My Directory\\MWBox2.csv")
JANWD.df<-data.frame(JANWD)
JANWD.sub <-subset(JANWD.df, MONTH < 2 & weekend == "NO")
KeepCols <-c("Hour" , "Houston_Load")
HWD <- JANWD.sub[ ,KeepCols]
sd <-tapply(HWD$Houston_Load, HWD$Hour, sd)
means <-tapply(HWD$Houston_Load, HWD$Hour, mean)
boxplot(Houston_Load ~ Hour, data=HWD, xlab="WEEKDAY HOURS", ylab="MW Differnce", ylim= c(-10, 20), smooth=TRUE ,col ="bisque", range=0)
points(sd, pch = 22, col= "blue")
points(means, pch=23, col ="red")
#Output of the subset of data used to run boxplot for month january in Houston
str(HWD)
'data.frame': 504 obs. of 2 variables:
`$ Hour : int 1 2 3 4 5 6 7 8 9 10 ...'
`$ Houston_Load: num 1.922 2.747 -2.389 0.515 1.922 ...'
#OUTPUT of the original data
str(JANWD)
'data.frame': 8783 obs. of 9 variables:
$ Date : Factor w/ 366 levels "1/1/2012","1/10/2012",..: 306 306 306 306 306 306 306 306 306 306 ...
`$ Hour : int 1 2 3 4 5 6 7 8 9 10 ...'
` $ MONTH : int 8 8 8 8 8 8 8 8 8 8 ...'
`$ weekend : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...'
`$ TOTAL_LOAD : num 0.607 5.111 6.252 7.607 0.607 ...'
`$ Houston_Load: num -2.389 0.515 1.922 2.747 -2.389 ...'
`$ North_Load : num 2.95 4.14 3.55 3.91 2.95 ...'
`$ South_Load : num -0.108 0.267 0.54 0.638 -0.108 ...'
`$ West_Load : num 0.154 0.193 0.236 0.311 0.154 ...'
Here is one way, using quantile() to compute the relevant percentiles for you. I add the marks using rug().
set.seed(1)
X <- rnorm(200)
boxplot(X, yaxt = "n")
## compute the required quantiles
qntl <- quantile(X, probs = c(0.85, 0.90))
## add them as a rgu plot to the left hand side
rug(qntl, side = 2, col = "blue", lwd = 2)
## add the box and axes
axis(2)
box()
Update: In response to the OP providing str() output, here is an example similar to the data that the OP has to hand:
set.seed(1) ## make reproducible
HWD <- data.frame(Hour = rep(0:23, 10),
Houston_Load = rnorm(24*10))
Now get I presume you want ticks at 85th and 90th percentiles for each Hour? If so we need to split the data by Hour and compute via quantile() as I showed earlier:
quants <- sapply(split(HWD$Houston_Load, list(HWD$Hour)),
quantile, probs = c(0.85, 0.9))
which gives:
R> quants <- sapply(split(HWD$Houston_Load, list(HWD$Hour)),
+ quantile, probs = c(0.85, 0.9))
R> quants
0 1 2 3 4 5 6
85% 0.3576510 0.8633506 1.581443 0.2264709 0.4164411 0.2864026 1.053742
90% 0.6116363 0.9273008 2.109248 0.4218297 0.5554147 0.4474140 1.366114
7 8 9 10 11 12 13 14
85% 0.5352211 0.5175485 1.790593 1.394988 0.7280584 0.8578999 1.437778 1.087101
90% 0.8625322 0.5969672 1.830352 1.519262 0.9399476 1.1401877 1.763725 1.102516
15 16 17 18 19 20 21
85% 0.6855288 0.4874499 0.5493679 0.9754414 1.095362 0.7936225 1.824002
90% 0.8737872 0.6121487 0.6078405 1.0990935 1.233637 0.9431199 2.175961
22 23
85% 1.058648 0.6950166
90% 1.145783 0.8436541
Now we can draw marks at the x locations of the boxplots
boxplot(Houston_Load ~ Hour, data = HWD, axes = FALSE)
xlocs <- 1:24 ## where to draw marks
tickl <- 0.15 ## length of marks used
for(i in seq_len(ncol(quants))) {
segments(x0 = rep(xlocs[i] - 0.15, 2), y0 = quants[, i],
x1 = rep(xlocs[i] + 0.15, 2), y1 = quants[, i],
col = c("red", "blue"), lwd = 2)
}
title(xlab = "Hour", ylab = "Houston Load")
axis(1, at = xlocs, labels = xlocs - 1)
axis(2)
box()
legend("bottomleft", legend = paste(c("0.85", "0.90"), "quantile"),
bty = "n", lty = "solid", lwd = 2, col = c("red", "blue"))
The resulting figure should look like this:

Resources