How does R calculate the PCA ellipses? - r

How does R know where to place the confidence ellipse for a PCA plot?
I have a minimal code using the iris dataset:
library(factoextra)
a<-data.matrix(iris[-5])
b<-prcomp(a, scale. = TRUE, center = TRUE)
fviz_pca_ind(b,
col.ind = iris$Species,
addEllipses = TRUE)
I know that I can find the plot coordinate with b$x. I also know that I can find the cluster centers with b$center. How do I re-derive the ellipses from the data?

If you are talking about the how, it eventually calls ggplot2::stat_ellipse.
If you want the coordinates, like with other ggplot objects, you can extract the data with ggplot_build
library(factoextra)
a<-data.matrix(iris[-5])
b<-prcomp(a, scale. = TRUE, center = TRUE)
p <- fviz_pca_ind(b,
col.ind = iris$Species,
addEllipses = TRUE)
ell <- ggplot2::ggplot_build(p)$data[[2]]
head(ell)
# colour fill x y group PANEL size linetype alpha
# 1 #F8766D #F8766D -1.697756 -0.06395559 1 1 0.5 1 0.1
# 2 #F8766D #F8766D -1.701694 0.22197334 1 1 0.5 1 0.1
# 3 #F8766D #F8766D -1.713449 0.50017215 1 1 0.5 1 0.1
# 4 #F8766D #F8766D -1.732842 0.76642364 1 1 0.5 1 0.1
# 5 #F8766D #F8766D -1.759579 1.01669171 1 1 0.5 1 0.1
# 6 #F8766D #F8766D -1.793255 1.24718254 1 1 0.5 1 0.1
p + geom_point(aes(x, y, color = factor(group)), data = ell, size = 4)

If you trace the code all the way through, you find that the ellipses are simply geom_polygons created with stat = "ellipse", i.e. they are calculated by stat_ellipse in ggplot.
We can show this by recreating the plot using only base R and ggplot. The following is a fully reproducible example:
library(ggplot2)
b <- prcomp(iris[-5], scale. = TRUE, center = TRUE)
df <- as.data.frame(predict(b)[,1:2])
df$Species <- iris$Species
ggplot(df, aes(PC1, PC2, color = Species)) +
geom_point() +
theme_bw() +
geom_polygon(stat = "ellipse", aes(fill = Species), alpha = 0.3)
Ultimately, stat_ellipse gets its data from the same method as cars::dataEllipse, so if you want the raw co-ordinates of the ellipses, you can do:
e <- car::dataEllipse(df$PC1, df$PC2, df$Species)
and obtain the 95th centile normal data ellipse co-ordinates like this:
e$setosa$`0.95`
#> x y
#> [1,] -2.167825 2.06328716
#> [2,] -2.104642 2.04546589
#> [3,] -2.043166 1.99227221
#> [4,] -1.984331 1.90451250
#> [5,] -1.929028 1.78351710
#> [6,] -1.878095 1.63112017
#> [7,] -1.832305 1.44963190
#> [8,] -1.792351 1.24180347
#> [9,] -1.758839 1.01078534
#> [10,] -1.732278 0.76007952
#> [11,] -1.713069 0.49348644
#> [12,] -1.701504 0.21504739
#> [13,] -1.697759 -0.07101678
#> [14,] -1.701889 -0.36036963
#> [15,] -1.713833 -0.64862486
#> [16,] -1.733410 -0.93141283
#> [17,] -1.760322 -1.20444675
#> [18,] -1.794162 -1.46358770
#> [19,] -1.834417 -1.70490738
#> [20,] -1.880476 -1.92474763
#> [21,] -1.931641 -2.11977588
#> [22,] -1.987137 -2.28703571
#> [23,] -2.046123 -2.42399164
#> [24,] -2.107703 -2.52856754
#> [25,] -2.170946 -2.59917816
#> [26,] -2.234892 -2.63475311
#> [27,] -2.298571 -2.63475311
#> [28,] -2.361018 -2.59917816
#> [29,] -2.421288 -2.52856754
#> [30,] -2.478465 -2.42399164
#> [31,] -2.531684 -2.28703571
#> [32,] -2.580138 -2.11977588
#> [33,] -2.623091 -1.92474763
#> [34,] -2.659894 -1.70490738
#> [35,] -2.689988 -1.46358770
#> [36,] -2.712917 -1.20444675
#> [37,] -2.728333 -0.93141283
#> [38,] -2.736002 -0.64862486
#> [39,] -2.735809 -0.36036963
#> [40,] -2.727757 -0.07101678
#> [41,] -2.711966 0.21504739
#> [42,] -2.688678 0.49348644
#> [43,] -2.658244 0.76007952
#> [44,] -2.621126 1.01078534
#> [45,] -2.577888 1.24180347
#> [46,] -2.529183 1.44963190
#> [47,] -2.475751 1.63112017
#> [48,] -2.418401 1.78351710
#> [49,] -2.358004 1.90451250
#> [50,] -2.295473 1.99227221
#> [51,] -2.231758 2.04546589
#> [52,] -2.167825 2.06328716
Created on 2021-11-05 by the reprex package (v2.0.0)

Related

How to obtain some points located around a points with GPS coordinates in R?

I am quite new to cartography operations with R. I would like to know if the there is a way to find some points located around a given GPS point.
Imagine that I have a data.table as follows:
Reference_Coordinates <- data.table(Latitude=c(1,2,3), Longitude=c(4,5,6), Point_ID=c("Point1","Point2","Point3")).
If I assume that each point is the centre of a circle, how could I get some points located at equal distance from the origin? For instance, 10 points forming a 1-km radius circle around point 1, then another 10 points around point 2, et cetera? I wanted to use a WGS 84 standard ellipsoid.
Thank you very much.
I've read some answers in this forum and I supose I could use some packages such as sf or sp, but I couldn't find the solution to my question.
Using the R package terra, you can create 3 points from the lat / lon co-ordinates like this:
library(terra)
v <-vect(cbind(Reference_Coordinates$Longitude, Reference_Coordinates$Latitude),
type = 'points', crs = 'WGS84')
To get a collection of points 10km from each starting point, you can do:
p <- as.points(buffer(v, 10000, quadsegs = 3))
Which looks like this:
plot(p, xlab = 'longitude', ylab = 'latitude')
To get the co-ordinates of each of these points, you can simply do
crds(p)
#> x y
#> [1,] 4.000000 1.0904366
#> [2,] 4.044924 1.0783201
#> [3,] 4.077809 1.0452174
#> [4,] 4.089845 0.9999988
#> [5,] 4.077807 0.9547807
#> [6,] 4.044922 0.9216792
#> [7,] 4.000000 0.9095633
#> [8,] 3.955078 0.9216792
#> [9,] 3.922193 0.9547807
#> [10,] 3.910155 0.9999988
#> [11,] 3.922191 1.0452174
#> [12,] 3.955076 1.0783201
#> [13,] 5.000000 2.0904358
#> [14,] 5.044945 2.0783191
#> [15,] 5.077846 2.0452160
#> [16,] 5.089886 1.9999975
#> [17,] 5.077841 1.9547802
#> [18,] 5.044941 1.9216796
#> [19,] 5.000000 1.9095641
#> [20,] 4.955059 1.9216796
#> [21,] 4.922159 1.9547802
#> [22,] 4.910114 1.9999975
#> [23,] 4.922154 2.0452160
#> [24,] 4.955055 2.0783191
#> [25,] 6.000000 3.0904344
#> [26,] 6.044980 3.0783175
#> [27,] 6.077906 3.0452144
#> [28,] 6.089954 2.9999963
#> [29,] 6.077899 2.9547800
#> [30,] 6.044974 2.9216805
#> [31,] 6.000000 2.9095655
#> [32,] 5.955026 2.9216805
#> [33,] 5.922101 2.9547800
#> [34,] 5.910046 2.9999963
#> [35,] 5.922094 3.0452144
#> [36,] 5.955020 3.0783175
Created on 2023-01-31 with reprex v2.0.2

Local Polynomial Regression: Reading nearest neighbor smoothing constants from R code

I'm studying from a textbook on data mining and I can't figure out how the author reads the nn values from the gcv output. The code and output are below:
## cv
alpha <- seq(0.20, 1, by = 0.01)
n1 = length(alpha)
g = matrix(nrow = n1, ncol = 4)
for (k in 1:length(alpha)) {
g[k,] <- gcv(NOx ~ lp(EquivRatio, nn = alpha[k]), data = ethanol)
}
g
the csv file is here:
https://github.com/jgscott/ECO395M/blob/master/data/ethanol.csv
I'm usin locfit library in R.
How do you find with given output?
The nn values are not read from the output - they are given in the input. In the loop, nn is assigned as the kth value of the object alpha.
Let's look at the output of the first 16 rows of g, which is the same as the picture you included in your question:
g[1:16,]
#> [,1] [,2] [,3] [,4]
#> [1,] -3.220084 18.81266 16.426487 0.1183932
#> [2,] -3.249601 17.61614 15.436227 0.1154507
#> [3,] -3.319650 16.77004 14.752039 0.1151542
#> [4,] -3.336464 15.44404 13.889209 0.1115457
#> [5,] -3.373011 14.52391 13.115430 0.1099609
#> [6,] -3.408908 13.96789 12.634934 0.1094681
#> [7,] -3.408908 13.96789 12.634934 0.1094681
#> [8,] -3.469254 12.99316 11.830996 0.1085293
#> [9,] -3.504310 12.38808 11.283837 0.1078784
#> [10,] -3.529167 11.93838 10.928859 0.1073628
#> [11,] -3.546728 11.46960 10.516520 0.1065792
#> [12,] -3.552238 11.26372 10.322329 0.1061728
#> [13,] -3.576083 11.03575 10.135243 0.1062533
#> [14,] -3.679128 10.54096 9.662613 0.1079229
#> [15,] -3.679128 10.54096 9.662613 0.1079229
#> [16,] -3.699044 10.46534 9.578396 0.1082955
Note that rows 11, 12 and 13 were created inside your loop using alpha[11], alpha[12] and alpha[13]. These values were passed to the nn argument of lp. If you want the nn values included in your table, all you need to do is:
cbind(g, nn = alpha)
#> nn
#> [1,] -3.220084 18.812657 16.426487 0.1183932 0.20
#> [2,] -3.249601 17.616143 15.436227 0.1154507 0.21
#> [3,] -3.319650 16.770041 14.752039 0.1151542 0.22
#> [4,] -3.336464 15.444040 13.889209 0.1115457 0.23
#> [5,] -3.373011 14.523910 13.115430 0.1099609 0.24
#> [6,] -3.408908 13.967891 12.634934 0.1094681 0.25
#> [7,] -3.408908 13.967891 12.634934 0.1094681 0.26
#> [8,] -3.469254 12.993165 11.830996 0.1085293 0.27
#> [9,] -3.504310 12.388077 11.283837 0.1078784 0.28
#> [10,] -3.529167 11.938379 10.928859 0.1073628 0.29
#> [11,] -3.546728 11.469598 10.516520 0.1065792 0.30
#> [12,] -3.552238 11.263716 10.322329 0.1061728 0.31
#> [13,] -3.576083 11.035752 10.135243 0.1062533 0.32
#> [14,] -3.679128 10.540964 9.662613 0.1079229 0.33
#> [15,] -3.679128 10.540964 9.662613 0.1079229 0.34
#> [16,] -3.699044 10.465337 9.578396 0.1082955 0.35

How to generate random numbers with normal distribution and uniform distribution

I am a newbie in R. Now, I want to create a matrix, and then extract 20 random Numbers from each of these three uniform distributions: U(0.6,0.8), U(0.0001,0.0003), U(100,110), and place them in the first three columns of the matrix, with each column corresponding to a uniform distribution. Then 20 random Numbers are extracted from each of the two normal distributions: N(7750,0.01), N(12,0.4), and placed in the last two columns of the matrix. My program is as follows, but can only achieve uniform distribution of random numbers, cannot achieve the first three columns are uniform distribution, the last two columns are the normal distribution of random numbers, How can I change it?
input <-5 # variable input
xinput <- 20 #sampling number
range <- matrix(c(0.60,0.80,
0.0001,0.0003,
100,110,
7700,8000,
10,15,
),nrow=input,ncol=2,byrow=TRUE)
range
rangeresult <- matrix(0, nrow=xinput, ncol=input)# empty matrix for latter data
rangeresult
##uniform distribution
for (i in 1:input){
set.seed(456+i) # make results reproducible
rangeresult[,i] <- runif(xinput,range[i,1],range[i,2])
}
Perhaps try this
cbind(
u1 = runif(20L, 0.6, 0.8),
u2 = runif(20L, 0.0001, 0.0003),
u3 = runif(20L, 100, 110),
n1 = rnorm(20L, 7750, 0.01),
n2 = rnorm(20L, 12, 0.4)
)
Output
u1 u2 u3 n1 n2
[1,] 0.7558480 0.0002851074 101.7209 7749.988 11.75270
[2,] 0.7807589 0.0002600877 104.9278 7749.998 11.67970
[3,] 0.7480385 0.0001562960 109.5744 7749.979 11.84603
[4,] 0.6283492 0.0001408027 108.9455 7749.999 12.00459
[5,] 0.7666862 0.0002485003 106.4735 7750.002 12.58783
[6,] 0.6354397 0.0001042544 107.0999 7749.982 12.36555
[7,] 0.7340912 0.0002507386 109.7052 7749.994 11.75111
[8,] 0.7220797 0.0001173221 105.7116 7749.995 11.35322
[9,] 0.6956138 0.0001478050 104.6444 7750.004 11.68879
[10,] 0.6146491 0.0001238944 108.5946 7750.006 12.78417
[11,] 0.7436676 0.0002492057 107.6073 7750.003 11.80814
[12,] 0.7916866 0.0001927277 100.1949 7750.016 12.16362
[13,] 0.7701075 0.0002236796 103.9207 7750.007 11.82555
[14,] 0.7151522 0.0001528767 101.0997 7749.996 11.75938
[15,] 0.6866158 0.0002872521 100.7036 7750.018 11.36261
[16,] 0.6106267 0.0001278512 105.8946 7749.986 11.81682
[17,] 0.6537794 0.0002875799 104.2015 7750.007 11.56224
[18,] 0.6095022 0.0001534366 108.9352 7749.993 12.22691
[19,] 0.7156714 0.0001303851 107.7274 7749.995 12.01923
[20,] 0.6397735 0.0002706792 109.6200 7749.986 12.01927
matrix(
c(runif(20, .6, .8),
runif(20, .0001, .0003),
runif(20, 100, 110),
rnorm(20, 7750, .01),
rnorm(20, 12, .4)),
ncol=5)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.6303004 0.0002700728 102.6577 7750.008 12.10271
#> [2,] 0.7611678 0.0001594420 106.2736 7750.001 11.95071
#> [3,] 0.7217263 0.0002726162 105.9933 7749.993 12.16880
#> [4,] 0.7873636 0.0001409666 109.9674 7750.016 11.58212
#> [5,] 0.7329912 0.0002504620 105.8886 7750.005 11.62768
#> [6,] 0.6775068 0.0002546660 109.9630 7750.000 11.75542
#> [7,] 0.6927353 0.0001217041 105.5130 7750.004 12.46987
#> [8,] 0.7889347 0.0001849753 105.8204 7750.002 11.96011
#> [9,] 0.7555766 0.0001712631 104.6053 7750.013 12.77534
#> [10,] 0.6225500 0.0001441519 101.4559 7750.011 11.62323
#> [11,] 0.6004412 0.0002862156 100.7426 7750.015 12.34398
#> [12,] 0.7896445 0.0001871342 103.5566 7750.002 11.18040
#> [13,] 0.7995510 0.0002998966 101.2008 7750.005 11.79095
#> [14,] 0.7271423 0.0001385434 108.3129 7750.006 11.85577
#> [15,] 0.7990341 0.0001868429 102.3255 7749.974 12.00426
#> [16,] 0.7711383 0.0001362412 108.1071 7749.995 11.62242
#> [17,] 0.7168780 0.0001821163 103.0949 7750.021 12.35856
#> [18,] 0.7197489 0.0002015831 109.4623 7749.981 11.46613
#> [19,] 0.7006335 0.0001257633 100.9744 7750.001 12.03066
#> [20,] 0.7503335 0.0002953110 102.1582 7749.989 12.54394

r igraph - how does plot() read the layout matrix?

My question is related to this one here, which unfortunately has not been responded. I'm trying to automatically annotate text next to highlighted communities on a plot. An intermediate step is to understand how nodes are placed on a plot.
G <- make_graph('zachary')
l <- layout_with_fr(G)
l
A layout is a matrix with rows representing nodes and columns representing the x and y plot parameters.
[,1] [,2]
[1,] 2.8510654 -2.2404898
[2,] 2.7183497 -1.1815130
[3,] 3.1429205 0.1117099
[4,] 1.5585372 -1.0743325
[5,] 2.2808632 -4.2035479
[6,] 2.1698198 -5.0526766
[7,] 1.4938068 -4.6975884
[8,] 1.9710816 -1.4672218
[9,] 3.5407035 0.5407852
[10,] 2.2222909 1.9079805
[11,] 3.0784642 -4.5828448
[12,] 4.4115351 -4.1057462
[13,] 0.6002378 -2.2432049
[14,] 2.5010525 -0.1563341
[15,] 4.8914673 4.1417759
[16,] 3.2053338 3.9212694
[17,] 1.1825200 -6.4099021
[18,] 3.7155897 -2.8354432
[19,] 3.8272351 4.2660906
[20,] 3.8636487 -0.5671906
[21,] 2.7302411 3.3998888
[22,] 1.6084374 -2.7407388
[23,] 4.3432855 3.8101278
[24,] 5.9392042 2.2364929
[25,] 6.9980077 0.2389222
[26,] 7.1608499 1.1360134
[27,] 6.0171481 4.0279067
[28,] 5.4996627 1.0367163
[29,] 4.4961257 0.9434659
[30,] 5.5987563 3.2314488
[31,] 2.9958404 1.2022317
[32,] 5.1188900 0.2919268
[33,] 4.1088296 2.5032294
[34,] 4.1686534 2.1339884
But the x, y coordinates of the plot go from -1 to 1, unlike the min-max coordinates in the layout matrix. So how is plot(G, layout = l) reading the layout matrix?
The according to the source, the plot method for objects of class igraph simply rescales the matrix from -1 to 1.
library(igraph)
set.seed(3)
l <- layout_with_fr(G)
[,1] [,2]
[1,] -2.283 0.658
[2,] -1.289 -0.108
[3,] 0.146 1.012
[4,] -1.523 1.601
#... with 30 more rows.
plot(G,layout = l)
maxs <- apply(l, 2, max)
mins <- apply(l, 2, min)
ll <- scale(l, center=(maxs+mins)/2, scale=(maxs-mins)/2)
ll
[,1] [,2]
[1,] -0.2422 -0.1051
[2,] -0.0704 -0.3821
[3,] 0.1775 0.0228
[4,] -0.1108 0.2357
#... with 30 more rows.
plot(G,layout = ll)
Note that the actual rescaling is performed with igraph::norm_coords:
igraph::norm_coords(l)
[,1] [,2]
[1,] -0.2422 -0.1051
[2,] -0.0704 -0.3821
[3,] 0.1775 0.0228
[4,] -0.1108 0.2357
#... with 30 more rows.

How plot multiple ggplots curves on the same Graph

i'm trying to plot multiple curves on the same Graph with ggplot2.... Actually i'm using a function "draw.data" with some parameters, so i can have multiple plots. But i got an issue, by the first call, i got a plot, once i call the method the 2nd time, i have an error. But i want the curves to be drew on the same Plot.
Hier are my codes:
# f.probit
f.probit <- function(x,beta1,beta2,minv,maxv){
return(pnorm(beta1+beta2*x)*(maxv-minv)+minv)
}
# f.logit
f.logit <- function(x,beta1,beta2,minv,maxv){
return((1/(1+exp(-(beta1+beta2*x))))*(maxv-minv)+minv)
}
# draw.data
draw.data <- function(xy, mod, add = F, FUN = NULL){
# Bibliothek für ggplot-Funktion
# Dependencies: > library("ggplot2") must be imported!
x.lab <- "concentration [M]"
y.lab <- "normalised luminescence [%]"
my_labels <- parse(text = paste("1E", seq(-10, -4, 1), sep = ""))
# Find max, min and difference
# y.max <- max(my.data$y)
# y.min <- min(my.data$y)
#y.max <- 1
#y.min <- 0
#diff <- y.max - y.min
# Find percentage and apply to new column
my.data <- data.frame(xy, model = mod)
if(!add){
quartz() # windows() unter MS Windows
ggplot(my.data, aes(x, y, group = model, color = model)) +
stat_function(fun = FUN, args = list(beta1, beta2, minv = 0, maxv = 1), geom = "line", aes(group = model, colour = model)) +
# Draw 2 lines at 50% and 90% through the y-axis
geom_hline(yintercept = c(0.5, 0.9), linetype = "dotted") + # draw dotted horizontal lines at 50 and 90
scale_x_continuous(x.lab, breaks = seq(-10, -4, 1), labels = my_labels) +
scale_y_continuous(labels = percent) +
labs(title = "Graph", x = x.lab, y = y.lab)
} else{
stat_function(fun = FUN, args = list(beta1, beta2, minv = 0, maxv = 1), geom = "line", aes(group = model, colour = model), data = my.data)
}
}
My call of the function "draw.data" with arguments 'add = F' the second time give my this error:
mapping: group = model, colour = model
geom_line:
stat_function: fun = function (x, beta1, beta2, minv, maxv)
{
return((1/(1 + exp(-(beta1 + beta2 * x)))) * (maxv - minv) + minv)
}, n = 101, args = list(-4.827511, -0.8401166, minv = 0, maxv = 1)
position_identity: (width = NULL, height = NULL)
What's wrong with this call? And how am i supposed to implement draw.data, so i can get into the else part and call stat_function to draw the 2nd curve on the same graph?
Update:
Hier ist the result:
> xy
x y
[1,] -10 1.14259527
[2,] -9 1.15024188
[3,] -8 1.10517450
[4,] -7 1.00961311
[5,] -6 0.71238360
[6,] -5 0.20355333
[7,] -4 0.04061895
[8,] -10 1.11022461
[9,] -9 1.11083317
[10,] -8 1.07867942
[11,] -7 0.98422000
[12,] -6 0.73539660
[13,] -5 0.36134577
[14,] -4 0.18124645
[15,] -10 2.13212408
[16,] -9 1.14529425
[17,] -8 1.25102307
[18,] -7 1.16045169
[19,] -6 0.50321380
[20,] -5 0.15422609
[21,] -4 0.10198811
[22,] -10 1.16539392
[23,] -9 1.15855333
[24,] -8 1.11766975
[25,] -7 0.97204379
[26,] -6 0.53504417
[27,] -5 0.17431435
[28,] -4 0.29470416
[29,] -10 1.03683145
[30,] -9 1.07524250
[31,] -8 1.07761291
[32,] -7 0.96401682
[33,] -6 0.78346457
[34,] -5 0.32783725
[35,] -4 0.08103084
[36,] -10 0.81372339
[37,] -9 0.85402909
[38,] -8 0.86584396
[39,] -7 0.80705470
[40,] -6 0.53086151
[41,] -5 0.15711034
[42,] -4 0.11496499

Resources