I have three classes with mean
mu1 <- matrix(c(3, 1), nrow=2)
mu2 <- matrix(c(4, 3), nrow=2)
mu3 <- matrix(c(8, 2), nrow=2)
and covariance
cov <- matrix(c(.5, .3, .3, .5), nrow=2, ncol=2)
I would like to simulate about 100 observations from each class and perform LDA.
first, I made three matrix with 100 observations.
x1 <- matrix(c(rmvnorm(100, mean=mu1, sigma=cov), matrix("x1", ncol=1, nrow=100)), ncol=3)
x2 <-matrix(c(rmvnorm(100, mean=mu2, sigma=cov), matrix("x2", ncol=1, nrow=100)), ncol=3)
x3 <- matrix(c(rmvnorm(100, mean=mu3, sigma=cov), matrix("x3", ncol=1, nrow=100)), ncol=3)
and made those to data frame and bind it together.
d1 <- data.frame(x1)
d2 <- data.frame(x2)
d3 <- data.frame(x3)
alld <- rbind(d1, d2, d3)
now I would like to perform lda with code of
lda.x1 <- lda(alld[,3]~alld[,1]+alld[,2], data=alld)
here... I got warning message and weird result.
please help me out
Thank you
Your groups are on a line, which is tripping off lda (see plot(alld[, 1], alld[, 2], col = alld[, 3]). I've modified your code a bit and added some noise to means.
set.seed(357)
mu1 <- sample(1:10, 2)
mu2 <- sample(1:10, 2)
mu3 <- sample(1:10, 2)
cov <- matrix(c(.5, .3, .3, .5), nrow=2, ncol=2)
require(mvtnorm)
x1 <- rmvnorm(100, mean= mu1, sigma=cov)
x2 <- rmvnorm(100, mean= mu2, sigma=cov)
x3 <- rmvnorm(100, mean= mu3, sigma=cov)
alld <- data.frame(rbind(x1, x2, x3))
alld$col <- rep(1:3, each = 100)
names(alld) <- c("a", "b", "col")
plot(b ~ a, data = alld, col = alld$col)
mdl <- lda(col ~ a + b, data = alld)
plot(mdl)
points(predict(mdl)$x, cex = 0.5, pch = "+")
Related
I am attempting to add a smoother to a plot of a regression model I have. I was just using base R to plot my X and Y vectors and add a smoother using plot() and then lines(). I've done this before, and it worked, but today I am given a plot with multiple lines connecting the points as opposed to one smooth line through all the data. I can't figure out what is different about this piece of code I have written, so I am hoping someone here could help me identify the issue.
Here is my code. I am using data I randomly generated to practice something else:
X and random variable vectors to create 'Y':
X <- rnorm(100, mean = 10, sd = 1)
epsilon <- rnorm(100, 0, 1)
Y:
b0 <- 0.27
b1 <- 0.49
b2 <- 0.62
b3 <- 0.8
Y <- b0 + b1*X + b2*2^2 + b3*X^3 + epsilon
Creating df and reg model/Yhat:
df = data.frame(Y,X,epsilon)
reg <- lm(Y ~ I(X^3), data = df)
Yhat <- fitted.values(reg)
cbind(df, Yhat) -> df
plot:
plot(X, Y)
lines(X, Yhat, col = "blue", lwd = 0.5)
For this to work, the X values have to be sorted and the Y values sorted according to their corresponding X values:
X <- rnorm(100, mean = 10, sd = 1)
epsilon <- rnorm(100, 0, 1)
b0 <- 0.27
b1 <- 0.49
b2 <- 0.62
b3 <- 0.8
Y <- b0 + b1*X + b2*2^2 + b3*X^3 + epsilon
df = data.frame(Y,X,epsilon)
reg <- lm(Y ~ I(X^3), data = df)
Yhat <- fitted.values(reg)
cbind(df, Yhat) -> df
plot(X, Y)
lines(X[order(X)], Yhat[order(X)], col = "blue", lwd = 0.5)
There is no parameter for mean unlike mvrnorm() for example. How would you include this? I just added + mean to the end of my code rmvt(200, sigma = sigma1, df = 6) + mean1 but I'm not sure if that's correct.
Edit: I'm using library(mvtnorm)
Edit2: I found this listed in the documentation. Unsure if correct.
# X ~ t_3(mu, Sigma)
n <- 1000
mu <- 1:2
Sigma <- matrix(c(4, 2, 2, 3), ncol=2)
set.seed(271)
x <- rep(mu, each=n) + rmvt(n, sigma=Sigma, df=3)
plot(x)
I have plotted a density function in base R and I would like to replicate the plot in ggplot2.
This is the plot in base R:
library(tidyverse)
library(mvtnorm)
sd <- 1 / 2
# sigma
s1 <- sd^2
# first two vectors
x.points <- seq(-3, 3, length.out = 100)
y.points <- seq(-3, 3, length.out = 100)
# the third vector is a density
z <- matrix(0, nrow = 100, ncol = 100)
mu1 <- c(0, 0)
sigma1 <- matrix(c(s1^2, 0, 0, s1^2), nrow = 2)
for (i in 1:100) {
for (j in 1:100) {
z[i, j] <- dmvnorm(c(x.points[i], y.points[j]),
mean = mu1, sigma = sigma1
)
}
}
contour(x.points, y.points, z, xlim = range(-3, 3), ylim = c(-3, 3), nlevels = 5, drawlabels = TRUE)
To obtain the same result in ggplot2, I am following this example:
library(ggplot2)
library(reshape2) # for melt
volcano3d <- melt(volcano)
names(volcano3d) <- c("x", "y", "z")
# Basic plot
v <- ggplot(volcano3d, aes(x, y, z = z))
v + stat_contour()
But in my case vector z has a different length than x.points and y.points. From the errors I get below, it looks like the three vectors should have the same length. How can I transform the dataset presented above so that it can be run through ggplot2?
data1 <- as.data.frame(cbind(x.points, y.points))
p <- ggplot(data = data1, mapping = aes(x.points, y.points, z=z))
p + geom_contour()
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
p + stat_contour()
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
p + stat_function(fun = contour) + xlim(-3,3)
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
Created on 2021-04-08 by the reprex package (v0.3.0)
The problem is likely that your data isn't in long format: for every value of the z matrix, you need the x and y position, which is different from the base R approach, wherein you just need these positions for every row/column.
We can transform the matrix z to a long format using reshape2::melt and then grab the correct positions from your vectors.
library(tidyverse)
library(mvtnorm)
sd <- 1 / 2
# sigma
s1 <- sd^2
# first two vectors
x.points <- seq(-3, 3, length.out = 100)
y.points <- seq(-3, 3, length.out = 100)
# the third vector is a density
z <- matrix(0, nrow = 100, ncol = 100)
mu1 <- c(0, 0)
sigma1 <- matrix(c(s1^2, 0, 0, s1^2), nrow = 2)
for (i in 1:100) {
for (j in 1:100) {
z[i, j] <- dmvnorm(c(x.points[i], y.points[j]),
mean = mu1, sigma = sigma1
)
}
}
# Here be the reshaping bit
df <- reshape2::melt(z)
df <- transform(
df,
x = x.points[Var1],
y = y.points[Var2]
)
ggplot(df, aes(x, y)) +
geom_contour(aes(z = value))
Created on 2021-04-08 by the reprex package (v1.0.0)
# but cannot handle categorical variables
my_lm <- function(explanatory_matrix, response_vec) {
exp_mat <- as.matrix(explanatory_matrix)
intercept <- rep(1, nrow(exp_mat))
exp_mat <- cbind(exp_mat, intercept)
solve(t(exp_mat) %*% exp_mat) %*% (t(exp_mat) %*% response_vec)
}
The above code will not work when there are categorical variables in the explanatory_matrix.
How can I implement that?
Here is an example for a data set with one categorical variable:
set.seed(123)
x <- 1:10
a <- 2
b <- 3
y <- a*x + b + rnorm(10)
# categorical variable
x2 <- sample(c("A", "B"), 10, replace = T)
# one-hot encoding
x2 <- as.integer(d$x2 == "A")
xm <- matrix(c(x, x2, rep(1, length(x))), ncol = 3, nrow = 10)
ym <- matrix(y, ncol = 1, nrow = 10)
beta_hat <- MASS::ginv(t(xm) %*% xm) %*% t(xm) %*% ym
beta_hat
This gives (note the order of coefficients - it matches the order of the predictor columns):
[,1]
[1,] 1.9916754
[2,] -0.7594809
[3,] 3.2723071
which is identical to the output of lm:
d <- data.frame(x = x,
x2 = x2,
y = y)
lm(y ~ ., data = d)
Output
# Call:
# lm(formula = y ~ ., data = d)
#
# Coefficients:
# (Intercept) x x2
# 3.2723 1.9917 -0.7595
For categorical handling you should use one-hot encoding.
Do something like
formula <- dep_var ~ indep_var
exp_mat <- model.matrix(formula, explanatory_matrix)
solve(t(exp_mat) %*% exp_mat) %*% (t(exp_mat) %*% response_vec)
I have the following data structure.
library(MASS)
mu1 <- c(2, -3)
mu2 <- c(2, 5)
rho <- 0.5
s1 <- 1
s2 <- 3
Sigma <- matrix(c(s1^2, rho * s1 * s2, rho * s1 * s2, s2^2), byrow = TRUE, nrow = 2)
n <- 50
X1 <- mvrnorm(n, mu = mu1, Sigma = Sigma)
X2 <- mvrnorm(n, mu = mu2, Sigma = Sigma)
y <- rep(c(0, 1), each = n)
X <- data.frame(rbind(X1, X2), class = y)
I wonder how to properly melt (produce long form) this data frame to produce the following scatter plot.