How to generate data for gaussian distributions in these 2 scenarios in R? - r

In "Elements of Statistical Learning" by Tibshirani, when comparing least squares/linear models and knn these 2 scenarios are stated:
Scenario 1: The training data in each class were generated from bivariate Gaussian distributions with uncorrelated components and different means.
Scenario 2: The training data in each class came from a mixture of 10
low- variance Gaussian distributions, with individual means themselves
distributed as Gaussian.
The idea is that the first is better suited for least squares/linear models and the second for knn like models (those with higher variance from what i understand since knn takes into account the closest points and not all points).
In R, how would I simulate data for both scenarios?
The end goal is to be able to reproduce both scenarios in order to prove that effectively the 1st one is better explained by the linear model than the 2nd one.
Thanks!

This could be scenario 1
library(mvtnorm)
N1 = 50
N2 = 50
K = 2
mu1 = c(-1,3)
mu2 = c(2,0)
cov1 = 0
v11 = 2
v12 = 2
Sigma1 = matrix(c(v11,cov1,cov1,v12),nrow=2)
cov2 = 0
v21 = 2
v22 = 2
Sigma2 = matrix(c(v21,cov2,cov2,v22),nrow=2)
x1 = rmvnorm(N1,mu1,Sigma1)
x2 = rmvnorm(N2,mu2,Sigma2)
This could be a candidate for simulating from a Gaussian mixture:
BartSimpson <- function(x,n = 100){
means <- as.matrix(sort(rnorm(10)))
dens <- .1*rowSums(apply(means,1,dnorm,x=x,sd=.1))
rBartSimpson <- c(apply(means,1,rnorm,n=n/10,sd=.1))
return(list("thedensity" = dens,"draws" = rBartSimpson))
}
x <- seq(-5,5,by=.01)
plot(x,BartSimpson(x)$thedensity,type="l",lwd=4,col="yellow2",xlim=c(-4,4),ylim=c(0,0.6))

In the code below, I first create the 10 different means of the classes, and then use the means to draw random values from those means. The code is identical for the two scenarios, but you'll have to adjust the variance within and between classes to get the results you want.
Scenario 1:
Here you want to generate 10 classes with different means (I assume the means follow a bivariate gaussian distribution). The difference between classes is much less than the difference within classes.
library(MASS)
n <- 20
# subjects per class
classes <- 10
# number of classes
mean <- 100
# mean value for all classes
var.between <- 25
# variation between classes
var.within <- 225
# variation within classes
covmatrix1 <- matrix(c(var.between,0,0,var.between), nrow=2)
# covariance matrix for the classes
means <- mvrnorm(classes, c(100,100), Sigma=covmatrix1)
# creates the means for the two variables for each class using variance between classes
covmatrix2 <- matrix(c(var.within,0,0,var.within), nrow=2)
# creates a covariance matrix for the subjects
class <- NULL
values <- NULL
for (i in 1:10) {
temp <- mvrnorm(n, c(means[i], means[i+classes]), Sigma=covmatrix2)
class <- c(class, rep(i, n))
values <- c(values, temp)
}
# this loop uses generates data for each class based on the class means and variance within classes
valuematrix <- matrix(values, nrow=(n*classes))
data <- data.frame (class, valuematrix)
plot(data$X1, data$X2)
Alternatively, if you don't care about specifying the variance between the classes, and you don't want any correlation within classes, you can just do this:
covmatrix <- matrix(c(225, 0, 0, 225), nrow=2)
# specifies that the variance in both groups is 225 and no covariance
values <- matrix(mvrnorm(200, c(100,100), Sigma=covmatrix), nrow=200)
# creates a matrix of 200 individuals with two values each.
Scenario 2:
Here the only difference is that the variation between classes is larger than the variation within classes. Try exchanging the value of the variable var.between to around 500 and the variable var.within to 25 and you'll see a clear clustering in the scatterplot:
n <- 20
# subjects per class
classes <- 10
# number of classes
mean <- 100
# mean value for all classes
var.between <- 500
# variation between classes
var.within <- 25
# variation within classes
covmatrix1 <- matrix(c(var.between,0,0,var.between), nrow=2)
# covariance matrix for the classes
means <- mvrnorm(classes, c(100,100), Sigma=covmatrix1)
# creates the means for the two variables for each class using variance between classes
covmatrix2 <- matrix(c(var.within,0,0,var.within), nrow=2)
# creates a covariance matrix for the subjects
class <- NULL
values <- NULL
for (i in 1:10) {
temp <- mvrnorm(n, c(means[i], means[i+classes]), Sigma=covmatrix2)
class <- c(class, rep(i, n))
values <- c(values, temp)
}
# this loop uses generates data for each class based on the class means and variance within classes
valuematrix <- matrix(values, nrow=(n*classes))
data <- data.frame (class, valuematrix)
plot(data$X1, data$X2)
The plot should confirm that the data are clustered.
Hope this helps!

With the help from both answers here I ended up using this:
mixed_dists = function(n, n_means, var=0.2) {
means = rnorm(n_means, mean=1, sd=2)
values <- NULL
class <- NULL
for (i in 1:n_means) {
temp <- rnorm(n/n_means, mean=means[i], sd=0.2)
class <- c(class, rep(i, n/n_means))
values <- c(values, temp)
}
return(list(values, class));
}
N = 100
#Scenario 1: The training data in each class were generated from bivariate Gaussian distributions
#with uncorrelated components and different means.
scenario1 = function () {
var = 0.5
n_groups = 2
m = mixed_dists(N, n_groups, var=var)
x = m[[1]]
group = m[[2]]
y = mixed_dists(N, n_groups, var=var)[[1]]
data = matrix(c(x,y, group), nrow=N, ncol=3)
colnames(data) = c("x", "y", "group")
data = data.frame(data)
plot(x=data$x,y=data$y, col=data$group)
model = lm(y~x, data=data)
summary(model)
}
#Scenario 2: The training data in each class came from a mixture of 10
#low-variance Gaussian distributions, with individual means themselves
#distributed as Gaussian.
scenario2 = function () {
var = 0.2 # low variance
n_groups = 10
m = mixed_dists(N, n_groups, var=var)
x = m[[1]]
group = m[[2]]
y = mixed_dists(N, n_groups, var=var)[[1]]
data = matrix(c(x,y, group), nrow=N, ncol=3)
colnames(data) = c("x", "y", "group")
data = data.frame(data)
plot(x=data$x,y=data$y, col=data$group)
model = lm(y~x, data=data)
summary(model)
}
# scenario1()
# scenario2()
So basically the data in scenario 1 is cleanly separated in 2 classes and the data in scenario 2 has about 10 clusters and can't be cleanly separated using a straight line. Indeed, running the linear model on both scenarios it can be seen that on average it will apply better to scenario 1 than to scenario 2.

Related

Plotting Forecast and Real values in one plot using a Rolling Window

I have a code which takes the input as the Yield Spread (dependent var.) and Forward Rates(independent var.) and operate an auto.arima to get the orders. Afterwards, I am forecasting the next 25 dates (forc.horizon). My training data are the first 600 (training). Then I am moving the time window 25 dates, meaning using the data from 26 to 625, estimating the auto.arima and then forecasting the data from 626 to 650 and so on. My data sets are 2298 rows (date) and 30 columns (maturity).
I want to store all of the forecasts and then plot the forecasted and real values in the same plot.
This is the code I have, but it doesn't store the forecasts in a way to plot later.
forecast.func <- function(NS.spread, ind.v, maturity, training, forc.horizon){
NS.spread <- NS.spread/100
forc <- c()
j <- 0
for(i in 1:floor((nrow(NS.spread)-training)/forc.horizon)){
# test data
y <- NS.spread[(1+j):(training+j) , maturity]
f <- ind.v[(1+j):(training+j) , maturity]
# auto- arima
c <- auto.arima(y, xreg = f, test= "adf")
# forecast
e <- ind.v[(training+j+1):(training+j+forc.horizon) , maturity]
h <- forecast(c, xreg = lagmatrix(e, -1))
forc <- c(forc, list(h))
j <- j + forc.horizon
}
return(forc)
}
a <- forecast.func(spread.NS.JPM, Forward.rate.JPM, 10, 600, 25)
lapply(a, plot)
Here's a link to my two datasets:
https://drive.google.com/drive/folders/1goCxllYHQo3QJ0IdidKbdmfR-DZgrezN?usp=sharing
LOOK AT THE END for a full functional example on how to handle AUTO.ARIMA MODEL with DAILY DATA using XREG and FOURIER SERIES with ROLLING STARTING TIMES and cross validated training and test.
Without a reproducible example no one can help you, because they can't run your code. You need to provide data. :-(
Even if it's not part of StackOverflow to discuss statistics matters, why don't you do an auto.arima with xreg instead of lm + auto.arima on residuals? Especially, considering how you forecast at the end, that training method looks really wrong. Consider using:
fit <- auto.arima(y, xreg = lagmatrix(f, -1))
h <- forecast(fit, xreg = lagmatrix(e, -1))
auto.arima will automatically calculate the best parameters by max likelihood.
On your coding question..
forc <- c() should be outside of the for loop, otherwise at every run you delete your previous results.
Same for j <- 0: at every run you're setting it back to 0. Put it outside if you need to change its value at every run.
The output of forecast is an object of class forecast, which is actually a type of list. Therefore, you can't use cbind effectively.
I'm my opinion, you should create forc in this way: forc <- list()
And create a list of your final results in this way:
forc <- c(forc, list(h)) # instead of forc <- cbind(forc, h)
This will create a list of objects of class forecast.
You can then plot them with a for loop by getting access at every object or with a lapply.
lapply(output_of_your_function, plot)
This is as far as I can go without a reproducible example.
FINAL EDIT
FULL FUNCTIONAL EXAMPLE
Here I try to sum up a conclusion out of the million comments we wrote.
With the data you provided, I built a code that can handle everything you need.
From training and test to model, till forecast and finally plotting which have the X axis with the time as required in one of your comments.
I removed the for loop. lapply is much better for your case.
You can leave the fourier series if you want to. That's how Professor Hyndman suggests to handle daily time series.
Functions and libraries needed:
# libraries ---------------------------
library(forecast)
library(lubridate)
# run model -------------------------------------
.daily_arima_forecast <- function(init, training, horizon, tt, ..., K = 10){
# create training and test
tt_trn <- window(tt, start = time(tt)[init] , end = time(tt)[init + training - 1])
tt_tst <- window(tt, start = time(tt)[init + training], end = time(tt)[init + training + horizon - 1])
# add fourier series [if you want to. Otherwise, cancel this part]
fr <- fourier(tt_trn[,1], K = K)
frf <- fourier(tt_trn[,1], K = K, h = horizon)
tsp(fr) <- tsp(tt_trn)
tsp(frf) <- tsp(tt_tst)
tt_trn <- ts.intersect(tt_trn, fr)
tt_tst <- ts.intersect(tt_tst, frf)
colnames(tt_tst) <- colnames(tt_trn) <- c("y", "s", paste0("k", seq_len(ncol(fr))))
# run model and forecast
aa <- auto.arima(tt_trn[,1], xreg = tt_trn[,-1])
fcst <- forecast(aa, xreg = tt_tst[,-1])
# add actual values to plot them later!
fcst$test.values <- tt_tst[,1]
# NOTE: since I modified the structure of the class forecast I should create a new class,
# but I didnt want to complicate your code
fcst
}
daily_arima_forecast <- function(y, x, training, horizon, ...){
# set up x and y together
tt <- ts.intersect(y, x)
# set up all starting point of the training set [give it a name to recognize them later]
inits <- setNames(nm = seq(1, length(y) - training, by = horizon))
# remove last one because you wouldnt have enough data in front of it
inits <- inits[-length(inits)]
# run model and return a list of all your models
lapply(inits, .daily_arima_forecast, training = training, horizon = horizon, tt = tt, ...)
}
# plot ------------------------------------------
plot_daily_forecast <- function(x){
autoplot(x) + autolayer(x$test.values)
}
Reproducible Example on how to use the previous functions
# create a sample data
tsp(EuStockMarkets) <- c(1991, 1991 + (1860-1)/365.25, 365.25)
# model
models <- daily_arima_forecast(y = EuStockMarkets[,1],
x = EuStockMarkets[,2],
training = 600,
horizon = 25,
K = 5)
# plot
plots <- lapply(models, plot_daily_forecast)
plots[[1]]
Example for the author of the post
# your data
load("BVIS0157_Forward.rda")
load("BVIS0157_NS.spread.rda")
spread.NS.JPM <- spread.NS.JPM / 100
# pre-work [out of function!!!]
set_up_ts <- function(m){
start <- min(row.names(m))
end <- max(row.names(m))
# daily sequence
inds <- seq(as.Date(start), as.Date(end), by = "day")
ts(m, start = c(year(start), as.numeric(format(inds[1], "%j"))), frequency = 365.25)
}
mts_spread.NS.JPM <- set_up_ts(spread.NS.JPM)
mts_Forward.rate.JPM <- set_up_ts(Forward.rate.JPM)
# model
col <- 10
models <- daily_arima_forecast(y = mts_spread.NS.JPM[, col],
x = stats::lag(mts_Forward.rate.JPM[, col], -1),
training = 600,
horizon = 25,
K = 5) # notice that K falls between ... that goes directly to the inner function
# plot
plots <- lapply(models, plot_daily_forecast)
plots[[5]]

How to generate spatially correlated random fields of very high dimension with R

This is an extended question I found from here (Method #1: http://santiago.begueria.es/2010/10/generating-spatially-correlated-random-fields-with-r/) and here (Method #2: https://gist.github.com/brentp/1306786). I know these two sites covered very well (Thanks!) with relatively small size of dimension (e.g., 1000x1). I am trying to generate spatially clustered binary data with large size of dimension like >=100000x1 dimension, for example, c(1,1,1,1,0,1,0,0,0,0, …, 0,0,0,0,0,0,0,0,0,0,0,0) with 1000 times / case study. Here are slightly modified codes from the sites.
# Method #1
dim1 <- 1000
dim2 <- 1
xy <- expand.grid(seq_len(dim1), seq_len(dim2))
colnames(xy) <- c("x", "y")
geo.model <- gstat(formula = z~x+y, locations = ~x+y, dummy = TRUE, beta = 0,
model = vgm(psill = 1,"Exp",
range = dim1), # Range parameter!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
nmax = 30) # Spatial correlation model
sim.mat <- predict(geo.model, newdata = xy, nsim = 1)
sim.mat[,3] <- ifelse(sim.mat[,3] > quantile(sim.mat[,3], .1), 0, 1)
plot(sim.mat[, 3])
# Method #2
# generate autocorrelated data.
nLags = 1000 # number of lags (size of region)
# fake, uncorrelated observations
X = rnorm(nLags)
# fake sigma... correlated decreases distance.
sigma = diag(nLags)
corr = .999
sigma <- corr ^ abs(row(sigma)-col(sigma))
#sigma
# Y is autocorrelated...
Y <- t(X %*% chol(sigma))
y <- ifelse(Y >= quantile(Y, probs=.9), 1, 0)[, 1]
plot(y)
Both methods work very well to generate binary data when dim1 is less than 10000. However, when I tried several hundred thousand (e.g., >= 100,000), it seems to take a long time or memory issue.
For example, when I used “nLags = 50000” in Method #2, I got an error message (“Error: cannot allocate vector of size 9.3 Gb”) after the code “sigma <- corr ^ abs(row(sigma)-col(sigma))”.
I would like to find an efficient (time- and memory-saving) way to generate such a spatially clustered binary data 1000 times (especially, with dim1 >= 100000) per each case study (about 200 cases).
I have thought about applying multiple probabilities in "sample" function or probability distribution. I am not sure how to and beyond my scope.

Is it possible to analyse a spatial point pattern given another, underlying, spatial point pattern in R

I want to analyse the type of spatial pattern shown by an animal (i.e. random, clustered, uniform) taking into consideration the underlying spatial pattern of it's available habitat. The animals in question roost in trees, so a standard analysis of the animal spp will always show a clustered distribution (i.e. clustering around trees), but I want to test whether there is clustering between trees vs whether they distribute randomly throughout trees. To provide a visual, I want to be able to differentiate between the following scenarios in the image:
https://imgur.com/a/iE3nAoh (image not allowed because I'm new to stack overflow, but it's available through the link)
Here is a reproducible data frame. The scenario here is of uniform habitat (25 areas of habitat) and uniform animals (16 animals per habitat):
library(spatstat)
data <- data.frame(matrix(ncol = 4, nrow = 25))
x <- c("habitat", "x", "y", "animalcount")
colnames(data) <- x
data$habitat <- 1:25
data$x <- seq(from=2, to=20, by=4)
data$y[1:5] <- 2
data$y[6:10] <- 6
data$y[11:15] <- 10
data$y[16:20] <- 14
data$y[21:25] <- 18
data$animalcount <- 16
Set up conditions for the spatial analysis:
plot.win <- owin(c(0,20), c(0,20)) # set the plot window as 20x20m
nS <- 499 # number of simulations
cd <- 5 # cluster distance
ed <- 50 # envelope distance
incr.dist <- 0.5 # increment distance for envelopes
Create the point pattern for the habitat:
habitat <- ppp(x = data$x, y = data$y, window = plot.win)
Create the point pattern for the animals. To do this, first make a new dataframe with repeated rows for the number in animal count, so that points are individual animals. Jitter x/y so that x/y coordinates are not exactly the same:
data <-data[which(data$animalcount>0),]
duplicate_rows <- function(habitat, x, y, animalcount) {
expanded <- paste0("animal-", 1:animalcount)
repeated_rows <- data.frame("habitat" = habitat, "x" = x, "y" = y, "animalcount" = expanded)
repeated_rows
}
expanded_rows <- Map(f = duplicate_rows, data$habitat, data$x, data$y, data$animalcount)
animal_data <- do.call(rbind, expanded_rows)
animal_data$xan <- jitter(animal_data$x)
animal_data$yan <- jitter(animal_data$y)
animal <- ppp(x = animal_data$xan, y = animal_data$yan, window = plot.win)
Now test Complete Spatial Randomness of animals regardless of habitat. This should come out as clustered:
an.csr <- envelope(animal, Kest, nsims = nS, savepatterns = TRUE, r = seq(0, ed, incr.dist), correction=c("Ripley"), verbose = FALSE) #CSR fit and determine the number of simulations
an.dclf <- dclf.test(an.csr, rinterval = c(0,cd), verbose = FALSE) #calculate the summary statistics of the CSR null model fit (dclf.test)
plot(an.csr, sqrt(./pi)-r~r, ylab="L(r)-r", xlab="r (meters)", xlim=c(0,ed), legend="NULL", main=paste("Animal - CSR", sep = "")) #plot 0-centered fit with the confidence bounds
clarkevans(animal)[2] #R > 1 suggests ordering, < 1 suggests clustering
clarkevans.test(animal, "Donnelly")$p
Now test Complete Spatial Randomness of animals, given the available habitat. This should come out not clustered. But simply adding habitat as a covariate clearly isn't the appropriate way to do it:
an.csr <- envelope(animal, covariates = animal_data[,2:3], Kest, nsims = nS, savepatterns = TRUE, r = seq(0, ed, incr.dist), correction=c("Ripley"), verbose = FALSE)
an.dclf <- dclf.test(an.csr, rinterval = c(0,cd), verbose = FALSE)
plot(an.csr, sqrt(./pi)-r~r, ylab="L(r)-r", xlab="r (meters)", xlim=c(0,ed), legend="NULL", main=paste("Animal - CSR", sep = ""))
clarkevans(animal)[2]
clarkevans.test(animal, "Donnelly")$p
I also tried running the test of Complete Spatial Randomness on a fitted Point Process Model, where the animal point pattern could be predicted by x&y, but this also did not change outcomes:
animalppm<-ppm(animal~x+y)
an.csr <- envelope(animalppm, Kest, nsims = nS, savepatterns = TRUE, r = seq(0, ed, incr.dist), correction=c("Ripley"), verbose = FALSE)
an.dclf <- dclf.test(an.csr, rinterval = c(0,cd), verbose = FALSE)
plot(an.csr, sqrt(./pi)-r~r, ylab="L(r)-r", xlab="r (meters)", xlim=c(0,ed), legend="NULL", main=paste("Animal - CSR", sep = ""))
clarkevans(animalppm)[2] #R > 1 suggests ordering, < 1 suggests clustering
clarkevans.test(animalppm, "Donnelly")$p
From there I would run tests of aggregation models, but the logic of adding the second point pattern should be similar.
I would appreciate any suggestions on ways to deal with this. I cannot think of an effective way to google this, and am coming up short on clever coding solutions in R. Thanks in advance!
You can model the intensity as depending on the distance to the
habitat pattern. Here is a simple example where the animals follow a Poisson
point process with intensity function which decays log-linearly with distance
to the habitat:
library(spatstat)
data <- expand.grid(x = seq(2, 18, by=4), y = seq(2, 18, by=4))
data$animalcount <- 16
plot.win <- owin(c(0,20), c(0,20)) # set the plot window as 20x20m
habitat <- ppp(x = data$x, y = data$y, window = plot.win)
d <- distmap(habitat)
plot(d)
lam <- exp(3-2*d)
plot(lam)
animal <- rpoispp(lam)
plot(animal)
fit <- ppm(animal ~ d)
fit
#> Nonstationary Poisson process
#>
#> Log intensity: ~d
#>
#> Fitted trend coefficients:
#> (Intercept) d
#> 2.952048 -1.974381
#>
#> Estimate S.E. CI95.lo CI95.hi Ztest Zval
#> (Intercept) 2.952048 0.07265533 2.809646 3.094450 *** 40.63085
#> d -1.974381 0.07055831 -2.112673 -1.836089 *** -27.98226
Taking the underlying non-homogeneous intensity into account
there is no sign of departure from the Poisson model in the
(inhomogeneous) K-function:
plot(Kinhom(animal, lambda = fit))
#> Warning: The behaviour of Kinhom when lambda is a ppm object has changed
#> (in spatstat 1.37-0 and later). See help(Kinhom)
You don't have to have simple log-linear dependence on distance. You could also make a threshold model where you have one intensity with e.g. distance 1 of the habitat and another intensity outside this distance. You can make all kinds of derived covariates from e.g. the distance for use in your model.
If animals is the point pattern of animals, and trees is the point pattern of trees (both objects of class "ppp" in spatstat) then you could do
d <- distfun(trees)
f <- rhohat(animals, d)
plot(f)
to get an idea of how the concentration of animals depends on distance to nearest tree. You can use
berman.test(animals, d)
to perform a hypothesis test of dependence on the trees.

Simulate mixture data with different mix dependecies structure between each two variables?

I would like to simulate a mixture data, say 3 dimensional data. I would like to have 2 different components between each two variables.
That is, simulate mixture data (V1 and V2) where the dependencies between them is two different normal components. Then, between V2 and V3 another two normal components. So, I will have 3d data, the dependence between first and a second variable are a mixture of two normals. And the dependence between the second and third variable are mixture of another two different components.
Another way to explain my question:
Suppose I would like to generate a mixture data as follows:
1- 0.3 normal(0.5,1) + 0.7 normal(2,4) # hence here I will get a bivariate mixture data generated from two different normal (two components of the mixture model), the sum of mixuter weight is 1.
Then, I would like to get another variable as follows:
2- 0.5 normal(2,4) # this is the second variable on the first simulate + 0.5 normal(2,6)
so here, I get 3d simulated mixture data, where V1 and V2 are generated by two different mixture components, and the V2 and V3 are generated by another different mixture components.
This is how to generate the data in r: ( I belive it is not generate a bivariate data)
N <- 100000
#Sample N random uniforms U
U <- runif(N)
#Variable to store the samples from the mixture distribution
rand.samples <- rep(NA,N)
#Sampling from the mixture
for(i in 1:N) {
if(U[i]<.3) {
rand.samples[i] <- rnorm(1,1,3)
} else {
rand.samples[i] <- rnorm(1,2,5)
}
}
so if we generate mixture bivariate data (two variables) then how can extend this to have 4 or 5 variables, where V1 and V2 are generated from two different normals (the dependencies structures between them is a mixture of two normals) and then V3 will generated from another another differetn normal and then compine with V2. That is when we plot the V2 ~ V3 we will find that the dependencies structures between them is a mixutre of two normals and so on.
I am not really sure I have correctly understood the question but I will give it a try. You have 3 distributions D1, D2 and D3. From these three distributions you would like to create variables that use 2 out of those 3 but not the same ones.
Since I do not know how the distributions should be combined I used the flags using the binomial distribution (its a vector of length equal to 200 with 0s and 1s) to determine from which distribution each value will be picked (You can change that if that is not how you want it done).
D1 = rnorm(200,2,1)
D2 = rnorm(200,3,1)
D3= rnorm(200,1.5,2)
In order to created the mixed distribution we can use the rbinom function to create a vector of 1s and 0s according to a selected probability. This is a way to have some values from both distributions.
var_1_flag <- rbinom(200, size=1, prob = 0.3)
var_1 <- var_1_flag*D1 + (1 - var_1_flag)*D2
var_2_flag <- rbinom(200, size=1, prob = 0.7)
var_2 <- var_2_flag*D2 + (1 - var_2_flag)*D3
var_3_flag <- rbinom(200, size=1, prob = 0.6)
var_3 <- var_3_flag*D1 + (1 - var_3_flag)*D3
In order to see which values come from which distribution you can do the following:
var_1[var_1_flag] #This gives you the values in the mixed distribution that come from the first distribution (D1)
var1[!var_1_flag] #This gives you the values in the mixed distribution that come from the second distribution (D2)
Since I found this a bit manual and I am guessing you might want to change the variables, you might want to use the function below to get the same results
create_distr <- function(observations, mean1, sd1, mean2, sd2, flag_prob) {
flag <- rbinom(observations, size=1, prob = flag_prob)
my_distribution <- flag * rnorm(observations, mean1, sd1) + (1 - flag) * rnorm(observations, mean2, sd2)
}
var_1 <- create_distr(200, 2, 1, 3, 1, 0.5)
var_2 <- create_distr(200, 3, 1, 1.5, 2, 0.7)
var_3 <- create_distr(200, 2, 1, 1.5, 2, 0.6)
If you would like to have more than two variables (distributions) to the mix you could extend the code you have provided as follows:
N <- 100000
#Sample N random uniforms U
U <- runif(N)
#Variable to store the samples from the mixture distribution
rand.samples <- rep(NA,N)
for(i in 1:N) {
if(U[i] < 0.3) {
rand.samples[i] <- rnorm(1,1,3)
} else if (U[i] < 0.5){
rand.samples[i] <- rnorm(1,2,5)
} else if (U[i] < 0.8) {
rand.samples[i] <- rnorm(1,5,2)
} else {
rand.samples[i] <- rt(1, 2)
}
}
This way every element is taken one at a time from each distribution. If you want to have the same result but without taking each element one at a time you can do the following:
N <- 100000
#Sample N random uniforms U
U <- runif(N)
#Variable to store the samples from the mixture distribution
rand.samples <- rep(NA,N)
D1 = rnorm(N,1,3)
D2 = rnorm(N,2,5)
D3= rnorm(N,5,2)
D4 = rt(N, 2)
rand.samples <- c(D1[U < 0.3], D2[U >= 0.3 & U < 0.5], D3[U >= 0.5 & U < 0.8], D4[U >= 0.8])
Which corresponds to 0.3*normal(1,3) + 0.2*normal(2,5) + 0.3*normal(5,2) + 0.2*student(2 degrees of freedom)
If you want to create two mixtures, but in the second keep the same values from the normal distribution you can do the following:
mixture_1 <- c(D1[U < 0.3], D2[U >= 0.3 ])
mixture_2 <- c(D1[U < 0.3], D3[U >= 0.3])
This will use the exact same elements from normal(1,3) in both mixtures. The trick is to not recalculate the rnorm(N,1,3) every time you use it. And in both cases the samples are composed from 30% roughly coming from the first normal (D1) and 70% roughly from the second distribution. For example:
set.seed(1)
N <- 100000
U <- runif(N)
> prop.table(table(U < 0.3))
FALSE TRUE
0.6985 0.3015
30% of the values in the U vector is below 0.3.

How to solve equations symbolically in a loop in R?

I need to conduct Gaussian Maximum Likelihood Classification for 1000 data sets of two classes of bivariate Gaussian distributions with each 100 data points.
Here is the code to create the data sets:
# mean vector for two classes
mean1<-c(70,130) ; mean2<-c(148,160)
# covariance matrix for two classes
cov1<-matrix(c(784,-546,-546,900),nrow=2,ncol=2,byrow=TRUE)
cov2<-matrix(c(484,285.1,285.1,324),nrow=2,ncol=2,byrow=TRUE)
library(MASS)
# Number of samples
nrs <- 1000
# sample size
ss <- 100
# number of dimensions
d <- length(mean1)
set.seed(1)
# generation of bivariate normal random variables based on mean vector and covariance matrix for each class
refdata_1 <- replicate(nrs,matrix(mvrnorm(ss, mu = mean1, Sigma = cov1 ),ncol = d,nrow = ss),simplify=FALSE)
refdata_2 <- replicate(nrs,matrix(mvrnorm(ss, mu = mean2, Sigma = cov2 ),ncol = d,nrow = ss),simplify=FALSE)
# calculation of mean vector for each sample of random reference data
mean_refdata_1 <- lapply(refdata_1,colMeans)
mean_refdata_2 <- lapply(refdata_2,colMeans)
# calculation of covariance matrix for each sample of random reference data
cov_refdata_1 <- lapply(refdata_1,cov)
cov_refdata_2 <- lapply(refdata_2,cov)
Now, I need to plot the decision boundary between the two classes for each of the 1000 data sets (thus 1000 decision boundaries).
Here is the decision equation (if you wonder where the ln p(class) part is, both classes have same probability and thus cancel each other out):
This is the vector of the data points:
x = vector(SR,var('a,b'))
Here is the decision equation (if you wonder where the ln p(class) part is, both classes have same probability and thus cancel each other out):
decision1 =-0.5*log(det(cov1))-0.5*((x-mean1)*cov1.inverse()*(x-mean1))
decision2 =-0.5*log(det(cov2))-0.5*((x-mean2)*cov2.inverse()*(x-mean2))
If decision1(data point) > decision2(data point), then the data point belongs to class 1. In order to get the decision boundary, decision1 - decision2 == 0. The data points are RBG images. Thus, a in the data vector x is 0:255. I solve the equation for b:
solve(decision1-decision2==0,b)
In R, that looks for the original data set like this:
m_1<-c(70,130) ; m_2<-c(148,160)
covma_1<-matrix(c(784,-546,-546,900),nrow=2,ncol=2,byrow=TRUE)
covma_2<-matrix(c(484,285.1,285.1,324),nrow=2,ncol=2,byrow=TRUE)
library(rSymPy)
c11 <- Var("c11")
c12 <- Var("c12")
c13 <- Var("c13")
c14 <- Var("c14")
sympy("covma_1 = Matrix([[c11,c12], [c13,c14]])")
a <- Var("a")
b <- Var("b")
sympy("x = Matrix([a,b])")
m11 <- Var("m11")
m12 <- Var("m12")
sympy("m_1 = Matrix([m11,m12])")
sympy("covma_1=covma_1.subs(c11,784)")
sympy("covma_1=covma_1.subs(c12,-546)")
sympy("covma_1=covma_1.subs(c13,-546)")
sympy("covma_1=covma_1.subs(c14,900)")
sympy("m_1=m_1 .subs(m11,70)")
sympy("m_1=m_1 .subs(m12,130)")
first <-sympy("-0.5*log(covma_1.det())")
second <-sympy("-0.5*((x-m_1).T*covma_1.inv()*(x-m_1))")
second<-gsub("\\[","",second)
second<-gsub("\\]","",second)
c21 <- Var("c21")
c22 <- Var("c22")
c23 <- Var("c23")
c24 <- Var("c24")
sympy("covma_2 = Matrix([[c21,c22], [c23,c24]])")
m21 <- Var("m21")
m22 <- Var("m22")
sympy("m_2 = Matrix([m21,m22])")
sympy("covma_2=covma_2.subs(c21,484)")
sympy("covma_2=covma_2.subs(c22,285.1)")
sympy("covma_2=covma_2.subs(c23,285.1)")
sympy("covma_2=covma_2.subs(c24,324)")
sympy("m_2=m_2.subs(m21,148)")
sympy("m_2=m_2.subs(m22,160)")
third <-sympy("-0.5*log(covma_2.det())")
fourth <-sympy("-0.5*((x-m_2).T*covma_2.inv()*(x-m_2))")
fourth<-gsub("\\[","",fourth)
fourth<-gsub("\\]","",fourth)
class1 <- paste(c(first,second),collapse="")
class2 <- paste(c(third,fourth),collapse="")
sympy(paste(c("hm=solve(",class2,"-","(",class1,")",",b)"), collapse = ""))
As you can see, I use very nasty string operations to parse into sympy. Anyway, after solving for b in sympy, I stuck and don't know how to get numeric values. Can somebody tell me how to solve symbolically for b and plot it in a loop for 1000 data sets? I m also open for non-symbolic approaches.
Any help is appreciated!

Resources