How to find the range of kmeans clusters? - r

if I have this
x = c(0.5,0.1,0.3,6,5,2,1,4,2,1,0.9,3,6,99,22,11,44,55)
apply kmeans
kmeans(x, centers=5, iter.max = 10, nstart = 1)
this gives:
Cluster means:
[,1]
1 99.00
2 49.50
3 22.00
4 7.00
5 1.48
Clustering vector:
[1] 5 5 5 4 4 5 5 5 5 5 5 5 4 1 3 4 2 2
Now I want to classify my values in x2 based on the clusters of x (1,2,3,4,5). How to do this?
x2 = c(0.3,1,3,0.66,0.5,0.2,0.1,64,92,21,0.93,93,6,99,22,11,44,55)

Here is a naive approch based on distance to centroids of each cluster:
km <- kmeans(x, centers=5, iter.max = 10, nstart = 1)
group <- sapply(x2, function(xx) which.min(abs(km$centers - xx)))
plot(x = x, y = rep(1, length(x)), col = km$cluster)
points(x = km$centers, y = rep(1, length(km$centers)), col = "purple", pch = "*")
points(x = x2, y = rep(1, length(x2)), col = group, pch = "+")
Please check the link provided by #neilfws about doing predictions with kmeans.

There is a predict method for kmeans clusters in the fdm2id package. Load the package and type ?KMEANS and ?predict.kmeans for more information.
library(data.table)
library(fdm2id)
#
x <- c(0.5,0.1,0.3,6,5,2,1,4,2,1,0.9,3,6,99,22,11,44,55)
dt <- data.table(x2 = c(0.3,1,3,0.66,0.5,0.2,0.1,64,92,21,0.93,93,6,99,22,11,44,55))
dt[, pred:=predict(KMEANS(x, k = 5), newdata=x2)]
Note that the cluster numbers are arbitrary in the sense that they merely distinguish between clusters, so the numbering might not be the same but the cluster membership should align.

Related

Calculate y value for distribution fitting functions R

I am plotting curves for different distribution functions and I need to know the highest y-value for each curve. Later I will plot only the one curve, which is selected as the best fitting.
This is the function (it is a bit hard-coded, I am working on it):
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white'
# ylim= c(0,max_y)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2]),
add=TRUE, col='blue', lwd=2)
curve(dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2]),
add=TRUE, col='darkgreen', lwd=2)
curve(dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2]),
add=TRUE, col='purple', lwd=2)
curve(dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2]),
add=TRUE, col='Gold', lwd=2)
curve(dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3]),
add=TRUE, col='red', lwd=2)
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
I was trying to play with max_y and then to use it in ylim. I could do it only for normal density, but not for the rest curves.
Currently plot looks like this (some curves are cut):
If to set up ylim = c(0,2) we can see, that lognormal and gamma distribution goes beyond 1:
I need to know the max value for each curve, so, when I choose which curve will be printed, to set up the correct ylim.
You could use purrr::map_dbl to map the function optimize over your densities if you rearrange your code slightly and you have an idea over what input values you want to find their maxima/the density exists.
You can set your densities with whatever your parameters are ahead of time, that way you can find their peak values using optimize and also pass them to the curve function.
As a small reproducible example:
library(purrr)
# parameterize your densities
mynorm <- function(x) dnorm(x, mean = 0, sd = 1)
mygamma <- function(x) dgamma(x, rate = .5, shape = 1)
# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))
# 0.4999811
# plot data
curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
curve(mygamma, col = "red", lwd = 2, add = T)
Using your code I've implemented the above solution and adjusted the x grid of the curve function to show you what I mean after our discussion in the comments to make things more clear and show you what you should actually be plotting:
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
# parameterize your densities before plotting
mynorm <- function(x) {
dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2])
}
mylnorm <- function(x){
dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2])
}
myweibull <- function(x) {
dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2])
}
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
}
mygev <- function(x){
dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3])
}
distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
# get the max of each density
y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)
# find the max (excluding infinity)
ymax <- max(y[abs(y) < Inf])
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white',
ylim= c(0, ymax)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(mynorm,
add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
curve(mylnorm,
add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
curve(myweibull,
add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
curve(mygamma,
add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
curve(mygev,
add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
Why your plot height isn't matching the solution output
To illustrate further what's going on with your plot and some of the confusion you might have you need to understand how the curve function is plotting your data. By default curve takes 101 x-values and evaluates your functions to get their y-values and then plots those points as a line. Because the peaks on some of your density are so sharp, the curve function isn't evaluating enough x-values to plot your density peaks. To show you want I mean I will focus on your gamma density. Don't worry too much about the code as much as the output. Below I have the first few (x,y) coordinates for different values of n.
library(purrr)
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1], # 0.6225622
rate=fit[['gamma']]$estimate[2]) # 0.3568242
}
number_of_x <- c(5, 10, 101, 75000)
purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>%
dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>%
dplyr::mutate(n = factor(n, unique(n))) %>%
dplyr::filter(x > 0) %>%
dplyr::group_by(n) %>%
dplyr::slice_min(order_by = x, n = 5)
n x y
<fct> <dbl> <dbl>
1 n = 5 1.5 0.184
2 n = 5 3 0.0828
3 n = 5 4.5 0.0416
4 n = 5 6 0.0219
5 n = 10 0.667 0.336
6 n = 10 1.33 0.204
7 n = 10 2 0.138
8 n = 10 2.67 0.0975
9 n = 10 3.33 0.0707
10 n = 101 0.06 1.04
11 n = 101 0.12 0.780
12 n = 101 0.18 0.655
13 n = 101 0.24 0.575
14 n = 101 0.3 0.518
15 n = 75000 0.0000800 12.9
16 n = 75000 0.000160 9.90
17 n = 75000 0.000240 8.50
18 n = 75000 0.000320 7.62
19 n = 75000 0.000400 7.01
Notice that when n = 5 you have very few values plotted. As n increases, the distance between the x-values gets smaller. Since these functions are continuous, there are infinite number of points to plot, but that cannot be done computationally so a subset of x-values are plotted to approximate. The more x-values the better the approximation. Normally, the default n = 101 works fine, but because the gamma and log-normal densities have such sharp peaks the plot function is stepping over the maximum value. Below is a full plot of the data for n = 5, 10, 101, 75000 with points added.
Finally I have used this solution, found here:
mygamma <- function(x) dgamma(x, shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
get_curve_values <- function(fn, x_data){
res <- curve(fn, from=0, to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma, x_data)
ylim <- max(curve_val$y,na.rm = TRUE)

Creating stimuli in R with ggplot

I am trying to generate my own stimuli for an experiment using R. Below is the code that creates my (x,y) coordinates using the rnorm() with different a sample size of 100, different means and sd. I also create another variable to represent the size of the circles, which are determined by the runif().
dt <- data.frame(x = NA,
y = NA,
size = NA,
M = NA,
sd = NA,
col = NA,
iter = NA)
sa<-0
mySD<-c(5, 15)
myMeans<-c(35, 45)
colors<-c("Blues", "Reds")
for(i in 1:10){
for(s in mySD){
for(m in myMeans){
x = abs(rnorm(n=1, mean=m, sd=s))
y = abs(rnorm(n=1, mean=m, sd=s))
size = runif(1, 1, 25) #select a random x speed between [25,35]
sa<-sa+1
dt[sa,] <- NA
dt$x[sa]<-x
dt$y[sa]<-y
dt$M[sa]<-m
dt$sd[sa]<-s
dt$size[sa]<-size
dt$iter[sa]<-i
}
}
}
}
Next, I want to use ggplot(dt, aes(x, y, size=size) to plot. I want to randomly select 4 (x,y) values to plot for one graph, then 8 for another, then 16 for another, etc. Basically, I want to plot different graphs with a different number of data points. For example, some graphs that you would see would have 4 data points that vary by size and color, others would have 32 data points that vary in size and color. I m not sure how to select a set of unique data points from the data frame that I created. Any help would be great. I'm pretty new to R.
Here are two ways - depending if you wanted each group to not contain points from any other group.
I'll just use a dummy data frame that just has columns x, y, and size.
library(tidyverse)
dt <- tibble(x = runif(100), y = runif(100), size = runif(100))
Allowing groups to share the same points
Create a vector for the size of each group.
sample_sizes <- 2^(seq_len(4) + 1)
sample_sizes
#> [1] 4 8 16 32
Randomly sample the data frame and add a group column.
sampled <- map_dfr(
sample_sizes,
~sample_n(dt, .),
.id = "group"
)
Plot using facets.
ggplot(sampled, aes(x, y, size = size)) +
geom_point() +
facet_wrap(~group)
Requiring groups to have different points
First, we need a way to generate four 1s, eight 2s etc. This can be done using log2 and some tricks.
groups <- floor(log2(seq_len(nrow(dt)) + 3)) - 1
groups
#> [1] 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4
#> [36] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5
#> [71] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
Shuffle this vector and add it as a column.
dt$group <- sample(groups)
Facet using this new column to generate the desired plots.
ggplot(dt, aes(x, y, size = size)) +
geom_point() +
facet_wrap(~group)
First of all, the question's data creation code can be greatly simplified, rewritten with no loops at all. R is a vectorized language and the following will create a data frame with the same structure.
Don't forget to set the RNG seed, in order to make the results reproducible.
library(ggplot2)
set.seed(2020) # make the results reproducible
sd <- rep(rep(mySD, each = 2), 10)
M <- rep(myMeans, 2*10)
x <- abs(rnorm(n = 40, mean = M, sd = sd))
y <- abs(rnorm(n = 40, mean = M, sd = sd))
size <- runif(40, 1, 25)
iter <- seq_along(x)
dt2 <- data.frame(x, y, size, M, sd, iter)
dt2$col <- c("blue", "red")
Now the plots. The following function accepts a data frame X as its first argument and a number of points to draw as the second one. Then plots n points chosen at random with color col and size (a continuous variable) size.
plot_fun <- function(X, n){
Colors <- unique(X[["col"]])
Colors <- setNames(Colors, Colors)
i <- sample(nrow(X), n)
g <- ggplot(X[i,], aes(x, y, size = size, color = col)) +
geom_point() +
scale_color_manual(values = Colors) +
theme_bw()
g
}
plot_fun(dt2, 8)
To plot several values for n, produce the plots with lapply then use grid.arrange from package gridExtra.
plot_list <- lapply(c(4,8,16,32), function(n) plot_fun(dt2, n))
gridExtra::grid.arrange(grobs = plot_list)
Individual plots are still possible with
plot_list[[1]]
plot_list[[2]]
and so on.
Another way is to use faceting. Write another function, plot_fun_facets assigning the number of points to a new variable in the sample data frames, n, and use that variable as a faceting variable.
plot_fun_facets <- function(X, n){
Colors <- unique(X[["col"]])
Colors <- setNames(Colors, Colors)
X_list <- lapply(n, function(.n){
i <- sample(nrow(X), .n)
Y <- X[i,]
Y$n <- .n
Y
})
X <- do.call(rbind, X_list)
g <- ggplot(X, aes(x, y, size = size, color = col)) +
geom_point() +
scale_color_manual(values = Colors) +
facet_wrap(~ n) +
theme_bw()
g
}
plot_fun_facets(dt2, c(4,8,16,32))

Kmeans: Wrong size of clusters

I am running Kmeans algorithm in R on Heart Disease UCI dataset. I am supposed to get 2 clusters with 138 165 size for each like what in the data set.
Steps:
Store dataset in a data frame:
df <- read.csv(".../heart.csv",fileEncoding = "UTF-8-BOM")
Extract the features:
features = subset(df, select = -target)
Normalize it:
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
features = data.frame(sapply(features, normalize))
Run the algorithm:
set.seed(0)
cluster = kmeans(features, 2)
cluster$size
Output:
[1] 99 204
Why?
It seems like you're focusing on the size of the clusters rather than the accuracy of your predictions. You may well get two clusters of size (138, 165) but not necessarily the same clusters as the 'target' column in the data.
A better way of judging performance is the accuracy of your predictions. In your case, your model accuracy is 72%. You can see this by:
df$label <- cluster$cluster -1
confusionMatrix(table(df$target, df$label))
#Confusion Matrix and Statistics
#
# 0 1
# 0 76 62
# 1 23 142
#
# Accuracy : 0.7195
# ...
I was able to get a better accuracy by standardizing the data rather than normalizing. Possibly because standardizing is more robust to outliers.
I also dummy-coded the categorical looking variables which seems to have improved the accuracy. We now have 85% accuracy and the cluster size is closer to what we expect (143 160). Although, as discussed, on its own the cluster size is meaningless.
library(dplyr)
library(fastDummies)
library(caret)
standardize <- function(x){
num <- x - mean(x, na.rm=T)
denom <- sd(x, na.rm=T)
num/denom
}
# dummy-code and standardize
features <- select(df, -target) %>%
dummy_cols(select_columns = c('cp','thal', 'ca'),
remove_selected_columns = T,remove_first_dummy = T) %>%
mutate_all(standardize)
set.seed(0)
cluster <- kmeans(features, centers = 2, nstart = 50)
cluster$size
# 143 160
# check predictions vs actual labels
df$label <- cluster$cluster -1
confusionMatrix(table(df$target, df$label))
#Confusion Matrix and Statistics
#
#
# 0 1
# 0 117 21
# 1 26 139
#
# Accuracy : 0.8449
Of course, there are other accuracy metrics worth considering too such as out-of-sample accuracy (split your data into training and test sets, and calculate accuracy of predictions on your test set), and f1-score.
Here is an example that should help you get things straightened out.
library(tidyverse) # data manipulation
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
df <- USArrests
df <- na.omit(df)
df <- scale(df)
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
fviz_cluster(k2, data = df)
[![enter image description here][1]][1]
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
set.seed(123)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(df, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
[![enter image description here][1]][1]
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
# Compute k-means clustering with k = 4
set.seed(123)
final <- kmeans(df, 4, nstart = 25)
print(final)
fviz_cluster(final, data = df)
https://uc-r.github.io/kmeans_clustering

How to add a mean point to a scatter plot for each cluster individually?

I have plotted the following data in a scatter plot..
X Y
90 3
3 0
90 4
30 0
14 0
70 1
20 3
90 5
21 2
15 0
82 2
1 0
1 0
and performed k-means cluster analysis with k = 2
results = kmeans(data,2)
plot(data$X , data$Y,
xlab = "X" , ylab = "Y",
col = results$cluster , pch = 19)
This gives me the following scatter plot
link
I have tried the following line of code, but it only shows me the mean of the entire dataset.
points(mean(data$X), mean(data$Y))
I want to know how to show the means of each separate cluster. Thank you
Something like that should work for any given number of clusters:
## Your code:
results = kmeans(data,2)
plot(data$X , data$Y,
xlab = "X" , ylab = "Y",
col = results$cluster , pch = 19)
## Add a cluster info in the data:
data$cluster <- results$cluster
## Compute the center of each cluster:
clusters <- split(data[, 1:2], data$cluster)
centers <- lapply(clusters, FUN = colMeans)
## Plot those centers:
for (k in 1:length(clusters)) {
points(x = centers[[k]][1],
y = centers[[k]][2],
col = k,
pch = 1
)
}
(This solution only uses functions from "base R".)

Convert 3 Matrices to 1 plot using ggplot2

I'm trying to combine 3 matrices to one plot.
I'm trying to simulate a mark-recapture scenario. However, instead of having 1 population, there are 3 (which are contained in each of their matrices).
Because I want to sample from each population once, the x-axis will range from 0-300. However, 1-100 on the x-axis will correspond to the samples collected from population:
101-200 from population 2
201-300 from population 3. The only deviation from the picture is that I'd like a continuous line, from 0-300.
I have the code to create these matrices and made each matrix the same size, but I don't know how to 1) convert and plot them using ggplot2 2) put all three on one graph
## Population size
N <- 400
N
## Vector labeling each item in the population
pop <- c(1:N)
pop
## Lower and upper bounds of sample size
lower.bound <- round(x = .05 * N, digits = 0)
lower.bound ## Smallest possible sample size
upper.bound <- round(x = .15 * N, digits = 0)
upper.bound ## Largest possible sample size
## Length of sample size interval
length.ss.interval <- length(c(lower.bound:upper.bound))
length.ss.interval ## total possible sample sizes, ranging form lower.bound to upper.bound
## Determine a sample size randomly (not a global variable...simply for test purposes)
## Between lower and upper bounds set previously
## Give equal weight to each possible sample size in this interval
sample(x = c(lower.bound:upper.bound),
size = 1,
prob = c(rep(1/length.ss.interval, length.ss.interval)))
## Specify number of samples to take
n.samples <- 100
## Initiate empty matrix
## 1st column is population (item 1 thorugh item 400)
## 2nd through nth column are all rounds of sampling
dat <- matrix(data = NA,
nrow = length(pop),
ncol = n.samples + 1)
dat[,1] <- pop
## Take samples of random sizes
## Record results in columns 2 through n
## 1 = sampled (marked)
## 0 = not sampled (not marked)
for(i in 2:ncol(dat)) {
a.sample <- sample(x = pop,
size = sample(x = c(lower.bound:upper.bound),
size = 1,
prob = c(rep(1/length.ss.interval, length.ss.interval))),
replace = FALSE)
dat[,i] <- dat[,1] %in% a.sample
}
## How large was each sample size?
apply(X = dat, MARGIN = 2, FUN = sum)
## 1st element is irrelevant
## 2nd element through nth element: sample size for each of the 100 samples
schnabel.comp <- data.frame(sample = 1:n.samples,
n.sampled = apply(X = dat, MARGIN = 2, FUN = sum)[2:length(apply(X = dat, MARGIN = 2, FUN = sum))]
)
## First column: which sample, 1-100
## Second column: number selected in that sample
## How many items were previously sampled?
## For 1st sample, it's 0
## For 2nd sample, code is different than for remaning samples
n.prev.sampled <- c(0, rep(NA, n.samples-1))
n.prev.sampled
n.prev.sampled[2] <- sum(ifelse(test = dat[,3] == 1 & dat[,2] == 1,
yes = 1,
no = 0))
n.prev.sampled
for(i in 4:ncol(dat)) {
n.prev.sampled[i-1] <- sum(ifelse(test = dat[,i] == 1 & rowSums(dat[,2:(i-1)]) > 0,
yes = 1,
no = 0))
}
schnabel.comp$n.prev.sampled <- n.prev.sampled
## n.newly.sampled: in each sample, how many items were newly sampled?
## i.e., never seen before?
schnabel.comp$n.newly.sampled <- with(schnabel.comp,
n.sampled - n.prev.sampled)
## cum.sampled: how many total items have you seen?
schnabel.comp$cum.sampled <- c(0, cumsum(schnabel.comp$n.newly.sampled)[2:n.samples-1])
## numerator of schnabel formula
schnabel.comp$numerator <- with(schnabel.comp,
n.sampled * cum.sampled)
## denominator of schnable formula is n.prev.sampled
## pop.estimate -- after each sample (starting with 2nd -- need at least two samples)
schnabel.comp$pop.estimate <- NA
for(i in 1:length(schnabel.comp$pop.estimate)) {
schnabel.comp$pop.estimate[i] <- sum(schnabel.comp$numerator[1:i]) / sum(schnabel.comp$n.prev.sampled[1:i])
}
## Plot population estimate after each sample
if (!require("ggplot2")) {install.packages("ggplot2"); require("ggplot2")}
if (!require("scales")) {install.packages("scales"); require("scales")}
small.sample.dat <- schnabel.comp
small.sample <- ggplot(data = small.sample.dat,
mapping = aes(x = sample, y = pop.estimate)) +
geom_point(size = 2) +
geom_line() +
geom_hline(yintercept = N, col = "red", lwd = 1) +
coord_cartesian(xlim = c(0:100), ylim = c(300:500)) +
scale_x_continuous(breaks = pretty_breaks(11)) +
scale_y_continuous(breaks = pretty_breaks(11)) +
labs(x = "\nSample", y = "Population estimate\n",
title = "Sample sizes are between 5% and 15%\nof the population") +
theme_bw(base_size = 12) +
theme(aspect.ratio = 1)
small.sample
It seems that what you want to do is...
Given three data frames like this:
> d1
x y
1 1 0.899683096
2 2 0.604513234
3 3 0.005824789
4 4 0.442692758
5 5 0.103125175
> d2
x y
1 1 0.35260029
2 2 0.06248654
3 3 0.79272047
> d3
x y
1 1 0.4791399
2 2 0.2583674
3 3 0.1283629
4 4 0.7133847
Construct d:
> d = rbind(d1,d2,d3)
> d$x = 1:nrow(d)
> d
x y
1 1 0.899683096
2 2 0.604513234
3 3 0.005824789
4 4 0.442692758
5 5 0.103125175
6 6 0.352600287
7 7 0.062486543
8 8 0.792720473
9 9 0.479139947
10 10 0.258367356
11 11 0.128362933
12 12 0.713384651
And then plot x against y as normal.

Resources