Convert 3 Matrices to 1 plot using ggplot2 - r

I'm trying to combine 3 matrices to one plot.
I'm trying to simulate a mark-recapture scenario. However, instead of having 1 population, there are 3 (which are contained in each of their matrices).
Because I want to sample from each population once, the x-axis will range from 0-300. However, 1-100 on the x-axis will correspond to the samples collected from population:
101-200 from population 2
201-300 from population 3. The only deviation from the picture is that I'd like a continuous line, from 0-300.
I have the code to create these matrices and made each matrix the same size, but I don't know how to 1) convert and plot them using ggplot2 2) put all three on one graph
## Population size
N <- 400
N
## Vector labeling each item in the population
pop <- c(1:N)
pop
## Lower and upper bounds of sample size
lower.bound <- round(x = .05 * N, digits = 0)
lower.bound ## Smallest possible sample size
upper.bound <- round(x = .15 * N, digits = 0)
upper.bound ## Largest possible sample size
## Length of sample size interval
length.ss.interval <- length(c(lower.bound:upper.bound))
length.ss.interval ## total possible sample sizes, ranging form lower.bound to upper.bound
## Determine a sample size randomly (not a global variable...simply for test purposes)
## Between lower and upper bounds set previously
## Give equal weight to each possible sample size in this interval
sample(x = c(lower.bound:upper.bound),
size = 1,
prob = c(rep(1/length.ss.interval, length.ss.interval)))
## Specify number of samples to take
n.samples <- 100
## Initiate empty matrix
## 1st column is population (item 1 thorugh item 400)
## 2nd through nth column are all rounds of sampling
dat <- matrix(data = NA,
nrow = length(pop),
ncol = n.samples + 1)
dat[,1] <- pop
## Take samples of random sizes
## Record results in columns 2 through n
## 1 = sampled (marked)
## 0 = not sampled (not marked)
for(i in 2:ncol(dat)) {
a.sample <- sample(x = pop,
size = sample(x = c(lower.bound:upper.bound),
size = 1,
prob = c(rep(1/length.ss.interval, length.ss.interval))),
replace = FALSE)
dat[,i] <- dat[,1] %in% a.sample
}
## How large was each sample size?
apply(X = dat, MARGIN = 2, FUN = sum)
## 1st element is irrelevant
## 2nd element through nth element: sample size for each of the 100 samples
schnabel.comp <- data.frame(sample = 1:n.samples,
n.sampled = apply(X = dat, MARGIN = 2, FUN = sum)[2:length(apply(X = dat, MARGIN = 2, FUN = sum))]
)
## First column: which sample, 1-100
## Second column: number selected in that sample
## How many items were previously sampled?
## For 1st sample, it's 0
## For 2nd sample, code is different than for remaning samples
n.prev.sampled <- c(0, rep(NA, n.samples-1))
n.prev.sampled
n.prev.sampled[2] <- sum(ifelse(test = dat[,3] == 1 & dat[,2] == 1,
yes = 1,
no = 0))
n.prev.sampled
for(i in 4:ncol(dat)) {
n.prev.sampled[i-1] <- sum(ifelse(test = dat[,i] == 1 & rowSums(dat[,2:(i-1)]) > 0,
yes = 1,
no = 0))
}
schnabel.comp$n.prev.sampled <- n.prev.sampled
## n.newly.sampled: in each sample, how many items were newly sampled?
## i.e., never seen before?
schnabel.comp$n.newly.sampled <- with(schnabel.comp,
n.sampled - n.prev.sampled)
## cum.sampled: how many total items have you seen?
schnabel.comp$cum.sampled <- c(0, cumsum(schnabel.comp$n.newly.sampled)[2:n.samples-1])
## numerator of schnabel formula
schnabel.comp$numerator <- with(schnabel.comp,
n.sampled * cum.sampled)
## denominator of schnable formula is n.prev.sampled
## pop.estimate -- after each sample (starting with 2nd -- need at least two samples)
schnabel.comp$pop.estimate <- NA
for(i in 1:length(schnabel.comp$pop.estimate)) {
schnabel.comp$pop.estimate[i] <- sum(schnabel.comp$numerator[1:i]) / sum(schnabel.comp$n.prev.sampled[1:i])
}
## Plot population estimate after each sample
if (!require("ggplot2")) {install.packages("ggplot2"); require("ggplot2")}
if (!require("scales")) {install.packages("scales"); require("scales")}
small.sample.dat <- schnabel.comp
small.sample <- ggplot(data = small.sample.dat,
mapping = aes(x = sample, y = pop.estimate)) +
geom_point(size = 2) +
geom_line() +
geom_hline(yintercept = N, col = "red", lwd = 1) +
coord_cartesian(xlim = c(0:100), ylim = c(300:500)) +
scale_x_continuous(breaks = pretty_breaks(11)) +
scale_y_continuous(breaks = pretty_breaks(11)) +
labs(x = "\nSample", y = "Population estimate\n",
title = "Sample sizes are between 5% and 15%\nof the population") +
theme_bw(base_size = 12) +
theme(aspect.ratio = 1)
small.sample

It seems that what you want to do is...
Given three data frames like this:
> d1
x y
1 1 0.899683096
2 2 0.604513234
3 3 0.005824789
4 4 0.442692758
5 5 0.103125175
> d2
x y
1 1 0.35260029
2 2 0.06248654
3 3 0.79272047
> d3
x y
1 1 0.4791399
2 2 0.2583674
3 3 0.1283629
4 4 0.7133847
Construct d:
> d = rbind(d1,d2,d3)
> d$x = 1:nrow(d)
> d
x y
1 1 0.899683096
2 2 0.604513234
3 3 0.005824789
4 4 0.442692758
5 5 0.103125175
6 6 0.352600287
7 7 0.062486543
8 8 0.792720473
9 9 0.479139947
10 10 0.258367356
11 11 0.128362933
12 12 0.713384651
And then plot x against y as normal.

Related

How to find the range of kmeans clusters?

if I have this
x = c(0.5,0.1,0.3,6,5,2,1,4,2,1,0.9,3,6,99,22,11,44,55)
apply kmeans
kmeans(x, centers=5, iter.max = 10, nstart = 1)
this gives:
Cluster means:
[,1]
1 99.00
2 49.50
3 22.00
4 7.00
5 1.48
Clustering vector:
[1] 5 5 5 4 4 5 5 5 5 5 5 5 4 1 3 4 2 2
Now I want to classify my values in x2 based on the clusters of x (1,2,3,4,5). How to do this?
x2 = c(0.3,1,3,0.66,0.5,0.2,0.1,64,92,21,0.93,93,6,99,22,11,44,55)
Here is a naive approch based on distance to centroids of each cluster:
km <- kmeans(x, centers=5, iter.max = 10, nstart = 1)
group <- sapply(x2, function(xx) which.min(abs(km$centers - xx)))
plot(x = x, y = rep(1, length(x)), col = km$cluster)
points(x = km$centers, y = rep(1, length(km$centers)), col = "purple", pch = "*")
points(x = x2, y = rep(1, length(x2)), col = group, pch = "+")
Please check the link provided by #neilfws about doing predictions with kmeans.
There is a predict method for kmeans clusters in the fdm2id package. Load the package and type ?KMEANS and ?predict.kmeans for more information.
library(data.table)
library(fdm2id)
#
x <- c(0.5,0.1,0.3,6,5,2,1,4,2,1,0.9,3,6,99,22,11,44,55)
dt <- data.table(x2 = c(0.3,1,3,0.66,0.5,0.2,0.1,64,92,21,0.93,93,6,99,22,11,44,55))
dt[, pred:=predict(KMEANS(x, k = 5), newdata=x2)]
Note that the cluster numbers are arbitrary in the sense that they merely distinguish between clusters, so the numbering might not be the same but the cluster membership should align.

Creating stimuli in R with ggplot

I am trying to generate my own stimuli for an experiment using R. Below is the code that creates my (x,y) coordinates using the rnorm() with different a sample size of 100, different means and sd. I also create another variable to represent the size of the circles, which are determined by the runif().
dt <- data.frame(x = NA,
y = NA,
size = NA,
M = NA,
sd = NA,
col = NA,
iter = NA)
sa<-0
mySD<-c(5, 15)
myMeans<-c(35, 45)
colors<-c("Blues", "Reds")
for(i in 1:10){
for(s in mySD){
for(m in myMeans){
x = abs(rnorm(n=1, mean=m, sd=s))
y = abs(rnorm(n=1, mean=m, sd=s))
size = runif(1, 1, 25) #select a random x speed between [25,35]
sa<-sa+1
dt[sa,] <- NA
dt$x[sa]<-x
dt$y[sa]<-y
dt$M[sa]<-m
dt$sd[sa]<-s
dt$size[sa]<-size
dt$iter[sa]<-i
}
}
}
}
Next, I want to use ggplot(dt, aes(x, y, size=size) to plot. I want to randomly select 4 (x,y) values to plot for one graph, then 8 for another, then 16 for another, etc. Basically, I want to plot different graphs with a different number of data points. For example, some graphs that you would see would have 4 data points that vary by size and color, others would have 32 data points that vary in size and color. I m not sure how to select a set of unique data points from the data frame that I created. Any help would be great. I'm pretty new to R.
Here are two ways - depending if you wanted each group to not contain points from any other group.
I'll just use a dummy data frame that just has columns x, y, and size.
library(tidyverse)
dt <- tibble(x = runif(100), y = runif(100), size = runif(100))
Allowing groups to share the same points
Create a vector for the size of each group.
sample_sizes <- 2^(seq_len(4) + 1)
sample_sizes
#> [1] 4 8 16 32
Randomly sample the data frame and add a group column.
sampled <- map_dfr(
sample_sizes,
~sample_n(dt, .),
.id = "group"
)
Plot using facets.
ggplot(sampled, aes(x, y, size = size)) +
geom_point() +
facet_wrap(~group)
Requiring groups to have different points
First, we need a way to generate four 1s, eight 2s etc. This can be done using log2 and some tricks.
groups <- floor(log2(seq_len(nrow(dt)) + 3)) - 1
groups
#> [1] 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4
#> [36] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5
#> [71] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
Shuffle this vector and add it as a column.
dt$group <- sample(groups)
Facet using this new column to generate the desired plots.
ggplot(dt, aes(x, y, size = size)) +
geom_point() +
facet_wrap(~group)
First of all, the question's data creation code can be greatly simplified, rewritten with no loops at all. R is a vectorized language and the following will create a data frame with the same structure.
Don't forget to set the RNG seed, in order to make the results reproducible.
library(ggplot2)
set.seed(2020) # make the results reproducible
sd <- rep(rep(mySD, each = 2), 10)
M <- rep(myMeans, 2*10)
x <- abs(rnorm(n = 40, mean = M, sd = sd))
y <- abs(rnorm(n = 40, mean = M, sd = sd))
size <- runif(40, 1, 25)
iter <- seq_along(x)
dt2 <- data.frame(x, y, size, M, sd, iter)
dt2$col <- c("blue", "red")
Now the plots. The following function accepts a data frame X as its first argument and a number of points to draw as the second one. Then plots n points chosen at random with color col and size (a continuous variable) size.
plot_fun <- function(X, n){
Colors <- unique(X[["col"]])
Colors <- setNames(Colors, Colors)
i <- sample(nrow(X), n)
g <- ggplot(X[i,], aes(x, y, size = size, color = col)) +
geom_point() +
scale_color_manual(values = Colors) +
theme_bw()
g
}
plot_fun(dt2, 8)
To plot several values for n, produce the plots with lapply then use grid.arrange from package gridExtra.
plot_list <- lapply(c(4,8,16,32), function(n) plot_fun(dt2, n))
gridExtra::grid.arrange(grobs = plot_list)
Individual plots are still possible with
plot_list[[1]]
plot_list[[2]]
and so on.
Another way is to use faceting. Write another function, plot_fun_facets assigning the number of points to a new variable in the sample data frames, n, and use that variable as a faceting variable.
plot_fun_facets <- function(X, n){
Colors <- unique(X[["col"]])
Colors <- setNames(Colors, Colors)
X_list <- lapply(n, function(.n){
i <- sample(nrow(X), .n)
Y <- X[i,]
Y$n <- .n
Y
})
X <- do.call(rbind, X_list)
g <- ggplot(X, aes(x, y, size = size, color = col)) +
geom_point() +
scale_color_manual(values = Colors) +
facet_wrap(~ n) +
theme_bw()
g
}
plot_fun_facets(dt2, c(4,8,16,32))

How to mimic geom_boxplot() with outliers using geom_boxplot(stat = "identity")

I would like to pre-compute by-variable summaries of data (with plyr and passing a quantile function) and then plot with geom_boxplot(stat = "identity"). This works great except it (a) does not plot outliers as points and (b) extends the "whiskers" to the max and min of the data being plotted.
Example:
library(plyr)
library(ggplot2)
set.seed(4)
df <- data.frame(fact = sample(letters[1:2], 12, replace = TRUE),
val = c(1:10, 100, 101))
df
# fact val
# 1 b 1
# 2 a 2
# 3 a 3
# 4 a 4
# 5 b 5
# 6 a 6
# 7 b 7
# 8 b 8
# 9 b 9
# 10 a 10
# 11 b 100
# 12 a 101
by.fact.df <- ddply(df, c("fact"), function(x) quantile(x$val))
by.fact.df
# fact 0% 25% 50% 75% 100%
# 1 a 2 3.25 5.0 9.00 101
# 2 b 1 5.50 7.5 8.75 100
# What I can do...with faults (a) and (b) above
ggplot(by.fact.df,
aes(x = fact, ymin = `0%`, lower = `25%`, middle = `50%`,
upper = `75%`, ymax = `100%`)) +
geom_boxplot(stat = "identity")
# What I want...
ggplot(df, aes(x = fact, y = val)) +
geom_boxplot()
What I can do...with faults (a) and (b) mentioned above:
What I would like to obtain, but still leverage pre-computation via plyr (or other method):
Initial Thoughts: Perhaps there is some way to pre-compute the true end-points of the whiskers without the outliers? Then, subset the data for outliers and pass them as geom_point()?
Motivation: When working with larger datasets, I have found it faster and more practical to leverage plyr, dplyr, and/or data.table to pre-compute the stats and then plot them rather than having ggplot2 to the calculations.
UPDATE
I am able to extract what I need with the following mix of dplyr and plyr code, but I'm not sure if this is the most efficient way:
df %>%
group_by(fact) %>%
do(ldply(boxplot.stats(.$val), data.frame))
Source: local data frame [6 x 3]
Groups: fact
fact .id X..i..
1 a stats 2
2 a stats 4
3 a stats 10
4 a stats 13
5 a stats 16
6 a n 9
Here's my answer, using built-in functions quantile and boxplot.stats.
geom_boxplot does the calcualtions for boxplot slightly differently than boxplot.stats. Read ?geom_boxplot and ?boxplot.stats to understand my implementation below
#Function to calculate boxplot stats to match ggplot's implemention as in geom_boxplot.
my_boxplot.stats <-function(x){
quantiles <-quantile(x, c(0, 0.25, 0.5, 0.75, 1))
labels <-names(quantile(x))
#replacing the upper whisker to geom_boxplot
quantiles[5] <-boxplot.stats(x)$stats[5]
res <-data.frame(rbind(quantiles))
names(res) <-labels
res$out <-boxplot.stats(x)$out
return(res)
}
Code to calculate the stats and plot it
library(dplyr)
df %>% group_by(fact) %>% do(my_boxplot.stats(.$val)) %>%
ggplot(aes(x=fact, y=out, ymin = `0%`, lower = `25%`, middle = `50%`,
upper = `75%`, ymax = `100%`)) +
geom_boxplot(stat = "identity") + geom_point()
To get the correct statistics, you have to do some more calculations than just finding the quantiles. The geom_boxplot function with stat = "identity" does not draw the outliers. So you have to calculate the statistics without the outliers and then use geom_point to draw the outliers seperately. The following function (basically a simplified version of stat_boxplot) is probably not the most efficient, but it gives the desired result:
box.df <- df %>% group_by(fact) %>% do({
stats <- as.numeric(quantile(.$val, c(0, 0.25, 0.5, 0.75, 1)))
iqr <- diff(stats[c(2, 4)])
coef <- 1.5
outliers <- .$val < (stats[2] - coef * iqr) | .$val > (stats[4] + coef * iqr)
if (any(outliers)) {
stats[c(1, 5)] <- range(c(stats[2:4], .$val[!outliers]), na.rm=TRUE)
}
outlier_values = .$val[outliers]
if (length(outlier_values) == 0) outlier_values <- NA_real_
res <- as.list(t(stats))
names(res) <- c("lower.whisker", "lower.hinge", "median", "upper.hinge", "upper.whisker")
res$out <- outlier_values
as.data.frame(res)
})
box.df
## Source: local data frame [2 x 7]
## Groups: fact
##
## fact lower.whisker lower.hinge median upper.hinge upper.whisker out
## 1 a 2 3.25 5.0 9.00 10 101
## 2 b 1 5.50 7.5 8.75 9 100
ggplot(box.df, aes(x = fact, y = out, middle = median,
ymin = lower.whisker, ymax = upper.whisker,
lower = lower.hinge, upper = upper.hinge)) +
geom_boxplot(stat = "identity") +
geom_point()

how to detect peaks in a particular range in R

I have a time series and I would like to detect (and identify them) some peaks but only for a particular range in R.
here is an example
## generate test data with 3 peaks
set.seed(123)
x <- seq(0, 360, length = 20)
y <- abs(rnorm(20, mean = 1, sd = 0.1))
y[5:10] <- c(2, 4, 7, 3, 4, 2)
y <- c(y, 0.8 * y, 1.2 * y)
x <- seq(0, 360, along = y)
y[6] <- y[7] # test case with 2 neighbouring equal points
plot(x, y, type="b")
#
In that example, let says, I want to select peaks (y) only between 6 and 9 (2 peaks) or only between 2 and 4 (also 2 peaks).
I am aware of several packages in R detecting peaks (e.g. Peaks, pastecs, quantmod, pracma, splus2R) but none seems to have this feature, usually only having a minimum threshold.
Any advice would be appreciated.
thank you
Martin
Edit: The code provided by Eric works perfectly. But with my own datasets I have a small problem. What would you do to detect only one peak if same values twice in a certain window (x). Basically I would like to create a conditional statement that would say, you need a certain number of points (x) between peaks to be considered as two distinctive peaks.
Something like this gets close (not sure if you care about detecting the peak with two values twice).
# Reproduce your data
set.seed(123)
x <- seq(0, 360, length = 20)
y <- abs(rnorm(20, mean = 1, sd = 0.1))
y[5:10] <- c(2, 4, 7, 3, 4, 2)
y <- c(y, 0.8 * y, 1.2 * y)
x <- seq(0, 360, along = y)
y[6] <- y[7] # test case with 2 neighbouring equal points
plot(x, y, type="b")
# shift y up and down a position (for peak identification)
yu <- c(tail(y, -1), NA)
yd <- c(NA, head(y, -1))
# identify peaks that are in the correct range
# where y is higher than the point before and after
high <- which(y - yu >= 0 & y - yd >= 0 & y > 6 & y < 9)
low <- which(y - yu >= 0 & y - yd >= 0 & y >= 2 & y <= 4) # one peak is at 4
# plot lines at peaks
abline(v = x[high], col = 'blue')
abline(v = x[low], col = 'red')

plot linear discriminant analysis in R

I would like to plot two matrices in scatter plot diagram. How can I do that? I like that this plot looks like
I'm calculating linear disciminant analysis on two classes with Fischer's method. This is what I calculate:
XM1 <- matrix(data=c(4,2, 2,4, 2,3, 3,6, 4,4), ncol = 2, byrow = TRUE)
XM2 <- matrix(data=c(9,10, 6,8, 9,5, 8,7, 10,8), ncol = 2, byrow = TRUE)
mi1 <- apply(XM1, MARGIN = 2, FUN = "mean")
mi2 <- apply(XM2, MARGIN = 2, FUN = "mean")
Sb <- (mi1-mi2)%*%t(mi1-mi2)
sum.cov <- (cov(XM1)+cov(XM2))
SwSb <- solve(sum.cov)%*%Sb
eg <- eigen(SwSb)
How do I plot these two matrices (one with circles, second with squares) with abline (using eigenval result)?
Here is a ggplot2 solution. First you have to bring your data in a appropriate form:
mdf <- as.data.frame( rbind(XM1, XM2) )
names(mdf) <- c("x1", "x2")
mdf$f <- c( rep( "a", nrow(XM1) ), rep( "b", nrow(XM2) ) )
head(mdf)
x1 x2 f
1 4 2 a
2 2 4 a
3 2 3 a
4 3 6 a
5 4 4 a
6 9 10 b
And than this produces a plot similar to the one you showed:
library(ggplot2)
ggplot( mdf, aes(x=x1, y=x2, col=f) ) +
geom_point( size = 4, aes(shape = f) ) +
geom_abline( slope = eg$vectors[2,1] / eg$vectors[1,1], colour = "green" ) +
scale_shape_manual(values=c(16,15)) +
expand_limits( y = 0, x = 0) +
labs( title = paste("LDA projection vector with highest eigen value =", round(eg$values[1], 2)) ) +
theme_bw()

Resources