Specifying spatial coordinates in a data.frame - r

Pretty silly but I can't figure out what I'm doing wrong here:
I have a data.frame with 2 columns:
df = data.frame(x = rep(1, 20), y = runif(20, 10,20))
I then want to set x and y as spatial coordinates so I can plot df in a bubble plot. So I try:
coordinates(df) = c("x","y")
But then:
bubble(df)
gives this error:
Error in data.frame(x#data, x#coords) :
arguments imply differing number of rows: 0, 20

For bubble plot to be meaningful, you should probably create a SpatialPointsDataFrame.
library(sp)
df <- data.frame(x = rep(1, 20), y = runif(20, 10,20))
data <- data.frame(variable = runif(20))
coordinates(df) <- ~ x + y
out <- SpatialPointsDataFrame(df, data)
bubble(out)

library(sp)
set.seed(1)
df = data.frame(x = rep(1, 20), y = runif(20, 10, 20), dummy = rep(0, 20))
coordinates(df) = c("x","y")
bubble(df)

Related

Binning two columns of data frame together in R

I would like to bin two columns of a dataset simultaneously to create one common binned column. The simple code is as follows
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
Any help is appreciated!
Not sure if this is what you are looking for
library(tidyverse)
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data <- data%>%
dplyr::mutate(
x_binned = cut(x, breaks = seq(0,100,10)),
y_binned = cut(y, breaks = seq(0,100,10))
)
data %>%
ggplot() +
geom_bin_2d(
aes(x = x_binned, y = y_binned), binwidth = c(10,10), colour = "red") +
theme_minimal()
After asking in the comments I am still not quite shure, what the desired answer would look like but I hope, that one of the two answers in the below code will work for you:
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data$xbin <- cut(data$x, breaks = xbin, ordered = TRUE)
data$ybin <- cut(data$y, breaks = ybin, ordered = TRUE)
data$commonbin1 <- paste0(data$xbin, data$ybin)
data$commonbin2 <- paste0("(",as.numeric(data$xbin),";", as.numeric(data$ybin),")")
head(data, 20)
This will construct a common binning variable commonbin1 that includes the bin-limits in the names of the bins and commonbin2 which will be easier to compare to the plot mentioned in the comment.

Countor plot of bivariate normal functions

Could someone explain me why such function doesn't produce a countor plot as I expected.
I've a bivariate normal function whit:
means = c(5,1)
var_cov = matrix(c(2,1,1,1),2)
I'd like to plot its contour plot; I'm able to reach the result but I'd like to ask why in one case I don't get expected result.
Working Example:
library(MASS)
library(ggplot2)
N <- 100
set.seed(123)
var_cov_matrix <- matrix(c(2,1,1,1),2)
mean <- c(5,1)
bivariate_points <- expand.grid(s.1 = seq(-25, 25, length.out=N), s.2 = seq(-25, 25, length.out=N))
z <- mvtnorm::dmvnorm(bivariate_points, mean = mean, sigma = var_cov_matrix)
data <- cbind(bivariate_points,z)
colnames(data) <- c("X1","X2","Z")
data.df <- as.data.frame(data)
ggplot() +
geom_contour(data=data.df,aes(x=X1,y=X2,z=Z))
Non Working Example:
library(MASS)
library(ggplot2)
N <- 100
set.seed(123)
var_cov_matrix <- matrix(c(2,1,1,1),2)
mean <- c(5,1)
bivariate_points <- mvrnorm(N, mu = mean, Sigma = var_cov_matrix ) # <---- EDITED
z <- mvtnorm::dmvnorm(bivariate_points, mean = mean, sigma = var_cov_matrix)
data <- cbind(bivariate_points,z)
colnames(data) <- c("X1","X2","Z")
data.df <- as.data.frame(data)
ggplot() +
geom_contour(data=data.df,aes(x=X1,y=X2,z=Z))
In your non-working example, since you don't have regular grid for contour plot, you can use stat_density2d instead, i.e.,
ggplot(data.df, aes(x = X1, y = X2, z = Z)) +
geom_point(aes(colour = z)) +
stat_density2d()

Differences between plotting contour() function in base R and using geom_contour() or stat_contour() in ggplot2

I have plotted a density function in base R and I would like to replicate the plot in ggplot2.
This is the plot in base R:
library(tidyverse)
library(mvtnorm)
sd <- 1 / 2
# sigma
s1 <- sd^2
# first two vectors
x.points <- seq(-3, 3, length.out = 100)
y.points <- seq(-3, 3, length.out = 100)
# the third vector is a density
z <- matrix(0, nrow = 100, ncol = 100)
mu1 <- c(0, 0)
sigma1 <- matrix(c(s1^2, 0, 0, s1^2), nrow = 2)
for (i in 1:100) {
for (j in 1:100) {
z[i, j] <- dmvnorm(c(x.points[i], y.points[j]),
mean = mu1, sigma = sigma1
)
}
}
contour(x.points, y.points, z, xlim = range(-3, 3), ylim = c(-3, 3), nlevels = 5, drawlabels = TRUE)
To obtain the same result in ggplot2, I am following this example:
library(ggplot2)
library(reshape2) # for melt
volcano3d <- melt(volcano)
names(volcano3d) <- c("x", "y", "z")
# Basic plot
v <- ggplot(volcano3d, aes(x, y, z = z))
v + stat_contour()
But in my case vector z has a different length than x.points and y.points. From the errors I get below, it looks like the three vectors should have the same length. How can I transform the dataset presented above so that it can be run through ggplot2?
data1 <- as.data.frame(cbind(x.points, y.points))
p <- ggplot(data = data1, mapping = aes(x.points, y.points, z=z))
p + geom_contour()
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
p + stat_contour()
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
p + stat_function(fun = contour) + xlim(-3,3)
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
Created on 2021-04-08 by the reprex package (v0.3.0)
The problem is likely that your data isn't in long format: for every value of the z matrix, you need the x and y position, which is different from the base R approach, wherein you just need these positions for every row/column.
We can transform the matrix z to a long format using reshape2::melt and then grab the correct positions from your vectors.
library(tidyverse)
library(mvtnorm)
sd <- 1 / 2
# sigma
s1 <- sd^2
# first two vectors
x.points <- seq(-3, 3, length.out = 100)
y.points <- seq(-3, 3, length.out = 100)
# the third vector is a density
z <- matrix(0, nrow = 100, ncol = 100)
mu1 <- c(0, 0)
sigma1 <- matrix(c(s1^2, 0, 0, s1^2), nrow = 2)
for (i in 1:100) {
for (j in 1:100) {
z[i, j] <- dmvnorm(c(x.points[i], y.points[j]),
mean = mu1, sigma = sigma1
)
}
}
# Here be the reshaping bit
df <- reshape2::melt(z)
df <- transform(
df,
x = x.points[Var1],
y = y.points[Var2]
)
ggplot(df, aes(x, y)) +
geom_contour(aes(z = value))
Created on 2021-04-08 by the reprex package (v1.0.0)

Kmean clustering in ggplot

I am using K-mean alg. in R in order to separe variables. I would like to plot results in ggplot witch I was able to manage,
however results seem to be different in ggplot and in cluster::clusplot
So I wanted to ask what I am missing: for example I know that scaling in different but I was wondering Whz when using clustplot all variables are inside the bounds and when using ggplot it is not.
Is it just because of the scaling?
So are two below result exatly the same?
library(cluster)
library(ggfortify)
x <- rbind(matrix(rnorm(2000, sd = 123), ncol = 2),
matrix(rnorm(2000, mean = 800, sd = 123), ncol = 2))
colnames(x) <- c("x", "y")
x <- data.frame(x)
A <- kmeans(x, centers = 3, nstart = 50, iter.max = 500)
cluster::clusplot(cbind(x$x, x$y), A$cluster, color = T, shade = T)
autoplot(kmeans(x, centers = 3, nstart = 50, iter.max = 500), data = x, frame.type = 'norm')
For me, I get the same plot using either clusplot or ggplot. But for using ggplot, you have to first make a PCA on your data in order to get the same plot as clustplot. Maybe it's where you have an issue.
Here, with your example, I did:
x <- rbind(matrix(rnorm(2000, sd = 123), ncol = 2),
matrix(rnorm(2000, mean = 800, sd = 123), ncol = 2))
colnames(x) <- c("x", "y")
x <- data.frame(x)
A <- kmeans(x, centers = 3, nstart = 50, iter.max = 500)
cluster::clusplot(cbind(x$x, x$y), A$cluster, color = T, shade = T)
pca_x = princomp(x)
x_cluster = data.frame(pca_x$scores,A$cluster)
ggplot(test, aes(x = Comp.1, y = Comp.2, color = as.factor(A.cluster), fill = as.factor(A.cluster))) + geom_point() +
stat_ellipse(type = "t",geom = "polygon",alpha = 0.4)
The plot using clusplot
And the one using ggplot:
Hope it helps you to figure out the reason of your different plots

Sampling from columns of ys stacked over values of x in R (visual provided)

Background
I have a two variables called x and y (please see R code below the picture). When I plot(x, y), I obtain the top-row plot (see below). y values are stacked over the top of each x value.
Question
I am wondering WHY when I sample from y values that are separately stacked over the top of each x value (e.g., y-values stacked over the top of x value of "0"), I get some sampled y values that are outside their range of their mother sample!? (please see the bottom-row table to see this).
HERE IS MY R CODE:
############# Input Values ###################
each.sub.pop.n = 150;
sub.pop.means = 20:10;
predict.range = 0:10;
sub.pop.sd = .75;
n.sample = 2;
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y)
## Unsuccessfull Sampling ##
x <- rep(predict.range, each = n.sample)
y <- sample(y , length(x), replace = TRUE)
plot(x, y)
It seems to me that your sample is not conditional on x in your unsuccessful sampling piece. In the below, I split the y data by x and then sampled two cases from each. The result seems to work.
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
plot(sample$x, sample$y)
You can use the stratified sampling implemented in the sampling package with the strata function:
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y)
library(sampling)
df <- data.frame(x,y)
set.seed(123)
stratif_sampl <- strata(df,"x",rep(2,11))
idx <- stratif_sampl$ID_unit
plot(x[idx], y[idx])

Resources