Binning two columns of data frame together in R - r

I would like to bin two columns of a dataset simultaneously to create one common binned column. The simple code is as follows
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
Any help is appreciated!

Not sure if this is what you are looking for
library(tidyverse)
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data <- data%>%
dplyr::mutate(
x_binned = cut(x, breaks = seq(0,100,10)),
y_binned = cut(y, breaks = seq(0,100,10))
)
data %>%
ggplot() +
geom_bin_2d(
aes(x = x_binned, y = y_binned), binwidth = c(10,10), colour = "red") +
theme_minimal()

After asking in the comments I am still not quite shure, what the desired answer would look like but I hope, that one of the two answers in the below code will work for you:
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data$xbin <- cut(data$x, breaks = xbin, ordered = TRUE)
data$ybin <- cut(data$y, breaks = ybin, ordered = TRUE)
data$commonbin1 <- paste0(data$xbin, data$ybin)
data$commonbin2 <- paste0("(",as.numeric(data$xbin),";", as.numeric(data$ybin),")")
head(data, 20)
This will construct a common binning variable commonbin1 that includes the bin-limits in the names of the bins and commonbin2 which will be easier to compare to the plot mentioned in the comment.

Related

How to retrieve isocontour coordinates?

I have data interpolated on a grid and I need to retrieve the iso-contour coordinates:
require(akima)
require(pracma)
require(ggplot2)
require(RColorBrewer)
r <- seq(0.1, 1, length.out = 20)
theta <- seq(0, 90)
my.df <- expand.grid(r = r, theta = theta)
my.df$value <- 1/my.df$r^2 * sin(deg2rad(my.df$theta))
# Interpolating data on rectangular grid
data.interp <-
interp(
x = my.df$r * cos(deg2rad(my.df$theta)),
y = my.df$r * sin(deg2rad(my.df$theta)),
z = my.df$value,
nx = 200,
ny = 200,
duplicate = "strip"
)
data.xyz <- as.data.frame(interp2xyz(data.interp))
data.xyz <- setNames(data.xyz, c("x", "y", "value"))
data.xyz <- na.omit(data.xyz)
my.breaks <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
brks <- cut(data.xyz$value,
breaks = my.breaks,
ordered_result = TRUE)
levels(brks) <- gsub(",", " - ", levels(brks), fixed = TRUE)
levels(brks) <- gsub("\\(|\\]","",levels(brks))
data.xyz$brks <- brks
ggplot(data.xyz, aes(x = x, y = y, fill = brks)) +
geom_tile() +
scale_fill_manual("Value",
values = rev(colorRampPalette(brewer.pal(11, "Spectral"))(length(my.breaks))))
Here is what the result looks like:
What I need is to retrieve the coordinates of my iso-contours.
The purpose of to create a 3D model of those contours assuming the data is axisymmetric. But before I do that, I need to find the coordinates of the line separating the colors.
Using contourLines, here is how to do this:
r <- seq(0.1, 1, length.out = 20)
theta <- seq(0, 90)
my.df <- expand.grid(r = r, theta = theta)
my.df$value <- 1/my.df$r^2 * sin(deg2rad(my.df$theta))
my.matrix <- acast(my.df, r ~ theta, value.var = "value")
contour.lines <- contourLines(x = r,
y = theta,
z = my.matrix,
levels = seq(0, 100, by = 10))
contour.df <- data.frame()
for(level in contour.lines) {
contour.df <- rbind(contour.df, data.frame(x = level$x * cos(deg2rad(level$y)),
y = level$x * sin(deg2rad(level$y)),
level = as.factor(level$level)))
}
ggplot(contour.df, aes(x, y, color = level)) + geom_path() + scale_x_continuous(limits = c(0, 1)) + scale_y_continuous(limits = c(0, 1))

geom_vlines multiple vlines per plot

How can I get ggplot to produce something similar like
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/path/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[2]))
}
this is the interesting part about the plot:
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed')
So far my result is: https://www.dropbox.com/s/mysadkruo946oox/changePoint.pdf which means that there is something wrong with my array passed to the geom_vlines.
Could you point me in the right direction why I only get vlines in the first 2 plots?
This is the solution:
library(ggplot2)
library(reshape2)
library(ecp)
synthetic_control.data <- read.table("/Users/geoHeil/Dropbox/6.Semester/BachelorThesis/rResearch/data/synthetic_control.data.txt", quote="\"", comment.char="")
n <- 2
s <- sample(1:100, n)
idx <- c(s, 100+s, 200+s, 300+s, 400+s, 500+s)
sample2 <- synthetic_control.data[idx,]
df = as.data.frame(t(as.matrix(sample2)))
#calculate the change points
changeP <- e.divisive(as.matrix(df[1]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = data.frame(changeP,variable=colnames(df)[1])
for(series in 2:ncol(df)){
changeP <- e.divisive(as.matrix(df[series]), k=8, R = 400, alpha = 2, min.size = 3)
changeP = changeP$estimates
changeP = changeP[-c(1,length(changeP))]
changePoints = rbind(changePoints, data.frame(changeP,variable=colnames(df)[series]))
}
# plot
df$id = 1:nrow(df)
dfMelt <- reshape2::melt(df, id.vars = "id")
p = ggplot(dfMelt,aes(x=id,y=value))+geom_line(color = "steelblue")+ facet_grid(variable ~ ., scales = 'free_y')
p + geom_vline(aes(xintercept=changeP), data=changePoints, linetype='dashed', colour='darkgreen')

Subset of data for specific plot layer

I want to
plot all data on some layers (here: geom_point)
plot only a subset on some other layers (here: geom_text for type "range")
However, I'm getting the text labels for the whole data, while they should only be added for the turquoise points.
I tried subsetting the data, but the output is not the desired. Still, the object sub_data holds only the wanted data.
Any suggestions?
R code:
library(ggplot2)
N <- 10
# create 20 = 2*10 data points
test_data <- data.frame(
idx <- c( 1:N, 1:N ),
vals <- c( runif(N, 0, 1),
rep( 0.5, N)),
type <- c( rep("range", N),
rep("const", N))
)
# this subsets to the 10 data points of type "range"
sub_data <- subset( test_data, type == "range")
ggplot( test_data, aes( x = idx, y = vals)) +
geom_point( aes( colour = type)) +
geom_text( data = sub_data, aes( x = idx + 0.1, label = idx ), size = 3.5)
output:
Change the <- to = inside your data.frame command, like this:
test_data <- data.frame(
idx = c(1:N, 1:N),
vals = c(runif(N, 0, 1), rep( 0.5, N)),
type = c(rep("range", N), rep("const", N))
)
Then execute your plot code and you should get the desired result.
An alternative to creating a dataframe in a correct way is:
idx <- c(1:N, 1:N),
vals <- c(runif(N, 0, 1), rep( 0.5, N)),
type <- c(rep("range", N), rep("const", N))
test_data <- data.frame(idx, vals, type)
For more background on the difference between the <- and the = assignment operators, see the answers to this question

Specifying spatial coordinates in a data.frame

Pretty silly but I can't figure out what I'm doing wrong here:
I have a data.frame with 2 columns:
df = data.frame(x = rep(1, 20), y = runif(20, 10,20))
I then want to set x and y as spatial coordinates so I can plot df in a bubble plot. So I try:
coordinates(df) = c("x","y")
But then:
bubble(df)
gives this error:
Error in data.frame(x#data, x#coords) :
arguments imply differing number of rows: 0, 20
For bubble plot to be meaningful, you should probably create a SpatialPointsDataFrame.
library(sp)
df <- data.frame(x = rep(1, 20), y = runif(20, 10,20))
data <- data.frame(variable = runif(20))
coordinates(df) <- ~ x + y
out <- SpatialPointsDataFrame(df, data)
bubble(out)
library(sp)
set.seed(1)
df = data.frame(x = rep(1, 20), y = runif(20, 10, 20), dummy = rep(0, 20))
coordinates(df) = c("x","y")
bubble(df)

Multiple density graphs different groups (based on factor level) using plyr

I am trying to output multiple density plot from a function, by dividing the dataframe into pieces such that separate density for each level of a factor for corresponding yvar.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(gen, yvar)
minyvar <- min(yvar)
maxyvar <- max(yvar)
par(mfrow = c(length(levels(mydf$gen)),1))
plotdensity <- function (xf, minyvar, maxyvar){
plot(density(xf), xlim=c(minyvar, maxyvar), main = paste (names(xf),
"distribution", sep = ""))
dens <- density(xf)
x1 <- min(which(dens$x >= quantile(xf, .80)))
x2 <- max(which(dens$x < max(dens$x)))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="blu4"))
abline(v= mean(xf), col = "black", lty = 1, lwd =2)
}
require(plyr)
ddply(mydf, .(mydf$gen), plotdensity, yvar, minyvar, maxyvar)
Error in .fun(piece, ...) : unused argument(s) (111.544494112914)
My specific expectation are each plot is named by name of level for example Aa, Bb, Cc, Dd
Arrangement of the graphs see the parameter set, so that we compare density changes and means. compact - Low space between the graphs.
Help appreciated.
Edits:
The following graphs are individually produced, although I want to develop a function that can be applicable to x level for a factor.
I see that #Andrie just beat me to most of this. I'm still going to post my answer, since filling only certain quantiles of the distribution requires a slightly different approach.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
ggplot(mydf,aes(x = x)) +
facet_wrap(~grp) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0, fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),colour = "black")
Here is a way of doing it in ggplot:
set.seed(1234)
mydf <- rbind(
data.frame(gen="Aa", yvar= rnorm(40000, 50, 10)),
data.frame(gen="Bb", yvar=rnorm(4000, 70, 10)),
data.frame(gen="Cc", yvar=rnorm(400, 75, 10)),
data.frame(gen="Dd", yvar=rnorm(40, 80, 10))
)
labels <- ddply(mydf, .(gen), nrow)
means <- ddply(mydf, .(gen), summarize, mean=mean(yvar))
ggplot(mydf, aes(x=yvar)) +
stat_density(fill="blue") +
facet_grid(gen~.) +
theme_bw() +
geom_vline(data=means, aes(xintercept=mean), colour="red") +
geom_text(data=labels, aes(label=paste("n =", V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution")
With sincere thanks to joran and Andrie, the following is just compilation of my favorite from above two posts, just some of readers might want to see.
require(ggplot2)
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
mydf1 <- mydf
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
labels <- ddply(mydf1, .(grp), nrow)
ggplot(mydf,aes(x = x)) +
facet_grid(grp~.) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0,
fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),
colour = "black") + geom_text(data=labels,
aes(label=paste("n =", labels$V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution") + theme_bw()

Resources