I would like to bin two columns of a dataset simultaneously to create one common binned column. The simple code is as follows
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
Any help is appreciated!
Not sure if this is what you are looking for
library(tidyverse)
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data <- data%>%
dplyr::mutate(
x_binned = cut(x, breaks = seq(0,100,10)),
y_binned = cut(y, breaks = seq(0,100,10))
)
data %>%
ggplot() +
geom_bin_2d(
aes(x = x_binned, y = y_binned), binwidth = c(10,10), colour = "red") +
theme_minimal()
After asking in the comments I am still not quite shure, what the desired answer would look like but I hope, that one of the two answers in the below code will work for you:
x <- sample(100)
y <- sample(100)
data <- data.frame(x, y)
xbin <- seq(from = 0, to = 100, by = 10)
ybin <- seq(from = 0, to = 100, by = 10)
data$xbin <- cut(data$x, breaks = xbin, ordered = TRUE)
data$ybin <- cut(data$y, breaks = ybin, ordered = TRUE)
data$commonbin1 <- paste0(data$xbin, data$ybin)
data$commonbin2 <- paste0("(",as.numeric(data$xbin),";", as.numeric(data$ybin),")")
head(data, 20)
This will construct a common binning variable commonbin1 that includes the bin-limits in the names of the bins and commonbin2 which will be easier to compare to the plot mentioned in the comment.
Suppose I have the following data:
coef <- list(c(47, 2, 0, 0),
c(7, 42, -8, 0),
c(78, -71, 43, -7))
my_data <- data.frame("x" = rep(1:4, times = 20))
cols <- c("1", "2", "3")
I would like to plot a function for each vector of coefficients in the object coef.
However, using a for loop does not work.
library(ggplot2)
g1 <- ggplot(data = my_data, aes(x = x))
for(i in 1:3){
my_fun <- function(x){
sum(as.vector(outer(x, 0:3, FUN="^")) * coef[[i]])
}
my_fun <- Vectorize(my_fun)
g1 <- (g1 + stat_function(fun = my_fun, aes(col = cols[i]), lwd = 1.2))
print(g1)
}
The result is supposed to look (somehow) like this:
Is there a way to use lapply instead of the loop? Or can I modify the for-loop to fix this problem?
I wanted to make a graph using facet_wrap and plot it in different pages in a pdf file. I've read son many options, and this works:
R + ggplot: plotting over multiple pages
but only when you have the same rows in each page.
I have this demo data to try explain my case:
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
So, I have ID with different number of observations. I tried to make a vector with the number of observation in each ID (I want an ID per page), but it doesn't work.
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
slice2 = slice[!(slice > nrow(A2))]
A3 = A2[slice2,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
Could you help me?
Thanks in advances,
Nacho
I think you were overthinking trying to calculate your "slices". Maybe you want this?
Not entirely sure. If you only want one ID per page you don't need facet_wrap, and you will probably need to set the scale explicitly to keep it the same from page to page.
library(plyr)
A <- data.frame(TIME = rep(c(0, 5, 10, 15, 30, 45, 60), 5))
A$C <- (1 - exp(-0.2*A$TIME))
A$ID <- rep(1:5, each = 7)
A$R <- rnorm(35, mean = 1, sd = 0.01)
A$C2 <- A$C*A$R
Pages <- 5
A2 <- A[c(1,4:8,10:22,24:35),]
nrws <- ddply(A2, .(ID), "nrow")
nsamp <- nrws[,2]
pdf("Test.pdf")
for (i in seq(Pages))
{
# slice = seq(((i-1)*nsamp[i]),(i*nsamp[i]))
# slice2 = slice[!(slice > nrow(A2))]
# A3 = A2[slice2,]
A3 = A2[A2$ID==i,]
p1 <- ggplot(A3, aes(x = TIME, y = C2)) +
geom_line(size = 0.5) +
geom_point(size = 1) +
facet_wrap(~ID)
print(p1)
}
dev.off()
I'm producing a plot like this:
library(ggplot2)
data.dist = matrix(
c(10, -10, 10, -10, 10, -10, 10, -10, 10),
nrow=3,
ncol=3,
byrow = TRUE)
hc <- agnes(dist(data.dist), method = "ward", diss = TRUE)
cluster <- cutree(hc, k=2)
xy <- data.frame(cmdscale(dist(data.dist)), factor(cluster))
names(xy) <- c("x", "y", "cluster")
xy$model <- rownames(xy)
ggplot(xy, aes(x, y)) + geom_point(aes(colour=cluster), size=3)
Which gives me:
However, let's say I want to attach another covariate, say a binary variable c(1, 0, 1) to the data and display all 1 using one symbol (say an X) and all 0 using another symbol (say a dot). How can I accomplish this?
xy<-data.frame(x=rnorm(3),y=rnorm(3),cluster=as.factor(c(1,0,1)),another=as.factor(c(1,1,0)) )
ggplot(xy, aes(x, y,shape=another)) + geom_point(aes(colour=cluster), size=3)
I am trying to output multiple density plot from a function, by dividing the dataframe into pieces such that separate density for each level of a factor for corresponding yvar.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(gen, yvar)
minyvar <- min(yvar)
maxyvar <- max(yvar)
par(mfrow = c(length(levels(mydf$gen)),1))
plotdensity <- function (xf, minyvar, maxyvar){
plot(density(xf), xlim=c(minyvar, maxyvar), main = paste (names(xf),
"distribution", sep = ""))
dens <- density(xf)
x1 <- min(which(dens$x >= quantile(xf, .80)))
x2 <- max(which(dens$x < max(dens$x)))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="blu4"))
abline(v= mean(xf), col = "black", lty = 1, lwd =2)
}
require(plyr)
ddply(mydf, .(mydf$gen), plotdensity, yvar, minyvar, maxyvar)
Error in .fun(piece, ...) : unused argument(s) (111.544494112914)
My specific expectation are each plot is named by name of level for example Aa, Bb, Cc, Dd
Arrangement of the graphs see the parameter set, so that we compare density changes and means. compact - Low space between the graphs.
Help appreciated.
Edits:
The following graphs are individually produced, although I want to develop a function that can be applicable to x level for a factor.
I see that #Andrie just beat me to most of this. I'm still going to post my answer, since filling only certain quantiles of the distribution requires a slightly different approach.
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
ggplot(mydf,aes(x = x)) +
facet_wrap(~grp) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0, fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),colour = "black")
Here is a way of doing it in ggplot:
set.seed(1234)
mydf <- rbind(
data.frame(gen="Aa", yvar= rnorm(40000, 50, 10)),
data.frame(gen="Bb", yvar=rnorm(4000, 70, 10)),
data.frame(gen="Cc", yvar=rnorm(400, 75, 10)),
data.frame(gen="Dd", yvar=rnorm(40, 80, 10))
)
labels <- ddply(mydf, .(gen), nrow)
means <- ddply(mydf, .(gen), summarize, mean=mean(yvar))
ggplot(mydf, aes(x=yvar)) +
stat_density(fill="blue") +
facet_grid(gen~.) +
theme_bw() +
geom_vline(data=means, aes(xintercept=mean), colour="red") +
geom_text(data=labels, aes(label=paste("n =", V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution")
With sincere thanks to joran and Andrie, the following is just compilation of my favorite from above two posts, just some of readers might want to see.
require(ggplot2)
set.seed(1234)
Aa = c(rnorm(40000, 50, 10))
Bb = c(rnorm(4000, 70, 10))
Cc = c(rnorm(400, 75, 10))
Dd = c(rnorm(40, 80, 10))
yvar = c(Aa, Bb, Cc, Dd)
gen <- c(rep("Aa", length(Aa)),rep("Bb", length(Bb)), rep("Cc", length(Cc)),
rep("Dd", length(Dd)))
mydf <- data.frame(grp = gen,x = c(Aa,Bb,Cc,Dd))
mydf1 <- mydf
#Calculate the densities and an indicator for the desire quantile
# for later use in subsetting
mydf <- ddply(mydf,.(grp),.fun = function(x){
tmp <- density(x$x)
x1 <- tmp$x
y1 <- tmp$y
q80 <- x1 >= quantile(x$x,0.8)
data.frame(x=x1,y=y1,q80=q80)
})
#Separate data frame for the means
mydfMean <- ddply(mydf,.(grp),summarise,mn = mean(x))
labels <- ddply(mydf1, .(grp), nrow)
ggplot(mydf,aes(x = x)) +
facet_grid(grp~.) +
geom_line(aes(y = y)) +
geom_ribbon(data = subset(mydf,q80),aes(ymax = y),ymin = 0,
fill = "black") +
geom_vline(data = mydfMean,aes(xintercept = mn),
colour = "black") + geom_text(data=labels,
aes(label=paste("n =", labels$V1)), x=5, y=0,
hjust=0, vjust=0) +
opts(title="Distribution") + theme_bw()