Creating a cumulative step graph in R - r

Say I have this example data frame
set.seed(12345)
n1 <- 3
n2 <- 10
n3 <- 60
times <- seq(0, 100, 0.5)
individual <- c(rep(1, n1),
rep(2, n2),
rep(3, n3))
events <- c(sort(sample(times, n1)),
sort(sample(times, n2)),
sort(sample(times, n3)))
df <- data.frame(individual = individual, events = events)
Which gives
> head(df, 10)
individual events
1 1 72.0
2 1 75.5
3 1 87.5
4 2 3.0
5 2 14.5
6 2 16.5
7 2 32.0
8 2 45.5
9 2 50.0
10 2 70.5
I would like to plot a cumulative step graph of the events so that I get one line per individual which goes up by 1 each time an event is "encountered".
So, for instance individual 1 will be 0 up to 72.0, then go up to 1, until 75.5 when it becomes 2 and up to 3 at 87.5 to the end of the graph.
What would be the easiest way to do that?

df$step <- 1
library(plyr)
df <- ddply(df,.(individual),transform,step=cumsum(step))
plot(step~events,data=df[df$individual==1,],type="s",xlim=c(0,max(df$events)),ylim=c(0,max(df$step)),xlab="time",ylab="step")
lines(step~events,data=df[df$individual==2,],type="s",col=2)
lines(step~events,data=df[df$individual==3,],type="s",col=3)

There is also the stepfun function in the stats package. Using that, you could use the plot method for that object class:
sdf <- split(df, individual)
plot(1, 1, type = "n", xlim = c(0, max(events)), ylim = c(0, max(table(individual))),
ylab = "step", xlab = "time")
sfun <- lapply(sdf, function(x){
sf <- stepfun(sort(x$events), seq_len(nrow(x) + 1) - 1)
plot(sf, add = TRUE, col = unique(x$individual), do.points = FALSE)
})

Use ggplot2:
library(ggplot2)
# Add step height information with sequence and rle
df$step <- sequence(rle(df$individual)$lengths)
# plot
df$individual <- factor(df$individual)
ggplot(df, aes(x=events, group=individual, colour=individual, y=step)) +
geom_step()

Related

How to create ggplot box plot which add data over time

I have a dataframe, which gives values for different courses over a series of weeks.
Course Week m
1 UGS200H 1 44.33333
2 CMSE201 1 73.66667
3 CMSE201 2 88.16667
4 CMSE201 2 88.16667
5 PHY215 2 73.66667
6 PHY215 3 86.33333
7 CMSE201 3 84.00000
8 UGS200H 4 60.66667
9 UGS200H 4 76.66667
I would like to create a series of box plots which plot m values over the weeks for each course. I would like for the box plots to build off of each other though, such that week 1 contains only the data from Week = 1, but week 2 contains data including data from Week = 1 and 2, and week 3 includes data from Week = 1,2,3 and etc. I have create the following code which creates the box plots but without the building up over the weeks.
d <- subset(data_manual)
a <- ggplot(data=d, aes(x=(Week), fill = Course, y=(m), group=interaction(Course, Week)))
geom_boxplot()+
scale_y_continuous(limits = c(-2, 100), breaks = seq(0, 100, by = 20))+
xlab('Week') +
ylab('Course-Level SE') +
print(a) #show us the plot!!
}
This gives plots like this
But these are just individual weeks, not the summed version that I would like. Is there a way to have them build and plot the multiple weeks on one plot?
How about this:
# dat <- tibble::tribble(
# ~Course, ~Week, ~m,
# "UGS200H", 1, 44.33333,
# "CMSE201", 1, 73.66667,
# "CMSE201", 2, 88.16667,
# "CMSE201", 2, 88.16667,
# "PHY215", 2, 73.66667,
# "PHY215", 3, 86.33333,
# "CMSE201", 3, 84.00000,
# "UGS200H", 4, 60.66667,
# "UGS200H", 4, 76.66667)
dat <- data.frame(
Course = rep(c("A", "B", "C"), each=1000),
Week = rep(rep(1:10, each=100), 3),
m = runif(3000, 50, 100)
)
library(ggplot2)
dats <- lapply(1:max(dat$Week), \(i){
tmp <- subset(dat, Week <= i)
tmp$plot_week <- i
tmp})
dats <- do.call(rbind, dats)
table(dat$Week)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 300 300 300 300 300 300 300 300 300 300
table(dats$plot_week)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 300 600 900 1200 1500 1800 2100 2400 2700 3000
ggplot(data=dats, aes(x=as.factor(plot_week), fill = Course, y=(m), group=interaction(Course, plot_week))) +
geom_boxplot()+
scale_y_continuous(limits = c(-2, 100), breaks = seq(0, 100, by = 20))+
xlab('Week') +
ylab('Course-Level SE')
Created on 2022-10-18 by the reprex package (v2.0.1)
Basically the same idea as by #DaveArmstrong but using lapply with multiple geom_boxplots.
Note 1: To make the example a bit more realistic I use some random fake example data.
Note 2: I added an additional geom_point just to check that the number of obs. is actually increasing for each week.
set.seed(123)
d <- data.frame(
Course = rep(c("UGS200H", "CMSE201", "PHY215"), each = 40),
Week = rep(1:4, 30),
m = runif(120, 40, 100)
)
library(ggplot2)
ggplot(data=d, aes(x = factor(Week), fill = Course, y=m)) +
lapply(unique(d$Week), function(x) {
list(
geom_boxplot(data = subset(d, Week <= x) |> transform(Week = x), position = "dodge"),
geom_point(data = subset(d, Week <= x) |> transform(Week = x), position = position_dodge(.9), alpha = .2)
)
}) +
labs(x = 'Week', y = 'Course-Level SE')

Calculate triangle areas in a loop - R

I have a 20 points (X1, Y1,…. Xn, Yn) on a pyramid and a random base point (Xbase, Ybase). I wish to calculate the triangle area between (Xi, Yi; Xi+1, Yi+1; Xbase, Ybase). Therefore, I did a loop that calculate the area but I can not store the area result area in a the data.frame (myDF). Furthermore, is there another elegant way to calculate the area?
Script:
library(ggplot2)
myDF <- data.frame(area=double())
nElem <- 100
xData <- as.data.frame(seq(1,nElem,5))
yData1 <- seq(5,nElem/2,5)
yData2 <- rev(yData1-4)
yData<- as.data.frame((c(yData1, yData2)))
xyDATA<- cbind(xData,yData)
colnames(xyDATA) <- c("xCoord","yCoord")
Xbase <-runif(1, 90, 91)
Ybase <-runif(1, 1.0, 1.5)
for(i in 1:19)
{
x1 <- Xbase
y1 <- Ybase
x2 <- xyDATA[i,1]
y2 <- xyDATA[i,2]
x3 <- xyDATA[i+1,1]
y3 <- xyDATA[i+1,2]
s <- 0.5*sqrt((x2*x3-x3*y2)^2+(x3*y1-x1*y3)^2+(x1*y2-x2*y1)^2)
myDF[i] <-s
}
P1 <- ggplot(xyDATA) + geom_point(aes(x = xCoord, y = yCoord))
P2 <- P1 + geom_point(aes(x = x1, y = y1),colour="red",size=4)
P2
Thanks a lot.
As written you are assigning the value of s to an entire column in the dataframe. You probably want to specify an area column and then assign into a row of that col.
# before the loop, create the column:
DF['area'] <- NA
# Inside the loop
....
myDF[i, "area"] <-s
Here is a solution using the dplyr package:
nElem <- 100
xData <- as.data.frame(seq(1,nElem,5))
yData1 <- seq(5,nElem/2,5)
yData2 <- rev(yData1-4)
yData<- as.data.frame((c(yData1, yData2)))
xyDATA<- cbind(xData,yData)
colnames(xyDATA) <- c("xCoord","yCoord")
Xbase <-runif(1, 90, 91)
Ybase <-runif(1, 1.0, 1.5)
library(dplyr)
myDF <- xyDATA %>%
mutate("s" = 0.5*sqrt(
(xCoord*lead(xCoord)-lead(xCoord)*yCoord)^2+
(lead(xCoord)*Ybase-Xbase*lead(yCoord))^2+
(Xbase*yCoord-xCoord*Ybase)^2
))
head(myDF)
xCoord yCoord s
1 1 5 502.2731
2 6 10 807.6995
3 11 15 1118.5987
4 16 20 1431.4092
5 21 25 1745.1034
6 26 30 2059.2776

Interpolate missing values of a data frame

I have a dataset like this:
x y z
1 1 0.954
1 3 0.134
1 30 0.123
2 1 0.425
2 3 0.123
2 30 0.865
5 1 0.247
5 3 0.654
5 30 0.178
Let's think of this as the height of a surface sampled at 9 points over a 4x29 field. Suppose I want to fill in the missing values by interpolating (linear is fine), so that I end up with a z value for every (integer) x in [1,5] and every y in [1,30]. I want the result to still be a data frame with the same structure.
How can I do this in R?
I'll take the previous lack of answer as a gift :)
#akima_0.5-12
library(akima)
my_df <- data.frame(
x = c(rep(1, 3), rep(2, 3), rep(5, 3)),
y = rep(c(1, 3, 30), 3),
z = c(0.954, 0.134, 0.123, 0.425, 0.123, 0.865, 0.247, 0.654, 0.178)
)
my_op <- interp(
x = my_df$x,
y = my_df$y,
z = my_df$z,
xo = 1:5, # vector of x coordinates to use in interpolation
yo = 1:30, # vector of y coordinates to use in interpolation
linear = TRUE # default interpolation method
)
my_op$z # matrix of interpolated z coordinates, (row, col) correspond to (x, y)
ind <- which(!is.nan(my_op$z), arr.ind = TRUE)
desired_output <- data.frame(
x = ind[, 1],
y = ind[, 2],
z = as.vector(my_op$z) # data are organized column-by-column
)

Matrix version of rasterToPoints?

Anyone know of a non-raster method to achieve the following?
require(raster)
d = data.frame(rasterToPoints(raster(volcano)))
head(d)
x y layer
1 0.008196721 0.9942529 100
2 0.024590164 0.9942529 100
3 0.040983607 0.9942529 101
4 0.057377049 0.9942529 101
5 0.073770492 0.9942529 101
6 0.090163934 0.9942529 101
Cheers.
One way would be to use the row and col command:
library(raster)
data(volcano)
df <- data.frame(
x = as.vector(col(volcano)),
y = (yy <- as.vector(row(volcano)))[length(yy):1],
val = as.vector(volcano)
)
raster rescales the range to 0 - 1, if not specified differently, so we would to have to do this too:
## rescale
df$x <- with(df, (x - min(x)) / (max(x) - min(x)))
df$y <- with(df, (y - min(x)) / (max(y) - min(y)))
Finally lets check, that the results are the same:
## Using raster df1 <- data.frame(rasterToPoints(raster(volcano)))
cols <- colorRampPalette(c('white', "blue",'red')) df$col <-
cols(20)[as.numeric(cut(df$val, breaks = 20))] df1$col <-
cols(20)[as.numeric(cut(df1$layer, breaks = 20))]
par(mfrow = c(1, 2)) plot(df[, 1:2], col = df$col, pch = 20, main =
"matrix")
plot(df1[, 1:2], col = df1$col, pch = 20, main = "raster")
Note:
While the results appear the same visually, they are not. The resolution of the raster command is most likely different, and hence there are different nrows for df and df1.
Faster for large matrices:
data.frame(
x = rep(1:ncol(m), each=nrow(m)),
y = rep(nrow(m):1, ncol(m)),
val = as.vector(m)
)

How to plot a bipartite graph in R

How do I plot a network of type bipartite in R? Similar to this:
I have similar data but with weights for both genes and diseases and SARS. This network is an example. I have different kind of attributes. I followed a link here. But due to my little knowledge in this topic, I could not get much out of it. Thanks in advance for any help.
From the ?bipartite_graph help:
Bipartite graphs have a type vertex attribute in igraph, this is boolean and FALSE for the vertices of the first kind and TRUE for vertices of the second kind.
So you could do something like this (igraph 1.0.1):
library(igraph)
set.seed(123)
# generate random bipartite graph.
g <- sample_bipartite(10, 5, p=.4)
# check the type attribute:
V(g)$type
# define color and shape mappings.
col <- c("steelblue", "orange")
shape <- c("circle", "square")
plot(g,
vertex.color = col[as.numeric(V(g)$type)+1],
vertex.shape = shape[as.numeric(V(g)$type)+1]
)
Check also ?bipartite.
Using the example provided by the OP in the comments. Since the graph is multipartite and given the provided data format, I would first create a bipartite graph, then add the additional edges. Note that although the resulting graph returns TRUE for is_bipartite() the type argument is specified as numeric instead of logical and may not work properly with other bipartite functions.
set.seed(123)
V1 <- sample(LETTERS[1:10], size = 10, replace = TRUE)
V2 <- sample(1:10, size = 10, replace = TRUE)
d <- data.frame(V1 = V1, V2 = V2, weights = runif(10))
d
> d
V1 V2 weights
1 C 10 0.8895393
2 H 5 0.6928034
3 E 7 0.6405068
4 I 6 0.9942698
5 J 2 0.6557058
6 A 9 0.7085305
7 F 3 0.5440660
8 I 1 0.5941420
9 F 4 0.2891597
10 E 10 0.1471136
g <- graph_from_data_frame(d, directed = FALSE)
V(g)$label <- V(g)$name # set labels.
# create a graph connecting central node FOO to each V2.
e <- expand.grid(V2 = unique(d$V2), V2 = "FOO")
> e
V2 V2
1 10 FOO
2 5 FOO
3 7 FOO
4 6 FOO
5 2 FOO
6 9 FOO
7 3 FOO
8 1 FOO
9 4 FOO
g2 <- graph.data.frame(e, directed = FALSE)
# join the two graphs.
g <- g + g2
# set type.
V(g)$type <- 1
V(g)[name %in% 1:10]$type <- 2
V(g)[name %in% "FOO"]$type <- 3
V(g)$type
> V(g)$type
[1] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3
col <- c("steelblue", "orange", "green")
shape <- c("circle", "square", "circle")
library(rTRM) # Bioconductor package containing layout.concentric()
# the fist element in the list for concentric is the central node.
l <- layout.concentric(g, concentric = list("FOO", 1:10, LETTERS[1:10]))
plot(g,
layout = l,
vertex.color = col[V(g)$type],
vertex.shape = shape[V(g)$type],
edge.width = E(g)$weights * 5 # optional, plot edges width proportional to weights.
)
The function layout.concentric() is in (my) package rTRM, available from Bioconductor. It is really a simple implementation I wrote to do exactly what you want. I am not completely sure whether the latest igraph version has the same functionality though (it may be).
For the example you provided, I would recommend using the x and y attributes for visualizing a bipartite graph. E.g.:
V(g)$x <- c(1, 1, 1, 2, 2, 2, 2)
V(g)$y <- c(3, 2, 1, 3.5, 2.5, 1.5, 0.5)
V(g)$shape <- shape[as.numeric(V(g)$type) + 1]
V(g)$color <- c('red', 'blue', 'green', 'steelblue', 'steelblue', 'steelblue', 'steelblue')
E(g)$color <- 'gray'
E(g)$color[E(g)['A' %--% V(g)]] <- 'red'
E(g)$color[E(g)['B' %--% V(g)]] <- 'blue'
E(g)$color[E(g)['C' %--% V(g)]] <- 'green'
plot(g)
EDIT: added code to give the vertices and edges different colors for clarity.
Or you can use the multigraph package.
swomen <- read.dl(file = "http://moreno.ss.uci.edu/davis.dat")
bmgraph(swomen, layout = "force", seed = 1, cex = 3, tcex = .8, pch = c(19, 15), lwd = 2,
+ vcol = 2:3, ecol = 8, rot = 65)
that can produce the binomial projection of the two-mode data set

Resources