facet grid for huge data in list - r

I would like to make a facet grid plot of quite long data which is in a list of different dataframes. Here is some example data:
library(tidyverse)
library(ggplot2)
L <- list()
for (i in 1:10) {
DATE <- seq( as.POSIXct("2016-01-01 21:00:00", tz="UTC"),
as.POSIXct("2016-02-01 21:00:00", tz="UTC"),
by = 600
)
V1 <- round(runif(length(DATE)) - 0.3,0)
L[[i]] <- tibble(DATE, V1)
}
names(L) <- LETTERS[1:10]
What I acctually need is a facet_grid() with each dataframe in one row showing V1 is either 1 or 0, like a barcode or something similar.
My only success was to bind all list elements to one dataframe with the name of each list element as a new column. This seems to be being kind of circuitous.
for (i in 1:10) {
L[[i]] <- mutate(L[[i]], list.name = names(L)[i] )
}
DF <- do.call(rbind.data.frame, L)
ggplot(DF) +
geom_step(aes(DATE, V1), size = 0.1, ) +
facet_grid(list.name ~ .)
I would like to have the downsite of the geom_step() plot filled like it is at the geom_area() (I know it is hard to see in the example). Is there another geom that fits better? Since my data is way longer it takes hours to create the plot. Is there a better way to make a facet_grid() on list elements?

How about using geom_raster?
DF <- map_df(L, I) %>% mutate(V1 = as.factor(V1))
ggplot(DF, aes(x = DATE, y = list.name, fill = V1)) +
geom_raster() +
scale_fill_manual(values = c("white", "black"))

Related

Slope Chart - ggplot2

I am testing some templates of ggplot2, and I am interesting to the slope chart, available from here:
Slope Chart - Link
It works perfectly.
I am only trying to make a similar one but with only a group defined, like the following code:
library(dplyr)
library(ggplot2)
theme_set(theme_classic())
source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv")
source_df <- filter(source_df, group == "Thyroid")
then I copy the remaining code from the example:
# Define functions. Source: https://github.com/jkeirstead/r-slopegraph
tufte_sort <- function(df, x="year", y="value", group="group", method="tufte", min.space=0.05) {
## First rename the columns for consistency
ids <- match(c(x, y, group), names(df))
df <- df[,ids]
names(df) <- c("x", "y", "group")
## Expand grid to ensure every combination has a defined value
tmp <- expand.grid(x=unique(df$x), group=unique(df$group))
tmp <- merge(df, tmp, all.y=TRUE)
df <- mutate(tmp, y=ifelse(is.na(y), 0, y))
## Cast into a matrix shape and arrange by first column
require(reshape2)
tmp <- dcast(df, group ~ x, value.var="y")
ord <- order(tmp[,2])
tmp <- tmp[ord,]
min.space <- min.space*diff(range(tmp[,-1]))
yshift <- numeric(nrow(tmp))
## Start at "bottom" row
## Repeat for rest of the rows until you hit the top
for (i in 2:nrow(tmp)) {
## Shift subsequent row up by equal space so gap between
## two entries is >= minimum
mat <- as.matrix(tmp[(i-1):i, -1])
d.min <- min(diff(mat))
yshift[i] <- ifelse(d.min < min.space, min.space - d.min, 0)
}
tmp <- cbind(tmp, yshift=cumsum(yshift))
scale <- 1
tmp <- melt(tmp, id=c("group", "yshift"), variable.name="x", value.name="y")
## Store these gaps in a separate variable so that they can be scaled ypos = a*yshift + y
tmp <- transform(tmp, ypos=y + scale*yshift)
return(tmp)
}
plot_slopegraph <- function(df) {
ylabs <- subset(df, x==head(x,1))$group
yvals <- subset(df, x==head(x,1))$ypos
fontSize <- 3
gg <- ggplot(df,aes(x=x,y=ypos)) +
geom_line(aes(group=group),colour="grey80") +
geom_point(colour="white",size=8) +
geom_text(aes(label=y), size=fontSize, family="American Typewriter") +
scale_y_continuous(name="", breaks=yvals, labels=ylabs)
return(gg)
}
## Prepare data
df <- tufte_sort(source_df,
x="year",
y="value",
group="group",
method="tufte",
min.space=0.05)
df <- transform(df,
x=factor(x, levels=c(5,10,15,20),
labels=c("5 years","10 years","15 years","20 years")),
y=round(y))
## Plot
plot_slopegraph(df) + labs(title="Estimates of % survival rates") +
theme(axis.title=element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust=0.5,
family = "American Typewriter",
face="bold"),
axis.text = element_text(family = "American Typewriter",
face="bold"))
The problem I find is that the connecting lines disappears if I take only one object in source_df$group ()in this case Thyroid, like the followings:
If I add only another item in the same column everything is fine and the connecting line appears.
Is there a way to have the lines also in this situation? I have tried on a lot of ways, removing the lines containing NA values without success, therefore I do not know how to fix this problem, if ... it is possible to be fixed.
Thank you in advance for every eventual reply!
May I suggest a much easier way, with the {ggh4x} package, which has a base R type = "b" like geom. You can remove the points, and plot text instead.
You'll get your result in three lines of code :)
library(tidyverse)
library(ggh4x)
source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv")
source_df <- filter(source_df, group == "Thyroid")
ggplot(source_df, aes(year, value)) +
## set shape to NA
geom_pointpath(aes(group = group, mult = 1), shape = NA) +
geom_text(aes(label = value))
Created on 2021-12-30 by the reprex package (v2.0.1)
I fixed simply adding the line:
df <- df[complete.cases(df), ]
Before the graphing instructions. The problem was the generation of many lines with NA values and this line removes lines with null value.

Get multiple density plots of specifics columns for every data frame in a list

I have 60 data frames which have the same columns names store in a list which i got by using this code:
setwd("C:/Users/Visitor/Desktop/Unesco/")
temp = list.files(pattern="*.csv")
myfiles = lapply(temp, read.csv)
A part of one of my csv files :
"","PRO","TRA","MEN","ENF","COU","TOI","REP","SOM","TEL","LOI"
"HAU","610","140","60","10","120","95","115","760","175","315"
"FAU","475","90","250","30","140","120","100","775","115","305"
"FNU","105","0","495","110","170","110","130","785","160","430"
"HMU","616","141","65","10","115","90","115","765","180","305"
"FMU","179","29","421","87","161","112","119","776","143","373"
I did a ggplot of the columns PRO and TRA of all my dataframes merge in one dataframe with this :
library(dplyr)
library(reshape2)
library(ggplot2)
cols <-lapply(myfiles,function(x)select(x,PRO,TRA))
big_df2 <- do.call(rbind,cols)
df.m2 <- melt(big_df2)
ggplot(df.m2) + geom_freqpoly(aes(x = value,
+ y = ..density.., colour = variable))
I have this output:
But I want the same plot for all my data frames separately but i can't seem to find how to do that.
Maybe I can do something with this loop over my list :
for(i in 1:length(myfiles)){
myfiles[[i]]$df_num <- i
}
Try this example:
library(ggplot2)
library(dplyr)
library(reshape2)
# dummy data
set.seed(1)
df1 <- cars[sample(seq(nrow(cars)), 10), ]
df2 <- cars[sample(seq(nrow(cars)), 10), ]
df3 <- cars[sample(seq(nrow(cars)), 10), ]
mylist <- list(df1, df2, df3) #this is similar example of your "myfiles"
mylist <- lapply(mylist, melt)
# merge them with an id column
df_merged <- bind_rows(mylist, .id = "df_id")
# Option 1: One plot as facets, 1 file 1 page
ggsave("option1.PDF",
ggplot(df_merged, aes(value, col = variable)) + geom_freqpoly() +
facet_grid(.~df_id)
)
# Option 2: plot into files - output 3 PDF files
for(i in unique(df_merged$df_id)){
filePDF <- paste0("option2_", i, ".PDF")
myPlot <- df_merged %>%
filter(df_id == i) %>%
ggplot(aes(value, col = variable)) + geom_freqpoly()
ggsave(filePDF, myPlot)
}
# Option 3: plot into one file 3 pages
pdf("option3.PDF")
for(i in seq(length(df_merged))){
plotDat <- df_merged %>%
filter(df_id == i)
myPlot <- ggplot(plotDat, aes(value, col = variable)) + geom_freqpoly()
print(myPlot)
}
dev.off()

Combining uneven numbers of plots with a for loop and grid.arrange

I have hundreds of plots of that I have made with a previous for loop that are named in a consistent manner. The plots are the results of several tasks nested within domains.
I would like to use grid.arrange to plot all of the tasks of a defined domain on the same plot.
Dummy data that explains the structure of my data and plots:
domain_key <- data.frame(domain = c("soe", "soe", "soe", "elit", "elit"),
tasks = c("personal", "size", "shapeid", "onetoone", "puzzle"))
dummy <- ggplot(data.frame()) + geom_point() + xlim(0, 10) + ylim(0, 100)
plot.personalpct <- dummy + ggtitle("plot.personalpct")
plot.sizepct <- dummy + ggtitle("plot.sizepct")
plot.shapeidpct <- dummy + ggtitle("plot.shapeidpct")
plot.onetoonepct <- dummy + ggtitle("plot.onetoonepct")
plot.puzzlepct <- dummy + ggtitle("plot.puzzlepct")
And here is my basic idea of how to do it:
for(j in domain_key$domain){
tasks <- unique(with(domain_key, tasks[domain == j])) #Get a list of the unique tasks for the domain
plots <- paste("plot.", tasks, "pct", sep ="") #Get the name of the plots as a character vector
grid.arrange(eval(parse(text = plots))) #evaluate the expression with grid arrange to display all the plots
}
My problem is that the final argument only displays the first plot of each domain. This is because my character vector doesn't parse as mulitple objects, probably because they are not separated by a comma. I've tried a bunch of workarounds, but can't figure out a way around this. Or maybe my approach is totally off.
Much appreciate any assistance.
Maybe this helps
library(ggplot2)
library(gridExtra)
dummy_plot <- function(id, fill) ggplot() + ggtitle(id) +
theme(panel.background = element_rect(fill = fill))
pl = list(`domain 1` = lapply(1:3, dummy_plot, fill = "#FBB4AE"),
`domain 2` = lapply(1:2, dummy_plot, fill = "#B3CDE3"),
`domain 3` = lapply(1:4, dummy_plot, fill = "#CCEBC5"),
`domain 4` = lapply(1:5, dummy_plot, fill = "#DECBE4"))
dummy_group <- function(domain) arrangeGrob(grobs = pl[[domain]], top = domain)
grid.arrange(grobs = lapply(names(pl), dummy_group))
Thanks for the help of #baptiste and #lukeA. Both of their suggestions didn't quite answer my question, but helped put me on the right path.
I realized that I really needed to feed grid.arrange a list, so I found a reasonable, but not terribly elegant solution. I embedded the domain name in the plot name for each task. Then I fed grid.arrage and list of plots using grep with the domain name. This works reasonably well.
This has a bit too much detail but illustrates my solution:
### Graph means of tasks by treatment group with error bars
pltList <- list() #Create and empty list to store task plots
for(outcome in unique(domain_key$task) ){
df <- data.frame(Intervention.Group = factor(unique(children$treatment)),
Percent.Correct = eval(parse(text = paste0("meansbytreatment$", outcome, "$estimates"))),
SE = eval(parse(text = paste0("meansbytreatment$", outcome, "$se"))))
df$upper <- df$Percent.Correct + 2*df$SE
df$lower <- df$Percent.Correct - 2*df$SE #Make a temp df with with the estimates and errors for each treatment group
domain <- unique( domain_key$domain[domain_key$task == outcome] ) #extract the domain that the task belongs to
pltName <- paste( "plot", outcome, domain, sep = "." ) #Make a unique plot name with task and domain
pltList[[ pltName ]] <- ggplot(df, aes(Intervention.Group, Percent.Correct, fill = Intervention.Group)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.4) +
ggtitle(paste(outcome, "by Intervention Group") )+
theme(legend.position="none") #Save the plot to the plot list
}
### Graph domain subtasks together
Domainplt <- lst() #Make an empty list to store plots of all the taks in a domain
for(j in unique(domain_key$domain)){
plots <- grep(j, names(pltList)) #get all of the plots from the domain
nplots <- length(plots) # get the n of plots for the domain
Domainplt[[j]] <- grid.arrange(grobs = pltList[plots],
ncol = nplots,
top = j) #Arrange all of the plots horizontally
}

Custom scatterplot matrix using facet_grid in ggplot2

I'm trying to write a custom scatterplot matrix function in ggplot2 using facet_grid. My data have two categorical variables and one numeric variable.
I'd like to facet (make the scatterplot rows/cols) according to one of the categorical variables and change the plotting symbol according to the other categorical.
I do so by first constructing a larger dataset that includes all combinations (combs) of the categorical variable from which I'm creating the scatterplot panels.
My questions are:
How to use geom_rect to white-out the diagonal and upper panels in facet_grid (I can only make the middle ones black so far)?
How can you move the titles of the facets to the bottom and left hand sides respectively?
How does one remove tick axes and labels for the top left and bottom right facets?
Thanks in advance.
require(ggplot2)
# Data
nC <- 5
nM <- 4
dat <- data.frame(
Control = rep(LETTERS[1:nC], nM),
measure = rep(letters[1:nM], each = nC),
value = runif(nC*nM))
# Change factors to characters
dat <- within(dat, {
Control <- as.character(Control)
measure <- as.character(measure)
})
# Check, lapply(dat, class)
# Define scatterplot() function
scatterplotmatrix <- function(data,...){
controls <- with(data, unique(Control))
measures <- with(data, unique(measure))
combs <- expand.grid(1:length(controls), 1:length(measures), 1:length(measures))
# Add columns for values
combs$value1 = 1
combs$value2 = 0
for ( i in 1:NROW(combs)){
combs[i, "value1"] <- subset(data, subset = Control==controls[combs[i,1]] & measure == measures[combs[i,2]], select = value)
combs[i, "value2"] <- subset(data, subset = Control==controls[combs[i,1]] & measure == measures[combs[i,3]], select = value)
}
for ( i in 1:NROW(combs)){
combs[i,"Control"] <- controls[combs[i,1]]
combs[i,"Measure1"] <- measures[combs[i,2]]
combs[i,"Measure2"] <- measures[combs[i,3]]
}
# Final pairs plot
plt <- ggplot(combs, aes(x = value1, y = value2, shape = Control)) +
geom_point(size = 8, colour = "#F8766D") +
facet_grid(Measure2 ~ Measure1) +
ylab("") +
xlab("") +
scale_x_continuous(breaks = c(0,0.5,1), labels = c("0", "0.5", "1"), limits = c(-0.05, 1.05)) +
scale_y_continuous(breaks = c(0,0.5,1), labels = c("0", "0.5", "1"), limits = c(-0.05, 1.05)) +
geom_rect(data = subset(combs, subset = Measure1 == Measure2), colour='white', xmin = -Inf, xmax = Inf,ymin = -Inf,ymax = Inf)
return(plt)
}
# Call
plt1 <- scatterplotmatrix(dat)
plt1
I'm not aware of a way to move the panel strips (the labels) to the bottom or left. Also, it's not possible to format the individual panels separately (e.g., turn off the tick marks for just one facet). So if you really need these features, you will probably have to use something other than, or in addition to ggplot. You should really look into GGally, although I've never had much success with it.
As far as leaving some of the panels blank, here is a way.
nC <- 5; nM <- 4
set.seed(1) # for reproducible example
dat <- data.frame(Control = rep(LETTERS[1:nC], nM),
measure = rep(letters[1:nM], each = nC),
value = runif(nC*nM))
scatterplotmatrix <- function(data,...){
require(ggplot2)
require(data.table)
require(plyr) # for .(...)
DT <- data.table(data,key="Control")
gg <- DT[DT,allow.cartesian=T]
setnames(gg,c("Control","H","x","V","y"))
fmt <- function(x) format(x,nsmall=1)
plt <- ggplot(gg, aes(x,y,shape = Control)) +
geom_point(subset=.(as.numeric(H)<as.numeric(V)),size=5, colour="#F8766D") +
facet_grid(V ~ H) +
ylab("") + xlab("") +
scale_x_continuous(breaks=c(0,0.5,1), labels=fmt, limits=c(-0.05, 1.05)) +
scale_y_continuous(breaks=c(0,0.5,1), labels=fmt, limits=c(-0.05, 1.05))
return(plt)
}
scatterplotmatrix(dat)
The main feature of this is the use of subset=.(as.numeric(H)<as.numeric(V)) in the call to geom_point(...). This subsets the dataset so you only get a point layer when the condition is met, e.g. in facets where is.numeric(H)<is.numeric(V). This works because I've left the H and V columns as factors and is.numeric(...) operating on a factor returns the levels, not the names.
The rest is just a more compact (and much faster) way of creating what you called comb.

Error in strsplit(filename, "\\.") : non-character argument in ggplot

I am trying to make the plot with horizontal lines where the data2 and data3 points should be within data1 range. This will give an overlapping lines in different colors but I am getting an error which says:
Error in strsplit(filename, "\\.") : non-character argument
Here is the data and code. Please give me some suggestion.
data1 <- data.frame(Start=c(10),End=c(19))
data2 <- data.frame(Start=c(5),End=c(15))
data3 <- data.frame(Start=c(6),End=c(18))
filter_data2 <- data2[data2$Start >= (data1$Start-(data1$Start/2)) & data2$End <= (data1$End+(data1$End/2)), ]
filter_data3 <- data3[data3$Start >= (data1$Start-(data1$Start/2)) & data3$End <= (data1$End+(data1$End/2)), ]
data1 <- data.frame(rep(1,nrow(data1)),data1)
colnames(data1) <- c("ID","start","end")
data2 <- data.frame(rep(2,nrow(filter_data2)),filter_data2)
colnames(data2) <- c("ID","start","end")
data3 <- data.frame(rep(3,nrow(filter_data3)),filter_data3)
colnames(data3) <- c("ID","start","end")
dat1 <- rbind(data1,data2,data3)
pdf("overlap.pdf")
p <- ggplot(dat1, aes(x=(max(start)-max(start)/2), y = ID, colour=ID))
p <- p + geom_segment(aes(xend =(max(end)+max(end)/2), ystart = ID, yend = ID))
p <- p + scale_colour_brewer(palette = "Set1")
ggsave(p)
There are two problems in your code. If you want to use scale_colour_brewer() then ID values should be set as factor
p <- ggplot(dat1, aes(x=(max(start)-max(start)/2), y = ID, colour=as.factor(ID)))
Next, to save the ggplot2 plot you have two possibilities.
Using ggsave() function you should provide file name and format. In this case function pdf() is unnecessary.
ggsave(plot=p,file="plot.pdf")
Using function pdf(), you should add print(p) and then dev.off(). In this case you don't need ggsave() function.
pdf("overlap.pdf")
print(p)
dev.off()

Resources