ggplot facet grid within a factor - r

Consider data that looks like this
fitem<-rep(rep(1:16,each=3),2)
fsubs<-factor(rep(rep(paste('sub',1:3,sep=''),16),2))
ftime<-factor(as.character(rep(c('a','b'),each=48)))
fcounts<-as.numeric(round(runif(96,1,10)))
fdf<-data.frame(fsubs,fitem,fcounts,ftime)
head(df)
fsubs fitem fcounts ftime
1 sub1 1 8 a
2 sub2 1 10 a
3 sub3 1 4 a
4 sub1 2 4 a
5 sub2 2 1 a
6 sub3 2 6 a
I would like to plot a facet grid that shows the counts for the two time points ('a','b'), subject-wise. I can't seem to figure out how to plot this in ggplot
here is my ugly attempt to do it
fdf_counts<-data.frame()
for (i in unique(fdf$fsubs)){
fdf_counts<-append(fdf_counts,cbind(fdf%>%filter(fsubs==i,ftime=='a')%>%dplyr::select(fcounts),
fdf%>%filter(fsubs==i,ftime=='b')%>%dplyr::select(fcounts)))
fdf_counts<-data.frame(fdf_counts)
}
s1<-ggplot(fdf_counts,aes(x=fcounts,y=fcounts.1))+geom_point()+geom_smooth(method='lm')+labs(x='a',y='b',title='sub1')
s2<-ggplot(fdf_counts,aes(x=fcounts.2,y=fcounts.3))+geom_point()+geom_smooth(method='lm')+labs(x='a',y='b',title='sub2')
s3<-ggplot(fdf_counts,aes(x=fcounts.4,y=fcounts.5))+geom_point()+geom_smooth(method='lm')+labs(x='a',y='b',title='sub3')
plot_grid(s1,s2,s3)#from 'cowplot' package
How can I do this with using the original fdf data.frame? Especially as the # of subs increase
Or for example if I wanted to plot one scatter plot across all of the subs with fcounts against eachother with ftime(a) as x axis and ftime(b) as y axis?

Consider a merge solution with data frame by itself on fsubs and fitem (being sequential number of items per fsubs and ftime grouping). This approach allows you to keep your long, tidy data format which is ideal format for ggplot since you can then facet_grid using fsubs without iteration.
mdf <- merge(subset(fdf, ftime=="a"),
subset(fdf, ftime=="b"),
by=c("fsubs", "fitem"),
suffixes=c("", "_"))
ggplot(mdf, aes(x=fcounts, y=fcounts_)) +
geom_point() +
geom_smooth(method='lm') +
labs(x='a', y='b') +
facet_grid(~fsubs)

This should get you close:
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
fitem<-rep(rep(1:16,each=3),2)
fsubs<-factor(rep(rep(paste('sub',1:3,sep=''),16),2))
ftime<-factor(as.character(rep(c('a','b'),each=48)))
fcounts<-as.numeric(round(runif(96,1,10)))
fdf<-tibble(fsubs,fitem,fcounts,ftime)
fdf <- fdf %>%
group_by(ftime) %>%
mutate(row_id = row_number()) %>%
pivot_wider(values_from = fcounts,
names_from = ftime)
ggplot(data = fdf, aes(x = a, y = b)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(fsubs ~ ., ncol = 1)
The tidyr function pivot_wider allows us to create the shape of the data we need without explicit loops: create new columns a and b with values from fcounts. We do need to create a unique row id to make this work.
By the way, when I run your code the plots look different from what you posted in the question.
With this output:

Just tried to create a visualization that would analyze all 4 variables. Got a geom_histogram
```{r}
fitem<-rep(rep(1:16,each=3),2)
fsubs<-factor(rep(rep(paste('sub',1:3,sep=''),16),2))
ftime<-factor(as.character(rep(c('a','b'),each=48)))
fcounts<-as.numeric(round(runif(96,1,10)))
fdf<-data.frame(fsubs,fitem,fcounts,ftime)
fdf_counts<-data.frame()
for (i in unique(fdf$fsubs)){
fdf_counts<-append(fdf_counts,cbind(fdf%>%filter(fsubs==i,ftime=='a')%>%dplyr::select(fcounts),
fdf%>%filter(fsubs==i,ftime=='b')%>%dplyr::select(fcounts)))
fdf_counts<-data.frame(fdf_counts)
}
ggplot(data = fdf, mapping = aes(x = fdf$fsubs, y = fdf$fcounts, fill = fdf$fitem)) + geom_bar(stat = "identity", position = "dodge") + facet_grid(cols = vars(ftime))
```

Related

How to plot data from two files?

I am trying to plot the 5 first values and the 5 last of my data.
I have two files A and B.
A is order that is why I need the 5 and the five last.
I want to plot the intensity of each replicate which are in another file B.
If my names in A are found in B to plot all the values...
File B as the same rownames that file A (and more) that is why I want those in common between A and B, then if they are equal I want to plot all the values for each replicate (column) from my file B.
So at the end I'll have 5 plots for 5first and 5 plots for 5last.
I already try something like this but like I said I am not very good with R programming.
Just to begin, I want to see if I can make the link between the two files. This doesn't work.
5fisrt <- A[1:5,]
5last <- A[(nrow(RT)-5+1):nrow(A), ]
i <- 0
for (i in 5fisrt)
{
if row.names(5fisrt[i]) == row.names(B[i])
plot <- boxplot(B,aes(B[i]))
print (plot)
}
I'll function do another for 5last.
How do I solve the problem?
If I understand, you want to make a barplot for each row in B, if the name of that row is the same as the name of one of the first 5 rows or last 5 rows of A.
If you want all 10 plots in one graphic, you can use facet_grid:
library(tidyverse)
n <- nrow(A)
C <- B[row.names(A[c(1:5,(n-4):n),]),]
C %>%
mutate(rowName = row_number()) %>%
gather(key = 'replicates', value = 'intensity', -rowName) %>%
ggplot(
aes(
x = replicates,
y = intensity
)
) +
geom_col() +
facet_grid(rowName~.)
Note that my answer assumes that there are at least 10 rows in A. The second line of code subsets B as you want, and assigns it to a new object C, just for clarity in the code. I could have just piped that subset of B right into the mutate without defining C.
UPDATE:
Since you don't want to use facet, you could try one of these:
library(tidyverse)
n <- nrow(A)
C <- B[row.names(A[c(1:5,(n-4):n),]),]
D <- C %>%
mutate(rowName = row_number()) %>%
gather(key = 'replicates', value = 'intensity', -rowName)
# Plot to the RStudio viewer
for(i in 1:10){
p <-
D %>%
filter(rowName == i) %>%
ggplot(
aes(
x = replicates,
y = intensity
)
) +
geom_col()
print(p)
}
# Save the plots to files
for(i in 1:10){
p <-
D %>%
filter(rowName == i) %>%
ggplot(
aes(
x = replicates,
y = intensity
)
) +
geom_col()
ggsave(paste0("myPlot",i,".png"), plot = p)
}

Combine multiple plots, generated using the "by" R function, in one figure

I have a data frame containing multiple numeric columns and one column with different factors. I'd like to produce a unique image containing the plots of the numeric columns, by factor. I tried the following:
varA <- runif(40)
varB <- runif(40)
varB <- runif(40)
varC <- runif(40)
mainVar <- c(rep('cat', 10), rep('dof', 10), rep('mouse', 10), rep('frog', 10))
plotData <- data.frame(varA, varB, varC, mainVar)
pdf('asd.pdf')
par(mfrow=c(2,2))
by(plotData, plotData$mainVar, function(x){
par(mfrow=c(1,3))
boxplot(x$varA)
boxplot(x$varB)
boxplot(x$varC)
})
dev.off()
It produces a unique pdf, but with a page for every factor.
Instead, I'd like to get something like that (without the red lines):
First, both techniques shown here prefer data in a "tall" format. There are several tools that will reshape it for you, I'll use
# library(tidyr)
plotDataTall <- tidyr::gather(plotData, k, v, -mainVar)
head(plotDataTall)
# mainVar k v
# 1 cat varA 0.4023846
# 2 cat varA 0.3406813
# 3 cat varA 0.7990530
# 4 cat varA 0.3706167
# 5 cat varA 0.5986029
# 6 cat varA 0.1626782
Other tools include the reshape2 package or the stats function reshape, both of which are increasingly less-intuitive to use for first time users.
ggplot2
library(ggplot2)
ggplot(plotDataTall, aes(x = k, y = v)) +
geom_boxplot() +
facet_wrap(~ mainVar, nrow=2) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank())
Base R
Because you cannot nest uses of par(mfrow=...) (they replace, not nest), you can stick with the over-arching 2x2 and handling the per-variable boxplots within boxplot. This can be with the wide data:
par(mfrow=c(2,2), mar=c(4,3,3,0.1))
ign <- by(plotData, plotData$mainVar, function(x) {
boxplot(x$varA, x$varB, x$varC, main=x$mainVar[1])
})
or the tall format:
par(mfrow=c(2,2), mar=c(4,3,3,0.1))
ign <- by(plotDataTall, plotDataTall$mainVar, function(x) {
boxplot(v~k, data=x, main=x$mainVar[1])
})
(I took the liberty of adjusting the margins, primarily for a shrunken combined plot here. Not required for production.)
As r2evans already points out, I doubt this is possible with the base plot function. Using ggplot2 (part of the tidyverse) you can get a one-page plot using:
library(tidyverse)
plotData %>%
gather(var, y, -mainVar) %>%
ggplot(aes(x = var, y = y)) + geom_boxplot() + facet_wrap(~mainVar)
note that this also uses the pipe operator (dplyr), and gather (tidyr) both part of tidyverse

Stacked bar graph with only top/bottom results

I have an Excel file and am trying to create a bar chart that groups categories and shows the average rating of the category. Because there are a lot of categories, I'd also like to only show either the top 10 or bottom 10 in the resulting horizontal bar chart.
category rating
A 10
A 8
A 9
B 1
B 4
B 9
C 6
C 7
D 9
Something like this (representative bar instead of the numbers):
A 9
D 9
...
C 6.5
B 4.66
I know this seems super simple to do, but I can't seem to be able to get anything working after trying various answers around here. Using ggplot2 seems to be the most promising so far. Closest I've gotten is showing the number of ratings for each category...
Edit: didn't save the work I did earlier as it wasn't the result I wanted, but it was something like this (didn't use ggplot)
dat[,c(1,12)]
category = dat[,1] //selecting column from sheet
rating = dat[,12] //selecting column from sheet
rating<-as.numeric(unlist(dat[,12]))
dat<-table(dat$rating,dat$category)
barplot(dat, main="Overall Ratings",
xlab="Ratings", col=c("skyblue","red"), horiz=TRUE,
legend = rownames(dat))
Here's a chaining solution using dplyr and tidyr. First, we need to load the data.
library(dplyr)
library(tidyr)
library(ggplot2)
df <- read.table(text="category,rating
A,10
A,8
A,9
B,1
B,4
B,9
C,6
C,7
D,9
", sep=",", header=TRUE)
Now to the solution. After grouping the data by category, we calculate each category's mean rating.
means.df <-
df %>%
group_by(category) %>%
summarise(mean = mean(rating))
top_n selects the top (positive number) or bottom (negative number) n rows from a dataset. We apply this to our dataset with means. In your real data, adjust the 2 to 10 for the top and to -10 for the bottom 10 categories.
means.df %>%
top_n(2, mean) %>%
ggplot(aes(x = category, y = mean)) +
geom_bar(stat = 'identity')
The following code plots the top/bottom cutoff_number categories into one plot. Adjust the variable cutoff_number as needed.
cutoff_number <- 2
means.df %>%
arrange(-mean) %>%
mutate(
topbottom = ifelse(row_number() <= cutoff_number, "top", NA),
topbottom = ifelse(row_number() > nrow(.) - cutoff_number, "bottom", topbottom)
) %>%
ggplot(aes(x = category, y = mean)) +
geom_bar(stat = 'identity') +
facet_wrap(~topbottom, scales = 'free_x')
This solution uses data.table to summarize the data, then delivers the result to ggplot:
library(data.table);library(ggplot2)
category=c("A","A","A","B","B","B","C","C","D")
rating=c(10,9,8,1,4,9,6,7,9)
dt=as.data.table(cbind(category,rating))
ggplot(dt[,mean(as.numeric(rating)),by=category],aes(category,V1))+geom_col()+ylab("Mean")

stop printing na in plot

I am plotting a categorical variable. There is no NA but the plot is coming up with NA bar with no observation in it. How do I stop prinint this NA ber in my plot?
ggplot(data.frame(cat.hour),
aes(x=factor(cat.hour, levels=c("1 min","2 min","3 min","4 min","5 min","6 min","7 min","8 min","9 min","10 min")))) +
geom_bar(fill="lightgreen")
As you can see in call unique(cat.hour) there is an NA in your data.
Filtering for non-NA values would be a simple workaround for your plot. However, I encourage you to perform your calculations outside of the plot.
require(ggplot2)
require(dplyr)
require(magrittr)
cat.hour %>%
na.omit() %>%
mutate(variable = factor(variable, levels=c("1 min","2 min","3 min","4 min","5 min","6 min","7 min","8 min","9 min","10 min"))) -> cat.hour
g <- ggplot(cat.hour, aes(x = variable))
g <- g + geom_bar(fill = "lightgreen")
g

R stacked bar charts including "other" (using ggplot2)

I want to make a stacked barchart that describes abundances of taxa at two locations in three different seasons. I'm using ggplot2. Making the plot is ok, but I have 48 taxa so I end up with a lot of different colours in the bar. There are only eight taxa that occur frequently and abundantly, so I'd like to group the others into "Other" for the plot.
My data looks like this:
SampleID TransectID SampleYear Season Location Taxa1 Taxa2 Taxa3 .... Taxa48
BW15001 1 2015 fall SiteA 25 0 0 0
BW15001 2 2015 fall SiteA 32 0 0 2
BW15001 2 2015 fall SiteA 6 0 45 0
BW15001 3 2015 fall SiteA 78 1 2 0
This is what I have tried (modified from here):
y <- rowSums(invert[6:54])
x<-invert[6:54]/y
x<-invert[,order(-colSums(x))]
#Extract list of top N Taxa
N<-8
taxa_list<-colnames(x)[1:N]
#remove "__Unknown__" and add it to others
taxa_list<-taxa_list[!grepl("Unknown",taxa_list)]
N<-length(taxa_list)
#Generate a new table with everything added to Others
new_x<-data.frame(x[,colnames(x) %in% taxa_list],
Others=rowSums(x[,!colnames(x) %in% taxa_list]))
df<-NULL
for (i in 1:dim(new_x)[2]){
tmp<-data.frame(row.names=NULL,Sample=rownames(new_x),
Taxa=rep(colnames(new_x)[i],dim(new_x) [1]),Value=new_x[,i],Type=grouping_info[,1])
if(i==1){df<-tmp} else {df<-rbind(df,tmp)}
}
To plot the graph:
colours <- c("#F0A3FF", "#0075DC", "#993F00","#4C005C","#2BCE48","#FFCC99","#808080","#94FFB5","#8F7C00","#9DCC00","#C20088","#003380","#FFA405","#FFA8BB","#426600","#FF0010","#5EF1F2","#00998F","#740AFF","#990000","#FFFF00");
library(ggplot2)
p<-ggplot(df,aes(Sample,Value,fill=Taxa))+
geom_bar(stat="identity")+
facet_grid(. ~ Type, drop=TRUE,scale="free",space="free_x")
p<-p+scale_fill_manual(values=colours[1:(N+1)])
p<-p+theme_bw()+ylab("Proportions")
p<-p+ scale_y_continuous(expand = c(0,0))+
theme(strip.background = element_rect(fill="gray85"))+
theme(panel.spacing = unit(0.3, "lines"))
p<-p+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
p
The main problem that I would like help with today is pulling out the main taxa and lumping the rest as "Other". I think I can figure out how to group the graph by Season and Location using facet_grid() later...
Thanks!
Expanding on my comment. Take a look at the forcats package. Without a full example, it's hard to say, but the following should work:
library(tidyverse)
library(forcats)
temp <- df %>%
gather(taxa, amount, -c(1:5))
# Reshape the data so that that there is one record per each amount
tidy_df <- temp[rep(rownames(temp), times = temp$amount), ]
tidy_df %>%
select(-amount) %>%
mutate(taxa = fct_lump(taxa, n = 2)) %>% # Check out this line
ggplot(., aes(x = SampleID, fill = taxa)) +
geom_bar()
You can change fct_lump(taxa, n = 2) to fct_lump(taxa, n = 8) to group the top 8 categories. Alternatively, you can use fct_lump(taxa, prop = 0.9) to lump things up by proportions.
If you are simply going after the "presence" of the taxa in a sample (and not the value or amount), things are a bit simpler and can likely be handled in one pipe:
df %>%
gather(taxa, amount, -c(1:5)) %>%
mutate(amount = na_if(amount, 0)) %>%
na.omit() %>%
mutate(taxa = fct_lump(taxa, n = 2)) %>%
ggplot(., aes(x = SampleID, fill = taxa)) +
geom_bar()
One way of doing it:
library(plyr)
d=data.frame(SampleID=rep('BW15001',4),
TransectID=c(1,2,2,3),
SampleYear=rep(2015,4),
Taxa1=c(25,32,6,78),
Taxa2=c(0,0,0,1),
Taxa3=c(0,0,45,3))
#Reshape the df so that all taxa columns are melted into two
d=melt(d,id=colnames(d[,1:3]))
d$variable=as.character(d$variable)
# rename all uninteresting taxa as 'other'
`%ni%` <- Negate(`%in%`) # Here I decided to select the ones to keep, but the other way around is fine as well of course
d[d$variable %ni% c('Taxa1','Taxa2'),'variable']='Other' #here you could add a function to automatically determine which taxta you want to keep, as you already did
# aggregate all data for 'other'
d=ddply(d,colnames(d[,1:4]),summarise,value=sum(value))
#make your plot, this one is just a bad example
ggplot(d,aes(SampleID,value,fill=variable))+
geom_bar(stat="identity")+
facet_grid(. ~ Type, drop=TRUE,scale="free",space="free_x")

Resources