plot pearson correlation for the two datasets - r

I want to plot a correlation plot between two datasets. y axis is frequency of genes and x axis is correlation value or R2 value.
gvds_counts = read.delim("GVDS_normalized_counts_new.txt", header = TRUE, sep="\t",row.names = 1)
gvds_elastic = (read.delim("GVDS_PrediXcan_Test_2021_new.txt", header = TRUE, sep="\t",row.names = 1))
gvds_elastic= gvds_elastic[,-1]
gvds_elastic1=t(gvds_elastic)
This is how my two datasets look like:
dput(gvds_elastic1[1:5,1:5])
structure(c(0.117057915232398, 0, 0.16739607781207, -0.00582814885591799,
-0.00522232324665859, 0.196331738115648, 0, -0.00712521269403638,
0, -0.00522232324665859, 0.196331738115648, -0.021754984214482,
-0.00356260634701819, -0.0210845683319449, -0.00522232324665859,
0.177439691941073, -0.021754984214482, 0, -0.0210845683319449,
-0.00522232324665859, 0.158547645766499, -0.0435099684289639,
0, -0.00582814885591799, -0.00522232324665859), .Dim = c(5L,
5L), .Dimnames = list(c("ENSG00000107959.15", "ENSG00000057608.16",
"ENSG00000281186.1", "ENSG00000151632.17", "ENSG00000134452.19"
), c("GTEX-1117F", "GTEX-111CU", "GTEX-111FC", "GTEX-111VG",
"GTEX-111YS")))
dput(gvds_counts[1:5,1:5])
structure(list(GTEX.1117F.3226.SM.5N9CT = c(1.07050611836238,
319.010823271988, 0, 0, 0), GTEX.111FC.3126.SM.5GZZ2 = c(0, 137.627503211918,
0.81921132864237, 1.63842265728474, 0), GTEX.1128S.2726.SM.5H12C = c(0.93125973749023,
98.7135321739644, 0, 0.93125973749023, 0.93125973749023), GTEX.117XS.3026.SM.5N9CA = c(0,
140.966661860149, 0, 0.766123162283421, 0.766123162283421), GTEX.1192X.3126.SM.5N9BY = c(0.937426181007344,
139.676500970094, 0, 0.937426181007344, 0)), row.names = c("ENSG00000223972.5",
"ENSG00000227232.5", "ENSG00000278267.1", "ENSG00000243485.5",
"ENSG00000237613.2"), class = "data.frame")
I want to first rename the ID in gvds_counts similar to gvds_elastic1 which is to edit it from
GTEX.1128S.2726.SM.5H12C to GTEX-1128S
and then plot a correlation between these two datasets with x axis as R2 value and y axis as frequency of genes.

Related

How to add environmental variable to DCA plot using ggplot2

I would like to plot an environmental variable on a ggplot2 version of a DCA plot.
I have some code where I extract species and data scores from vegan and then plot them up in ggplot2. I am having trouble trying to work out how I can get my environmental variable SWLI to plot as an arrow - something like this RDA's plots with ggvegan: How can I change text position for arrows text? (or see PCA example here https://www.rpubs.com/an-bui/vegan-cheat-sheet)
Can anybody help?
#DCA Plot
library(plyr)
library(vegan)
library(ggplot2)
library(cluster)
library(ggfortify)
library(factoextra)
#read in csv and remove variables you don't want to go through analysis
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
swli<-read_csv("DCAenv.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
regforamcountsall$Sample = NULL
regforamcountsall$Site=NULL
regforamcountsall$SWLI=NULL
#check csv
regforamcountsall
#run ordination
ord<-decorana(regforamcountsall)
#get species scores
summary(ord)
#get DCA values of environmental variable
ord.fit <- envfit(ord ~ SWLI, data=swli, perm=999)
ord.fit
plot(ord, dis="site")
plot(ord.fit)
#use this summary code to get species scores for DCA1 and DCA2
#put species scores values in from ord plot summary stats
species.scores<-read.csv("speciescores.csv")
species.scores$species <- row.names(species.scores)
#Using the scores function from vegan to extract the sample scores and convert to a data.frame
data.scores <- as.data.frame(scores(ord))
# create a column of groupings/clusters, from the rownames of data.scores
data.scores$endgroup <- as.factor(pam(regforamcountsall, 3)$clustering)
#getting the convex hull of each unique point set
find_hull <- function(df) df[chull(data.scores$DCA1, data.scores$DCA2), ]
hulls <- NULL
for(i in 1:length(unique(data.scores$endgroup))){
endgroup_coords <- data.scores[data.scores$endgroup == i,]
hull_coords <- data.frame(
endgroup_coords[chull(endgroup_coords[endgroup_coords$endgroup == i,]$DCA1,
endgroup_coords[endgroup_coords$endgroup == i,]$DCA2),])
hulls <- rbind(hulls,hull_coords)
}
data.scores$numbers <- 1:length(data.scores$endgroup)
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
data.scores$Site<-regforamcountsall$Site
data.scores$SWLI<-regforamcountsall$SWLI
data.scores
#DCA with species
data.scores$Site <- as.character(data.scores$Site)
library(scico)
dca <- ggplot() +
# add the point markers
geom_point(data=data.scores,aes(x=DCA1,y=DCA2,colour=SWLI,pch=Site),size=4) + geom_point(data=species.scores,aes(x=DCA1,y=DCA2),size=3,pch=3,alpha=0.8,colour="grey22") +
# add the hulls and labels - numbers position labels
geom_polygon(data = hulls,aes(x=DCA1,y=DCA2,fill=endgroup), alpha = 0.25) +
#geom_text(data=data.scores,aes(x=DCA1-0.03,y=DCA2,colour=endgroup, label = numbers))+
geom_text(data=species.scores,aes(x=DCA1+0.1,y=DCA2+0.1, label = species))+
#look this up
geom_segment(data=ord.fit,aes(x = 0, y = 0, xend=DCA1,yend=DCA2), arrow = arrow(length = unit(0.3, "cm")))+
theme_classic()+
scale_color_scico(palette = "lapaz")+
coord_fixed()
dca
#regforamcountsall data
structure(list(Sample = c("T3LB7.008", "T3LB7.18", "T3LB7.303",
"WAP 0 ST-2", "T3LB7.5", "LG120"), T.salsa = c(86.63793102, 68.5897436,
70.39274924, 5.199999999, 79.15057916, 44.40000001), H.wilberti = c(0,
0, 0, 0, 0.386100386, 9.399999998), Textularia = c(0, 0, 0, 0,
0, 0.4), T.irregularis = c(2.155172414, 10.25641026, 7.854984897,
0, 2.702702703, 0), P.ipohalina = c(0, 0, 0, 0, 0, 0), J.macrescens = c(4.741379311,
5.769230769, 4.833836859, 5.800000001, 8.108108107, 5.400000001
), T.inflata = c(6.465517244, 15.38461538, 16.918429, 83.2, 5.791505794,
40.4), S.lobata = c(0, 0, 0, 2.300000001, 0, 0), M.fusca = c(0,
0, 0, 3.499999999, 3.861003862, 0), A.agglutinans = c(0, 0, 0,
0, 0, 0), A.exiguus = c(0, 0, 0, 0, 0, 0), A.subcatenulatus = c(0,
0, 0, 0, 0, 0), P.hyperhalina = c(0, 0, 0, 0, 0, 0), SWLI = c(200,
197.799175, 194.497937, 192.034776, 191.746905, 190.397351),
Site = c("LSP", "LSP", "LSP", "WAP", "LSP", "LG")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#data.scores
structure(list(DCA1 = c(-1.88587476921648, -1.58550534382589,
-1.59816311314591, -0.0851161831632892, -1.69080448670088, -1.14488987340879
), DCA2 = c(0.320139736602921, 0.226662031865046, 0.230912045301637,
-0.0531232712001122, 0.272143119753744, 0.0696939776869396),
DCA3 = c(-0.755595015095353, -0.721144380683279, -0.675071834919103,
0.402339366526422, -0.731006052784081, 0.00474996849420783
), DCA4 = c(-1.10780013276303, -0.924265835490466, -0.957711953532202,
-0.434438970032073, -0.957873836258657, -0.508347000558056
), endgroup = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), numbers = 1:6, Site = c("LSP",
"LSP", "LSP", "WAP", "LSP", "LG"), SWLI = c(200, 197.799175,
194.497937, 192.034776, 191.746905, 190.397351)), row.names = c(NA,
6L), class = "data.frame")
#species.scores
structure(list(species = c("1", "2", "3", "4", "5", "6"), DCA1 = c(-2.13,
-1.6996, -2.0172, -0.9689, 1.0372, -0.3224), DCA2 = c(0.342,
-0.8114, 0.3467, -0.3454, 2.0007, 0.9147)), row.names = c(NA,
6L), class = "data.frame")

Create faceted xy scatters using vectors of column names in R

I have two character vectors of equal length; where position one in vector.x matches position one in vector.y and so on. The elements refer to column names in a data frame (wide format). I would like to somehow loop through these vectors to produce xy scatter graphs for each pair in the vector, preferably in a faceted plot. Here is a (hopefully) reproducible example. To be clear, with this example, I would end up with 10 scatter graphs.
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
structure(list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176,
0.0108639420589757), Marinobacter = c(0, 0.00219023779724656,
0, 0.00137867647058824, 0.00310398344542162), Neptuniibacter = c(0.00945829750644884,
0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393
), Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856
), Pseudomonas = c(0.00466773123694878, 0.00782227784730914,
0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856
), Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529,
0.147697878944646), unclassified_GpIIa = c(0, 0.000730079265748853,
0, 0.003125, 0.00103466114847387), unclassified_Porticoccus = c(0,
0, 0, 0.00119485294117647, 0.00569063631660631), Aplanochytrium = c(0,
0, 0, 0.000700770847932726, 0.0315839846865529), Bathycoccus = c(0.000388802488335925,
0, 0, 0.0227750525578136, 0.00526399744775881), Brockmanniella = c(0,
0.00383141762452107, 0, 0.000875963559915907, 0), Caecitellus_paraparvulus = c(0,
0, 0, 0.000875963559915907, 0.00797575370872547)), row.names = c("B11",
"B13", "B22", "DI5", "FF6"), class = "data.frame")
As Rui Barradas shows, it's possible to get a very nice plot from ggplot and gridExta. If you wanted to stick to base R, here's how you'd do that (assuming your data set is called df1):
# set plot sizes
par(mfcol = c(floor(sqrt(length(vector.x))), ceiling(sqrt(length(vector.x)))))
# loop through plots
for (i in 1:length(vector.x)) {
plot(df1[[vector.x[i]]], df1[[vector.y[i]]], xlab = vector.x[i], ylab = vector.y[i])
}
# reset plot size
par(mfcol = c(1,1))
This is a bit long and convoluted but it works.
library(tidyverse)
library(gridExtra)
df_list <- apply(data.frame(vector.x, vector.y), 1, function(x){
DF <- df1[which(names(df1) %in% x)]
i <- which(names(DF) %in% vector.x)
if(i == 2) DF[2:1] else DF
})
gg_list <- lapply(df_list, function(DF){
ggplot(DF, aes(x = get(names(DF)[1]), y = get(names(DF)[2]))) +
geom_point() +
xlab(label = names(DF)[1]) +
ylab(label = names(DF)[2])
})
g <- do.call(grid.arrange, gg_list)
g
Not too elegant, but should get you going:
vector.x <- c("Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Aplanochytrium", "Bathycoccus", "Brockmanniella", "Brockmanniella", "Caecitellus_paraparvulus", "Caecitellus_paraparvulus")
vector.y <- c("Aliiroseovarius", "Neptuniibacter", "Pseudofulvibacter", "Thalassobius", "unclassified_Porticoccus", "Tenacibaculum", "Pseudomonas", "unclassified_GpIIa", "Marinobacter", "Thalassobius")
df1 = structure(
list(Aliiroseovarius = c(0, 0, 0, 0.00487132352941176, 0.0108639420589757),
Marinobacter = c(0, 0.00219023779724656, 0, 0.00137867647058824, 0.00310398344542162),
Neptuniibacter = c(0.00945829750644884, 0.00959532749269921, 0.0171310629514964, 0.2796875, 0.345835488877393),
Pseudofulvibacter = c(0, 0, 0, 0.00284926470588235, 0.00362131401965856),
Pseudomonas = c(0.00466773123694878, 0.00782227784730914, 0.0282765737874097, 0.00707720588235294, 0.00400931195033627),
Tenacibaculum = c(0, 0, 0, 0.00505514705882353, 0.00362131401965856),
Thalassobius = c(0, 0.00166875260742595, 0, 0.0633272058823529, 0.147697878944646),
unclassified_GpIIa = c(0, 0.000730079265748853, 0, 0.003125, 0.00103466114847387),
unclassified_Porticoccus = c(0, 0, 0, 0.00119485294117647, 0.00569063631660631),
Aplanochytrium = c(0, 0, 0, 0.000700770847932726, 0.0315839846865529),
Bathycoccus = c(0.000388802488335925, 0, 0, 0.0227750525578136, 0.00526399744775881),
Brockmanniella = c(0, 0.00383141762452107, 0, 0.000875963559915907, 0),
Caecitellus_paraparvulus = c(0, 0, 0, 0.000875963559915907, 0.00797575370872547)),
row.names = c("B11", "B13", "B22", "DI5", "FF6"),
class = "data.frame"
)
df2 = NULL
for(i in 1:10) {
df.tmp = data.frame(
plot = paste0(vector.x[i], ":", vector.y[i]),
x = df1[[vector.x[i]]],
y = df1[[vector.y[i]]]
)
if(is.null(df2)) df2=df.tmp else df2 = rbind(df2, df.tmp)
}
ggplot(data=df2, aes(x, y)) +
geom_point() +
facet_grid(cols = vars(plot))

Is there a way to count occurrences of a specific value for unique columns in a dataframe in R?

I am relatively new to R and have a dataframe (cn_data2) with several duplicated columns. It looks something like this:
Gene breast_cancer breast_cancer breast_cancer lung_cancer lung_cancer
myc 1 0 1 1 2
ARID1A 0 2 1 1 0
Essentially, the rows are genes and the columns are different types of cancers. What I want is to find for each gene the number of times, a value (0,1,or 2) occurs for each unique cancer type.
I have tried several things but haven't been able to achieve what I want. For example, cn_data2$count1 <- rowSums(cn_data == '1') gives me a column with the number of "1" for each gene but what I want the number of "1" for each individual disease.
Hope my question is clear!I appreciate any help, thank you!
structure(list(gene1 = structure(1:6, .Label = c("ACAP3", "ACTRT2",
"AGRN", "ANKRD65", "ATAD3A", "ATAD3B"), class = "factor"), glioblastoma_multiforme_Primary_Tumor = c(0,
0, 0, 0, 0, 0), glioblastoma_multiforme_Primary_Tumor.1 = c(-1,
-1, -1, -1, -1, -1), glioblastoma_multiforme_Primary_Tumor.2 = c(0,
0, 0, 0, 0, 0), glioblastoma_multiforme_Primary_Tumor.3 = c(2,
2, 2, 2, 2, 2), glioblastoma_multiforme_Primary_Tumor.4 = c(0,
0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA, 6L))

Can I convert a certain string upon reading with fread in R data.table?

I am trying to use R's data.table package to read a large data set (~800k rows). The set contains results from a simulation of 1000 scenarios (+ a scenario "0" - so 1,001 scenarios in total) and one of the columns, "ScenId", contains the number of the scenario, e.g. 0,1,2,..
The problem is the program used to output this txt file cannot name scenario 1000 as '1000', but uses 'AAA' instead. The column 'ScenId' thus contains only numbers, apart from the value 'AAA'.
I am trying to find a solution to convert 'AAA' to 1000 preferably within the fread command.
My current workaround is using na.strings = "AAA" in fread and then replacing the NA's with 1000, after reading is complete. This works well because those are the only NA instances in the data set.
However, I was hoping for a quicker / more elegant solution, i.e. to do this within the fread command.
Any help / advice will be much appreciated.
Later edit: an attempt at posting sample data.
structure(list(ScenId = "AAA", SensId = "_", SystemProd = "ZCPP__",
AssumClass = "SPLPSV", ProjPer = 40L, ProjMode = "Annual",
VarName = "belLUL", Description = "(BEL)",
Module = "MLIAB", FormType = "inv", Group = "calc.BEL", Width = 12L,
Decimals = 2L, Scale = "Yes", Value000 = 0, Value001 = 0,
Value002 = 0, Value003 = 0, Value004 = 0, Value005 = 0, Value006 = 0,
Value007 = 0, Value008 = 0, Value009 = 0, Value010 = 0, Value011 = 0,
Value012 = 0, Value013 = 0, Value014 = 0, Value015 = 0, Value016 = 0,
Value017 = 0, Value018 = 0, Value019 = 0, Value020 = 0, Value021 = 0,
Value022 = 0, Value023 = 0, Value024 = 0, Value025 = 0, Value026 = 0,
Value027 = 0, Value028 = 0, Value029 = 0, Value030 = 0, Value031 = 0,
Value032 = 0, Value033 = 0, Value034 = 0, Value035 = 0, Value036 = 0,
Value037 = 0, Value038 = 0, Value039 = 0, Value040 = 0), .Names =("ScenId",
"SensId", "SystemProd", "AssumClass", "ProjPer", "ProjMode",
"VarName", "Description", "Module", "FormType", "Group", "Width",
"Decimals", "Scale", "Value000", "Value001", "Value002", "Value003",
"Value004", "Value005", "Value006", "Value007", "Value008", "Value009",
"Value010", "Value011", "Value012", "Value013", "Value014", "Value015",
"Value016", "Value017", "Value018", "Value019", "Value020", "Value021",
"Value022", "Value023", "Value024", "Value025", "Value026", "Value027",
"Value028", "Value029", "Value030", "Value031", "Value032", "Value033",
"Value034", "Value035", "Value036", "Value037", "Value038", "Value039",
"Value040"), class = c("data.table", "data.frame"), row.names = c(NA,
-1L), .internal.selfref = <pointer: 0x0000000000310788>)
This is just one line of my data set. Hope this makes sense.

Plot correlation matrix with R in specific data range

I have used corrplot package to plot my data-pairs. But all the relationships in my data are positive.
Mydata<-read.csv("./xxxx.csv")
M <-cor(Mydata)
corrplot(M,,col=rev(brewer.pal(n=8, name="RdYlBu")))
Using ggcorr, I also can't find any solution to deal with the issue.
How to generate a user-defined colormap with the corresponding range from 0 to 1?
If you are trying to map the entire range of the colormap to only the positive correlations, you could use col = rep(rev(brewer.pal(n=8, name="RdYlBu")), 2). This repeats the color sequence, and then cl.lim = c(0,1) forces corrplot to use only the 2nd half of the sequence, mapped to the range 0 to 1.
par(xpd=T)
corrplot(M,,'upper',
col = rep(rev(brewer.pal(n=8, name="RdYlBu")), 2),
cl.lim = c(0,1),
mar = c(1, 0, 1, 0))
Some reproducible data
set.seed(12)
x = (1:100)/100
Mydata = data.frame(a=x^runif(1, 0, 50),
b=x^runif(1, 0, 50),
c=x^runif(1, 0, 50),
d=x^runif(1, 0, 50),
e=x^runif(1, 0, 50),
f=x^runif(1, 0, 50),
g=x^runif(1, 0, 50),
h=x^runif(1, 0, 50),
i=x^runif(1, 0, 50))
M = cor(Mydata)

Resources