how to make two networks connected with selected members - r

I have a data like this
df<- structure(list(Core = c("Bestman", "Tetra"), member1 = c("Tera1",
"Brownie1"), member2 = c("Tera2", "Brownie2"), member3 = c("Tera3",
"Brownie3"), member4 = c("Tera4", "Brownie4"), member5 = c("Tera5",
"Brownie5"), member6 = c("", "Brownie6"), member7 = c("", "Brownie7"
)), class = "data.frame", row.names = c(NA, -2L))
I want to connect all the members to their Core. for example if you look at the first row, you can see there are 5 members , I want to connect them to their Core
The same for the second row
Then I connect both Core together
Here is what I have done
mydf <- crossprod(table(cbind(df[1], stack(df[-1]))[-3]))
graph_from_adjacency_matrix(mydf, diag = F, weighted = T, mode = "undirected") %>%
plot(edge.width = E(.)$weight)

If i understood correctly, what you want is:
library(igraph)
df<- data.frame(Core = c("Bestman", "Tetra"), member1 = c("Tera1",
"Brownie1"), member2 = c("Tera2", "Brownie2"), member3 = c("Tera3",
"Brownie3"), member4 = c("Tera4", "Brownie4"), member5 = c("Tera5",
"Brownie5"), member6 = c("", "Brownie6"), member7 = c("", "Brownie7"))
edges <- t(do.call(rbind, apply(
df, 1, function(x) cbind(x[1], x[x!=""][-1]))))
core_edges <- if(nrow(df)>1) combn(df$Core,2) else c()
g<-graph(c(edges,core_edges), directed=F )
plot(g , edge.width = E(g)$weight)
EDIT
To colorize and resize nodes:
V(g)$color <- apply(df, 1, \(x) names(V(g)) %in% x) |> apply(1,which)
V(g)$size <- 15
V(g)[df$Core]$size <- degree(g, V(g)[df$Core]) + 15
plot(g)

Related

How can I make the group-by code to call a function from another package faster?

I have below code to compute a meta value using meta package:
probMetaControl <- long %>% group_by(ID, sample) %>% group_split() %>% mclapply(mc.cores = 10 ,function(endf){
message(endf$ID[1])
res <- meta::metagen(data = endf, studlab = ID, TE = expression , seTE = sd, sm = "SMD",
n.e = rep(1,nrow(endf)),
method.tau = "REML",
hakn = TRUE,
control = list(maxiter=1000))
data.frame(
ID = endf$ID[1],
sample = endf$sample[1],
meta.exprs = res$TE.fixed,
stringsAsFactors = F
)
}) %>% do.call(what = rbind) %>% as.data.frame()
the long dataframe has around 800,000 rows. The small part of long dataframe is as:
as.data.table(structure(list(ID = c("h:5982", "h:3310", "h:7849", "h:2978",
"h:7318"), pID = c("X1053_at", "X117_at", "X121_at", "X1255_g_at",
"X1294_at"), sd = c(0.228908614809978, 0.436455554523966, 0.210542866430305,
0.672545478318169, 0.26926204466525), sample = c("A", "B", "A",
"C", "A"), expression = c(6.53920197406645, 6.12380136266864,
8.01553257692446, 4.62636832157394, 7.58222133679378)), row.names = c(NA,
-5L), class = c("data.table", "data.frame")))
At the moment, this code takes 23 mins to run. Is there any way to make it faster?

Is it possible to make it more readable? treemap

I just wanna know how can I make it more readable.
marketcap <- data.frame(Marketcap = c(641899161594, 30552518424, 271028619181,
9277626785, 3986737880, 1202315485,
6049985280, 30722840711),
id = c('Bitcoin', 'Dogecoin', 'Ethereum', 'Litecoin', 'Monero', 'Nem', 'Stellar', 'xrp'),
row.names = c('Bitcoin', 'Dogecoin', 'Ethereum', 'Litecoin', 'Monero', 'Nem',
'Stellar', 'xrp')); df
#install.packages('treemap')
library(treemap)
df1 <- na.omit(marketcap[,c('id','Marketcap')])
df1$Marketcap <- as.numeric(round(df1$Marketcap, 0))
df1$formatted_market_cap = paste0(df1$id, '\n', '$', formatC(c("642","30.5","271","9.3","4","1.2","6.044","30.7"), format = "e", digits = 2))
treemap(df1, index = 'formatted_market_cap', vSize = 'Marketcap', title = 'Cryptocurrency Market Cap (bn)', fontsize.labels=c(15, 4), palette='Set3')
For example, Nem is looking poor

Getting Duplicate Labeled Points on Scatterplot in R

I am trying to use kmeans to show what states have similar statistics with one another from the Lahman database, my code is as follows:
battingInfo <- Batting %>% filter(yearID >= 1999)
total <- merge(battingInfo,People,by="playerID")
totalN <- total[,-c(24,25,28:47)]
filterByState <- totalN %>% group_by(birthState) %>% summarise(players = length(playerID))
newMerge <- merge(totalN, filterByState, by="birthState")
newTest <- newMerge %>% group_by(birthState) %>% summarise_at(vars(G, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB,
SO, IBB, HBP, SH, SF, GIDP), sum, na.rm = TRUE)
updateTest <- newMerge %>% group_by(birthState) %>% summarise(Players = n_distinct(playerID), G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
finalUpdate <- newMerge %>% group_by(birthState = case_when(!birthState %in% state.abb ~ "Other",
TRUE ~ birthState)) %>% summarise(Players = n_distinct(playerID),
G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
This gives me the data frame I want. Now my code for kmeans is:
subDat5 <- finalUpdate[, c(2:19)]
subDatSc5 <- scale(subDat5)
distDat5 <- dist(subDatSc5)
k2<-5
km3new<-kmeans(subDatSc5, k2, nstart = 40)
fitNew <-cmdscale(distDat5) # k is the number of dim to PLOT
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16, col=km3new$cluster)
birthState=as.character(finalUpdate[,1])
View(birthState)
text(fitNew+.1, labels = birthState, cex=.5)
Everything seems to work perfectly up until the last line, when I label all the points and it outputs a graph with each point being labeled 50 times.
Is there any fix to this?
dput(fitNew) =
structure(c(-1.65773726259238, -0.534080004429963, -1.25224081559503,
-0.77600324658737, 13.7591986092784, -1.48285027332317, -1.0685046710528,
-1.40697098882713, 4.45857203274176, 1.31053002832658, -1.35540549966184,
-1.29910272287957, -1.68908570162927, 0.480144496416969, -0.592812161743823,
-1.23667901504586, -0.844421560951474, -0.827147650450116, -1.22861495063773,
-1.09472770146309, -1.68944621276222, -1.04378183282088, -1.34915033496973,
-0.951660697104605, -0.45483103293441, -1.70655513856763, -0.0616193106609581,
-1.48510165062592, -1.46251714293967, -1.66524625215651, -0.302561452071198,
-1.56675666458699, -1.28344728331308, 0.864956587539308, 0.16173394975142,
-0.850595975621662, -0.756783746315003, 24.7256817273653, -0.427398940139082,
-1.39925870808987, -0.755785801532488, -1.51858748511865, -0.944152303255372,
2.99465893267538, -1.67729960185572, -0.428860890332761, -1.66997803522651,
-0.392867003697617, -1.30257694125332, -1.66036447381944, -1.6019072254532,
-0.0137738939595427, -0.296070047308066, -0.00473553953140588,
0.0641385777789144, 1.13842140049119, -0.0268651281540734, -0.128806499497676,
-0.00491611456401126, 0.364126276181306, -0.143046769591177,
-0.0283493696039194, -0.0485069239634975, -0.0287370449451863,
0.095714493198601, -0.124528071666917, -0.0332600735692987, 0.0352695212129851,
-0.119261467201306, -0.0381525968696119, 0.0551469698282207,
-0.0115458694920637, -0.0250933419027217, 0.0406395856647227,
0.12482265126378, -0.17954163594865, -0.0113245644618699, -0.0894498877336694,
0.0305207676977073, 0.0323710265810206, -0.0491296972494748,
-0.121635810491615, 0.0175346179372083, 0.0127983868546243, 0.21663582448027,
0.0803333481747664, -0.0309611163272855, 0.0201356804088859,
-0.696293053438086, 0.133550765173667, 0.108119095159391, -0.136003613852937,
0.00557290379285935, 0.0602630898597761, -0.196004062948666,
-0.0161895096280255, -0.178283625530885, -0.0170000868214074,
0.107232630021258, 0.0375464632562086, -0.00276496483054615,
0.0193363060673037), .Dim = c(51L, 2L), .Dimnames = list(NULL,
NULL))
and dput(birthState) =
"c(\"AK\", \"AL\", \"AR\", \"AZ\", \"CA\", \"CO\", \"CT\", \"DE\", \"FL\", \"GA\", \"HI\", \"IA\", \"ID\", \"IL\", \"IN\", \"KS\", \"KY\", \"LA\", \"MA\", \"MD\", \"ME\", \"MI\", \"MN\", \"MO\", \"MS\", \"MT\", \"NC\", \"ND\", \"NE\", \"NH\", \"NJ\", \"NM\", \"NV\", \"NY\", \"OH\", \"OK\", \"OR\", \"Other\", \"PA\", \"RI\", \"SC\", \"SD\", \"TN\", \"TX\", \"UT\", \"VA\", \"VT\", \"WA\", \"WI\", \"WV\", \"WY\")"
As I mentioned in my comment, your problem is probably due to the fact that birthState is a string of an R character vector and not the actual vector.
The following code
birthState <- eval(parse(text = birthState))
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16)
text(fitNew, labels = birthState, cex=.5, pos = 4)
Yielded this for me

HeatMap: how to cluster only the rows and keep order of the heatmap's column labels as same as in the df?

I wanna plot a heatmap and cluster only the rows (i.e. genes in this tydf1).
Also, wanna keep order of the heatmap's column labels as same as in the df (i.e. tydf1)?
Sample data
df1 <- structure(list(Gene = c("AA", "PQ", "XY", "UBQ"), X_T0_R1 = c(1.46559502, 0.220140568, 0.304127515, 1.098842127), X_T0_R2 = c(1.087642983, 0.237500819, 0.319844338, 1.256624804), X_T0_R3 = c(1.424945196, 0.21066267, 0.256496284, 1.467120048), X_T1_R1 = c(1.289943948, 0.207778662, 0.277942721, 1.238400358), X_T1_R2 = c(1.376535013, 0.488774258, 0.362562315, 0.671502431), X_T1_R3 = c(1.833390311, 0.182798731, 0.332856558, 1.448757569), X_T2_R1 = c(1.450753714, 0.247576125, 0.274415259, 1.035410946), X_T2_R2 = c(1.3094609, 0.390028842, 0.352460646, 0.946426593), X_T2_R3 = c(0.5953716, 1.007079177, 1.912258811, 0.827119776), X_T3_R1 = c(0.7906009, 0.730242116, 1.235644748, 0.832287694), X_T3_R2 = c(1.215333041, 1.012914813, 1.086362205, 1.00918082), X_T3_R3 = c(1.069312467, 0.780421013, 1.002313082, 1.031761442), Y_T0_R1 = c(0.053317766, 3.316414959, 3.617213894, 0.788193798), Y_T0_R2 = c(0.506623748, 3.599442788, 1.734075583, 1.179462912), Y_T0_R3 = c(0.713670106, 2.516735845, 1.236204882, 1.075393433), Y_T1_R1 = c(0.740998252, 1.444496448, 1.077023349, 0.869258744), Y_T1_R2 = c(0.648231834, 0.097957459, 0.791438659, 0.428805547), Y_T1_R3 = c(0.780499252, 0.187840968, 0.820430227, 0.51636582), Y_T2_R1 = c(0.35344654, 1.190274584, 0.401845911, 1.223534348), Y_T2_R2 = c(0.220223951, 1.367784148, 0.362815405, 1.102117612), Y_T2_R3 = c(0.432856978, 1.403057729, 0.10802472, 1.304233845), Y_T3_R1 = c(0.234963735, 1.232129062, 0.072433381, 1.203096462), Y_T3_R2 = c(0.353770497, 0.885122768, 0.011662112, 1.188149743), Y_T3_R3 = c(0.396091395, 1.333921747, 0.192594116, 1.838029829), Z_T0_R1 = c(0.398000559, 1.286528398, 0.129147097, 1.452769794), Z_T0_R2 = c(0.384759325, 1.122251177, 0.119475721, 1.385513609), Z_T0_R3 = c(1.582230097, 0.697419716, 2.406671502, 0.477415567), Z_T1_R1 = c(1.136843842, 0.804552001, 2.13213228, 0.989075996), Z_T1_R2 = c(1.275683837, 1.227821594, 0.31900326, 0.835941568), Z_T1_R3 = c(0.963349308, 0.968589683, 1.706670339, 0.807060135), Z_T2_R1 = c(3.765036263, 0.477443352, 1.712841882, 0.469173869), Z_T2_R2 = c(1.901023385, 0.832736132, 2.223429427, 0.593558769), Z_T2_R3 = c(1.407713024, 0.911920317, 2.011259223, 0.692553388), Z_T3_R1 = c(0.988333629, 1.095130142, 1.648598854, 0.629915612), Z_T3_R2 = c(0.618606729, 0.497458337, 0.549147265, 1.249492088), Z_T3_R3 = c(0.429823986, 0.471389536, 0.977124788, 1.136635484)), row.names = c(NA, -4L ), class = c("data.table", "data.frame"))
Scripts used
library(dplyr)
library(stringr)
library(tidyr)
gdf1 <- gather(df1, "group", "Expression", -Gene)
gdf1$tgroup <- apply(str_split_fixed(gdf1$group, "_", 3)[, c(1, 2)],
1, paste, collapse ="_")
library(dplyr)
tydf1 <- gdf1 %>%
group_by(Gene, tgroup) %>%
summarize(expression_mean = mean(Expression)) %>%
spread(., tgroup, expression_mean)
#1 heatmap script is being used
library(tidyverse)
tydf1 <- tydf1 %>%
as.data.frame() %>%
column_to_rownames(var=colnames(tydf1)[1])
library(gplots)
library(vegan)
randup.m <- as.matrix(tydf1)
scaleRYG <- colorRampPalette(c("red","yellow","darkgreen"),
space = "rgb")(30)
data.dist <- vegdist(randup.m, method = "euclidean")
row.clus <- hclust(data.dist, "aver")
heatmap.2(randup.m, Rowv = as.dendrogram(row.clus),
dendrogram = "row", col = scaleRYG, margins = c(7,10),
density.info = "none", trace = "none", lhei = c(2,6),
colsep = 1:3, sepcolor = "black", sepwidth = c(0.001,0.0001),
xlab = "Identifier", ylab = "Rows")
#2 heatmap script is being used
df2 <- as.matrix(tydf1[, -1])
heatmap(df2)
Also, I want to add a color key.
It is still unclear to me, what the desired output is. There are some notes:
You don't need to use vegdist() to calculate distance matrix for your hclust() call. Because if you check all(vegdist(randup.m, method = "euclidian") == dist(randup.m)) it returns TRUE;
Specifying Colv = F in your heatmap.2() call will prevent reordering of the columns (default is TRUE);
Maybe it is better to scale your data by row (see the uncommented row);
Your call of heatmap.2() returns the heatmap with color key.
So summing it up - in your first script you just miss the Colv = F argument, and after a little adjustment it looks like this:
heatmap.2(randup.m,
Rowv = as.dendrogram(row.clus),
Colv = F,
dendrogram = "row",
#scale = "row",
col = scaleRYG,
density.info = "none",
trace = "none",
srtCol = -45,
adjCol = c(.1, .5),
xlab = "Identifier",
ylab = "Rows"
)
However I am still not sure - is it what you need?

Rename list of lists using a named list

So I'm working with a list that contains other lists inside, with this structure:
library(graph)
library(RBGL)
library(Rgraphviz)
show(tree)
$`SO:0001968`
$`SO:0001968`$`SO:0001622`
$`SO:0001968`$`SO:0001622`$`SO:0001624`
$`SO:0001968`$`SO:0001622`$`SO:0001624`$`SO:0002090`
[1] 1
$`SO:0001968`$`SO:0001622`$`SO:0001623`
$`SO:0001968`$`SO:0001622`$`SO:0001623`$`SO:0002091`
[1] 1
$`SO:0001968`$`SO:0001969`
$`SO:0001968`$`SO:0001969`$`SO:0002090`
[1] 1
$`SO:0001968`$`SO:0001969`$`SO:0002091`
[1] 1
dput(tree)
list(`SO:0001968` = list(`SO:0001622` = list(`SO:0001624` = list(
`SO:0002090` = 1), `SO:0001623` = list(`SO:0002091` = 1)),
`SO:0001969` = list(`SO:0002090` = 1, `SO:0002091` = 1)))
The data I use to build the list comes from an object called g:
show(g)
A graphNEL graph with directed edges
Number of Nodes = 7
Number of Edges = 8
dput(g)
new("graphNEL",
nodes = c("SO:0001968", "SO:0001969", "SO:0001622",
"SO:0001623", "SO:0001624", "SO:0002090", "SO:0002091"), edgeL = list(
`SO:0001968` = list(edges = 3:2), `SO:0001969` = list(edges = 6:7),
`SO:0001622` = list(edges = 5:4), `SO:0001623` = list(edges = 7L),
`SO:0001624` = list(edges = 6L), `SO:0002090` = list(edges = integer(0)),
`SO:0002091` = list(edges = integer(0))), edgeData = new("attrData",
data = list(`SO:0001968|SO:0001622` = list(weight = 1), `SO:0001968|SO:0001969` = list(
weight = 1), `SO:0001969|SO:0002090` = list(weight = 1),
`SO:0001969|SO:0002091` = list(weight = 1), `SO:0001622|SO:0001624` = list(
weight = 1), `SO:0001622|SO:0001623` = list(weight = 1),
`SO:0001623|SO:0002091` = list(weight = 1), `SO:0001624|SO:0002090` = list(
weight = 1)), defaults = list(weight = 1)), nodeData = new("attrData",
data = list(`SO:0001968` = list(label = "coding_transcript_variant"),
`SO:0001969` = list(label = "coding_transcript_intron_variant"),
`SO:0001622` = list(label = "UTR_variant"), `SO:0001623` = list(
label = "5_prime_UTR_variant"), `SO:0001624` = list(
label = "3_prime_UTR_variant"), `SO:0002090` = list(
label = "3_prime_UTR_intron_variant"), `SO:0002091` = list(
label = "5_prime_UTR_intron_variant")), defaults = list(
label = NA_character_)), renderInfo = new("renderInfo",
nodes = list(), edges = list(), graph = list(), pars = list()),
graphData = list(edgemode = "directed"))
Each SO:000XXX corresponds to a name, and I can find the names using the function nodeData, that returns a named list:
nodeData(g, nodes(g), "label")
$`SO:0001968`
[1] "coding_transcript_variant"
$`SO:0001969`
[1] "coding_transcript_intron_variant"
$`SO:0001622`
[1] "UTR_variant"
$`SO:0001623`
[1] "5_prime_UTR_variant"
$`SO:0001624`
[1] "3_prime_UTR_variant"
$`SO:0002090`
[1] "3_prime_UTR_intron_variant"
$`SO:0002091`
[1] "5_prime_UTR_intron_variant"
What I need is to replace (or rename) the data in the tree list with the corresponding string of the nodeData function.
For example, replace the 'SO:0001968' in the tree list for coding_transcript_variant from the nodeData function.
This recursive function should do the trick :
# you will do this but I couldn't install your packages
# nodeD <- nodeData(g, nodes(g), "label")
nodeD <- list(`SO:0001968` = "coding_transcript_variant",
`SO:0001969` = "coding_transcript_intron_variant",
`SO:0001622` = "UTR_variant",
`SO:0001623` = "5_prime_UTR_variant",
`SO:0001624` = "3_prime_UTR_variant",
`SO:0002090` = "3_prime_UTR_intron_variant",
`SO:0002091` = "5_prime_UTR_intron_variant")
rename_items <- function(item){
if (is.list(item)){
item <- lapply(item,rename_items)
names(item) <- unname(nodeD[names(item)])
}
item
}
tree2 <- rename_items(tree)
Result
# $coding_transcript_variant
# $coding_transcript_variant$UTR_variant
# $coding_transcript_variant$UTR_variant$`3_prime_UTR_variant`
# $coding_transcript_variant$UTR_variant$`3_prime_UTR_variant`$`3_prime_UTR_intron_variant`
# [1] 1
#
#
# $coding_transcript_variant$UTR_variant$`5_prime_UTR_variant`
# $coding_transcript_variant$UTR_variant$`5_prime_UTR_variant`$`5_prime_UTR_intron_variant`
# [1] 1
#
#
#
# $coding_transcript_variant$coding_transcript_intron_variant
# $coding_transcript_variant$coding_transcript_intron_variant$`3_prime_UTR_intron_variant`
# [1] 1
#
# $coding_transcript_variant$coding_transcript_intron_variant$`5_prime_UTR_intron_variant`
# [1] 1
If you save the output from nodeData() to a vector, you can use the names() function to assign the names to a list().
An example of assigning names to list elements:
x <- 1:5
y <- 11:20
z <- 21:25
theList <- list(x,y,z)
listNames <- c("element1","element2","element3")
names(theList) <- listNames
# access first element by name, using $ form of extract operator
theList$element1
...and the output:
> theList$element1
[1] 1 2 3 4 5
>
You may need to unlist() the output of nodeData() as follows:
theNames <- unlist(nodeData(g, nodes(g), "label"))
names(g) <- theNames

Resources