I am trying to use kmeans to show what states have similar statistics with one another from the Lahman database, my code is as follows:
battingInfo <- Batting %>% filter(yearID >= 1999)
total <- merge(battingInfo,People,by="playerID")
totalN <- total[,-c(24,25,28:47)]
filterByState <- totalN %>% group_by(birthState) %>% summarise(players = length(playerID))
newMerge <- merge(totalN, filterByState, by="birthState")
newTest <- newMerge %>% group_by(birthState) %>% summarise_at(vars(G, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB,
SO, IBB, HBP, SH, SF, GIDP), sum, na.rm = TRUE)
updateTest <- newMerge %>% group_by(birthState) %>% summarise(Players = n_distinct(playerID), G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
finalUpdate <- newMerge %>% group_by(birthState = case_when(!birthState %in% state.abb ~ "Other",
TRUE ~ birthState)) %>% summarise(Players = n_distinct(playerID),
G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
This gives me the data frame I want. Now my code for kmeans is:
subDat5 <- finalUpdate[, c(2:19)]
subDatSc5 <- scale(subDat5)
distDat5 <- dist(subDatSc5)
k2<-5
km3new<-kmeans(subDatSc5, k2, nstart = 40)
fitNew <-cmdscale(distDat5) # k is the number of dim to PLOT
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16, col=km3new$cluster)
birthState=as.character(finalUpdate[,1])
View(birthState)
text(fitNew+.1, labels = birthState, cex=.5)
Everything seems to work perfectly up until the last line, when I label all the points and it outputs a graph with each point being labeled 50 times.
Is there any fix to this?
dput(fitNew) =
structure(c(-1.65773726259238, -0.534080004429963, -1.25224081559503,
-0.77600324658737, 13.7591986092784, -1.48285027332317, -1.0685046710528,
-1.40697098882713, 4.45857203274176, 1.31053002832658, -1.35540549966184,
-1.29910272287957, -1.68908570162927, 0.480144496416969, -0.592812161743823,
-1.23667901504586, -0.844421560951474, -0.827147650450116, -1.22861495063773,
-1.09472770146309, -1.68944621276222, -1.04378183282088, -1.34915033496973,
-0.951660697104605, -0.45483103293441, -1.70655513856763, -0.0616193106609581,
-1.48510165062592, -1.46251714293967, -1.66524625215651, -0.302561452071198,
-1.56675666458699, -1.28344728331308, 0.864956587539308, 0.16173394975142,
-0.850595975621662, -0.756783746315003, 24.7256817273653, -0.427398940139082,
-1.39925870808987, -0.755785801532488, -1.51858748511865, -0.944152303255372,
2.99465893267538, -1.67729960185572, -0.428860890332761, -1.66997803522651,
-0.392867003697617, -1.30257694125332, -1.66036447381944, -1.6019072254532,
-0.0137738939595427, -0.296070047308066, -0.00473553953140588,
0.0641385777789144, 1.13842140049119, -0.0268651281540734, -0.128806499497676,
-0.00491611456401126, 0.364126276181306, -0.143046769591177,
-0.0283493696039194, -0.0485069239634975, -0.0287370449451863,
0.095714493198601, -0.124528071666917, -0.0332600735692987, 0.0352695212129851,
-0.119261467201306, -0.0381525968696119, 0.0551469698282207,
-0.0115458694920637, -0.0250933419027217, 0.0406395856647227,
0.12482265126378, -0.17954163594865, -0.0113245644618699, -0.0894498877336694,
0.0305207676977073, 0.0323710265810206, -0.0491296972494748,
-0.121635810491615, 0.0175346179372083, 0.0127983868546243, 0.21663582448027,
0.0803333481747664, -0.0309611163272855, 0.0201356804088859,
-0.696293053438086, 0.133550765173667, 0.108119095159391, -0.136003613852937,
0.00557290379285935, 0.0602630898597761, -0.196004062948666,
-0.0161895096280255, -0.178283625530885, -0.0170000868214074,
0.107232630021258, 0.0375464632562086, -0.00276496483054615,
0.0193363060673037), .Dim = c(51L, 2L), .Dimnames = list(NULL,
NULL))
and dput(birthState) =
"c(\"AK\", \"AL\", \"AR\", \"AZ\", \"CA\", \"CO\", \"CT\", \"DE\", \"FL\", \"GA\", \"HI\", \"IA\", \"ID\", \"IL\", \"IN\", \"KS\", \"KY\", \"LA\", \"MA\", \"MD\", \"ME\", \"MI\", \"MN\", \"MO\", \"MS\", \"MT\", \"NC\", \"ND\", \"NE\", \"NH\", \"NJ\", \"NM\", \"NV\", \"NY\", \"OH\", \"OK\", \"OR\", \"Other\", \"PA\", \"RI\", \"SC\", \"SD\", \"TN\", \"TX\", \"UT\", \"VA\", \"VT\", \"WA\", \"WI\", \"WV\", \"WY\")"
As I mentioned in my comment, your problem is probably due to the fact that birthState is a string of an R character vector and not the actual vector.
The following code
birthState <- eval(parse(text = birthState))
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16)
text(fitNew, labels = birthState, cex=.5, pos = 4)
Yielded this for me
Related
I have a data like this
df<- structure(list(Core = c("Bestman", "Tetra"), member1 = c("Tera1",
"Brownie1"), member2 = c("Tera2", "Brownie2"), member3 = c("Tera3",
"Brownie3"), member4 = c("Tera4", "Brownie4"), member5 = c("Tera5",
"Brownie5"), member6 = c("", "Brownie6"), member7 = c("", "Brownie7"
)), class = "data.frame", row.names = c(NA, -2L))
I want to connect all the members to their Core. for example if you look at the first row, you can see there are 5 members , I want to connect them to their Core
The same for the second row
Then I connect both Core together
Here is what I have done
mydf <- crossprod(table(cbind(df[1], stack(df[-1]))[-3]))
graph_from_adjacency_matrix(mydf, diag = F, weighted = T, mode = "undirected") %>%
plot(edge.width = E(.)$weight)
If i understood correctly, what you want is:
library(igraph)
df<- data.frame(Core = c("Bestman", "Tetra"), member1 = c("Tera1",
"Brownie1"), member2 = c("Tera2", "Brownie2"), member3 = c("Tera3",
"Brownie3"), member4 = c("Tera4", "Brownie4"), member5 = c("Tera5",
"Brownie5"), member6 = c("", "Brownie6"), member7 = c("", "Brownie7"))
edges <- t(do.call(rbind, apply(
df, 1, function(x) cbind(x[1], x[x!=""][-1]))))
core_edges <- if(nrow(df)>1) combn(df$Core,2) else c()
g<-graph(c(edges,core_edges), directed=F )
plot(g , edge.width = E(g)$weight)
EDIT
To colorize and resize nodes:
V(g)$color <- apply(df, 1, \(x) names(V(g)) %in% x) |> apply(1,which)
V(g)$size <- 15
V(g)[df$Core]$size <- degree(g, V(g)[df$Core]) + 15
plot(g)
I have a ggplot2 graph which plots two separate violin plots onto one graph, given by this example (thanks to #jared_mamrot for providing it):
library(tidyverse)
data("Puromycin")
head(Puromycin)
dat1 <- Puromycin %>%
filter(state == "treated")
dat2 <- Puromycin %>%
filter(state == "untreated")
mycp <- ggplot() +
geom_violin(data = dat1, aes(x= state, y = conc, colour = "Puromycin (Treatment1)")) +
geom_violin(data = dat2, aes(x= state, y = conc, colour = "Puromycin (Treatment2)"))
mycp
I would like to add a boxplot or other summary statistics such as those in http://www.sthda.com/english/wiki/ggplot2-violin-plot-quick-start-guide-r-software-and-data-visualization and https://www.maths.usyd.edu.au/u/UG/SM/STAT3022/r/current/Misc/data-visualization-2.1.pdf, but trying the code suggested in those places does not change the original plot.
mycp + geom_boxplot()
Thanks for reading and hopefully this makes sense!
UPDATE ==========================================================================
So the above example does not reflect exactly my situation I realize now. Essentially, I want to apply statistics onto a combined ggplot2 graph that uses two separate objects as its variables (here TNBC_List1 and ER_List1) Here is an example that does (sorry for the longer example, I will admit I am having trouble creating a simpler reproducible example and I am very new to coding in general):
# Libraries -------------------------------------------------------------
library(BiocManager)
library(GEOquery)
library(plyr)
library(dplyr)
library(Matrix)
library(devtools)
library(Seurat)
library(ggplot2)
library(cowplot)
library(SAVER)
library(metap)
library(multtest)
# Loading Raw Data into RStudio ----------------------------------
filePaths = getGEOSuppFiles("GSE75688")
tarF <- list.files(path = "./GSE75688/", pattern = "*.tar", full.names = TRUE)
tarF
untar(tarF, exdir = "./GSE75688/")
gzipF <- list.files(path = "./GSE75688/", pattern = "*.gz", full.names = TRUE)
ldply(.data = gzipF, .fun = gunzip)
list.files(path = "./GSE75688/", full.names = TRUE)
list.files(path = "./GSE75688/", pattern = "\\.txt$",full.names = TRUE)
# full matrix ----------------------------------------------------------
fullmat <- read.table(file = './GSE75688//GSE75688_GEO_processed_Breast_Cancer_raw_TPM_matrix.txt',
sep = '\t', header = FALSE, stringsAsFactors = FALSE)
fullmat <- data.frame(fullmat[,-1], row.names=fullmat[,1])
colnames(fullmat) <- as.character(fullmat[1, ])
fullmat <- fullmat[-1,]
fullmat <- as.matrix(fullmat)
# BC01 ER+ matrix -----------------------------------------------------------
BC01mat <- grep(pattern =c("^BC01") , x = colnames(fullmat), value = TRUE)
BC01mat = fullmat[,grepl(c("^BC01"),colnames(fullmat))]
BC01mat = BC01mat[,!grepl("^BC01_Pooled",colnames(BC01mat))]
BC01mat = BC01mat[,!grepl("^BC01_Tumor",colnames(BC01mat))]
BC01pdat <- data.frame("samples" = colnames(BC01mat), "treatment" = "ER+")
# BC07 TNBC matrix -----------------------------------------------------------
BC07mat <- grep(pattern =c("^BC07") , x = colnames(fullmat), value = TRUE)
BC07mat <- fullmat[,grepl(c("^BC07"),colnames(fullmat))]
BC07mat <- BC07mat[,!grepl("^BC07_Pooled",colnames(BC07mat))]
BC07mat <- BC07mat[,!grepl("^BC07_Tumor",colnames(BC07mat))]
BC07mat <- BC07mat[,!grepl("^BC07LN_Pooled",colnames(BC07mat))]
BC07mat <- BC07mat[,!grepl("^BC07LN",colnames(BC07mat))]
BC07pdat <- data.frame("samples" = colnames(BC07mat), "treatment" = "TNBC")
#merge samples together =========================================================================
joined <- cbind(BC01mat, BC07mat)
pdat_joined <- rbind(BC01pdat, BC07pdat)
#fdat ___________________________________________________________________________________
fdat <- grep(pattern =c("gene_name|gene_type") , x = colnames(fullmat), value = TRUE)
fdat <- fullmat[,grepl(c("gene_name|gene_type"),colnames(fullmat))]
fdat <- as.data.frame(fdat, stringsAsFactors = FALSE)
fdat <- setNames(cbind(rownames(fdat), fdat, row.names = NULL),
c("ensembl_id", "gene_short_name", "gene_type"))
rownames(pdat_joined) <- pdat_joined$samples
rownames(fdat) = make.names(fdat$gene_short_name, unique=TRUE)
rownames(joined) <- rownames(fdat)
# Create Seurat Object __________________________________________________________________
joined <- as.data.frame(joined)
sobj_pre <- CreateSeuratObject(counts = joined)
sobj_pre <-AddMetaData(sobj_pre,metadata=pdat_joined)
head(sobj_pre#meta.data)
#gene name input
sobj_pre[["RNA"]]#meta.features<-fdat
head(sobj_pre[["RNA"]]#meta.features)
#Downstream analysis -------------------------------------------------------
sobj <- sobj_pre
sobj <- FindVariableFeatures(object = sobj, mean.function = ExpMean, dispersion.function = LogVMR, nfeatures = 2000)
sobj <- ScaleData(object = sobj, features = rownames(sobj), block.size = 2000)
sobj <- RunPCA(sobj, npcs = 100, ndims.print = 1:10, nfeatures.print = 5)
sobj <- FindNeighbors(sobj, reduction = "pca", dims = 1:4, nn.eps = 0.5)
sobj <- FindClusters(sobj, resolution = 1, n.start = 10)
umap.method = 'umap-learn'
metric = 'correlation'
sobj <- RunUMAP(object = sobj, reduction = "pca", dims = 1:4,min.dist = 0.5, seed.use = 123)
p0 <- DimPlot(sobj, reduction = "umap", pt.size = 0.1,label=TRUE) + ggtitle(label = "Title")
p0
# ER+ score computation -------------------
ERlist <- list(c("CPB1", "RP11-53O19.1", "TFF1", "MB", "ANKRD30B",
"LINC00173", "DSCAM-AS1", "IGHG1", "SERPINA5", "ESR1",
"ILRP2", "IGLC3", "CA12", "RP11-64B16.2", "SLC7A2",
"AFF3", "IGFBP4", "GSTM3", "ANKRD30A", "GSTT1", "GSTM1",
"AC026806.2", "C19ORF33", "STC2", "HSPB8", "RPL29P11",
"FBP1", "AGR3", "TCEAL1", "CYP4B1", "SYT1", "COX6C",
"MT1E", "SYTL2", "THSD4", "IFI6", "K1AA1467", "SLC39A6",
"ABCD3", "SERPINA3", "DEGS2", "ERLIN2", "HEBP1", "BCL2",
"TCEAL3", "PPT1", "SLC7A8", "RP11-96D1.10", "H4C8",
"PI15", "PLPP5", "PLAAT4", "GALNT6", "IL6ST", "MYC",
"BST2", "RP11-658F2.8", "MRPS30", "MAPT", "AMFR", "TCEAL4",
"MED13L", "ISG15", "NDUFC2", "TIMP3", "RP13-39P12.3", "PARD68"))
sobj <- AddModuleScore(object = sobj, features = ERlist, name = "ER_List")
#TNBC computation -------------------
tnbclist <- list(c("FABP7", "TSPAN8", "CYP4Z1", "HOXA10", "CLDN1",
"TMSB15A", "C10ORF10", "TRPV6", "HOXA9", "ATP13A4",
"GLYATL2", "RP11-48O20.4", "DYRK3", "MUCL1", "ID4", "FGFR2",
"SHOX2", "Z83851.1", "CD82", "COL6A1", "KRT23", "GCHFR",
"PRICKLE1", "GCNT2", "KHDRBS3", "SIPA1L2", "LMO4", "TFAP2B",
"SLC43A3", "FURIN", "ELF5", "C1ORF116", "ADD3", "EFNA3",
"EFCAB4A", "LTF", "LRRC31", "ARL4C", "GPNMB", "VIM",
"SDR16C5", "RHOV", "PXDC1", "MALL", "YAP1", "A2ML1",
"RP1-257A7.5", "RP11-353N4.6", "ZBTB18", "CTD-2314B22.3", "GALNT3",
"BCL11A", "CXADR", "SSFA2", "ADM", "GUCY1A3", "GSTP1",
"ADCK3", "SLC25A37", "SFRP1", "PRNP", "DEGS1", "RP11-110G21.2",
"AL589743.1", "ATF3", "SIVA1", "TACSTD2", "HEBP2"))
sobj <- AddModuleScore(object = sobj, features = tnbclist, name = "TNBC_List")
#ggplot2 issue ----------------------------------------------------------------------------
sobj[["ClusterName"]] <- Idents(object = sobj)
sobjlists <- FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName"))
library(reshape2)
melt(sobjlists, id.vars = c("ER_List1", "TNBC_List1", "ClusterName"))
p <- ggplot() + geom_violin(data = sobjlists, aes(x= ClusterName, y = ER_List1, fill = ER_List1, colour = "ER+ Signature"))+ geom_violin(data = sobjlists, aes(x= ClusterName, y = TNBC_List1, fill = TNBC_List1, colour="TNBC Signature"))
Extension ======================================================================
If you want to do this but with two objects (sobjlists1 and sobjlists2, for example) instead of what my example showed (two variables but one object), rbind the two and then do what #StupidWolf says
library(reshape2)
sobjlists1= melt(sobjlists1, id.vars = "treatment")
sobjlists2= melt(sobjlists2, id.vars = "treatment")
combosobjlists <- rbind(sobjlists1, sobjlists2)
and then continue on with their code using combosobjlists:
ggplot(combosobjlists,aes(x= ClusterName, y = value)) +
geom_violin(aes(fill=variable)) +
geom_boxplot(aes(col=variable),
width = 0.2,position=position_dodge(0.9))
Hope this thread helps!
Try to include just the minimum code to show your problem. Like in your example, there's no need to start with the whole seurat processing. You can just provide the data.frame with dput() and we can see the issue with ggplot2 , see this post.
Create some example data:
library(Seurat)
library(ggplot2)
genes = c(unlist(c(ERlist,tnbclist)))
mat = matrix(rnbinom(500*length(genes),mu=500,size=1),ncol=500)
rownames(mat) = genes
colnames(mat) = paste0("cell",1:500)
sobj = CreateSeuratObject(mat)
sobj = NormalizeData(sobj)
Add some made-up cluster:
sobj$ClusterName = factor(sample(0:1,ncol(sobj),replace=TRUE))
Add your module score:
sobj = AddModuleScore(object = sobj, features = tnbclist,
name = "TNBC_List",ctrl=5)
sobj = AddModuleScore(object = sobj, features = ERlist,
name = "ER_List",ctrl=5)
We get the data, what you need to do is to pivot it long correctly. Plotting it twice with ggplot2 is going to cause all kinds of problem:
sobjlists = FetchData(object = sobj, vars = c("ER_List1", "TNBC_List1", "ClusterName"))
head(sobjlists)
ER_List1 TNBC_List1 ClusterName
cell1 -0.05391108 -0.008736057 1
cell2 0.07074816 -0.039064126 1
cell3 0.08688374 -0.066967324 1
cell4 -0.12503649 0.120665057 0
cell5 0.05356685 -0.072293651 0
cell6 -0.20053804 0.178977042 1
Should look like this:
library(reshape2)
sobjlists = melt(sobjlists, id.vars = "ClusterName")
ClusterName variable value
1 1 ER_List1 -0.05391108
2 1 ER_List1 0.07074816
3 1 ER_List1 0.08688374
4 0 ER_List1 -0.12503649
5 0 ER_List1 0.05356685
6 1 ER_List1 -0.20053804
Now we plot:
ggplot(sobjlists,aes(x= ClusterName, y = value)) +
geom_violin(aes(fill=variable)) +
geom_boxplot(aes(col=variable),
width = 0.2,position=position_dodge(0.9))
for you to be able to use the data within a plot without specifying it (like geom_boxplot() ), you need to put the data in the ggplot() function call. Then the following functions are able to inherit them.
You also do not need an extra violin plot per color
library(tidyverse)
data("Puromycin")
head(Puromycin)
mycp <- ggplot(Puromycin,aes(x= state, y = conc, colour=state))+geom_violin()
mycp + geom_boxplot(width=0.1, color= "black") +
scale_color_discrete(
labels= c("Puromycin (Treatment1)","Puromycin (Treatment2)")
)
Result:
I have an R list that contains 2500 lists in it. Each of 2500 lists contain 1 vector and 2 values. For the sake of reproducibility, I subset a tiny version of the data so it looks something like this:
head(models, 1)
>$model_1
>$model_1$m
> [1] 0.01335775 0.01336487 0.01336805 0.01338025 0.01340532 0.01343117 0.01346120 0.01349530 0.01353788 > 0.01357654 0.01360668
>$model_1$Cab
>[1] 59.6774
>$model_1$LAI
>[1] 4.01739
>$model_2
>$model_2$m
> [1] 0.02367338 0.02360433 0.02352800 0.02346125 0.02339469 0.02333403 0.02325861 0.02317945 0.02310961 >0.02303802 0.02295710
>$model_2$Cab
>[1] 59.6774
>$model_2$LAI
>[1] 0.5523946
Now, I want to make a line plot (using ggplot2) whose x axis is values from 400 to 410 and y axis is the vector in each lists (models$model_1$m, models$model_2$m and so on.) Therefore, there will be a lot of lines in the plot. I also want to color (continuous coloring) each line with their respective models$model_2$Cab values and have a continuous legend showing each models$model_2$Cab value and its color.
For reproducibility (Please note that this is greatly simplified version of the original data):
> dput(head(models, 10))
list(model_1 = list(m = c(0.0133577497667816, 0.0133648693063468,
0.0133680481888036, 0.01338024983382, 0.0134053218864944, 0.0134311717034271,
0.0134612003419723, 0.0134953017438241, 0.0135378825635721, 0.0135765418166368,
0.0136066826886183), Cab = 59.6773970406502, LAI = 4.01739045299768),
model_2 = list(m = c(0.023673375903171, 0.0236043348551818,
0.0235280045196734, 0.0234612496831449, 0.0233946873132861,
0.0233340349230324, 0.0232586128971129, 0.0231794538902946,
0.0231096074536893, 0.023038021285693, 0.0229570982021948
), Cab = 59.6773970406502, LAI = 0.552394618641403), model_3 = list(
m = c(0.0138277418755234, 0.0138310132688916, 0.0138301891768216,
0.0138383905159343, 0.0138587906203227, 0.0138802253169266,
0.0139048786261519, 0.0139332011615252, 0.0139700189737812,
0.0140030367215791, 0.0140275202380309), Cab = 59.6773970406502,
LAI = 3.01987725977579), model_4 = list(m = c(0.017483089696901,
0.0174591709902523, 0.017429967081058, 0.0174099884420304,
0.0173976896061841, 0.0173882607103241, 0.0173752969257632,
0.0173632160871019, 0.0173599236031355, 0.0173536114293099,
0.0173384748063733), Cab = 59.6773970406502, LAI = 1.37503600459533),
model_5 = list(m = c(0.0182499047037402, 0.0182203724940146,
0.0181853063358603, 0.0181595102703982, 0.0181404648083386,
0.0181246681180869, 0.0181039409709977, 0.01808352264341,
0.0180719579429791, 0.018057532687598, 0.0180342187796566
), Cab = 59.6773970406502, LAI = 1.22529135635182), model_6 = list(
m = c(0.0158200567917405, 0.0158083674745268, 0.0157919331298277,
0.0157846269346119, 0.0157870246965916, 0.0157914665730281,
0.0157954117645301, 0.0158014906653224, 0.0158162176575737,
0.0158275775312257, 0.0158302513933357), Cab = 59.6773970406502,
LAI = 1.81571552453658), model_7 = list(m = c(0.0133628950691214,
0.0133699680411211, 0.0133730986417069, 0.0133852517083498,
0.0134102666346747, 0.0134360623898904, 0.0134660252680654,
0.0135000559061319, 0.0135425658393117, 0.013581155812944,
0.013611227528355), Cab = 59.6773970406502, LAI = 3.99643688124574),
model_8 = list(m = c(0.0183501671255408, 0.0183199017377111,
0.0182840698901064, 0.0182575139774255, 0.0182375872739662,
0.0182209588085648, 0.0181992175650369, 0.0181777101462036,
0.0181650648958527, 0.0181495798700031, 0.0181251977995322
), Cab = 59.6773970406502, LAI = 1.20735517669905), model_9 = list(
m = c(0.0143687162679524, 0.0143678440890305, 0.0143626995592654,
0.0143666036037224, 0.0143820089259476, 0.0143987279254991,
0.0144176359711743, 0.0144397860850458, 0.0144704682720878,
0.0144974726755733, 0.0145159061770205), Cab = 59.6773970406502,
LAI = 2.51320168699674), model_10 = list(m = c(0.0138736072820698,
0.0138765215672426, 0.0138753253418108, 0.0138831561248062,
0.0139031250366076, 0.0139241525443688, 0.0139483098566198,
0.0139760994306543, 0.0140123870383231, 0.0140448852992375,
0.0140688465774421), Cab = 59.6773970406502, LAI = 2.96397596054064))
What I want to achieve is something like this (but with a better-looking ggplot2):
This could be achieved like so:
Convert your list of lists to a list of dataframes.
Add a variable with your x-axis variable to each df
Bind the list of data frames by row
Plot, where I make use of scale_colour_gradientn(colors = rainbow(20)) to mimic your rainbow color scale.
library(dplyr)
library(ggplot2)
models <- lapply(models, as.data.frame) %>%
lapply(function(x) { x$x <- 400:410; x}) %>%
bind_rows(.id = "id")
ggplot(models, aes(x = x, y = m, color = LAI, group = id)) +
geom_line() +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_colour_gradientn(colors = rainbow(20))
I'm working on a neural network but I have problems when I try to associate probabilities to the network. The dataset is the asian network
I've tried to change the node E dimensions, but I've still failed
data <- read.table('survey.txt',header = TRUE)
dag <- model2network("[A][S][E|A:S][O|E][R|E][T|O:R]")
options(repr.plot.width=6, repr.plot.height=6)
plot(dag)
enter image description here
colnames(data) <- c("Age", "Residence", "Education", "Occupation", "Sex", "Travel")
print("Age")
pA <- round(prop.table(table(data$A)),digits = 2)
print(pA)
print("Sex")
pS <- round(prop.table(table(data$S)),digits = 2)
print(pS)
print("Occupation given Education")
pO <- round(prop.table(table(data$O,data$E),1),digits = 2)
print(pO)
print("Residence given Education")
pR <- round(prop.table(table(data$R,data$E),1),digits = 2)
print(pR)
print("Education given age and sex")
pE <- prop.table(ftable(table(data$E,data$S,data$A),1),1)
print(pE)
print("Travel given occupation and residence")
pT <- round(prop.table(ftable(table(data$O, data$T, data$R)),1),digits = 2)
print(pT)
Then I change the type of results bellow into a matrix for each node:
pA <- matrix(c(pA[1],pA[2],pA[3]),ncol=3,dimnames=list(NULL,c("ADULT","OLD", "YOUNG")))#
pS <- matrix(c(pS[1],pS[2]), ncol=1,dimnames=list(c("F", "M"),NULL))
pO <- matrix(c(pO[1],pO[3],pO[2],pO[4]), ncol=2,dimnames=list(c("EMP", "SELF"), c("HIGH", "UNI")))
pR <- matrix(c(pR[1],pR[3],pR[2],pR[4]), ncol=2,dimnames=list(c("BIG", "SMALL"), c("HIGH", "UNI")))
pT <- c(pT[1], pT[7],pT[4],pT[10],pT[3],pT[9],pT[6],pT[12], pT[2],pT[8],pT[5],pT[11])
dim(pT) <- c(2,2,3)
dimnames(pT) <- list("R"=c("BIG", "SMALL"),"O"=c("EMP", "SELF"), "T"=c("CAR", "TRAIN","OTHER")) #,
pE <- c(pE[1], pE[5],pE[9],pE[2],pE[6],pE[10],pE[3],pE[7],pE[11],pE[4],pE[8],pE[12])
dim(pE) <- c(3,2,2)
dimnames(pE) <- list("A"=c("ADULT", "OLD", "YOUNG"),"S"=c("F", "M"),"E"=c("HIGH", "UNI")) #
Finally I tried this, but I had the next error:
dag.disc = custom.fit(dag, dist=list(E=pE,S=pS, A=pA, O=pO, R = pR, T=pT))
Error in check.dnode.vs.parents(node, new = dist[[node]], parents = fitted[node.parents]): wrong dimensions for node E.
Traceback:
1. custom.fit(dag, dist = list(E = pE, S = pS, A = pA, O = pO, R = pR,
. T = pT))
2. custom.fit.backend(x = x, dist = dist, ordinal = ordinal, debug = debug)
3. check.dnode.vs.parents(node, new = dist[[node]], parents = fitted[node.parents])
4. stop("wrong dimensions for node ", node, ".")
I am trying to produce a sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R? where a broad variaty of packages producing sankey diagrams are listed. Since I have input data and know different tools/packages I can produce such diagram BUT my euqestion is: how can I prepare input data for such?
Let's assume we would like to present how users have migrated between various states over 10 days and have start data set like the one below:
data.frame(userID = 1:100,
day1_state = sample(letters[1:8], replace = TRUE, size = 100),
day2_state = sample(letters[1:8], replace = TRUE, size = 100),
day3_state = sample(letters[1:8], replace = TRUE, size = 100),
day4_state = sample(letters[1:8], replace = TRUE, size = 100),
day5_state = sample(letters[1:8], replace = TRUE, size = 100),
day6_state = sample(letters[1:8], replace = TRUE, size = 100),
day7_state = sample(letters[1:8], replace = TRUE, size = 100),
day8_state = sample(letters[1:8], replace = TRUE, size = 100),
day9_state = sample(letters[1:8], replace = TRUE, size = 100),
day10_state = sample(letters[1:8], replace = TRUE, size = 100)
) -> dt
Now if one would like to create a sankey diagram with networkD3 package how should one tranform this dt data.frame into required input
so that we would have input like from this example
library(networkD3)
URL <- paste0(
"https://cdn.rawgit.com/christophergandrud/networkD3/",
"master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
units = "TWh", fontSize = 12, nodeWidth = 30)
EDIT
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
Then this code generates such sankey diagram for mentioned in question data.frame
fixtable <- function(...) {
tab <- table(...)
if (substr(colnames(tab)[1],1,1) == "_" &
substr(rownames(tab)[1],1,1) == "_") {
tab2 <- tab
colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
tab2[1,1] <- 0
# mandat w klubie
for (par in names(which(tab2[1,] > 0))) {
delta = min(tab2[par, 1], tab2[1, par])
tab2[par, par] = tab2[par, par] + delta
tab2[1, par] = tab2[1, par] - delta
tab2[par, 1] = tab2[par, 1] - delta
}
# przechodzi przez niezalezy
for (par in names(which(tab2[1,] > 0))) {
tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
tab2[1, par] = 0
}
for (par in names(which(tab2[,1] > 0))) {
tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
tab2[par, 1] = 0
}
tab[] <- tab2[]
}
tab
}
flow2 <- rbind(
data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
flow2 <- flow2[flow2[,3] > 0,]
nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]
links2 <- data.frame(source = nam2[as.character(flow2[,1])],
target = nam2[as.character(flow2[,2])],
value = flow2[,3])
sankeyNetwork(Links = links, Nodes = nodes,
Source = "source", Target = "target",
Value = "value", NodeID = "name",
fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
colourScale = "d3.scale.category20()")
I asked a similar question while ago. And I guess I better post it here how it can be done with the tidyverse magic.
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)
# The actual data preperation happens here
dt_new <- dt %>%
gather(day, state, -userID) %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1]) %>% # Get the numbers
mutate(day = as.integer(day), # Convert to proper data types
state = as.factor(state))
Here is how the data dt_new looks like
userID day state
1 1 1 d
2 2 1 d
3 3 1 g
4 4 1 a
5 5 1 a
6 6 1 d
7 7 1 d
8 8 1 b
9 9 1 d
10 10 1 e
...
Now plotting the Sankey plot:
ggplot(dt_new,
aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
geom_stratum() +
geom_text(stat = "stratum") +
geom_flow()
Here is the output