error while i run circ <- circle_dat(BP1, e) in GOplot - r

hi i get error when i trying to merge two data frames with the circle_dat function in Oplot library and this is my whole code
e = read_excel("C:/Users/Amahd/Desktop/proposal/final.xlsx", col_names = F)
e = e[,1:4]
colnames(e)[1:4] = c("ID_REF", "adj.p.val", "logFC", "ID")
e1 = e[,-1:-3]
e = cbind(e1,e[,c(-1,-2,-4)])
e[,1] = toupper(e[,1])
read.delim("C:/Users/Amahd/Desktop/proposal/BP1.txt")
BP1 <- BP1[,-11:-14]
BP1$Category = "BP"
BP2 = as.data.frame(BP1[,1])
BP3 = as.data.frame(BP1[,2])
BP4 = as.data.frame(BP1[,5])
BP5 = as.data.frame(BP1[,10])
BP6 = as.data.frame(BP1[,11])
BP1 = cbind(BP6,BP3,BP2,BP5,BP4)
colnames(BP1)[1:5] = c("Category", "ID", "Term", "Genes","adj_pavl")
e1 = data.frame(words = unlist(e))
circ <- circle_dat(BP1,e)
and this is my error
Error in data.frame(category = rep(as.character(terms$category),
count), : arguments imply differing number of rows: 7195, 0

Related

binding three dataframes with rbind

I have three sets of data with the same variables and different observations. The variables all share the same name, but when I try to bind them using the rbind function I see this:
names do not match previous names.
Does anyone know how to fix the problem? My desired goal is to have one dataset with numerous observations of the same variables.
What I have tried so far is this:
Daten1
> attach(Daten1)
rel.Var.1 <- data.frame(Q35, Q37, Q38, Q42, Q46, Q47, Q50, Q51, Q52, Q55, Q60, Q61,
Q91_1, Q92_1, Q93_1, Q94_1, Q95_1, Q96_1, Q97_1, Q301_1, Q300_1, Q98_1,
Q99_1, Q100_1, Q101_1, Q102_1, Q103_1, Q104_1, Q105_1, Q106_1,
Q107_1, Q108_1, Q109_1, Q110_1, Q111_1, Q112_1, Q113_1, Q114_1,
Q115_1, Q116_1, Q117_1, Q118_1, Q119_1, Q121_1, Q122_1,
Q123_1, Q124_1, Q125_1, Q126_1, Q127_1, Q128_1, Q129_1, Q130_1,
Q131_1, Q132_1, Q133_1, Q134_1, Q135_1, Q136_1, Q137_1, Q138_1,
Q139_1, Q140_1, Q141_1, Q142_1, Q143_1, Q144_1, Q145_1,
Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15, Q16, Q17, Q18, Q19, Q20,
Q21, Q22, Q23, Q24, Q25, Q26, Q27, Q28, Q29, Q30, Q31, Q32, Q33,
Q176, Q177, Q178, Q175, VPN)
>detach(Daten1)
>rel.Var.1 <- rel.Var.1 %>% rename(
neo_01 = Q35, neo_03 = Q37,neo_04 = Q38, neo_08 = Q42,
neo_12 = Q46,neo_13 = Q47, neo_16 = Q50, neo_17 = Q51, neo_18 = Q52,
neo_21 = Q55, neo_26 = Q60, neo_27 = Q61, TICS_1 = Q91_1, TICS_2 = Q92_1,
TICS_3 = Q93_1, TICS_4 = Q94_1, TICS_5 = Q95_1, TICS_6 = Q96_1,
TICS_7 = Q97_1, TICS_8 = Q301_1, TICS_9 = Q300_1, TICS_10 = Q98_1,
TICS_11 = Q99_1, TICS_12= Q100_1, TICS_13 = Q101_1, TICS_14 = Q102_1,
TICS_15 = Q103_1,
ICS_16 = Q104_1, TICS_17 = Q105_1, TICS_18 = Q106_1, TICS_19 = Q107_1,
TICS_20 = Q108_1, TICS_21 = Q109_1, TICS_22 = Q110_1, TICS_24 = Q111_1,
TICS_25 = Q112_1,
TICS_26 = Q113_1, TICS_27 = Q114_1, TICS_28 = Q115_1, TICS_29 = Q116_1,
TICS_30 = Q117_1, TICS_31 = Q118_1, TICS_32 = Q119_1, TICS_33 = Q121_1,
TICS_34 = Q122_1, TICS_35 = Q123_1, TICS_36 = Q124_1, TICS_37 = Q125_1,
TICS_38 = Q126_1, TICS_39 = Q127_1, TICS_40 = Q128_1, TICS_41 = Q129_1,
TICS_42 = Q130_1,
TICS_43 = Q131_1, TICS_44 = Q132_1, TICS_45 = Q133_1, TICS_46 = Q134_1,
TICS_47 = Q135_1, TICS_48 = Q136_1, TICS_49 = Q137_1,
TICS_50 = Q138_1, TICS_51 = Q139_1, TICS_52 = Q140_1, TICS_53 = Q141_1,
TICS_54 = Q142_1, TICS_55 = Q143_1, TICS_56 = Q144_1, TICS_57 = Q145_1,
HSPS_1 = Q7, HSPS_2 = Q8, HSPS_3 = Q9, HSPS_4 = Q10, HSPS_5 = Q11,
HSPS_6 = Q12, HSPS_7 = Q13, HSPS_8 = Q14,
HSPS_9 = Q15, HSPS_10 = Q16, HSPS_11 = Q17, HSPS_12 = Q18, HSPS_13 = Q19,
HSPS_14 = Q20, HSPS_15 = Q21, HSPS_16 = Q22,
HSPS_17 = Q23, HSPS_18 = Q24, HSPS_19 = Q25, HSPS_20 = Q26,
HSPS_21 = Q27, HSPS_22 = Q28, HSPS_23 = Q29,
HSPS_24 = Q30, HSPS_25 = Q31, HSPS_26 = Q32, HSPS_27 = Q33,
sex = Q176, Bildung = Q177, Tat = Q178, age= Q175)
>rel.Var.1 <- na.omit(rel.Var.1)
Daten 2
> attach(Daten2)
rel.Var.2 <- data.frame(Q182, Q186, Q188, Q196, Q204, Q206, Q212, Q214,
Q216, Q222, Q232, Q234,
Q221, Q222.1, Q223, Q224.1, Q225, Q226.1, Q227, Q174, Q175, Q228.1, Q229,
Q230.1,Q231, Q232.1, Q233, Q234.1, Q235, Q236.1, Q237, Q238.1,
Q239,Q240.1, Q241, Q242, Q243, Q244, Q245, Q246, Q247, Q248, Q249, Q251,
Q252, Q253, Q254, Q255, Q256, Q257, Q258, Q259, Q260, Q261, Q262, Q263,
Q264, Q265, Q266, Q267, Q268,
Q269, Q270, Q271, Q272, Q273, Q274, Q275,
Q207, Q209, Q211, Q213, Q215, Q217, Q219, Q221.1, Q223.1, Q225.1,
Q227.1, Q229.1, Q231.1, Q233.1, Q235.1, Q237.1, Q239.1, Q241.1, Q243.1,
Q245.1, Q247.1, Q249.1, Q251.1, Q253.1, Q255.1, Q257.1, Q259.1,
Q6, Q7,Q8, Q5, VPN)
>detach(Daten2)
>rel.Var.2 <- rel.Var.2 %>% rename(
neo_01 = Q182, neo_03 = Q186, neo_04 = Q188, neo_08 = Q196, neo_12 =
Q204,
neo_13 = Q206, neo_16 = Q212, neo_17 = Q214, neo_18 = Q216, neo_21 =
Q222,
neo_26 = Q232, neo_27 = Q234, TICS_1 = Q221, TICS_2 = Q222.1, TICS_3 =
Q223,
TICS_4 = Q224.1, TICS_5 = Q225, TICS_6 = Q226.1, TICS_7 = Q227, TICS_8 =
Q174,
TICS_9 = Q175, TICS_10 = Q228.1, TICS_11 = Q229, TICS_12 = Q230.1,
TICS_13 = Q231,
TICS_14 = Q232.1, TICS_15 = Q233, TICS_16 = Q234.1, TICS_17 = Q235,
TICS_18 = Q236.1,
TICS_19 = Q237, TICS_20 = Q238.1, TICS_21 = Q239, TICS_22 = Q240.1,
TICS_24 = Q241,
TICS_25 = Q242, TICS_26 = Q243, TICS_27 = Q244, TICS_28 = Q245, TICS_29 =
Q246,
TICS_30 = Q247, TICS_31 = Q248, TICS_32 = Q249, TICS_33 = Q251, TICS_34 =
Q252,
TICS_35 = Q253, ICS_36 = Q254, TICS_37 = Q255, TICS_38 = Q256, TICS_39 =
Q257,
TICS_40 = Q258, TICS_41 = Q259, TICS_42 = Q260, TICS_43 = Q261, TICS_44 =
Q262,
TICS_45 = Q263, TICS_46 = Q264, TICS_47 = Q265, TICS_48 = Q266, TICS_49 =
Q267,
TICS_50 = Q268, TICS_51 = Q269, TICS_52 = Q270, TICS_53 = Q271, TICS_54 =
Q272,
TICS_55 = Q273, TICS_56 = Q274, TICS_57 = Q275, HSPS_1 = Q207, HSPS_2 =
Q209,
HSPS_3 = Q211, HSPS_4 = Q213, HSPS_5 = Q215, HSPS_6 = Q217, HSPS_7 =
Q219,
HSPS_8 = Q221.1, HSPS_9 = Q223.1, HSPS_10 = Q225.1, HSPS_11 = Q227.1,
HSPS_12 = Q229.1,
HSPS_13 = Q231.1, HSPS_14 = Q233.1, HSPS_15 = Q235.1, HSPS_16 = Q237.1,
HSPS_17 = Q239.1,
HSPS_18 = Q241, HSPS_19 = Q243, HSPS_20 = Q245, HSPS_21 = Q247, HSPS_22 =
Q249,
HSPS_23 = Q251.1, HSPS_24 = Q253.1, HSPS_25 = Q255.1, HSPS_26 = Q257.1,
HSPS_27 = Q259.1,
sex = Q6, Bildung = Q7, Tat = Q8, age= Q5)
>rel.Var.2 <- na.omit(rel.Var.2)
Daten3
> attach(Daten3)
rel.Var.3 <- data.frame(neo_03, neo_08, neo_12, neo_16, neo_21, neo_26,
neo_01, neo_04, neo_13,
neo_17, neo_18, neo_27, TICS_1, TICS_2, TICS_3, TICS_4, TICS_5, TICS_6,
TICS_7, TICS_8, TICS_9,
TICS_10, TICS_11, TICS_12, TICS_13, TICS_14, TICS_15, ICS_16, TICS_17,
TICS_18, TICS_19, TICS_20, TICS_21, TICS_22,
TICS_24, TICS_25, TICS_26, TICS_27, TICS_28, TICS_29, TICS_30, TICS_31,
TICS_32, TICS_33, TICS_34, TICS_35, TICS_36,
TICS_37, TICS_38, TICS_39, TICS_40, TICS_41, TICS_42, TICS_43, TICS_44,
TICS_45, TICS_46, TICS_47, TICS_48, TICS_49, TICS_50,
TICS_51, TICS_52, TICS_53, TICS_54, TICS_55, TICS_56, TICS_57,
HSPS_1, HSPS_2, HSPS_3, HSPS_4, HSPS_5, HSPS_6, HSPS_7, HSPS_8, HSPS_9,
HSPS_10, HSPS_11, HSPS_12, HSPS_13,
HSPS_14, HSPS_15, HSPS_16, HSPS_17, HSPS_18, HSPS_19, HSPS_20, HSPS_21,
HSPS_22, HSPS_23, HSPS_24, HSPS_25, HSPS_26, HSPS_27,
Geschlecht, Bildungsabschluss, derzeitige_Beschaeftigung, Alter, NR)
>detach(Daten3)
>rel.Var.3 <- rel.Var.3 %>% rename(
sex = Geschlecht, Bildung = Bildungsabschluss, Tat =
derzeitige_Beschaeftigung, age= Alter, VPN = NR)
>rel.Var.3 <- na.omit(rel.Var.3)
>View(rel.Var.1)
>View(rel.Var.2)
>View(rel.Var.3)
## Datensaetze zusammenfuegen ##bind data
data_gesamt <- rbind(rel.Var.1, rel.Var.2, rel.Var.3)
data_gesamt <- bind_rows(rel.Var.1, rel.Var.2, rel.Var.3)
With bind_rows I get this error:
Can't combine `..1$neo_01` <character> and `..3$neo_01` <integer>.
Backtrace:
1. dplyr::bind_rows(rel.Var.1, rel.Var.2, rel.Var.3)
2. vctrs::vec_rbind(!!!dots, .names_to = .id)
4. vctrs::vec_default_ptype2(...)
5. vctrs::stop_incompatible_type(...)
6. vctrs:::stop_incompatible(...)
7. vctrs:::stop_vctrs(...)

Getting Duplicate Labeled Points on Scatterplot in R

I am trying to use kmeans to show what states have similar statistics with one another from the Lahman database, my code is as follows:
battingInfo <- Batting %>% filter(yearID >= 1999)
total <- merge(battingInfo,People,by="playerID")
totalN <- total[,-c(24,25,28:47)]
filterByState <- totalN %>% group_by(birthState) %>% summarise(players = length(playerID))
newMerge <- merge(totalN, filterByState, by="birthState")
newTest <- newMerge %>% group_by(birthState) %>% summarise_at(vars(G, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB,
SO, IBB, HBP, SH, SF, GIDP), sum, na.rm = TRUE)
updateTest <- newMerge %>% group_by(birthState) %>% summarise(Players = n_distinct(playerID), G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
finalUpdate <- newMerge %>% group_by(birthState = case_when(!birthState %in% state.abb ~ "Other",
TRUE ~ birthState)) %>% summarise(Players = n_distinct(playerID),
G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
This gives me the data frame I want. Now my code for kmeans is:
subDat5 <- finalUpdate[, c(2:19)]
subDatSc5 <- scale(subDat5)
distDat5 <- dist(subDatSc5)
k2<-5
km3new<-kmeans(subDatSc5, k2, nstart = 40)
fitNew <-cmdscale(distDat5) # k is the number of dim to PLOT
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16, col=km3new$cluster)
birthState=as.character(finalUpdate[,1])
View(birthState)
text(fitNew+.1, labels = birthState, cex=.5)
Everything seems to work perfectly up until the last line, when I label all the points and it outputs a graph with each point being labeled 50 times.
Is there any fix to this?
dput(fitNew) =
structure(c(-1.65773726259238, -0.534080004429963, -1.25224081559503,
-0.77600324658737, 13.7591986092784, -1.48285027332317, -1.0685046710528,
-1.40697098882713, 4.45857203274176, 1.31053002832658, -1.35540549966184,
-1.29910272287957, -1.68908570162927, 0.480144496416969, -0.592812161743823,
-1.23667901504586, -0.844421560951474, -0.827147650450116, -1.22861495063773,
-1.09472770146309, -1.68944621276222, -1.04378183282088, -1.34915033496973,
-0.951660697104605, -0.45483103293441, -1.70655513856763, -0.0616193106609581,
-1.48510165062592, -1.46251714293967, -1.66524625215651, -0.302561452071198,
-1.56675666458699, -1.28344728331308, 0.864956587539308, 0.16173394975142,
-0.850595975621662, -0.756783746315003, 24.7256817273653, -0.427398940139082,
-1.39925870808987, -0.755785801532488, -1.51858748511865, -0.944152303255372,
2.99465893267538, -1.67729960185572, -0.428860890332761, -1.66997803522651,
-0.392867003697617, -1.30257694125332, -1.66036447381944, -1.6019072254532,
-0.0137738939595427, -0.296070047308066, -0.00473553953140588,
0.0641385777789144, 1.13842140049119, -0.0268651281540734, -0.128806499497676,
-0.00491611456401126, 0.364126276181306, -0.143046769591177,
-0.0283493696039194, -0.0485069239634975, -0.0287370449451863,
0.095714493198601, -0.124528071666917, -0.0332600735692987, 0.0352695212129851,
-0.119261467201306, -0.0381525968696119, 0.0551469698282207,
-0.0115458694920637, -0.0250933419027217, 0.0406395856647227,
0.12482265126378, -0.17954163594865, -0.0113245644618699, -0.0894498877336694,
0.0305207676977073, 0.0323710265810206, -0.0491296972494748,
-0.121635810491615, 0.0175346179372083, 0.0127983868546243, 0.21663582448027,
0.0803333481747664, -0.0309611163272855, 0.0201356804088859,
-0.696293053438086, 0.133550765173667, 0.108119095159391, -0.136003613852937,
0.00557290379285935, 0.0602630898597761, -0.196004062948666,
-0.0161895096280255, -0.178283625530885, -0.0170000868214074,
0.107232630021258, 0.0375464632562086, -0.00276496483054615,
0.0193363060673037), .Dim = c(51L, 2L), .Dimnames = list(NULL,
NULL))
and dput(birthState) =
"c(\"AK\", \"AL\", \"AR\", \"AZ\", \"CA\", \"CO\", \"CT\", \"DE\", \"FL\", \"GA\", \"HI\", \"IA\", \"ID\", \"IL\", \"IN\", \"KS\", \"KY\", \"LA\", \"MA\", \"MD\", \"ME\", \"MI\", \"MN\", \"MO\", \"MS\", \"MT\", \"NC\", \"ND\", \"NE\", \"NH\", \"NJ\", \"NM\", \"NV\", \"NY\", \"OH\", \"OK\", \"OR\", \"Other\", \"PA\", \"RI\", \"SC\", \"SD\", \"TN\", \"TX\", \"UT\", \"VA\", \"VT\", \"WA\", \"WI\", \"WV\", \"WY\")"
As I mentioned in my comment, your problem is probably due to the fact that birthState is a string of an R character vector and not the actual vector.
The following code
birthState <- eval(parse(text = birthState))
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16)
text(fitNew, labels = birthState, cex=.5, pos = 4)
Yielded this for me

Error in `V<-`(`*tmp*`, value = `*vtmp*`) : invalid indexing

I used the bibliometrix function in R, and want to plot some useful graphs.
library(bibliometrix)
??bibliometrix
D<-readFiles("E:\\RE\\savedrecs.txt")
M <- convert2df(D,dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M ,sep = ";" )
S<- summary(object=results,k=10, pause=FALSE)
plot(x=results,k=10,pause=FALSE)
options(width=100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M1, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE)
res <- thematicMap(net, NetMatrix, S)
plot(res$map)
But in the net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE), it shows error
Error in V<-(*tmp*, value = *vtmp*) : invalid indexing
. Also I cannot do the CR, it always shows unlistCR. I cannot use the NetMatrix function neither.
Some help me plsssssssss
The problem is in the data itself not in the code you presented. When I downloaded the data from bibliometrix.com and changed M1 to M (typo?) in biblioNetwork function call everything worked perfectly. Please see the code below:
library(bibliometrix)
# Plot bibliometric analysis results
D <- readFiles("http://www.bibliometrix.org/datasets/savedrecs.txt")
M <- convert2df(D, dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M, sep = ";")
S <- summary(results)
plot(x = results, k = 10, pause = FALSE)
# Plot Bibliographic Network
options(width = 100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network", type = "fruchterman",
labelsize = 0.7, halo = FALSE, cluster = "walktrap",
remove.isolates = FALSE, remove.multiple = FALSE, noloops = TRUE, weighted = TRUE)
# Plot Thematic Map
res <- thematicMap(net, NetMatrix, S)
str(M)
plot(res$map)

Rename list of lists using a named list

So I'm working with a list that contains other lists inside, with this structure:
library(graph)
library(RBGL)
library(Rgraphviz)
show(tree)
$`SO:0001968`
$`SO:0001968`$`SO:0001622`
$`SO:0001968`$`SO:0001622`$`SO:0001624`
$`SO:0001968`$`SO:0001622`$`SO:0001624`$`SO:0002090`
[1] 1
$`SO:0001968`$`SO:0001622`$`SO:0001623`
$`SO:0001968`$`SO:0001622`$`SO:0001623`$`SO:0002091`
[1] 1
$`SO:0001968`$`SO:0001969`
$`SO:0001968`$`SO:0001969`$`SO:0002090`
[1] 1
$`SO:0001968`$`SO:0001969`$`SO:0002091`
[1] 1
dput(tree)
list(`SO:0001968` = list(`SO:0001622` = list(`SO:0001624` = list(
`SO:0002090` = 1), `SO:0001623` = list(`SO:0002091` = 1)),
`SO:0001969` = list(`SO:0002090` = 1, `SO:0002091` = 1)))
The data I use to build the list comes from an object called g:
show(g)
A graphNEL graph with directed edges
Number of Nodes = 7
Number of Edges = 8
dput(g)
new("graphNEL",
nodes = c("SO:0001968", "SO:0001969", "SO:0001622",
"SO:0001623", "SO:0001624", "SO:0002090", "SO:0002091"), edgeL = list(
`SO:0001968` = list(edges = 3:2), `SO:0001969` = list(edges = 6:7),
`SO:0001622` = list(edges = 5:4), `SO:0001623` = list(edges = 7L),
`SO:0001624` = list(edges = 6L), `SO:0002090` = list(edges = integer(0)),
`SO:0002091` = list(edges = integer(0))), edgeData = new("attrData",
data = list(`SO:0001968|SO:0001622` = list(weight = 1), `SO:0001968|SO:0001969` = list(
weight = 1), `SO:0001969|SO:0002090` = list(weight = 1),
`SO:0001969|SO:0002091` = list(weight = 1), `SO:0001622|SO:0001624` = list(
weight = 1), `SO:0001622|SO:0001623` = list(weight = 1),
`SO:0001623|SO:0002091` = list(weight = 1), `SO:0001624|SO:0002090` = list(
weight = 1)), defaults = list(weight = 1)), nodeData = new("attrData",
data = list(`SO:0001968` = list(label = "coding_transcript_variant"),
`SO:0001969` = list(label = "coding_transcript_intron_variant"),
`SO:0001622` = list(label = "UTR_variant"), `SO:0001623` = list(
label = "5_prime_UTR_variant"), `SO:0001624` = list(
label = "3_prime_UTR_variant"), `SO:0002090` = list(
label = "3_prime_UTR_intron_variant"), `SO:0002091` = list(
label = "5_prime_UTR_intron_variant")), defaults = list(
label = NA_character_)), renderInfo = new("renderInfo",
nodes = list(), edges = list(), graph = list(), pars = list()),
graphData = list(edgemode = "directed"))
Each SO:000XXX corresponds to a name, and I can find the names using the function nodeData, that returns a named list:
nodeData(g, nodes(g), "label")
$`SO:0001968`
[1] "coding_transcript_variant"
$`SO:0001969`
[1] "coding_transcript_intron_variant"
$`SO:0001622`
[1] "UTR_variant"
$`SO:0001623`
[1] "5_prime_UTR_variant"
$`SO:0001624`
[1] "3_prime_UTR_variant"
$`SO:0002090`
[1] "3_prime_UTR_intron_variant"
$`SO:0002091`
[1] "5_prime_UTR_intron_variant"
What I need is to replace (or rename) the data in the tree list with the corresponding string of the nodeData function.
For example, replace the 'SO:0001968' in the tree list for coding_transcript_variant from the nodeData function.
This recursive function should do the trick :
# you will do this but I couldn't install your packages
# nodeD <- nodeData(g, nodes(g), "label")
nodeD <- list(`SO:0001968` = "coding_transcript_variant",
`SO:0001969` = "coding_transcript_intron_variant",
`SO:0001622` = "UTR_variant",
`SO:0001623` = "5_prime_UTR_variant",
`SO:0001624` = "3_prime_UTR_variant",
`SO:0002090` = "3_prime_UTR_intron_variant",
`SO:0002091` = "5_prime_UTR_intron_variant")
rename_items <- function(item){
if (is.list(item)){
item <- lapply(item,rename_items)
names(item) <- unname(nodeD[names(item)])
}
item
}
tree2 <- rename_items(tree)
Result
# $coding_transcript_variant
# $coding_transcript_variant$UTR_variant
# $coding_transcript_variant$UTR_variant$`3_prime_UTR_variant`
# $coding_transcript_variant$UTR_variant$`3_prime_UTR_variant`$`3_prime_UTR_intron_variant`
# [1] 1
#
#
# $coding_transcript_variant$UTR_variant$`5_prime_UTR_variant`
# $coding_transcript_variant$UTR_variant$`5_prime_UTR_variant`$`5_prime_UTR_intron_variant`
# [1] 1
#
#
#
# $coding_transcript_variant$coding_transcript_intron_variant
# $coding_transcript_variant$coding_transcript_intron_variant$`3_prime_UTR_intron_variant`
# [1] 1
#
# $coding_transcript_variant$coding_transcript_intron_variant$`5_prime_UTR_intron_variant`
# [1] 1
If you save the output from nodeData() to a vector, you can use the names() function to assign the names to a list().
An example of assigning names to list elements:
x <- 1:5
y <- 11:20
z <- 21:25
theList <- list(x,y,z)
listNames <- c("element1","element2","element3")
names(theList) <- listNames
# access first element by name, using $ form of extract operator
theList$element1
...and the output:
> theList$element1
[1] 1 2 3 4 5
>
You may need to unlist() the output of nodeData() as follows:
theNames <- unlist(nodeData(g, nodes(g), "label"))
names(g) <- theNames

How to prepare input data for a sankey diagrams in R?

I am trying to produce a sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R? where a broad variaty of packages producing sankey diagrams are listed. Since I have input data and know different tools/packages I can produce such diagram BUT my euqestion is: how can I prepare input data for such?
Let's assume we would like to present how users have migrated between various states over 10 days and have start data set like the one below:
data.frame(userID = 1:100,
day1_state = sample(letters[1:8], replace = TRUE, size = 100),
day2_state = sample(letters[1:8], replace = TRUE, size = 100),
day3_state = sample(letters[1:8], replace = TRUE, size = 100),
day4_state = sample(letters[1:8], replace = TRUE, size = 100),
day5_state = sample(letters[1:8], replace = TRUE, size = 100),
day6_state = sample(letters[1:8], replace = TRUE, size = 100),
day7_state = sample(letters[1:8], replace = TRUE, size = 100),
day8_state = sample(letters[1:8], replace = TRUE, size = 100),
day9_state = sample(letters[1:8], replace = TRUE, size = 100),
day10_state = sample(letters[1:8], replace = TRUE, size = 100)
) -> dt
Now if one would like to create a sankey diagram with networkD3 package how should one tranform this dt data.frame into required input
so that we would have input like from this example
library(networkD3)
URL <- paste0(
"https://cdn.rawgit.com/christophergandrud/networkD3/",
"master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
units = "TWh", fontSize = 12, nodeWidth = 30)
EDIT
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
Then this code generates such sankey diagram for mentioned in question data.frame
fixtable <- function(...) {
tab <- table(...)
if (substr(colnames(tab)[1],1,1) == "_" &
substr(rownames(tab)[1],1,1) == "_") {
tab2 <- tab
colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
tab2[1,1] <- 0
# mandat w klubie
for (par in names(which(tab2[1,] > 0))) {
delta = min(tab2[par, 1], tab2[1, par])
tab2[par, par] = tab2[par, par] + delta
tab2[1, par] = tab2[1, par] - delta
tab2[par, 1] = tab2[par, 1] - delta
}
# przechodzi przez niezalezy
for (par in names(which(tab2[1,] > 0))) {
tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
tab2[1, par] = 0
}
for (par in names(which(tab2[,1] > 0))) {
tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
tab2[par, 1] = 0
}
tab[] <- tab2[]
}
tab
}
flow2 <- rbind(
data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
flow2 <- flow2[flow2[,3] > 0,]
nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]
links2 <- data.frame(source = nam2[as.character(flow2[,1])],
target = nam2[as.character(flow2[,2])],
value = flow2[,3])
sankeyNetwork(Links = links, Nodes = nodes,
Source = "source", Target = "target",
Value = "value", NodeID = "name",
fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
colourScale = "d3.scale.category20()")
I asked a similar question while ago. And I guess I better post it here how it can be done with the tidyverse magic.
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)
# The actual data preperation happens here
dt_new <- dt %>%
gather(day, state, -userID) %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1]) %>% # Get the numbers
mutate(day = as.integer(day), # Convert to proper data types
state = as.factor(state))
Here is how the data dt_new looks like
userID day state
1 1 1 d
2 2 1 d
3 3 1 g
4 4 1 a
5 5 1 a
6 6 1 d
7 7 1 d
8 8 1 b
9 9 1 d
10 10 1 e
...
Now plotting the Sankey plot:
ggplot(dt_new,
aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
geom_stratum() +
geom_text(stat = "stratum") +
geom_flow()
Here is the output

Resources