Create Percentage Change Labels Barplot - r

I am trying to create a barchart which shows percentage change between the bars for each category of performance test (smallpt,compress etc)
Current Graph Example:
Data:
CG2400Host <- data.frame(
smallpt = c(38.934),
compress = c(58.036),
ffte = c(5629.20),
johntheripper = c(8067),
mtdgemm = c(2.043463),
stockfish = c(16746109),
streamCopy = c(83562.0),
streamScale = c(79536.7),
streamTriad = c(82708.4),
streamAdd = c(83041.6),
dbenchOneClient = c(579.090),
dbenchSixClient = c(2814.47),
dbenchTwelveClient = c(4141.33),
dbenchFortyEight = c(4044.82),
dbenchOneHundredTwentyEight = c(851.355),
dbenchTwoHundredFiftyEight = c(870.838)
)
CG2300Host <- data.frame(
smallpt = c(19.712),
compress = c(52.873),
ffte = c(4626.09),
johntheripper = c(8729),
mtdgemm = c(5.494281),
stockfish = c(17610837),
streamCopy = c(79427.8),
streamScale = c(60582.3),
streamTriad = c(69226.6),
streamAdd = c(67805.7),
dbenchOneClient = c(47.8331),
dbenchSixClient = c(67.661),
dbenchTwelveClient = c(82.4374),
dbenchFortyEight = c(109.27),
dbenchOneHundredTwentyEight = c(111.981),
dbenchTwoHundredFiftyEight = c(95.2279)
)
GB1UHost <- data.frame(
smallpt = c(17.530),
compress = c(44.628),
ffte = c(7365.97),
johntheripper = c(11684),
mtdgemm = c(1.161368),
stockfish = c(22878029),
streamCopy = c(44096.4),
streamScale = c(29866.3),
streamTriad = c(31804.6),
streamAdd = c(31796.5),
dbenchOneClient = c(755.644),
dbenchSixClient = c(3333.72),
dbenchTwelveClient = c(4497.29),
dbenchFortyEight = c(3510.50),
dbenchOneHundredTwentyEight = c(2092.10),
dbenchTwoHundredFiftyEight = c(1720.72)
)
DellHost <- data.frame(
smallpt = c(19.081),
compress = c(38.394),
ffte = c(8569.61),
johntheripper = c(13365),
mtdgemm = c(1.791839),
stockfish = c(22134688),
streamCopy = c(133314.5),
streamScale = c(89241.6),
streamTriad = c(94915.5),
streamAdd = c(93186.8),
dbenchOneClient = c(852.674),
dbenchSixClient = c(3369.59),
dbenchTwelveClient = c(4348.31),
dbenchFortyEight = c(1497.37),
dbenchOneHundredTwentyEight = c(1528.85),
dbenchTwoHundredFiftyEight = c(1505.47)
)
Current code:
createHostComparisonBarchart <- function(CG2300DF,CG2400DF,GB1UDF,DellDF){
BarChartNames<-c("Smallpt Barchart","Compressed G-Zip","FFTE","John The Ripper","Mt-dgemm","Stockfish","Stream - Copy",
"Stream - Scale","Stream - Triad","Stream - Add","Dbench 1 Client","Dbench 6 Client","Dbench 12 Client",
"Dbench 48 Client","Dbench 128 Client","Dbench 256 Client")
UnitNames <- c("seconds","seconds","MFLOPS/s","Crypts/s","MFLOPS/s","Nodes/s","MB/s","MB/s","MB/s","MB/s","MB/s","MB/s",
"MB/s","MB/s","MB/s","MB/s")
IndexNames <- c("smallpt","compress","ffte","johntheripper","mtdgemm","stockfish","streamCopy","streamScale","streamTriad",
"streamAdd","dbenchOneClient","dbenchSixClient","dbenchTwelveClient","dbenchFortyEight","dbenchOneHundredTwentyEight",
"dbenchTwoHundredFiftyEight")
for (i in 1:length(BarChartNames)){
values <- data.frame(
serverType <- c("CG2300","CG2400","GB1U","R6525 Dell"),
result <- c(CG2300DF[i],CG2400DF[i],GB1UDF[i],DellDF[i])
)
p<-ggplot(data=values, aes(x=serverType, y=result,fill=serverType)) +
geom_bar(stat="identity")+theme_minimal()+
xlab("Server Type")+
ylab(UnitNames[i])+
ggtitle(BarChartNames[i])
print(p)
}
}
I have a function for calculating the percentage change between the values:
percentageChangeCalc <-function(serverADF,serverBDF){
percentChange <-c()
for (i in 1:length(colnames(serverADF))){
val <- (serverBDF[i] - serverADF[i])/ serverADF[i]
percentChange <- append(percentChange,val)
}
percentChange
}
Since each "bar" in the chart is compared to the one next to it...
percentageChangeCalc(CG2300Host,CG2400Host)
percentageChangeCalc(CG400Host,GB1UHost)
percentageChangeCalc(GB1UHost,DellHost)
would work.
I have tried different iterations of implementing this from using geom_text to geom_label but I seem to keep getting Error: Discrete value supplied to continuous scale. This makes me think it is not possible to carry out this with my current data.
I am aiming for something like:
Any help appreciated.

Maybe this helps:
library(tidyverse)
set.seed(1337)
data <- tibble(year = seq(2014, 2019), value = rpois(6, lambda = 10))
data
#> # A tibble: 6 × 2
#> year value
#> <int> <int>
#> 1 2014 10
#> 2 2015 5
#> 3 2016 8
#> 4 2017 8
#> 5 2018 6
#> 6 2019 12
data %>%
mutate(
diff = dplyr::lead(value) - value,
label_y = value %>% map2_dbl(diff, ~ 1.1 * max(.x, .x + .y))
) %>%
ggplot(aes(year)) +
geom_col(aes(y = value)) +
geom_errorbar(aes(ymin = value, ymax = value + diff), color = "red", width = 0.3) +
geom_label(aes(y = label_y, label = diff), color = "red")
#> Warning: Removed 1 rows containing missing values (geom_label).
Created on 2022-02-22 by the reprex package (v2.0.0)

Related

Ifelse fails in render function

The following code works. The functional line of code is p1+p2+p3+p4+show. However, if I comment that line out and uncomment the commented lines, the app still works, but the associated map doesn't render. The error is argument is of length zero.
Why does this happen, and how do I fix it?
Thanks as always!
UI
tabPanel("US Map",
fluidRow(
sliderInput("daterange","Date Range",
min = lubridate::as_date("2022-05-01"),
max = lubridate::as_date("2022-12-31"),
value = c(lubridate::as_date("2022-05-01"),
lubridate::as_date("2022-12-31"))),
checkboxInput("tweets","Show Tweets",value = TRUE),
checkboxInput("actions","Show Actions",value = TRUE),
hr(),
tmapOutput("geo2")
))
#> Error in tabPanel("US Map", fluidRow(sliderInput("daterange", "Date Range", : could not find function "tabPanel"
Created on 2022-12-10 with reprex v2.0.2
Server
output$geo2 <- renderTmap({
tweets_sf %<>% filter(date>=input$daterange[1],date<=input$daterange[2])
actions_sf <- actions %>%
# filter(incident_date >= input$daterange[1],incident_date <= input$daterange[2]) %>%
st_as_sf(coords = c("lon","lat"),crs=4326) %>%
st_cast("POINT")
tmap_mode("view")
## tmap mode set to plotting
p1<-tm_shape(usa) +
tm_fill("#acc480",alpha = 0.4,id = "NAME")
p2<-tm_shape(usa) +
tm_borders("black", lwd = .5)
p3<-tm_shape(actions_sf) +
tm_text("target_type", size = 1,scale=1) +
tm_symbols(col = "brown1",shape = "target_type",shapes = rep(5,6),
size = 1,alpha=0.4, scale = .25) +
tm_legend(show = TRUE)
p4<-tm_shape(tweets_sf) +
tm_text("ngram", size = 1,scale=.7) +
tm_symbols(col = "deepskyblue1", id = "date",
size = 1,alpha=0.4, scale = 0.15,palette = "RdBu") +
tm_legend(show = TRUE)
show<-tm_view(set.view = c(-98.5795,39.8283,4))
# if(input$tweets==TRUE & input$action==TRUE){
# p1+p2+p3+p4+show
# }else if(input$tweets==TRUE & input$actions==FALSE){
# p1+p2+p4+show
# }else if(input$tweets==FALSE & input$actions==TRUE){
# p1+p2+p3+show
# }else{p1+p2+show}
p1+p2+p3+p4+show
})
#> Error in renderTmap({: could not find function "renderTmap"
Created on 2022-12-10 with reprex v2.0.2
argument is of length zero is because the left parameter of the condition is empty.
Use DescTools::Coalesce() (or other similar function)
if(DescTools::Coalesce(input$tweets),TRUE)==TRUE
& DescTools::Coalesce(input$actions,TRUE)==FALSE)
etc
https://www.rdocumentation.org/packages/DescTools/versions/0.99.47/topics/Coalesce

Getting Duplicate Labeled Points on Scatterplot in R

I am trying to use kmeans to show what states have similar statistics with one another from the Lahman database, my code is as follows:
battingInfo <- Batting %>% filter(yearID >= 1999)
total <- merge(battingInfo,People,by="playerID")
totalN <- total[,-c(24,25,28:47)]
filterByState <- totalN %>% group_by(birthState) %>% summarise(players = length(playerID))
newMerge <- merge(totalN, filterByState, by="birthState")
newTest <- newMerge %>% group_by(birthState) %>% summarise_at(vars(G, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB,
SO, IBB, HBP, SH, SF, GIDP), sum, na.rm = TRUE)
updateTest <- newMerge %>% group_by(birthState) %>% summarise(Players = n_distinct(playerID), G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
finalUpdate <- newMerge %>% group_by(birthState = case_when(!birthState %in% state.abb ~ "Other",
TRUE ~ birthState)) %>% summarise(Players = n_distinct(playerID),
G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
This gives me the data frame I want. Now my code for kmeans is:
subDat5 <- finalUpdate[, c(2:19)]
subDatSc5 <- scale(subDat5)
distDat5 <- dist(subDatSc5)
k2<-5
km3new<-kmeans(subDatSc5, k2, nstart = 40)
fitNew <-cmdscale(distDat5) # k is the number of dim to PLOT
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16, col=km3new$cluster)
birthState=as.character(finalUpdate[,1])
View(birthState)
text(fitNew+.1, labels = birthState, cex=.5)
Everything seems to work perfectly up until the last line, when I label all the points and it outputs a graph with each point being labeled 50 times.
Is there any fix to this?
dput(fitNew) =
structure(c(-1.65773726259238, -0.534080004429963, -1.25224081559503,
-0.77600324658737, 13.7591986092784, -1.48285027332317, -1.0685046710528,
-1.40697098882713, 4.45857203274176, 1.31053002832658, -1.35540549966184,
-1.29910272287957, -1.68908570162927, 0.480144496416969, -0.592812161743823,
-1.23667901504586, -0.844421560951474, -0.827147650450116, -1.22861495063773,
-1.09472770146309, -1.68944621276222, -1.04378183282088, -1.34915033496973,
-0.951660697104605, -0.45483103293441, -1.70655513856763, -0.0616193106609581,
-1.48510165062592, -1.46251714293967, -1.66524625215651, -0.302561452071198,
-1.56675666458699, -1.28344728331308, 0.864956587539308, 0.16173394975142,
-0.850595975621662, -0.756783746315003, 24.7256817273653, -0.427398940139082,
-1.39925870808987, -0.755785801532488, -1.51858748511865, -0.944152303255372,
2.99465893267538, -1.67729960185572, -0.428860890332761, -1.66997803522651,
-0.392867003697617, -1.30257694125332, -1.66036447381944, -1.6019072254532,
-0.0137738939595427, -0.296070047308066, -0.00473553953140588,
0.0641385777789144, 1.13842140049119, -0.0268651281540734, -0.128806499497676,
-0.00491611456401126, 0.364126276181306, -0.143046769591177,
-0.0283493696039194, -0.0485069239634975, -0.0287370449451863,
0.095714493198601, -0.124528071666917, -0.0332600735692987, 0.0352695212129851,
-0.119261467201306, -0.0381525968696119, 0.0551469698282207,
-0.0115458694920637, -0.0250933419027217, 0.0406395856647227,
0.12482265126378, -0.17954163594865, -0.0113245644618699, -0.0894498877336694,
0.0305207676977073, 0.0323710265810206, -0.0491296972494748,
-0.121635810491615, 0.0175346179372083, 0.0127983868546243, 0.21663582448027,
0.0803333481747664, -0.0309611163272855, 0.0201356804088859,
-0.696293053438086, 0.133550765173667, 0.108119095159391, -0.136003613852937,
0.00557290379285935, 0.0602630898597761, -0.196004062948666,
-0.0161895096280255, -0.178283625530885, -0.0170000868214074,
0.107232630021258, 0.0375464632562086, -0.00276496483054615,
0.0193363060673037), .Dim = c(51L, 2L), .Dimnames = list(NULL,
NULL))
and dput(birthState) =
"c(\"AK\", \"AL\", \"AR\", \"AZ\", \"CA\", \"CO\", \"CT\", \"DE\", \"FL\", \"GA\", \"HI\", \"IA\", \"ID\", \"IL\", \"IN\", \"KS\", \"KY\", \"LA\", \"MA\", \"MD\", \"ME\", \"MI\", \"MN\", \"MO\", \"MS\", \"MT\", \"NC\", \"ND\", \"NE\", \"NH\", \"NJ\", \"NM\", \"NV\", \"NY\", \"OH\", \"OK\", \"OR\", \"Other\", \"PA\", \"RI\", \"SC\", \"SD\", \"TN\", \"TX\", \"UT\", \"VA\", \"VT\", \"WA\", \"WI\", \"WV\", \"WY\")"
As I mentioned in my comment, your problem is probably due to the fact that birthState is a string of an R character vector and not the actual vector.
The following code
birthState <- eval(parse(text = birthState))
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16)
text(fitNew, labels = birthState, cex=.5, pos = 4)
Yielded this for me

Rename list of lists using a named list

So I'm working with a list that contains other lists inside, with this structure:
library(graph)
library(RBGL)
library(Rgraphviz)
show(tree)
$`SO:0001968`
$`SO:0001968`$`SO:0001622`
$`SO:0001968`$`SO:0001622`$`SO:0001624`
$`SO:0001968`$`SO:0001622`$`SO:0001624`$`SO:0002090`
[1] 1
$`SO:0001968`$`SO:0001622`$`SO:0001623`
$`SO:0001968`$`SO:0001622`$`SO:0001623`$`SO:0002091`
[1] 1
$`SO:0001968`$`SO:0001969`
$`SO:0001968`$`SO:0001969`$`SO:0002090`
[1] 1
$`SO:0001968`$`SO:0001969`$`SO:0002091`
[1] 1
dput(tree)
list(`SO:0001968` = list(`SO:0001622` = list(`SO:0001624` = list(
`SO:0002090` = 1), `SO:0001623` = list(`SO:0002091` = 1)),
`SO:0001969` = list(`SO:0002090` = 1, `SO:0002091` = 1)))
The data I use to build the list comes from an object called g:
show(g)
A graphNEL graph with directed edges
Number of Nodes = 7
Number of Edges = 8
dput(g)
new("graphNEL",
nodes = c("SO:0001968", "SO:0001969", "SO:0001622",
"SO:0001623", "SO:0001624", "SO:0002090", "SO:0002091"), edgeL = list(
`SO:0001968` = list(edges = 3:2), `SO:0001969` = list(edges = 6:7),
`SO:0001622` = list(edges = 5:4), `SO:0001623` = list(edges = 7L),
`SO:0001624` = list(edges = 6L), `SO:0002090` = list(edges = integer(0)),
`SO:0002091` = list(edges = integer(0))), edgeData = new("attrData",
data = list(`SO:0001968|SO:0001622` = list(weight = 1), `SO:0001968|SO:0001969` = list(
weight = 1), `SO:0001969|SO:0002090` = list(weight = 1),
`SO:0001969|SO:0002091` = list(weight = 1), `SO:0001622|SO:0001624` = list(
weight = 1), `SO:0001622|SO:0001623` = list(weight = 1),
`SO:0001623|SO:0002091` = list(weight = 1), `SO:0001624|SO:0002090` = list(
weight = 1)), defaults = list(weight = 1)), nodeData = new("attrData",
data = list(`SO:0001968` = list(label = "coding_transcript_variant"),
`SO:0001969` = list(label = "coding_transcript_intron_variant"),
`SO:0001622` = list(label = "UTR_variant"), `SO:0001623` = list(
label = "5_prime_UTR_variant"), `SO:0001624` = list(
label = "3_prime_UTR_variant"), `SO:0002090` = list(
label = "3_prime_UTR_intron_variant"), `SO:0002091` = list(
label = "5_prime_UTR_intron_variant")), defaults = list(
label = NA_character_)), renderInfo = new("renderInfo",
nodes = list(), edges = list(), graph = list(), pars = list()),
graphData = list(edgemode = "directed"))
Each SO:000XXX corresponds to a name, and I can find the names using the function nodeData, that returns a named list:
nodeData(g, nodes(g), "label")
$`SO:0001968`
[1] "coding_transcript_variant"
$`SO:0001969`
[1] "coding_transcript_intron_variant"
$`SO:0001622`
[1] "UTR_variant"
$`SO:0001623`
[1] "5_prime_UTR_variant"
$`SO:0001624`
[1] "3_prime_UTR_variant"
$`SO:0002090`
[1] "3_prime_UTR_intron_variant"
$`SO:0002091`
[1] "5_prime_UTR_intron_variant"
What I need is to replace (or rename) the data in the tree list with the corresponding string of the nodeData function.
For example, replace the 'SO:0001968' in the tree list for coding_transcript_variant from the nodeData function.
This recursive function should do the trick :
# you will do this but I couldn't install your packages
# nodeD <- nodeData(g, nodes(g), "label")
nodeD <- list(`SO:0001968` = "coding_transcript_variant",
`SO:0001969` = "coding_transcript_intron_variant",
`SO:0001622` = "UTR_variant",
`SO:0001623` = "5_prime_UTR_variant",
`SO:0001624` = "3_prime_UTR_variant",
`SO:0002090` = "3_prime_UTR_intron_variant",
`SO:0002091` = "5_prime_UTR_intron_variant")
rename_items <- function(item){
if (is.list(item)){
item <- lapply(item,rename_items)
names(item) <- unname(nodeD[names(item)])
}
item
}
tree2 <- rename_items(tree)
Result
# $coding_transcript_variant
# $coding_transcript_variant$UTR_variant
# $coding_transcript_variant$UTR_variant$`3_prime_UTR_variant`
# $coding_transcript_variant$UTR_variant$`3_prime_UTR_variant`$`3_prime_UTR_intron_variant`
# [1] 1
#
#
# $coding_transcript_variant$UTR_variant$`5_prime_UTR_variant`
# $coding_transcript_variant$UTR_variant$`5_prime_UTR_variant`$`5_prime_UTR_intron_variant`
# [1] 1
#
#
#
# $coding_transcript_variant$coding_transcript_intron_variant
# $coding_transcript_variant$coding_transcript_intron_variant$`3_prime_UTR_intron_variant`
# [1] 1
#
# $coding_transcript_variant$coding_transcript_intron_variant$`5_prime_UTR_intron_variant`
# [1] 1
If you save the output from nodeData() to a vector, you can use the names() function to assign the names to a list().
An example of assigning names to list elements:
x <- 1:5
y <- 11:20
z <- 21:25
theList <- list(x,y,z)
listNames <- c("element1","element2","element3")
names(theList) <- listNames
# access first element by name, using $ form of extract operator
theList$element1
...and the output:
> theList$element1
[1] 1 2 3 4 5
>
You may need to unlist() the output of nodeData() as follows:
theNames <- unlist(nodeData(g, nodes(g), "label"))
names(g) <- theNames

I want to plot a line over four bargraphs of data in ggplot using geom_line.

I keep getting an error becasue the bargraphs are used for quaterly data and the line is going to be data from the entire year so it will have many points.
The only issue is with the geom_line function which I am new to using. The error is -->
Error in scale_fill_manual(values = c("green", "yellow")) + geom_line(aes(x = dts2, : non-numeric argument to binary operator
t="DG"
fin=getFinancials(t, auto.assign = F)
dts = labels(fin$BS$A)[[2]]
dts2 = paste(substr(dts,1,7),"::",dts, sep="")
stockprices = getSymbols(t, auto.assign = F)
price = rep(0,NROW(dts))
for(i in 1:NROW(price))
{
price[i]=as.vector(last(stockprices[dts2[i],6]))
}
yr= as.numeric(substr(dts,1,4))
pastyr = yr -2
betayr = paste(pastyr,"::",yr,sep="")
os = fin$BS$A["Total Common Shares Outstanding", ]
gw= fin$BS$A["Goodwill, Net", ]
ta= fin$BS$A["Total Assets", ]
td= fin$BS$A["Total Debt", ]
ni= fin$IS$A["Net Income", ]
btax = fin$IS$A["Income Before Tax", ]
atax = fin$IS$A["Income After Tax",]
intpaid = fin$CF$A["Cash Interest Paid, Supplemental",]
gw[is.na(gw)]=0
intpaid[is.na(intpaid)]=0
taa = (ta - gw)/os
Rd = rep(0,NROW(dts))
for(i in 1:NROW(dts))
{
if(td[i]!=0)
{
Rd[i] = intpaid[i]/td[i]
}
}
gspc = getSymbols("^GSPC", auto.assign = F)
gs5 = getSymbols("GS5", src = "FRED", auto.assign = F)
marketRisk = rep(0,NROW(dts))
riskFree = rep(0,NROW(dts))
beta = rep(0,NROW(dts))
for(i in 1:NROW(dts))
{
marketRisk[i]= mean(yearlyReturn(gspc[betayr[i]]))
riskFree[i] = mean(gs5[betayr[i]])
gspc.weekly = weeklyReturn(gspc[betayr[i]])
stockprices.weekly = weeklyReturn(stockprices[betayr[i]])
beta[i] = CAPM.beta(stockprices.weekly,gspc.weekly)
}
Re = (riskFree/100) + beta * (marketRisk-(riskFree/100))
E = os*price
V=E+td
Tc = (btax - atax)/btax
wacc = (E/V)*Re + (td/V)*Rd*(1-Tc)
margin = (ni/wacc)/os - taa
taadf = data.frame(dts,val = taa,cat="ta")
margindf = data.frame(dts,val = margin ,cat="margin")
mdf=rbind(margindf,taadf)
#linrng = paste(dts[NROW(dts)],"::",dts[1],sep="")
#dfdt = data.frame(stockprices[linrng,6])
#dfdt2 = data.frame(dt = labels(dfdt)[[1]],dfdt$AAPL.Adjusted,cat="taa")
#names(dfdt2)=c("dt,price,cat")
pricedf = data.frame(as.vector((stockprices[dts2[i],6])))
ggplot(mdf, aes(x=dts,y=val,fill=cat)) + geom_bar(stat="identity",color="black")
scale_fill_manual(values = c("green","yellow")) +
geom_line(aes(x=dts2, y=stockprices), stat = "identity",
position = "identity", na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE)
Note, the object stockprices is An ‘xts’ object. So, you can't use inside ggplot scale. I picked the fist variable of stockprices object to show the code, but you probabli want another one.
library(dplyr)
library(quantmod)
library(PerformanceAnalytics)
library(ggplot2)
stockprices_df <- as.data.frame(stockprices) %>%
mutate(date = rownames(.)) %>%
filter(date %in% dts)
ggplot() +
geom_col(
data = mdf,
aes(x = dts,y = val,fill = cat)
) +
geom_line(
data = stockprices_df,
aes(x = date, y = DG.Open, group = 1 )
) +
scale_fill_manual(values = c("green","yellow"))
[

How to prepare input data for a sankey diagrams in R?

I am trying to produce a sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R? where a broad variaty of packages producing sankey diagrams are listed. Since I have input data and know different tools/packages I can produce such diagram BUT my euqestion is: how can I prepare input data for such?
Let's assume we would like to present how users have migrated between various states over 10 days and have start data set like the one below:
data.frame(userID = 1:100,
day1_state = sample(letters[1:8], replace = TRUE, size = 100),
day2_state = sample(letters[1:8], replace = TRUE, size = 100),
day3_state = sample(letters[1:8], replace = TRUE, size = 100),
day4_state = sample(letters[1:8], replace = TRUE, size = 100),
day5_state = sample(letters[1:8], replace = TRUE, size = 100),
day6_state = sample(letters[1:8], replace = TRUE, size = 100),
day7_state = sample(letters[1:8], replace = TRUE, size = 100),
day8_state = sample(letters[1:8], replace = TRUE, size = 100),
day9_state = sample(letters[1:8], replace = TRUE, size = 100),
day10_state = sample(letters[1:8], replace = TRUE, size = 100)
) -> dt
Now if one would like to create a sankey diagram with networkD3 package how should one tranform this dt data.frame into required input
so that we would have input like from this example
library(networkD3)
URL <- paste0(
"https://cdn.rawgit.com/christophergandrud/networkD3/",
"master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
units = "TWh", fontSize = 12, nodeWidth = 30)
EDIT
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
Then this code generates such sankey diagram for mentioned in question data.frame
fixtable <- function(...) {
tab <- table(...)
if (substr(colnames(tab)[1],1,1) == "_" &
substr(rownames(tab)[1],1,1) == "_") {
tab2 <- tab
colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
tab2[1,1] <- 0
# mandat w klubie
for (par in names(which(tab2[1,] > 0))) {
delta = min(tab2[par, 1], tab2[1, par])
tab2[par, par] = tab2[par, par] + delta
tab2[1, par] = tab2[1, par] - delta
tab2[par, 1] = tab2[par, 1] - delta
}
# przechodzi przez niezalezy
for (par in names(which(tab2[1,] > 0))) {
tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
tab2[1, par] = 0
}
for (par in names(which(tab2[,1] > 0))) {
tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
tab2[par, 1] = 0
}
tab[] <- tab2[]
}
tab
}
flow2 <- rbind(
data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
flow2 <- flow2[flow2[,3] > 0,]
nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]
links2 <- data.frame(source = nam2[as.character(flow2[,1])],
target = nam2[as.character(flow2[,2])],
value = flow2[,3])
sankeyNetwork(Links = links, Nodes = nodes,
Source = "source", Target = "target",
Value = "value", NodeID = "name",
fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
colourScale = "d3.scale.category20()")
I asked a similar question while ago. And I guess I better post it here how it can be done with the tidyverse magic.
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)
# The actual data preperation happens here
dt_new <- dt %>%
gather(day, state, -userID) %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1]) %>% # Get the numbers
mutate(day = as.integer(day), # Convert to proper data types
state = as.factor(state))
Here is how the data dt_new looks like
userID day state
1 1 1 d
2 2 1 d
3 3 1 g
4 4 1 a
5 5 1 a
6 6 1 d
7 7 1 d
8 8 1 b
9 9 1 d
10 10 1 e
...
Now plotting the Sankey plot:
ggplot(dt_new,
aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
geom_stratum() +
geom_text(stat = "stratum") +
geom_flow()
Here is the output

Resources