Related
I am trying to create a barchart which shows percentage change between the bars for each category of performance test (smallpt,compress etc)
Current Graph Example:
Data:
CG2400Host <- data.frame(
smallpt = c(38.934),
compress = c(58.036),
ffte = c(5629.20),
johntheripper = c(8067),
mtdgemm = c(2.043463),
stockfish = c(16746109),
streamCopy = c(83562.0),
streamScale = c(79536.7),
streamTriad = c(82708.4),
streamAdd = c(83041.6),
dbenchOneClient = c(579.090),
dbenchSixClient = c(2814.47),
dbenchTwelveClient = c(4141.33),
dbenchFortyEight = c(4044.82),
dbenchOneHundredTwentyEight = c(851.355),
dbenchTwoHundredFiftyEight = c(870.838)
)
CG2300Host <- data.frame(
smallpt = c(19.712),
compress = c(52.873),
ffte = c(4626.09),
johntheripper = c(8729),
mtdgemm = c(5.494281),
stockfish = c(17610837),
streamCopy = c(79427.8),
streamScale = c(60582.3),
streamTriad = c(69226.6),
streamAdd = c(67805.7),
dbenchOneClient = c(47.8331),
dbenchSixClient = c(67.661),
dbenchTwelveClient = c(82.4374),
dbenchFortyEight = c(109.27),
dbenchOneHundredTwentyEight = c(111.981),
dbenchTwoHundredFiftyEight = c(95.2279)
)
GB1UHost <- data.frame(
smallpt = c(17.530),
compress = c(44.628),
ffte = c(7365.97),
johntheripper = c(11684),
mtdgemm = c(1.161368),
stockfish = c(22878029),
streamCopy = c(44096.4),
streamScale = c(29866.3),
streamTriad = c(31804.6),
streamAdd = c(31796.5),
dbenchOneClient = c(755.644),
dbenchSixClient = c(3333.72),
dbenchTwelveClient = c(4497.29),
dbenchFortyEight = c(3510.50),
dbenchOneHundredTwentyEight = c(2092.10),
dbenchTwoHundredFiftyEight = c(1720.72)
)
DellHost <- data.frame(
smallpt = c(19.081),
compress = c(38.394),
ffte = c(8569.61),
johntheripper = c(13365),
mtdgemm = c(1.791839),
stockfish = c(22134688),
streamCopy = c(133314.5),
streamScale = c(89241.6),
streamTriad = c(94915.5),
streamAdd = c(93186.8),
dbenchOneClient = c(852.674),
dbenchSixClient = c(3369.59),
dbenchTwelveClient = c(4348.31),
dbenchFortyEight = c(1497.37),
dbenchOneHundredTwentyEight = c(1528.85),
dbenchTwoHundredFiftyEight = c(1505.47)
)
Current code:
createHostComparisonBarchart <- function(CG2300DF,CG2400DF,GB1UDF,DellDF){
BarChartNames<-c("Smallpt Barchart","Compressed G-Zip","FFTE","John The Ripper","Mt-dgemm","Stockfish","Stream - Copy",
"Stream - Scale","Stream - Triad","Stream - Add","Dbench 1 Client","Dbench 6 Client","Dbench 12 Client",
"Dbench 48 Client","Dbench 128 Client","Dbench 256 Client")
UnitNames <- c("seconds","seconds","MFLOPS/s","Crypts/s","MFLOPS/s","Nodes/s","MB/s","MB/s","MB/s","MB/s","MB/s","MB/s",
"MB/s","MB/s","MB/s","MB/s")
IndexNames <- c("smallpt","compress","ffte","johntheripper","mtdgemm","stockfish","streamCopy","streamScale","streamTriad",
"streamAdd","dbenchOneClient","dbenchSixClient","dbenchTwelveClient","dbenchFortyEight","dbenchOneHundredTwentyEight",
"dbenchTwoHundredFiftyEight")
for (i in 1:length(BarChartNames)){
values <- data.frame(
serverType <- c("CG2300","CG2400","GB1U","R6525 Dell"),
result <- c(CG2300DF[i],CG2400DF[i],GB1UDF[i],DellDF[i])
)
p<-ggplot(data=values, aes(x=serverType, y=result,fill=serverType)) +
geom_bar(stat="identity")+theme_minimal()+
xlab("Server Type")+
ylab(UnitNames[i])+
ggtitle(BarChartNames[i])
print(p)
}
}
I have a function for calculating the percentage change between the values:
percentageChangeCalc <-function(serverADF,serverBDF){
percentChange <-c()
for (i in 1:length(colnames(serverADF))){
val <- (serverBDF[i] - serverADF[i])/ serverADF[i]
percentChange <- append(percentChange,val)
}
percentChange
}
Since each "bar" in the chart is compared to the one next to it...
percentageChangeCalc(CG2300Host,CG2400Host)
percentageChangeCalc(CG400Host,GB1UHost)
percentageChangeCalc(GB1UHost,DellHost)
would work.
I have tried different iterations of implementing this from using geom_text to geom_label but I seem to keep getting Error: Discrete value supplied to continuous scale. This makes me think it is not possible to carry out this with my current data.
I am aiming for something like:
Any help appreciated.
Maybe this helps:
library(tidyverse)
set.seed(1337)
data <- tibble(year = seq(2014, 2019), value = rpois(6, lambda = 10))
data
#> # A tibble: 6 × 2
#> year value
#> <int> <int>
#> 1 2014 10
#> 2 2015 5
#> 3 2016 8
#> 4 2017 8
#> 5 2018 6
#> 6 2019 12
data %>%
mutate(
diff = dplyr::lead(value) - value,
label_y = value %>% map2_dbl(diff, ~ 1.1 * max(.x, .x + .y))
) %>%
ggplot(aes(year)) +
geom_col(aes(y = value)) +
geom_errorbar(aes(ymin = value, ymax = value + diff), color = "red", width = 0.3) +
geom_label(aes(y = label_y, label = diff), color = "red")
#> Warning: Removed 1 rows containing missing values (geom_label).
Created on 2022-02-22 by the reprex package (v2.0.0)
A C
-------
KK 1
KK 3
KK 3
AA 4
AA 52
BB 33
BB 7
BB 83
DD 91
DD 10
i have a table like the one above i want to use column A and make it into groups
and use start and end index to be used in pack_rows to automate using the dataframe above.
i tried to use for loop but it didn't work it showing error in loop so how can i loop through
using below code
kbl(df,booktabs = T,longtable = T) %>%
kable_styling(latex_options = c("repeat_header"),bootstrap_options = "bordered",font_size = 7,full_width = F)%>%
row_spec(0, bold = T, color = "white", background = "#008752")%>%
pack_rows("KK", 1, 3,background = "#008000")%>%
pack_rows("AA", 4, 5,background = "#008000")
Make A column as factor with levels same as appearance. Use table in pack_rows -
library(knitr)
library(kableExtra)
df$A <- factor(df$A, unique(df$A))
kbl(df,booktabs = T,longtable = T) %>%
kable_styling(latex_options = c("repeat_header"),
bootstrap_options = "bordered",font_size = 7,full_width = F)%>%
row_spec(0, bold = T, color = "white", background = "#008752")%>%
pack_rows(index = table(df$A), background = "#008000")
I have an R list that contains 2500 lists in it. Each of 2500 lists contain 1 vector and 2 values. For the sake of reproducibility, I subset a tiny version of the data so it looks something like this:
head(models, 1)
>$model_1
>$model_1$m
> [1] 0.01335775 0.01336487 0.01336805 0.01338025 0.01340532 0.01343117 0.01346120 0.01349530 0.01353788 > 0.01357654 0.01360668
>$model_1$Cab
>[1] 59.6774
>$model_1$LAI
>[1] 4.01739
>$model_2
>$model_2$m
> [1] 0.02367338 0.02360433 0.02352800 0.02346125 0.02339469 0.02333403 0.02325861 0.02317945 0.02310961 >0.02303802 0.02295710
>$model_2$Cab
>[1] 59.6774
>$model_2$LAI
>[1] 0.5523946
Now, I want to make a line plot (using ggplot2) whose x axis is values from 400 to 410 and y axis is the vector in each lists (models$model_1$m, models$model_2$m and so on.) Therefore, there will be a lot of lines in the plot. I also want to color (continuous coloring) each line with their respective models$model_2$Cab values and have a continuous legend showing each models$model_2$Cab value and its color.
For reproducibility (Please note that this is greatly simplified version of the original data):
> dput(head(models, 10))
list(model_1 = list(m = c(0.0133577497667816, 0.0133648693063468,
0.0133680481888036, 0.01338024983382, 0.0134053218864944, 0.0134311717034271,
0.0134612003419723, 0.0134953017438241, 0.0135378825635721, 0.0135765418166368,
0.0136066826886183), Cab = 59.6773970406502, LAI = 4.01739045299768),
model_2 = list(m = c(0.023673375903171, 0.0236043348551818,
0.0235280045196734, 0.0234612496831449, 0.0233946873132861,
0.0233340349230324, 0.0232586128971129, 0.0231794538902946,
0.0231096074536893, 0.023038021285693, 0.0229570982021948
), Cab = 59.6773970406502, LAI = 0.552394618641403), model_3 = list(
m = c(0.0138277418755234, 0.0138310132688916, 0.0138301891768216,
0.0138383905159343, 0.0138587906203227, 0.0138802253169266,
0.0139048786261519, 0.0139332011615252, 0.0139700189737812,
0.0140030367215791, 0.0140275202380309), Cab = 59.6773970406502,
LAI = 3.01987725977579), model_4 = list(m = c(0.017483089696901,
0.0174591709902523, 0.017429967081058, 0.0174099884420304,
0.0173976896061841, 0.0173882607103241, 0.0173752969257632,
0.0173632160871019, 0.0173599236031355, 0.0173536114293099,
0.0173384748063733), Cab = 59.6773970406502, LAI = 1.37503600459533),
model_5 = list(m = c(0.0182499047037402, 0.0182203724940146,
0.0181853063358603, 0.0181595102703982, 0.0181404648083386,
0.0181246681180869, 0.0181039409709977, 0.01808352264341,
0.0180719579429791, 0.018057532687598, 0.0180342187796566
), Cab = 59.6773970406502, LAI = 1.22529135635182), model_6 = list(
m = c(0.0158200567917405, 0.0158083674745268, 0.0157919331298277,
0.0157846269346119, 0.0157870246965916, 0.0157914665730281,
0.0157954117645301, 0.0158014906653224, 0.0158162176575737,
0.0158275775312257, 0.0158302513933357), Cab = 59.6773970406502,
LAI = 1.81571552453658), model_7 = list(m = c(0.0133628950691214,
0.0133699680411211, 0.0133730986417069, 0.0133852517083498,
0.0134102666346747, 0.0134360623898904, 0.0134660252680654,
0.0135000559061319, 0.0135425658393117, 0.013581155812944,
0.013611227528355), Cab = 59.6773970406502, LAI = 3.99643688124574),
model_8 = list(m = c(0.0183501671255408, 0.0183199017377111,
0.0182840698901064, 0.0182575139774255, 0.0182375872739662,
0.0182209588085648, 0.0181992175650369, 0.0181777101462036,
0.0181650648958527, 0.0181495798700031, 0.0181251977995322
), Cab = 59.6773970406502, LAI = 1.20735517669905), model_9 = list(
m = c(0.0143687162679524, 0.0143678440890305, 0.0143626995592654,
0.0143666036037224, 0.0143820089259476, 0.0143987279254991,
0.0144176359711743, 0.0144397860850458, 0.0144704682720878,
0.0144974726755733, 0.0145159061770205), Cab = 59.6773970406502,
LAI = 2.51320168699674), model_10 = list(m = c(0.0138736072820698,
0.0138765215672426, 0.0138753253418108, 0.0138831561248062,
0.0139031250366076, 0.0139241525443688, 0.0139483098566198,
0.0139760994306543, 0.0140123870383231, 0.0140448852992375,
0.0140688465774421), Cab = 59.6773970406502, LAI = 2.96397596054064))
What I want to achieve is something like this (but with a better-looking ggplot2):
This could be achieved like so:
Convert your list of lists to a list of dataframes.
Add a variable with your x-axis variable to each df
Bind the list of data frames by row
Plot, where I make use of scale_colour_gradientn(colors = rainbow(20)) to mimic your rainbow color scale.
library(dplyr)
library(ggplot2)
models <- lapply(models, as.data.frame) %>%
lapply(function(x) { x$x <- 400:410; x}) %>%
bind_rows(.id = "id")
ggplot(models, aes(x = x, y = m, color = LAI, group = id)) +
geom_line() +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_colour_gradientn(colors = rainbow(20))
I am trying to use kmeans to show what states have similar statistics with one another from the Lahman database, my code is as follows:
battingInfo <- Batting %>% filter(yearID >= 1999)
total <- merge(battingInfo,People,by="playerID")
totalN <- total[,-c(24,25,28:47)]
filterByState <- totalN %>% group_by(birthState) %>% summarise(players = length(playerID))
newMerge <- merge(totalN, filterByState, by="birthState")
newTest <- newMerge %>% group_by(birthState) %>% summarise_at(vars(G, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB,
SO, IBB, HBP, SH, SF, GIDP), sum, na.rm = TRUE)
updateTest <- newMerge %>% group_by(birthState) %>% summarise(Players = n_distinct(playerID), G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
finalUpdate <- newMerge %>% group_by(birthState = case_when(!birthState %in% state.abb ~ "Other",
TRUE ~ birthState)) %>% summarise(Players = n_distinct(playerID),
G = sum(G), AB = sum(AB),
R = sum(R), H = sum(H), X2B = sum(X2B), X3B = sum(X3B),
HR = sum(HR), RBI = sum(RBI), SB = sum(SB), CS = sum(CS),
BB = sum(BB), SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP),
SH = sum(SH), SF = sum(SF), GIDP = sum(GIDP))
This gives me the data frame I want. Now my code for kmeans is:
subDat5 <- finalUpdate[, c(2:19)]
subDatSc5 <- scale(subDat5)
distDat5 <- dist(subDatSc5)
k2<-5
km3new<-kmeans(subDatSc5, k2, nstart = 40)
fitNew <-cmdscale(distDat5) # k is the number of dim to PLOT
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16, col=km3new$cluster)
birthState=as.character(finalUpdate[,1])
View(birthState)
text(fitNew+.1, labels = birthState, cex=.5)
Everything seems to work perfectly up until the last line, when I label all the points and it outputs a graph with each point being labeled 50 times.
Is there any fix to this?
dput(fitNew) =
structure(c(-1.65773726259238, -0.534080004429963, -1.25224081559503,
-0.77600324658737, 13.7591986092784, -1.48285027332317, -1.0685046710528,
-1.40697098882713, 4.45857203274176, 1.31053002832658, -1.35540549966184,
-1.29910272287957, -1.68908570162927, 0.480144496416969, -0.592812161743823,
-1.23667901504586, -0.844421560951474, -0.827147650450116, -1.22861495063773,
-1.09472770146309, -1.68944621276222, -1.04378183282088, -1.34915033496973,
-0.951660697104605, -0.45483103293441, -1.70655513856763, -0.0616193106609581,
-1.48510165062592, -1.46251714293967, -1.66524625215651, -0.302561452071198,
-1.56675666458699, -1.28344728331308, 0.864956587539308, 0.16173394975142,
-0.850595975621662, -0.756783746315003, 24.7256817273653, -0.427398940139082,
-1.39925870808987, -0.755785801532488, -1.51858748511865, -0.944152303255372,
2.99465893267538, -1.67729960185572, -0.428860890332761, -1.66997803522651,
-0.392867003697617, -1.30257694125332, -1.66036447381944, -1.6019072254532,
-0.0137738939595427, -0.296070047308066, -0.00473553953140588,
0.0641385777789144, 1.13842140049119, -0.0268651281540734, -0.128806499497676,
-0.00491611456401126, 0.364126276181306, -0.143046769591177,
-0.0283493696039194, -0.0485069239634975, -0.0287370449451863,
0.095714493198601, -0.124528071666917, -0.0332600735692987, 0.0352695212129851,
-0.119261467201306, -0.0381525968696119, 0.0551469698282207,
-0.0115458694920637, -0.0250933419027217, 0.0406395856647227,
0.12482265126378, -0.17954163594865, -0.0113245644618699, -0.0894498877336694,
0.0305207676977073, 0.0323710265810206, -0.0491296972494748,
-0.121635810491615, 0.0175346179372083, 0.0127983868546243, 0.21663582448027,
0.0803333481747664, -0.0309611163272855, 0.0201356804088859,
-0.696293053438086, 0.133550765173667, 0.108119095159391, -0.136003613852937,
0.00557290379285935, 0.0602630898597761, -0.196004062948666,
-0.0161895096280255, -0.178283625530885, -0.0170000868214074,
0.107232630021258, 0.0375464632562086, -0.00276496483054615,
0.0193363060673037), .Dim = c(51L, 2L), .Dimnames = list(NULL,
NULL))
and dput(birthState) =
"c(\"AK\", \"AL\", \"AR\", \"AZ\", \"CA\", \"CO\", \"CT\", \"DE\", \"FL\", \"GA\", \"HI\", \"IA\", \"ID\", \"IL\", \"IN\", \"KS\", \"KY\", \"LA\", \"MA\", \"MD\", \"ME\", \"MI\", \"MN\", \"MO\", \"MS\", \"MT\", \"NC\", \"ND\", \"NE\", \"NH\", \"NJ\", \"NM\", \"NV\", \"NY\", \"OH\", \"OK\", \"OR\", \"Other\", \"PA\", \"RI\", \"SC\", \"SD\", \"TN\", \"TX\", \"UT\", \"VA\", \"VT\", \"WA\", \"WI\", \"WV\", \"WY\")"
As I mentioned in my comment, your problem is probably due to the fact that birthState is a string of an R character vector and not the actual vector.
The following code
birthState <- eval(parse(text = birthState))
plot(fitNew, xlab="Coordinate 1",ylab="Coordinate 2", pch=16)
text(fitNew, labels = birthState, cex=.5, pos = 4)
Yielded this for me
I am trying to produce a sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R? where a broad variaty of packages producing sankey diagrams are listed. Since I have input data and know different tools/packages I can produce such diagram BUT my euqestion is: how can I prepare input data for such?
Let's assume we would like to present how users have migrated between various states over 10 days and have start data set like the one below:
data.frame(userID = 1:100,
day1_state = sample(letters[1:8], replace = TRUE, size = 100),
day2_state = sample(letters[1:8], replace = TRUE, size = 100),
day3_state = sample(letters[1:8], replace = TRUE, size = 100),
day4_state = sample(letters[1:8], replace = TRUE, size = 100),
day5_state = sample(letters[1:8], replace = TRUE, size = 100),
day6_state = sample(letters[1:8], replace = TRUE, size = 100),
day7_state = sample(letters[1:8], replace = TRUE, size = 100),
day8_state = sample(letters[1:8], replace = TRUE, size = 100),
day9_state = sample(letters[1:8], replace = TRUE, size = 100),
day10_state = sample(letters[1:8], replace = TRUE, size = 100)
) -> dt
Now if one would like to create a sankey diagram with networkD3 package how should one tranform this dt data.frame into required input
so that we would have input like from this example
library(networkD3)
URL <- paste0(
"https://cdn.rawgit.com/christophergandrud/networkD3/",
"master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
Target = "target", Value = "value", NodeID = "name",
units = "TWh", fontSize = 12, nodeWidth = 30)
EDIT
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:
https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R
Then this code generates such sankey diagram for mentioned in question data.frame
fixtable <- function(...) {
tab <- table(...)
if (substr(colnames(tab)[1],1,1) == "_" &
substr(rownames(tab)[1],1,1) == "_") {
tab2 <- tab
colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
tab2[1,1] <- 0
# mandat w klubie
for (par in names(which(tab2[1,] > 0))) {
delta = min(tab2[par, 1], tab2[1, par])
tab2[par, par] = tab2[par, par] + delta
tab2[1, par] = tab2[1, par] - delta
tab2[par, 1] = tab2[par, 1] - delta
}
# przechodzi przez niezalezy
for (par in names(which(tab2[1,] > 0))) {
tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
tab2[1, par] = 0
}
for (par in names(which(tab2[,1] > 0))) {
tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
tab2[par, 1] = 0
}
tab[] <- tab2[]
}
tab
}
flow2 <- rbind(
data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
flow2 <- flow2[flow2[,3] > 0,]
nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
nam2 <- seq_along(nodes2[,1])-1
names(nam2) <- nodes2[,1]
links2 <- data.frame(source = nam2[as.character(flow2[,1])],
target = nam2[as.character(flow2[,2])],
value = flow2[,3])
sankeyNetwork(Links = links, Nodes = nodes,
Source = "source", Target = "target",
Value = "value", NodeID = "name",
fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
colourScale = "d3.scale.category20()")
I asked a similar question while ago. And I guess I better post it here how it can be done with the tidyverse magic.
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(dplyr)
library(stringr)
# The actual data preperation happens here
dt_new <- dt %>%
gather(day, state, -userID) %>% # Long format
mutate(day = str_match(day, "[0-9]+")[,1]) %>% # Get the numbers
mutate(day = as.integer(day), # Convert to proper data types
state = as.factor(state))
Here is how the data dt_new looks like
userID day state
1 1 1 d
2 2 1 d
3 3 1 g
4 4 1 a
5 5 1 a
6 6 1 d
7 7 1 d
8 8 1 b
9 9 1 d
10 10 1 e
...
Now plotting the Sankey plot:
ggplot(dt_new,
aes(x = day, stratum = state, alluvium = userID, fill = state, label = state)) +
geom_stratum() +
geom_text(stat = "stratum") +
geom_flow()
Here is the output