SparkR: Generate Density Data From Numeric Columns - r

I am wondering how I could generate a data object like the one you get when calling stats::density(df$variable) on the numeric columns of a spark dataframe?
I am looking into SparkR::spark.lapply but think I am missing something. I have created a little example below. If someone knows how and is willing to help me out, I'd be very thankful.
Best,
NF
Example:
df<- iris
gen_density_data<- function(df){
col_types<- sapply(df, class)
good_cols<- which(col_types %in% c("numeric", "integer"))
tres<- lapply(good_cols, function(x){
expr<- paste0("stats::density(df$", colnames(df)[x], ")")
eval(parse(text=expr))
})
return(tres)
}
res<- gen_density_data(df)
# And for Spark:
sdf<- SparkR::createDataFrame(iris)
gen_spark_density_data<- function(sdf){
tmp_types<- SparkR::coltypes(sdf)
good_cols_idx<- which(tmp_types %in% setdiff(tmp_types, c("character", "POSIXct", "POSIXlt", "logical")))
if(length(good_cols_idx)>=1){
tres<- SparkR::spark.lapply(good_cols_idx, function(x){
eval(parse(text=paste0("stats::density(sdf$", colnames(sdf)[x], ")")))
})
return(tres)
}
}
tst<- gen_spark_density_data(sdf=sdf) # This is where it throws errors.

I came up with a solution that works decently well. I use highcharter for plotting. I think I could further improve how I manage the partitions of the data. Right now, this might not be the most scalable solution for large data sets with columns with large differences between the minimum and maximum values. Some conditional checking is probably in order, but for the purposes of just getting an example out there, this is what I made. Note: I adopted the example from https://rpubs.com/mcocam12/KDF_byHand. Many thanks to Marc for the example.
Data:
df<- do.call("rbind", replicate(10, iris, simplify = FALSE))
sdf<- SparkR::createDataFrame(df)
sdf<- SparkR::repartition(sdf, nrow(sdf))
Functions:
gen_sdf_kernel_density_points<- function(sdf=sdf,num_values, h=1){
x_col<- SparkR::colnames(sdf)[1]
min_max_sdf<- eval(parse(text=paste0("SparkR::agg(sdf, min=min(sdf$", x_col, "), max=max(sdf$", x_col,")) %>% SparkR::collect()")))
Range = seq(min_max_sdf$min-5, min_max_sdf$max+5, 0.01)
Range<- data.frame(Range)
RangeSDF<- SparkR::createDataFrame(Range)
# this is where I think I could be better with partitions, ideas welcomed
#RangeSDF<- SparkR::repartition(RangeSDF, nrow(RangeSDF))
# if(nrow(Range)>1000){
# RangeSDF<- SparkR::repartition(RangeSDF, 200L)
# } else if(nrow(Range) > 64){
# RangeSDF<- SparkR::repartition(RangeSDF, 64L)
# }
tst<- SparkR::crossJoin(sdf, RangeSDF)
tst$density<- eval(parse(text=paste0("exp(-(tst$Range-tst$", x_col,")^2/(2*h^2))/(h*sqrt(2*pi))")))
## Now group by range and get the sum of the density, normalize by the number of values
gb_df<- SparkR::groupBy(tst, tst$Range)
densities2<- SparkR::agg(gb_df, bell_sum=sum(tst$density))
densities2<- SparkR::withColumn(densities2, "kernel_density", densities2$bell_sum / num_values)
densities2<- SparkR::arrange(densities2, asc(densities2$Range))
return(densities2)
}
gen_den_plots_from_spark_res<- function(res){
big_out<- lapply(seq_along(res), function(x){
var_name<- names(res)[x]
rdf<- res[[x]]
tmp<- data.frame(cbind(x = rdf$Range, y = rdf$kernel_density))
x<- highcharter::list_parse(tmp)
hc<- highcharter::highchart() %>%
hc_series(
list(
name="Density Estimate",
data = x,
type = "areaspline",
marker = list(enabled = FALSE),
color = list(
linearGradient = list(x1 = 0, y1 = 1, x2 = 0, y2 = 0),
stops = list(
list(0, "transparent"),
list(0.33, "#0000FF1A"),
list(0.66, "#0000FF33"),
list(1, "#ccc")
)
),
fillColor = list(
linearGradient = list(x1 = 0, y1 = 1, x2 = 0, y2 = 0),
stops = list(
list(0, "transparent"),
list(0.1, "#0000FF1A"),
list(0.5, "#0000FF33"),
list(1, "#0000FF80")
)
)
)
)
hc<- hc %>%
highcharter::hc_title(text=paste0("Density Plot For: ", snakecase::to_title_case(var_name)))# %>% hc_add_series(data =tmp, hcaes(x= tmp$x, y = tmp$y),name="Bars", type="column" )
return(hc)
})
return(big_out)
}
make_hc_grid<- function(tres_out, ncol=2){
hc<- tres_out %>%
highcharter::hw_grid(rowheight = 450, ncol = ncol) %>%htmltools::browsable()
hc
}
Usage:
tmp_types<- SparkR::coltypes(sdf)
good_cols_idx<- which(tmp_types %in% setdiff(tmp_types, c("character", "POSIXct", "POSIXlt", "logical")))
nrows_sdf<- SparkR::count(sdf)
if(length(good_cols_idx)>=1){
out<- lapply(seq_along(good_cols_idx), function(z){
# Need to select a single column for the sdf, otherwise the cross join becomes too big
tmpz<- SparkR::select(sdf, SparkR::colnames(sdf)[good_cols_idx[z]])
out<- gen_sdf_kernel_density_points(sdf = tmpz, num_values = nrows_sdf)
out<- SparkR::collect(out)
return(out)
}) %>% stats::setNames(SparkR::colnames(sdf)[good_cols_idx])
}
Plotting:
tres<- gen_den_plots_from_spark_res(res=out)
all_plots<- make_hc_grid(tres_out = tres)
# View Result
all_plots
Expected Result:
This could all probably be improved...if you have ideas, I'd love to hear them.
Best,
NF

Related

Saving Multiple HTML files in R

I am following this R tutorial here (towards the end of the page) : https://glin.github.io/reactable/articles/examples.html. I would like to make an interactive map that allows you to filter points on the map - i.e. when you click on the table rows, points on the map appear and disappear.
I decided to generate my own data for this problem. I am following the last part where they make a map/table:
library(leaflet)
library(htmltools)
library(crosstalk)
library(reactable)
library(htmlwidgets)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
id = 1:1000
long = 2.2945 + rnorm( 1000, 0.1085246 , 0.1)
lat = 48.8584 + rnorm( 1000, 0.009036273 , 0.1)
my_data_1 = data.frame(id, lat, long, var1 = myFun(1000), var2 = myFun(1000), var3 = myFun(1000) )
Then, I manipulated the data according
brew_sp <- SharedData$new(my_data_1, group = "breweries")
brew_data <- as_tibble(my_data_1) %>%
select(var1, var2, var3) %>%
SharedData$new(group = "breweries")
I then tried to make the map (I added a small change to the map):
map <- leaflet(brew_sp) %>%
addTiles() %>%
addMarkers(clusterOption=markerClusterOptions())
And then the table:
tbl <- reactable(
brew_data ,
selection = "multiple",
filterable = TRUE,
searchable = TRUE,
onClick = "select",
rowStyle = list(cursor = "pointer"),
minRows = 10
)
Then, I put them together and saved the result - this worked fine:
final_file = htmltools::browsable(
htmltools::tagList(map, tbl)
)
htmltools::save_html(tagList(map, tbl), file = "sample.html")
However, here is the problem:
I noticed that when I click on only one of the rows, I notice that everything else still appears on the map. Shouldn't all the other points disappear when only one item is selected?
Could someone show me how to make this map/table look the same way it appears in the tutorial?
Thank you!

Error in m == q : R comparison (1) is possible only for atomic and list types

The whole function which i need to convert the for loop in to apply for optimization
plans_achievements <- function(pa_m,pa_q){
if(nrow(pa_m)==0 & nrow(pa_q==0)){
df = data.frame(a = c(""), b = c("No Data Available"))
colnames(df)=""
}else{
pa_m= pa_m%>% select(inc,month_year,Plans,Achievements,quarter_year)
colnames(pa_mon)[2] = "Period"
pa_q= pa_q%>% select(inc,quarter_year,Plans,Achievements)
colnames(pa_qtr)[2] = "Period"
df = data.frame(inc=c(""),Period=c(""),Plans=c(""),Achievements=c(""))
for (q in unique(pa_q$Period)){
df1 = pa_q[pa_q$Period==q,]
df1$Period = paste0("<span style=\"color:#288D55\">",df1$Period,"</span>")
df1$Plans = paste0("<span style=\"color:#288D55\">",df1$Plans,"</span>")
df1$Achievements = paste0("<span style=\"color:#288D55\">",df1$Achievements,"</span>")
df = rbind(df,df1)
for (m in unique(pa_m$quarter_year)){
if(m==q){
df2 = pa_m[pa_m$quarter_year==q,][-5]
df = rbind(df,df2)
}
}
}
df = df[-1,]
}
return(df)
}
The apply which i tried
my_fun <- function(q){
df1 = pa_qtr[pa_qtr$Period==q,]
df1$Period = paste0("<span style=\"color:#288D55\">",df1$Period,"</span>")
df1$Plans = paste0("<span style=\"color:#288D55\">",df1$Plans,"</span>")
df1$Achievements = paste0("<span style=\"color:#288D55\">",df1$Achievements,"</span>")
df = rbind(df,df1)
}
df = do.call(rbind,lapply(unique(pa_qtr$Period), my_fun))
my_fun2 <- function(m,my_fun){
if (m == q) {
df2 = pa_mon[pa_mon$qtr_yr == q, ][-5]
df = rbind(df,df2)
}
}
df = do.call(cbind,lapply(unique(pa_mon$qtr_yr), my_fun2))
DT::datatable(plans_achievements(pa_m[pa_m$inc=="vate",],pa_q[pa_q$inc=="vate",]), rownames = F,escape = FALSE,selection=list(mode="single",target="row"),options = list(pageLength = 50,scrollX = TRUE,dom = 'tp',ordering=F,columnDefs = list(list(visible=FALSE, targets=c(0)),list(className = 'dt-left', targets = '_all'))))
Why you get the error comparison is possible only for atomic and list types
I will answer your original question first:
You get the error because you haven't defined q as a variable inside the function my_fun2. Since you haven't defined this variable, R will look for it in the global environment. There R will find the function q() (used to quit R). So you get the error message comparison (1) is possible only for atomic and list types because R thinks you are trying to compare a number m with the function q.
Here is a small example to make it easy to see:
# Run this in a clean environment
m <- 1
m == b # Understandable error message - "b" is not found
m == q # Your error - because R thinks you are comparing m to a function
You fix this error by making sure that q is defined inside your function. Either by creating it inside the function, or by supplying it as an input argument.
A possible solution for your problem
As I understand your code, you want to format, merge and sort the values in pa_q and pa_m, to display them in a html table.
Under is a possible solution, using tidyverse and vectorized operations, rather than a loop or apply functions. Vectorized functions are typically your fastest option in R, as I know you want to optimize your code.
library(dplyr)
plans_achievements <- function(pa_m, pa_q) {
# I've modified the logic a bit: there is no need to wrap the full function in
# an else statement, since we can return early if the data has no rows
if (nrow(pa_m) == 0 && nrow(pa_q == 0)) {
df = data.frame(a = c(""), b = c("No Data Available"))
colnames(df) = ""
return(df)
}
pa_q <-
pa_q %>%
# Select and rename the columns vi need
select(inc, Period = quarter_year, Plans, Achievements, date) %>%
# Format the values
mutate(
Period = paste0("<span style=\"color:#288D55\">", Period,"</span>"),
Plans = paste0("<span style=\"color:#288D55\">", Plans,"</span>"),
Achievements = paste0("<span style=\"color:#288D55\">", Achievements,"</span>")
)
pa_m <-
pa_m %>%
# Select and rename the columns we need
select(inc, Period = month_year, Plans, Achievements, date) #%>%
# Combine the datasets
bind_rows(
pa_q,
pa_m
) %>%
# Make sure that R understand date as a date value
mutate(
date = lubridate::dmy(date)
) %>%
# Sort by date
arrange(desc(date)) %>%
# Remove columns we do not need
select(-date, -inc)
}
DT::datatable(
plans_achievements(
pa_m[pa_m$inc=="vate",],
pa_q[pa_q$inc=="vate",]
),
rownames = FALSE,
escape = FALSE,
selection = list(mode = "single", target = "row"),
options = list(
pageLength = 50,
scrollX = TRUE,
dom = 'tp',
ordering = FALSE,
columnDefs = list(
list(className = 'dt-left', targets = '_all')
)
)
)
Hopefully this solves your problem.

How to shorten code for "visRemoveNodes" using loop in rstudio

I have constructed multiple protein - protein networks for diseases in shiny app and I ploted them using visnetwork. I found the articulation points for each network and I want to remove them.
My code for a disease looks like this:
output$plot54 <- renderVisNetwork({
alsex <- as.matrix(alsex)
sel1 <- alsex[,1]
sel2 <- alsex[,2]
n10 <- unique(c(sel1,sel2))
n10 <- as.data.frame(n10)
colnames(n10) <- "id"
ed10 <- as.data.frame(alsex)
colnames(ed10) <- c("from", "to", "width")
n10
g <- graph_from_data_frame(ed10)
articulation.points(g)
nodes4 <- data.frame(n10, color = ifelse(n10$id=="CLEC4E"|n10$id=="ACE2"|n10$id=="MYO7A"|n10$id=="HSPB4"
|n10$id=="EXOSC3"|n10$id=="RBM45"|n10$id=="SPAST"|n10$id=="ALMS1"|n10$id=="PIGQ"
|n10$id=="CDC27"|n10$id=="GFM1"|n10$id=="UTRN"|n10$id=="RAB7B"|n10$id=="GSN"|n10$id=="VAPA"|n10$id=="GLE1"
|n10$id=="FA2H"|n10$id=="HSPA4"|n10$id=="SNCA"|n10$id=="RAB5A"|n10$id=="SETX","red","blue"))
visNetwork(nodes4, ed10, main = "Articulation Points") %>%
visNodes (color = list(highlight = "pink"))%>%
visIgraphLayout()%>%
visOptions(highlightNearest = list(enabled = T, hover = T),
nodesIdSelection = T)%>%
visInteraction(keyboard = TRUE)
})
observe({
input$delete54
visNetworkProxy("plot54") %>%
visRemoveNodes(id="CLEC4E")%>%visRemoveEdges(id = "CLEC4E")%>%
visRemoveNodes(id="ACE2")%>%visRemoveEdges(id = "ACE2")%>%
visRemoveNodes(id="MYO7A")%>%visRemoveEdges(id = "MYO7A")%>%
visRemoveNodes(id="HSPB4")%>%visRemoveEdges(id = "HSPB4")%>%
visRemoveNodes(id="EXOSC3")%>%visRemoveEdges(id = "EXOSC3")%>%
visRemoveNodes(id="RBM45")%>%visRemoveEdges(id = "RBM45")%>%
visRemoveNodes(id="SPAST")%>%visRemoveEdges(id = "SPAST")%>%
visRemoveNodes(id="ALMS1")%>%visRemoveEdges(id = "ALMS1")%>%
visRemoveNodes(id="PIGQ")%>%visRemoveEdges(id = "PIGQ")%>%
visRemoveNodes(id="CDC27")%>%visRemoveEdges(id = "CDC27")%>%
visRemoveNodes(id="GFM1")%>%visRemoveEdges(id = "GFM1")%>%
visRemoveNodes(id="UTRN")%>%visRemoveEdges(id = "UTRN")%>%
visRemoveNodes(id="RAB7B")%>%visRemoveEdges(id = "RAB7B")%>%
visRemoveNodes(id="GSN")%>%visRemoveEdges(id = "GSN")%>%
visRemoveNodes(id="VAPA")%>%visRemoveEdges(id = "VAPA")%>%
visRemoveNodes(id="GLE1")%>%visRemoveEdges(id = "GLE1")%>%
visRemoveNodes(id="FA2H")%>%visRemoveEdges(id = "FA2H")%>%
visRemoveNodes(id="HSPA4")%>%visRemoveEdges(id = "HSPA4")%>%
visRemoveNodes(id="SNCA")%>%visRemoveEdges(id = "SNCA")%>%
visRemoveNodes(id="RAB5A")%>%visRemoveEdges(id = "RAB5A")%>%
visRemoveNodes(id="SETX")%>%visRemoveEdges(id = "SETX")
})
Using
g <- graph_from_data_frame(ed10)
articulation.points(g)
I found the articulation points, and I marked them with red color using ifelse as you can see in nodes4 vector.
My questions:
How to shorten my code in ifelse using loop, so I don't have to write the articullation points one by one manually.
How to shorten my code in visRemoveNodes and visRemoveEdges using loop, so I don't have to write them one by one manually as well.
Crossed posted at:
https://community.rstudio.com/t/how-to-shorten-code-for-visremovenodes-using-loop/72506
The answer for the second question is:
observe({
l <- c("CLEC4E","ACE2", "MYO7A", "HSPB4", "EXOSC3", "RBM45","SPAST","ALMS1",
"PIGQ","CDC27","GFM1","UTRN",
"RAB7B", "GSN", "VAPA", "GLE1","FA2H","HSPA4",
"SNCA","RAB5A","SETX") #we put all genes that we want to delete in a vector
for (i in l){
input$delete54
visNetworkProxy("plot54")%>%
visRemoveNodes(id= i)%>%visRemoveEdges(id = i)
}
})

(R) Edit Gradient Color of Cells Behind Multiple Columns of Data In Data Table

I asked a similar question a few days ago and have made some progress by talking with some people, but I need a little more help to get this to the point I wanted. So here's where I am:
I want to have a data table of a certain size with columns of data (different scales) and be able to apply a color range (kind of like a heat map) to the values in the cells of the data table depending on how large or small the values in the columns are.
Here is some code that was provided to me:
library(DT)
testrun <- round(runif(100), 6)
data <- data.frame(testrun = testrun)
brks <- quantile(data$testrun, probs = seq(.05, .95, .01), na.rm = TRUE)
clrs_df <- colorRamp(c("white","blue"))(c(0,brks)) %>%
as_tibble(.name_repair ="minimal") %>%
setNames(nm=c("r","g","b")) %>%
mutate_all(~as.character(round(.,digits=0))) %>%
mutate(mycolor=paste0("rgb(",paste(r,g,b,sep = ","),")"))
clrs <- pull(clrs_df,mycolor)
DT::datatable(data,rownames=TRUE,options = list(lengthChange = FALSE, dom='t')) %>%
formatStyle(colnames(data), backgroundColor = styleInterval(brks, clrs))
The above example works for one column of data, but I intend to have multiple columns of data for which I want these color gradients to apply to each column individually.
I've tried substituting mtcars for testrun in the assignment portion of the code and in the brks assignment line, but it just leaves me with a table with no colors applied.
Can someone help point me in the right direction? Any help would be appreciated!
Thank you!
I think you want:
library(DT)
data <- data.frame(
C1 = rnorm(100),
C2 = rgamma(100, 10, 10)
)
dtable <- datatable(data, rownames=TRUE, options = list(lengthChange = FALSE, dom='t'))
colRamp <- colorRamp(c("white","blue"))
for(column in names(data)){
x <- na.omit(data[[column]])
brks <- quantile(x, probs = seq(.05, .95, .01))
RGB <- colRamp(c(0, (brks-min(x))/(max(x)-min(x))))
clrs <- apply(RGB, 1, function(rgb){
sprintf("rgb(%s)", toString(round(rgb,0)))
})
dtable <- dtable %>%
formatStyle(column, backgroundColor = styleInterval(brks, clrs))
}
dtable

Customize colors for boxplot with highcharter

I have boxplots on highcharter and I would like to customize both the
Fill color
Border color
Here is my code
df = data.frame(cbind(categ = rep(c('a','b','c','d')),value = rnorm(1000)))
hcboxplot(var = df$categ, x = as.numeric(df$value)) %>%
hc_chart(type = "column") %>%
hc_colors(c("#203d7d","#a0a0ed","#203d7e","#a0a0ad"))
The hc_colors works only if I put var2 instead of var but then the box plot are shrunken...
API for styling fillColor: https://api.highcharts.com/highcharts/series.boxplot.fillColor
And for "Border color": https://api.highcharts.com/highcharts/series.boxplot.color
Pure JavaScript example of how to style and define points: https://jsfiddle.net/BlackLabel/6tud3fgx
And R code:
library(highcharter)
df = data.frame(cbind(categ = rep(c('a','b','c','d', 'e')),value = rnorm(1000)))
hcboxplot(var = df$categ, x = as.numeric(df$value)) %>%
hc_chart(type = "column", events = list(
load = JS("function() {
var chart = this;
chart.series[0].points[2].update({
color: 'red'
})
chart.series[0].points[4].update({
x: 4,
low: 600,
q1: 700,
median: 800,
q3: 900,
high: 1000,
color: 'orange'
})
}")
)) %>%
hc_plotOptions(boxplot = list(
fillColor = '#F0F0E0',
lineWidth = 2,
medianColor = '#0C5DA5',
medianWidth = 3,
stemColor = '#A63400',
stemDashStyle = 'dot',
stemWidth = 1,
whiskerColor = '#3D9200',
whiskerLength = '20%',
whiskerWidth = 3,
color = 'black'
)) %>%
hc_colors(c("#203d7d","#a0a0ed","#203d7e","#a0a0ad"))
I made a couple functions to do some stuff with highcharts and boxplots. It will let you color each boxplot and fill it accordingly, and then inject new graphical parameters according to the Highcharts API, should you desire.
Check it out:
## Boxplots Data and names, note the data index (0,1,2) is the first number in the datum
series<- list(
list(
name="a",
data=list(c(0,1,2,3,4,5))
),
list(
name="b",
data=list(c(1,2,3,4,5,6))
),
list(
name="c",
data=list(c(2,3,4,5,6,7))
)
)
# Graphical attribute to be set: fillColor.
# Make the colors for the box fill and then also the box lines (make them match so it looks pretty)
cols<- viridisLite::viridis(n= length(series2), alpha = 0.5) # Keeping alpha in here! (for box fill)
cols2<- substr(cols, 0,7) # no alpha, pure hex truth, for box lines
gen_key_vector<-function(variable, num_times){
return(rep(variable, num_times))
}
kv<- gen_key_vector(variable = "fillColor", length(series))
# Make a function to put stuff in the 'series' list, requires seq_along to be used since x is the list/vector index tracker
add_variable_to_series_list<- function(x, series_list, key_vector, value_vector){
base::stopifnot(length(key_vector) == length(value_vector))
base::stopifnot(length(series_list) == length(key_vector))
series_list[[x]][length(series_list[[x]])+1]<- value_vector[x]
names(series_list[[x]])[length(series_list[[x]])]<- key_vector[x]
return(series_list[[x]])
}
## Put the extra stuff in the 'series' list
series2<- lapply(seq_along(series), function(x){ add_variable_to_series_list(x = x, series_list = series, key_vector = kv, value_vector = cols) })
hc<- highcharter::highchart() %>%
highcharter::hc_chart(type="boxplot", inverted=FALSE) %>%
highcharter::hc_title(text="This is a title") %>%
highcharter::hc_legend(enabled=FALSE) %>%
highcharter::hc_xAxis(type="category", categories=c("a", "b", "c"), title=list(text="Some x-axis title")) %>%
highcharter::hc_add_series_list(series2) %>%
hc_plotOptions(series = list(
marker = list(
symbol = "circle"
),
grouping=FALSE
)) %>%
highcharter::hc_colors(cols2) %>%
highcharter::hc_exporting(enabled=TRUE)
hc
This probably could be adjusted to work with a simple dataframe, but I think it will get you what you want for right now without having to do too much extra work. Also, maybe look into list_parse or list_parse2' fromhighcharter...it could probably help with building out theseries` object..I still need to look into that.
Edit:
I have expanded the example to make it work with a regular DF. As per some follow up questions, the colors are set using the viridis palette inside the make_highchart_boxplot_with_colored_factors function. If you want to allow your own palette and colors, you could expose those arguments and just include them as parameters inside the function call. The expanded example borrows how to add outliers from the highcharter library (albeit in a hacky way) and then builds everything else up from scratch. Hopefully this helps clarify my previous answer. Please note, I could probably also clean up the if condition to make it a little more brief, but I kept it verbose for illustrative purposes.
Double Edit: You can now specify a vector of colors for each level of the factor variable
library(highcharter)
library(magrittr)
library(viridisLite)
df = data.frame(cbind(categ = rep(c('a','b','c','d')),value = rnorm(1000)))
df$value<- base::as.numeric(df$value)
add_variable_to_series_list<- function(x, series_list, key_vector, value_vector){
base::stopifnot(length(key_vector) == length(value_vector))
base::stopifnot(length(series_list) == length(key_vector))
series_list[[x]][length(series_list[[x]])+1]<- value_vector[x]
names(series_list[[x]])[length(series_list[[x]])]<- key_vector[x]
return(series_list[[x]])
}
# From highcharter github pages:
hc_add_series_bwpout = function(hc, value, by, ...) {
z = lapply(levels(by), function(x) {
bpstats = boxplot.stats(value[by == x])$stats
outliers = c()
for (y in na.exclude(value[by == x])) {
if ((y < bpstats[1]) | (y > bpstats[5]))
outliers = c(outliers, list(which(levels(by)==x)-1, y))
}
outliers
})
hc %>%
hc_add_series(data = z, type="scatter", ...)
}
gen_key_vector<-function(variable, num_times){
return(rep(variable, num_times))
}
gen_boxplot_series_from_df<- function(value, by,...){
value<- base::as.numeric(value)
by<- base::as.factor(by)
box_names<- levels(by)
z=lapply(box_names, function(x) {
boxplot.stats(value[by==x])$stats
})
tmp<- lapply(seq_along(z), function(x){
var_name_list<- list(box_names[x])
#tmp0<- list(names(df)[x])
names(var_name_list)<- "name"
index<- x-1
tmp<- list(c(index, z[[x]]))
tmp<- list(tmp)
names(tmp)<- "data"
tmp_out<- c(var_name_list, tmp)
#tmp<- list(tmp)
return(tmp_out)
})
return(tmp)
}
# Usage:
#series<- gen_boxplot_series_from_df(value = df$total_value, by=df$asset_class)
## Boxplot function:
make_highchart_boxplot_with_colored_factors<- function(value, by, chart_title="Boxplots",
chart_x_axis_label="Values", show_outliers=FALSE,
boxcolors=NULL, box_line_colors=NULL){
by<- as.factor(by)
box_names_to_use<- levels(by)
series<- gen_boxplot_series_from_df(value = value, by=by)
if(is.null(boxcolors)){
cols<- viridisLite::viridis(n= length(series), alpha = 0.5) # Keeping alpha in here! (COLORS FOR BOXES ARE SET HERE)
} else {
cols<- boxcolors
}
if(is.null(box_line_colors)){
if(base::nchar(cols[[1]])==9){
cols2<- substr(cols, 0,7) # no alpha, pure hex truth, for box lines
} else {
cols2<- cols
}
} else {
cols2<- box_line_colors
}
# Injecting value 'fillColor' into series list
kv<- gen_key_vector(variable = "fillColor", length(series))
series2<- lapply(seq_along(series), function(x){ add_variable_to_series_list(x = x, series_list = series, key_vector = kv, value_vector = cols) })
if(show_outliers == TRUE){
hc<- highcharter::highchart() %>%
highcharter::hc_chart(type="boxplot", inverted=FALSE) %>%
highcharter::hc_title(text=chart_title) %>%
highcharter::hc_legend(enabled=FALSE) %>%
highcharter::hc_xAxis(type="category", categories=box_names_to_use, title=list(text=chart_x_axis_label)) %>%
highcharter::hc_add_series_list(series2) %>%
hc_add_series_bwpout(value = value, by=by, name="Outliers") %>%
hc_plotOptions(series = list(
marker = list(
symbol = "circle"
),
grouping=FALSE
)) %>%
highcharter::hc_colors(cols2) %>%
highcharter::hc_exporting(enabled=TRUE)
} else{
hc<- highcharter::highchart() %>%
highcharter::hc_chart(type="boxplot", inverted=FALSE) %>%
highcharter::hc_title(text=chart_title) %>%
highcharter::hc_legend(enabled=FALSE) %>%
highcharter::hc_xAxis(type="category", categories=box_names_to_use, title=list(text=chart_x_axis_label)) %>%
highcharter::hc_add_series_list(series2) %>%
hc_plotOptions(series = list(
marker = list(
symbol = "circle"
),
grouping=FALSE
)) %>%
highcharter::hc_colors(cols2) %>%
highcharter::hc_exporting(enabled=TRUE)
}
hc
}
# Usage:
tst_box<- make_highchart_boxplot_with_colored_factors(value = df$value, by=df$categ, chart_title = "Some Title", chart_x_axis_label = "Some X Axis", show_outliers = TRUE)
tst_box
# Custom Colors:
custom_colors_with_alpha_in_hex<- paste0(gplots::col2hex(sample(x=colors(), size = length(unique(df$categ)), replace = FALSE)), "80")
tst_box2<- make_highchart_boxplot_with_colored_factors(value = df$value, by=df$categ, chart_title = "Some Title",
chart_x_axis_label = "Some X Axis",
show_outliers = TRUE, boxcolors = custom_colors_with_alpha_in_hex)
tst_box2
tst_box3<- make_highchart_boxplot_with_colored_factors(value = df$value, by=df$categ, chart_title = "Some Title",
chart_x_axis_label = "Some X Axis",
show_outliers = TRUE, boxcolors = custom_colors_with_alpha_in_hex, box_line_colors = "black")
tst_box3
I hope this helps, please let me know if you have any more questions. I'm happy to try to help as best I can.
-nate
Since there's no highcharter answer yet, I give you at least a base solution.
First, your definition of the data frame is somewhat flawed, rather do:
dat <- data.frame(categ=c('a','b','c','d'), value=rnorm(1000))
Now, using boxplot is quite straightforward. border option colors your borders. With option col you also could color the fills.
boxplot(value ~ categ, dat, border=c("#203d7d","#a0a0ed","#203d7e","#a0a0ad"), pars=list(outpch=16))
Gives
Note: See this nice solution for further customizations.

Resources