I have a dataset and would like to do some exploratory data analysis before building a predictive model. All variables are categorical. I know that I can use 'dataExplorer' to do some quick EDA:
library(tidyverse)
library(dataExplorer)
dat <- data.frame(circuit = sample(c("China", "Murica", "Brazil"), 100, replace = T),
driver = sample(c("Kimi", "Seb", "Max", "Lando", "Lance"), 100, replace = T),
opinion = sample(c("Garbage", "Not.Garbage"), 100, replace = T, prob = c(0.8, 0.2)))
dat %>%
select(-opinion) %>%
plot_bar
However, I want the bars for 'circuit' and 'driver' to be filled in to represent the respective proportions of 'opinion' for each variable (see below). This is so that I can see which predictor variables are most closely associated with my outcome variable.
dat %>%
ggplot(aes(x = circuit, fill = opinion)) +
geom_histogram(stat = "count")
However, I don't want to build each plot individually and then use grid.arrange to organize them.
Thanks for any help :)
I don't think there is an easy way unless you tweak the plot_bar function, since it is currently designed to visualize univariate distribution. Run the following function and it should work with your example:
library(tidyverse)
library(data.table) ## Note: You will need to load data.table
library(DataExplorer)
## Rewrite plot_bar
plot_bar2 <- function(data, group, with = NULL, maxcat = 50, order_bar = TRUE, binary_as_factor = TRUE, title = NULL, ggtheme = theme_gray(), theme_config = list(), nrow = 3L, ncol = 3L, parallel = FALSE) {
frequency <- measure <- variable <- value <- NULL
if (!is.data.table(data)) data <- data.table(data)
split_data <- split_columns(data, binary_as_factor = binary_as_factor)
if (split_data$num_discrete == 0) stop("No discrete features found!")
discrete <- split_data$discrete
ind <- DataExplorer:::.ignoreCat(discrete, maxcat = maxcat)
if (length(ind)) {
message(length(ind), " columns ignored with more than ", maxcat, " categories.\n", paste0(names(ind), ": ", ind, " categories\n"))
drop_columns(discrete, names(ind))
if (length(discrete) == 0) stop("Note: All discrete features ignored! Nothing to plot!")
}
feature_names <- names(discrete)
if (is.null(with)) {
dt <- discrete[, list(frequency = .N), by = feature_names]
} else {
if (is.factor(data[[with]])) {
measure_var <- suppressWarnings(as.numeric(levels(data[[with]]))[data[[with]]])
} else if (is.character(data[[with]])) {
measure_var <- as.numeric(data[[with]])
} else {
measure_var <- data[[with]]
}
if (all(is.na(measure_var))) stop("Failed to convert `", with, "` to continuous!")
if (with %in% names(discrete)) drop_columns(discrete, with)
tmp_dt <- data.table(discrete, "measure" = measure_var)
dt <- tmp_dt[, list(frequency = sum(measure, na.rm = TRUE)), by = feature_names]
}
dt2 <- suppressWarnings(melt.data.table(dt, id.vars = c(group, "frequency"), measure.vars = setdiff(feature_names, group))) # This line is updated
layout <- DataExplorer:::.getPageLayout(nrow, ncol, ncol(discrete))
plot_list <- DataExplorer:::.lapply(
parallel = parallel,
X = layout,
FUN = function(x) {
if (order_bar) {
base_plot <- ggplot(dt2[variable %in% feature_names[x]], aes(x = reorder(value, frequency), y = frequency))
} else {
base_plot <- ggplot(dt2[variable %in% feature_names[x]], aes(x = value, y = frequency))
}
base_plot +
geom_bar(stat = "identity", aes_string(fill = group)) + # This line is updated
coord_flip() +
xlab("") + ylab(ifelse(is.null(with), "Frequency", toTitleCase(with)))
}
)
class(plot_list) <- c("multiple", class(plot_list))
plotDataExplorer(
plot_obj = plot_list,
page_layout = layout,
title = title,
ggtheme = ggtheme,
theme_config = theme_config,
facet_wrap_args = list(
"facet" = ~ variable,
"nrow" = nrow,
"ncol" = ncol,
"scales" = "free"
)
)
}
## Create data and plot
dat <- data.frame(
circuit = sample(c("China", "Murica", "Brazil"), 100, replace = T),
driver = sample(c("Kimi", "Seb", "Max", "Lando", "Lance"), 100, replace = T),
opinion = sample(c("Garbage", "Not.Garbage"), 100, replace = T, prob = c(0.8, 0.2))
)
plot_bar2(dat, group = "opinion")
The plot looks like this:
Related
Is there a way to generate the column defitions for color_tiles dynamically? Here is my current approach which does not work correctly, as the tiles are always colored by the last class-column that I pass.
library(reactable)
library(reactablefmtr)
library(dplyr)
library(purrr)
tile_column <- function(x, ref_column, name, show_text = FALSE) {
colDef(
show = TRUE,
name = name,
cell = color_tiles(
x,
color_by = ref_column,
colors = c("red", "orange", "yellow", "limegreen", "green"),
box_shadow = TRUE,
show_text = show_text,
),
na = ""
)
}
data <- tibble(
name = paste("name", 1:10),
val1 = paste("val1", 1:10),
val2 = paste("val2", 1:10),
class1 = sample(1:5, 10, replace = TRUE),
class2 = sample(1:5, 10, replace = TRUE),
)
col_definitions1 <- list(
name = colDef(show = TRUE, name = "Name")
)
col_definitions2 <- list()
for (i in 1:2) {
col_definitions2 <- c(
col_definitions2,
list(tile_column(data, ref_column = paste0("class", i), name = paste("Column", i), show_text = TRUE))
)
}
col_definitions3 <- col_definitions2 |> set_names(paste0("val", 1:2))
col_definitions <- c(col_definitions1, col_definitions3)
# browser()
data %>%
reactable(
defaultColDef = colDef(show = FALSE),
columns = col_definitions
)
Not sure what's the issue. But to make your code work you could switch to lapply instead of using a for loop. Just a guess, but as a similar issue is well known when creating ggplot2 inside a for loop the reason for the issue might be related to lazy evaluation.
library(reactable)
library(reactablefmtr)
library(dplyr)
set.seed(123)
col_definitions2 <- lapply(1:2, function(i) tile_column(data, ref_column = paste0("class", i), name = paste("Column", i), show_text = TRUE))
names(col_definitions2) <- paste0("val", 1:2)
col_definitions <- c(col_definitions1, col_definitions2)
data %>%
reactable(
defaultColDef = colDef(show = FALSE),
columns = col_definitions
)
I am using the R programming language. I trying to learn how to customize hover text in 3d plotly objects as seen here: https://rstudio-pubs-static.s3.amazonaws.com/441420_9a7c15988f3c4f59b2d828eb87ba1634.html
Recently, I have learned how to create a 3d plotly object for some data that I simulated :
library(Rtsne)
library(dplyr)
library(ggplot2)
library(plotly)
library(caret)
library(randomForest)
#data
a = iris
a <- unique(a)
#create two species just to make things easier
s <- c("a","b")
species<- sample(s , 149, replace=TRUE, prob=c(0.3, 0.7))
a$species = species
a$species = as.factor(a$species)
#split data into train/test, and then random forest
index = createDataPartition(a$species, p=0.7, list = FALSE)
train = a[index,]
test = a[-index,]
rf = randomForest(species ~ ., data=train, ntree=50, mtry=2)
#have the model predict the test set
pred = predict(rf, test, type = "prob")
labels = as.factor(ifelse(pred[,2]>0.5, "a", "b"))
confusionMatrix(labels, test$species)
#tsne algorithm
tsne_obj_3 <- Rtsne(test[,-5], perplexity=1, dims=3)
df_m2 <- as.data.frame(tsne_obj_3$Y)
df_m2$labels = test$species
df_m2$color = ifelse(df_m2$labels == "a", "red","blue")
df_m2$petal_length = test$Petal.Length
axis_1 = df_m2$V1
axis_2 = df_m2$V2
axis_3 = df_m2$V3
plot_ly(x=as.vector(axis_1),
y=as.vector(axis_2),
z=axis_3,
type="scatter3d",
mode="markers",
name = "Obs",
marker = list(size = 3)) %>%
add_mesh(x=as.vector(axis_1),
y=as.vector(axis_2),
z=df_m2$pred,
type = "mesh3d",
name = "Preds")
Now, I am trying to customize this plotly object so that different labels appear when you move the mouse over each point, and points corresponding to a given class all have the same color:
p <- plot_ly(type = 'scatter3d', mode = 'markers', colors = "Accent", color = df_m2$color) %>%
add_trace(
x = df_m2$V1,
y = df_m2$V2,
z = df_m2$V3,
marker = list(
size = 3),
name = df_m2$labels,
text = paste("Species: ", df_m2$labels ; "Width: ", df_m2$petal.width ; "color: ", df_m2$color" ),
showlegend = T
) %>%
add_mesh(x=as.vector(axis_1),
y=as.vector(axis_2),
z=df_m2$pred,
type = "mesh3d",
name = "Preds")
%>%
layout(
title = "none",
titlefont = list(
size = 10
),
paper_bgcolor = "#fffff8",
font = t,
xaxis = list(
zeroline = F
),
yaxis = list(
hoverformat = '.2f',
zeroline = F
)
)
p
However, there is an error here. Can someone please show me what I am doing wrong?
Thanks
Not sure what exactly you want to do with the predicted classes, but maybe something like this? (Color corresponds to real species, mouseover also shows prediction).
library(reprex)
reprex({
suppressPackageStartupMessages(invisible(
lapply(c("Rtsne", "dplyr", "ggplot2", "plotly", "caret", "randomForest"),
require, character.only = TRUE)))
#data
a <- unique(iris)
#create two species just to make things easier
set.seed(123)
a$species <- factor(sample(c("a", "b"), 149, replace=TRUE, prob=c(0.3, 0.7)))
#split data into train/test, and then random forest
index = createDataPartition(a$species, p=0.7, list = FALSE)
train = a[index,]
test = a[-index,]
rf <- randomForest(species ~ ., data=train, mtry=2)
#have the model predict the test set
pred <- predict(rf, test, type = "prob")
labels <- predict(rf, test)
confusionMatrix(labels, test$species)
#tsne algorithm
tsne_obj_3 <- Rtsne(test[,-5], perplexity=1, dims=3)
df_m2 <- as.data.frame(tsne_obj_3$Y)
df_m2$labels = toupper(test$species)
df_m2$pred <- labels # you did not define but call pred in plot_ly call
df_m2$color = ifelse(df_m2$labels == "A", "red", "blue")
df_m2$petal_length = test$Petal.Length
axis_1 <- df_m2$V1
axis_2 <- df_m2$V2
axis_3 <- df_m2$V3
plot_ly(type = 'scatter3d', mode = 'markers', colors = c("blue", "red"),
color = df_m2$color) %>%
add_trace(
x = df_m2$V1,
y = df_m2$V2,
z = df_m2$V3,
marker = list(size = 3),
name = df_m2$pred,
text = paste0("Species: ", df_m2$labels, "; Length: ",df_m2$petal_length, "; color: ", df_m2$color),
showlegend = TRUE) %>%
add_mesh(x=as.vector(axis_1),
y=as.vector(axis_2),
z=axis_3, # not sure what z you want here
type = "mesh3d",
name = "Preds") %>%
layout(
title = "none",
titlefont = list(size = 10),
paper_bgcolor = "#fffff8",
font = "Open Sans",
xaxis = list(zeroline = FALSE),
yaxis = list(hoverformat = '.2f', zeroline = FALSE)
)
I am trying to generate multiple graphs in Plotly for 30 different sales offices. Each graph would have 3 lines: sales, COGS, and inventory. I would like to keep this on one graph with 30 buttons for the different offices. This is the closest solution I could find on SO:
## Create random data. cols holds the parameter that should be switched
l <- lapply(1:100, function(i) rnorm(100))
df <- as.data.frame(l)
cols <- paste0(letters, 1:100)
colnames(df) <- cols
df[["c"]] <- 1:100
## Add trace directly here, since plotly adds a blank trace otherwise
p <- plot_ly(df,
type = "scatter",
mode = "lines",
x = ~c,
y= ~df[[cols[[1]]]],
name = cols[[1]])
## Add arbitrary number of traces
## Ignore first col as it has already been added
for (col in cols[-1]) {
p <- p %>% add_lines(x = ~c, y = df[[col]], name = col, visible = FALSE)
}
p <- p %>%
layout(
title = "Dropdown line plot",
xaxis = list(title = "x"),
yaxis = list(title = "y"),
updatemenus = list(
list(
y = 0.7,
## Add all buttons at once
buttons = lapply(cols, function(col) {
list(method="restyle",
args = list("visible", cols == col),
label = col)
})
)
)
)
print(p)
It works but only on graphs with single lines/traces. How can I modify this code to do the same thing but with graphs with 2 or more traces? or is there a better solution? Any help would be appreciated!
### EXAMPLE 2
#create fake time series data
library(plotly)
set.seed(1)
df <- data.frame(replicate(31,sample(200:500,24,rep=TRUE)))
cols <- paste0(letters, 1:31)
colnames(df) <- cols
#create time series
timeseries <- ts(df[[1]], start = c(2018,1), end = c(2019,12), frequency = 12)
fit <- auto.arima(timeseries, d=1, D=1, stepwise =FALSE, approximation = FALSE)
fore <- forecast(fit, h = 12, level = c(80, 95))
## Add trace directly here, since plotly adds a blank trace otherwise
p <- plot_ly() %>%
add_lines(x = time(timeseries), y = timeseries,
color = I("black"), name = "observed") %>%
add_ribbons(x = time(fore$mean), ymin = fore$lower[, 2], ymax = fore$upper[, 2],
color = I("gray95"), name = "95% confidence") %>%
add_ribbons(x = time(fore$mean), ymin = fore$lower[, 1], ymax = fore$upper[, 1],
color = I("gray80"), name = "80% confidence") %>%
add_lines(x = time(fore$mean), y = fore$mean, color = I("blue"), name = "prediction")
## Add arbitrary number of traces
## Ignore first col as it has already been added
for (col in cols[2:31]) {
timeseries <- ts(df[[col]], start = c(2018,1), end = c(2019,12), frequency = 12)
fit <- auto.arima(timeseries, d=1, D=1, stepwise =FALSE, approximation = FALSE)
fore <- forecast(fit, h = 12, level = c(80, 95))
p <- p %>%
add_lines(x = time(timeseries), y = timeseries,
color = I("black"), name = "observed", visible = FALSE) %>%
add_ribbons(x = time(fore$mean), ymin = fore$lower[, 2], ymax = fore$upper[, 2],
color = I("gray95"), name = "95% confidence", visible = FALSE) %>%
add_ribbons(x = time(fore$mean), ymin = fore$lower[, 1], ymax = fore$upper[, 1],
color = I("gray80"), name = "80% confidence", visible = FALSE) %>%
add_lines(x = time(fore$mean), y = fore$mean, color = I("blue"), name = "prediction", visible = FALSE)
}
p <- p %>%
layout(
title = "Dropdown line plot",
xaxis = list(title = "x"),
yaxis = list(title = "y"),
updatemenus = list(
list(
y = 0.7,
## Add all buttons at once
buttons = lapply(cols, function(col) {
list(method="restyle",
args = list("visible", cols == col),
label = col)
})
)
)
)
p
You were very close!
If for example you want graphs with 3 traces,
You only need to tweak two things:
Set visible the three first traces,
Modify buttons to show traces in groups of three.
My code:
## Create random data. cols holds the parameter that should be switched
library(plotly)
l <- lapply(1:99, function(i) rnorm(100))
df <- as.data.frame(l)
cols <- paste0(letters, 1:99)
colnames(df) <- cols
df[["c"]] <- 1:100
## Add trace directly here, since plotly adds a blank trace otherwise
p <- plot_ly(df,
type = "scatter",
mode = "lines",
x = ~c,
y= ~df[[cols[[1]]]],
name = cols[[1]])
p <- p %>% add_lines(x = ~c, y = df[[2]], name = cols[[2]], visible = T)
p <- p %>% add_lines(x = ~c, y = df[[3]], name = cols[[3]], visible = T)
## Add arbitrary number of traces
## Ignore first col as it has already been added
for (col in cols[4:99]) {
print(col)
p <- p %>% add_lines(x = ~c, y = df[[col]], name = col, visible = F)
}
p <- p %>%
layout(
title = "Dropdown line plot",
xaxis = list(title = "x"),
yaxis = list(title = "y"),
updatemenus = list(
list(
y = 0.7,
## Add all buttons at once
buttons = lapply(0:32, function(col) {
list(method="restyle",
args = list("visible", cols == c(cols[col*3+1],cols[col*3+2],cols[col*3+3])),
label = paste0(cols[col*3+1], " ",cols[col*3+2], " ",cols[col*3+3] ))
})
)
)
)
print(p)
PD: I only use 99 cols because I want 33 groups of 3 graphs
Do you see any mistake here?
the ggsave works, but png(file=file) part not - it saves empty white image.
output$savePlotAmount <- downloadHandler(
filename = "amount.png",
content = function(file) {
if(input$plotType == "dot"){
png(file = file)
plotAmount()
dev.off()
}else{
ggsave(plotAmount(), filename = file)
}
})
I spent "hours" on trying to repair it but I don't know what's going on. Sorry that the example is not reproductible, but it is to hard to reproduct all app.
EDIT:
What is plotAmount():
plotAmount <- reactive({
if(input$plotType == "violin") {
plotAmount <- ggplot(values$x, aes_string(x = input$groupedBy, y = input$yVariableContinous)) +
geom_violin() +
ggtitle(paste0(input$yVariableContinous, " grouped by ", input$groupedBy)) +
scale_y_continuous(limits = c(0, quantile(values$x[,input$yVariableContinous] , 0.95, na.rm = T)))
}
if(input$plotType == "boxplot") {
plotAmount <- ggplot(values$x, aes_string(x = input$groupedBy, y = input$yVariableContinous)) +
geom_boxplot(outlier.shape = NA) +
ggtitle(paste0(input$yVariableContinous, " grouped by ", input$groupedBy)) +
scale_y_continuous(limits = c(0, quantile(values$x[,input$yVariableContinous] , 0.95, na.rm = T)))
}
if(input$plotType == "dot") {
var <- ifelse(input$groupedBy == 1, input$xVariable, input$groupedBy)
agregat <- aggregate(x = values$x[,input$yVariableContinous], by = list(g = values$x[,var], xx = values$x[,input$xVariable]), FUN = input$valueAs)
dotchart(agregat$x, labels = agregat$xx,
groups = as.factor(agregat$g),
color = brewer.pal(9,"Set1")[as.numeric(as.factor(agregat$g))],
xlab = "salary",
cex = .75,
main = paste0(input$yVariableContinous, " for ", input$xVariable,
"\ngrouped by ", input$groupedBy),
xlim = c(min(values$x[,input$yVariableContinous], na.rm = T), quantile(values$x[,input$yVariableContinous] , 0.95, na.rm = T)),
pch = 16
)
}
ggplot2 plots need to be print()'d to render.
I have a data.frame I'd like to scatter plot using R's plotly with two factors which I'd like to color and shape by.
Here's my data:
set.seed(1)
df <- data.frame(x=rnorm(12),y=rnorm(12),
group=c(rep(1,3),rep(2,3),rep(3,3),rep(4,3)),
treatment=c(rep("A",6),rep("B",6)),
stringsAsFactors=F)
df$group <- factor(df$group,levels=1:4)
df$treatment <- factor(df$treatment,levels=c("A","B"))
Here's how I'm trying to plot:
require(plotly)
plot_ly(marker=list(size=10),type='scatter',mode="markers",x=~df$x,y=~df$y,color=~df$group,symbol=~df$treatment) %>%
add_annotations(text="group,treatment",xref="paper",yref="paper",x=1.02, xanchor="left",y=1.02,yanchor="top",legendtitle=TRUE,showarrow=FALSE) %>%
layout(xaxis=list(title="x"),yaxis=list(title="y"))
which gives me:
Is it possible to get the text of group and treatment in the legend be separated by comma instead of the new line as it is now?
This means that instead of:
1
A
2
A
3
B
4
B
I'll have:
1,A
2,A
3,B
4,B
Sounds trivial but it's one of the cases where Plotly decides whats good for you.
The legend labels are composed of the categories of color and symbol which are all passed in one command. In order to get control over the output, let's add each trace separately.
for (grou in groups) {
for (treat in treatments) {
trace_data <- subset(df, group == grou & treatment == treat)
if (nrow(trace_data) > 0) {
p <- add_trace(p,
x = trace_data$x,
y = trace_data$y,
marker = list(size = 10,
color = group,
symbol = as.integer(charToRaw(treat)) - 65),
type = 'scatter',
mode = "markers",
name = paste(grou, treat, sep = ",")
)
}
}
}
We pass the color (not strictly necessary) via marker and symbol also via marker (both can be passed in the add_trace command as well but then again Plotly decides for you what do to do with it).
The legend label is passed via name.
Note: You need to convert your treatment explicitly because symbol expects either a named symbol or a number (unless your treatments are named diamond or circle)
Complete code
library(utils)
library(plotly)
set.seed(1)
df <- data.frame(x = rnorm(12),
y = rnorm(12),
group = c(rep(1, 3),
rep(2, 3),
rep(3, 3),
rep(4, 3)
),
treatment=c(rep("A", 6),
rep("B", 6)
),
stringsAsFactors = FALSE
)
groups <- unique(df$group)
treatments <- unique(df$treatment)
p <- plot_ly()
for (grou in groups) {
for (treat in treatments) {
trace_data <- subset(df, group == grou & treatment == treat)
if (nrow(trace_data) > 0) {
p <- add_trace(p,
x = trace_data$x,
y = trace_data$y,
marker = list(size = 10,
color = group,
symbol = as.integer(charToRaw(treat)) - 65),
type = 'scatter',
mode = "markers",
name = paste(grou, treat, sep = ",")
)
}
}
}
p <- add_annotations(p,
text = "group,treatment",
xref = "paper",
yref = "paper",
x = 0.96,
xanchor = "left",
y = 1.03,
yanchor = "top",
legendtitle = TRUE,
showarrow = FALSE) %>%
layout(xaxis = list(title = "x"),
yaxis = list(title = "y"))
p