Modify graph (with plotly) - r

This is small example of my data set. This set contains weekly data about 52 weeks. You can see data with code below:
# CODE
#Data
library(tidyverse)
library(plotly)
ARTIFICIALDATA<-dput(structure(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52), `2019 Series_1` = c(534.771929824561,
350.385964912281, 644.736842105263, 366.561403508772, 455.649122807018,
533.614035087719, 829.964912280702, 466.035087719298, 304.421052631579,
549.473684210526, 649.719298245614, 537.964912280702, 484.982456140351,
785.929824561404, 576.736842105263, 685.508771929824, 514.842105263158,
464.491228070175, 608.245614035088, 756.701754385965, 431.859649122807,
524.315789473684, 739.40350877193, 604.736842105263, 669.684210526316,
570.491228070175, 641.649122807018, 649.298245614035, 664.210526315789,
530.385964912281, 754.315789473684, 646.80701754386, 764.070175438596,
421.333333333333, 470.842105263158, 774.245614035088, 752.842105263158,
575.368421052632, 538.315789473684, 735.578947368421, 522, 862.561403508772,
496.526315789474, 710.631578947368, 584.456140350877, 843.19298245614,
563.473684210526, 568.456140350877, 625.368421052632, 768.912280701754,
679.824561403509, 642.526315789474), `2020 Series_1` = c(294.350877192983,
239.824561403509, 709.614035087719, 569.824561403509, 489.438596491228,
561.964912280702, 808.456140350877, 545.157894736842, 589.649122807018,
500.877192982456, 584.421052631579, 524.771929824561, 367.438596491228,
275.228070175439, 166.736842105263, 58.2456140350878, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA)), row.names = c(NA, -52L), class = c("tbl_df", "tbl",
"data.frame")))
colnames(ARTIFICIALDATA) <- c('week', 'series1', 'series2')
So the next step is to plot this data with r-plotly package. I want to have a plot like the example below. Because this is weekly data, first series1 have 52 observations while series2 has 16 observation (series1 is mean data for 2019 and series2 data for 2020). So for that reason, the comparison must be only on 16 observation (all observations which don't have NA) like the example below:
So can anybody help how to plot this graph with plotly?

Try this:
colnames(ARTIFICIALDATA) <- c("week", "series1", "series2")
ARTIFICIALDATA %>%
# Drop rows with NA
drop_na() %>%
# Convert to long format
pivot_longer(-week, names_to = "series") %>%
# Set the labels for the plot. If you want other lables simply adjust
mutate(label = case_when(
series == "series1" ~ "2019 Series_1",
series == "series2" ~ "2020 Series_1")) %>%
# Compute sum by sereis
group_by(label) %>%
summarise(sum = sum(value, na.rm = TRUE)) %>%
ungroup() %>%
# Plot
plot_ly(x = ~label, y = ~sum) %>%
add_bars() %>%
# Remove title for xaxis. But can you can label it as you like
layout(xaxis = list(title = ""))

Related

Subset specific rows in a dataframe, but keeping the observations

I have a dataframe which look like this
y = data.frame(subdel = c(1, 2, 3, 1, 57, 14, 1, 2, 57, 57, 57, 3, 1, 1,
31, 21, 34, 56, 12, 45, 1, 63, 31, 34), muni = c("A01", "A83", "A40", NA, NA, NA, NA, NA, NA, NA, NA, "A45", "B26", "B42","B61", "B70", "B90", "C53", "C89","A45", "B26", "B42","B61", "B70"))
I'm expecting the next result:
z = data.frame(subdel = c(1, 2, 3, 57, 57, 57, 57, 3, 1, 1, 31, 21, 34, 56, 12, 45, 1, 63, 31, 34), muni = c("A01", "A83", "A40", NA, NA, NA, NA, "A45", "B26", "B42","B61", "B70", "B90", "C53", "C89", "A45", "B26", "B42","B61", "B70"))
I want to match subdel == 57 with muni == NA, but, as you can see, conservating all the another observations in the dataframe.
Any help would be appreciated.
We can use subset with a logical condition i.e. check for NA in 'muni' (is.na(muni)) and (&) where the 'subdel' is 57 (subdel == 57) or all other non-NA elements from 'muni' (!is.na(muni))
subset(y, is.na(muni) & subdel == 57 | !is.na(muni))

Use facet wrap with a data table object

I'm using the DT package and I'd like to show two tables on separate panes with one dataset.
Ideally, I'd like something like facet wrap that would let me make tables based on the plan id. I'd like to have one table that has all the values for the personal plan and another table that has all the values for the team plan. I can go the long way around and make two separate datasets, but I'm hoping there's something that I can do that might be more efficient
This is my data
structure(list(first_month = structure(c(17532, 17532, 17563,
17563, 17591, 17591, 17622, 17622, 17652, 17652), class = "Date"),
plan_id = c("personal", "team", "personal", "team", "personal",
"team", "personal", "team", "personal", "team"), new_customers = c(16,
32, 27, 33, 19, 41, 36, 46, 48, 46), `1` = c(16, 32, 27,
33, 19, 41, 36, 46, 48, 46), `2` = c(13, 29, 24, 30, 15,
37, 31, 40, 43, 38), `3` = c(13, 26, 22, 28, 14, 31, 30,
40, 36, 35), `4` = c(10, 20, 22, 22, 12, 29, 27, 39, 32,
33), `5` = c(10, 18, 20, 20, 11, 25, 22, 36, 27, 27), `6` = c(10,
16, 16, 20, 9, 24, 19, 34, 24, 25), `7` = c(10, 12, 13, 18,
7, 24, 16, 32, 21, 23), `8` = c(8, 10, 10, 14, 7, 21, 16,
30, 19, 21), `9` = c(7, 8, 7, 12, 7, 18, 16, 25, NA, NA),
`10` = c(7, 7, 5, 11, 6, 14, NA, NA, NA, NA), `11` = c(5,
6, 5, 10, NA, NA, NA, NA, NA, NA), `12` = c(5, 6, NA, NA,
NA, NA, NA, NA, NA, NA)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
This is my code
datatable(monthly_new_customer_cohorts_formatted_as_cohort_analysis_customer_counts,
class = 'cell-border stripe',
rownames = FALSE,
options = list(
ordering=F,
dom = 't',
pageLength = 1000))
Might not be the slickest answer, but hopefully this helps. You can use group_by, nest, mutate and map2 to create separate data-sets based on plan_id, then create a separate DT widget from each.
library(dplyr)
library(tidyr)
library(purrr)
# Separate data for each plan_id value:
widgets <- monthly_new_customer_cohorts_formatted_as_cohort_analysis_customer_counts %>%
group_by(plan_id) %>% nest()
# For each data subset, create a separate DT widget:
widgets <- widgets %>% mutate(dt_widget = map2(.y = plan_id, .x = data, .f = function(x,y){
datatable(x,class = 'cell-border stripe',caption = y,
rownames = FALSE,options = list(
ordering=F,
dom = 't',
pageLength = 1000),
height = "100%",width = "100%")}))
Each element of widgets$dt_widget is now a separate datatable widget. Trick now is to have them all in the same viewer:
library(htmltools)
browsable(x =
tagList(lapply(
widgets$dt_widget,function(x) tags$div(x)))
)
Unfortunately I don't think there is a function to give you the nice grid ala ggplot2::facet_wrap (e.g. a 3x3 grid for 9 facets) but you can manually do this by adjusting the style values in the div elements. For example, this allows the tables to go on the same row by reducing the width and adjusting the float:
browsable(
tagList(list(
tags$div(
widgets$dt_widget[[1]],
style = 'width:49%;display:block;float:left;'
),
tags$div(
widgets$dt_widget[[2]],
style = 'width:49%;display:block;float:right;')
)
))

How to find outliers variables in a very large dataset?

I want to find the outlier variable in a dataset both in terms of length and value.
In other words, I want to find a variable that is unlike other variables by comparing between variables.
To illustrate, below is a dummy set:
outliers <- data.frame(A = c(32, 31, 32, 38, 23, NA, NA, NA, NA, NA),
B = c(33, 39, 28, 34, 32, NA, NA, NA, NA, NA),
C = c(28, 39, 41, 31, 29, NA, NA, NA, NA, NA),
D = c(8, 9, 19, 28, 31, 23, 18, 13, 93, 2))
Clearly, the variable D is the outlier both in terms of length of observations and its values (i.e. mean).
I want to find a way to locate outlier variables like D in my actual dataset and put them into a list for further inspection.
The difficulty that I have in doing this with my actual dataset is that its very large (there are many lists that contain dataframes with hundreds of columns and thousands of rows).
Thank you!

Мodification GGPLOT with weekly data

This is small example of my data set.This set contain weekly data about 52 weeks.You can see data with code below:
# CODE
#Data
ARTIFICIALDATA<-dput(structure(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52), `2019 Series_1` = c(534.771929824561,
350.385964912281, 644.736842105263, 366.561403508772, 455.649122807018,
533.614035087719, 829.964912280702, 466.035087719298, 304.421052631579,
549.473684210526, 649.719298245614, 537.964912280702, 484.982456140351,
785.929824561404, 576.736842105263, 685.508771929824, 514.842105263158,
464.491228070175, 608.245614035088, 756.701754385965, 431.859649122807,
524.315789473684, 739.40350877193, 604.736842105263, 669.684210526316,
570.491228070175, 641.649122807018, 649.298245614035, 664.210526315789,
530.385964912281, 754.315789473684, 646.80701754386, 764.070175438596,
421.333333333333, 470.842105263158, 774.245614035088, 752.842105263158,
575.368421052632, 538.315789473684, 735.578947368421, 522, 862.561403508772,
496.526315789474, 710.631578947368, 584.456140350877, 843.19298245614,
563.473684210526, 568.456140350877, 625.368421052632, 768.912280701754,
679.824561403509, 642.526315789474), `2020 Series_1` = c(294.350877192983,
239.824561403509, 709.614035087719, 569.824561403509, 489.438596491228,
561.964912280702, 808.456140350877, 545.157894736842, 589.649122807018,
500.877192982456, 584.421052631579, 524.771929824561, 367.438596491228,
275.228070175439, 166.736842105263, 58.2456140350878, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA)), row.names = c(NA, -52L), class = c("tbl_df", "tbl",
"data.frame")))
So next steep is plot this data with ggplot2.So you can see my plot below
library(ggplot2)
p <- ggplot() +
geom_line(data = ARTIFICIALDATA, aes(x = week, y = `2019 Series_1`), color = "black") +
geom_line(data = ARTIFICIALDATA, aes(x = week, y = `2020 Series_1`), color = "red",size=1,linetype=2) +
xlab('Weeks') +
ylab('US dolars')+
theme(legend.position="top")
p
So this is how look like my plot, but here missing two things.First is legend (for 2019 Series_1 and 2020 Series_1) and x axis need to show values for all 52 weeks.So can anybody help me how to resolve this problem?
It would be simple to plot this if you have data in long format. Also you might control what to show on x-axis using scale_x_continuous by adding custom breaks.
library(ggplot2)
ARTIFICIALDATA %>%
tidyr::pivot_longer(cols = -week, names_to = 'Series') %>%
ggplot() + aes(x = week, y = value, color = Series) + geom_line() +
xlab('Weeks') +
ylab('US dolars') +
scale_x_continuous(breaks = c(seq(10, 40, 10), 52)) +
theme(legend.position="top")
Here the answer:
library(tidyverse)
ARTIFICIALDATA_rec <- ARTIFICIALDATA %>%
gather(key = Year_indicator, value = time_series_value, -1)
your_plot <- ggplot(data = ARTIFICIALDATA_rec, aes(x = week, y = time_series_value, group = Year_indicator)) +
geom_line(aes(color = Year_indicator)) +
scale_x_continuous(name = "Week of the year", limits=c(0, 52), breaks=seq(0,52,2))
and this is the plot:

Modify plot with PLOTLY package

This is small example of my data set.This set contain weekly data about 52 weeks.You can see data with code below:
# CODE
#Data
ARTIFICIALDATA<-dput(structure(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52), `2019 Series_1` = c(534.771929824561,
350.385964912281, 644.736842105263, 366.561403508772, 455.649122807018,
533.614035087719, 829.964912280702, 466.035087719298, 304.421052631579,
549.473684210526, 649.719298245614, 537.964912280702, 484.982456140351,
785.929824561404, 576.736842105263, 685.508771929824, 514.842105263158,
464.491228070175, 608.245614035088, 756.701754385965, 431.859649122807,
524.315789473684, 739.40350877193, 604.736842105263, 669.684210526316,
570.491228070175, 641.649122807018, 649.298245614035, 664.210526315789,
530.385964912281, 754.315789473684, 646.80701754386, 764.070175438596,
421.333333333333, 470.842105263158, 774.245614035088, 752.842105263158,
575.368421052632, 538.315789473684, 735.578947368421, 522, 862.561403508772,
496.526315789474, 710.631578947368, 584.456140350877, 843.19298245614,
563.473684210526, 568.456140350877, 625.368421052632, 768.912280701754,
679.824561403509, 642.526315789474), `2020 Series_1` = c(294.350877192983,
239.824561403509, 709.614035087719, 569.824561403509, 489.438596491228,
561.964912280702, 808.456140350877, 545.157894736842, 589.649122807018,
500.877192982456, 584.421052631579, 524.771929824561, 367.438596491228,
275.228070175439, 166.736842105263, 58.2456140350878, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA)), row.names = c(NA, -52L), class = c("tbl_df", "tbl",
"data.frame")))
# CODE WITH PLOTLY
library(tidyverse)
library(plotly)
colnames(ARTIFICIALDATA) <- c('week', 'series1', 'series2')
fig <- plot_ly(ARTIFICIALDATA, x = ~week, y = ~series2, name = "2019", type = 'scatter', mode = 'lines')
fig <- fig %>% add_trace(y = ~series1, name = "2020")
fig
So next steep is plot this data with plotly. So you can see how my plot look like below:
But my intention is to make plot like plot below with dashed line (size=1 linetype=2).So can anybody help me how to modify this ?
You can add line = list(dash = "dash") to get the dashed line. See options for different dash property choices. The other line can be set to solid.
library(tidyverse)
library(plotly)
colnames(ARTIFICIALDATA) <- c('week', 'series1', 'series2')
fig <- plot_ly(ARTIFICIALDATA, x = ~week, y = ~series2, name = "2019", type = 'scatter', mode = 'lines',
line = list(dash = "dash"))
fig <- fig %>% add_trace(y = ~series1, name = "2020", line = list(dash = "solid"))
fig
Plot

Resources