Plot graph with PLOTLY - r

This is small example of my data set.This set contain weekly data about 52 weeks.You can see data with code below:
# CODE
#Data
ARTIFICIALDATA<-dput(structure(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52), `2019 Series_1` = c(534.771929824561,
350.385964912281, 644.736842105263, 366.561403508772, 455.649122807018,
533.614035087719, 829.964912280702, 466.035087719298, 304.421052631579,
549.473684210526, 649.719298245614, 537.964912280702, 484.982456140351,
785.929824561404, 576.736842105263, 685.508771929824, 514.842105263158,
464.491228070175, 608.245614035088, 756.701754385965, 431.859649122807,
524.315789473684, 739.40350877193, 604.736842105263, 669.684210526316,
570.491228070175, 641.649122807018, 649.298245614035, 664.210526315789,
530.385964912281, 754.315789473684, 646.80701754386, 764.070175438596,
421.333333333333, 470.842105263158, 774.245614035088, 752.842105263158,
575.368421052632, 538.315789473684, 735.578947368421, 522, 862.561403508772,
496.526315789474, 710.631578947368, 584.456140350877, 843.19298245614,
563.473684210526, 568.456140350877, 625.368421052632, 768.912280701754,
679.824561403509, 642.526315789474), `2020 Series_1` = c(294.350877192983,
239.824561403509, 709.614035087719, 569.824561403509, 489.438596491228,
561.964912280702, 808.456140350877, 545.157894736842, 589.649122807018,
500.877192982456, 584.421052631579, 524.771929824561, 367.438596491228,
275.228070175439, 166.736842105263, 58.2456140350878, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA)), row.names = c(NA, -52L), class = c("tbl_df", "tbl",
"data.frame")))
# CODE WITH PLOTLY
library(tidyverse)
library(plotly)
library(reshape2)
library(ggplot2)
library(dplyr)
ARTIFICIALDATA_rec <- ARTIFICIALDATA %>%
gather(key = Year_indicator, value = time_series_value, -1)
ARTIFICIALDATA_rec$color <- factor(ARTIFICIALDATA_rec$Year_indicator, labels = c("royalblue", "orange"))
Chart <- plot_ly(ARTIFICIALDATA_rec, x = ~week , y = ~time_series_value,
type = 'bar',
marker = list(color = ~color), name = ~Year_indicator) %>%
layout(title = "TEST",yaxis = list(title = 'Millions EUR '), barmode = 'stack')
Chart<-ggplotly(Chart)
Chart
So next steep is plot this data with plotly. So you can see how my plot look like below:
But my intention is to make plot like plot below.I plot in Excel but defently i need this plot with plotly.Most important thing is to compare only data which is same.For example data for 2020 contain data about 16 weeks and compratation must be with the same period of 2019. So can anybody help me about this problem and plot this plot with plotly ?

You need to add a trace for each time series you want to plot and specify barmode in the layout of your `plotly plot. No additional data manipulation seems necessary to get what you want:
CODE
dat <- as.data.table(ARTIFICIALDATA)
colnames(dat) <- c('week', 'series1', 'series2')
plt <- plot_ly(dat) %>%
add_trace(x = ~week, y = ~series1, type = 'bar', name = '2019 Series 1') %>%
add_trace(x = ~week, y = ~series2, type = 'bar', name = '2020 Series 1') %>%
layout(
xaxis = list(title = 'week'),
yaxis = list(title = ''),
barmode = 'group'
)
the data.table part is not necessary - I did that purely to get simpler column names and because I prefer data.table for subsetting etc.
OUTPUT
The above code returns the below plot:
You can subset your data to include only weeks for which both series have data to get the graph in your post.
plt <- plot_ly(dat[!is.na(series2)]) %>%
...
Optionally, you can move the legend to the bottom by specifying the legend in layout - makes it nicer to read in my opinion:
layout(
...
legend = list(orientation = 'h')
)
This gives you:

Related

Create mean value plot without missing values count to total

Using a dataframe with missing values:
structure(list(id = c("id1", "test", "rew", "ewt"), total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8), total_frq_4 = c(36, NA, 104, NA)), row.names = c(NA, 4L), class = "data.frame")
How is is possible to create a bar plot with the mean for every column, excluding the id column, but without filling the missing values with 0 but leaving out the row with missing values example for total_frq_3 24+25+8 = 57/3 = 19
You can use colMeans function and pass it the appropriate argument to ignore NA.
library(ggplot2)
xy <- structure(list(id = c("id1", "test", "rew", "ewt"),
total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8),
total_frq_4 = c(36, NA, 104, NA)),
row.names = c(NA, 4L),
class = "data.frame")
xy.means <- colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE)
xy.means <- as.data.frame(xy.means)
xy.means$total <- rownames(xy.means)
ggplot(xy.means, aes(x = total, y = xy.means)) +
theme_bw() +
geom_col()
Or just use base image graphic
barplot(height = colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE))

Error with using a function to create a new variable with subtraction between variables in R

I have a huge dataset of the Marseille's rental property market (named marseilleannonces) which contains some variables:
structure(list(ID = c("af626000-342e-11e8-a56e-8326540c0e87",
"20629290-c926-11e6-a626-abf6d3bf8a25", "8495af50-b92c-11e5-86ef-abf6d3bf8a25",
"a4299b60-11e3-11ea-9589-c1180fadeaa5", "833f81d0-d3da-11ea-b28a-1b6a75606a9a",
"75358b40-6d76-11e5-bb7a-cfb08fbdec46", "8d6f22f3-abc7-11e4-b16a-1100e6029c1e",
"10ed2580-28cb-11e9-bcd9-d3a30a46a7fe", "dd156b70-1534-11e6-afdf-abf6d3bf8a25",
"15688650-2934-11e8-ab89-41d65c7c6457"), TYPE = c("APARTMENT",
"APARTMENT", "APARTMENT", "APARTMENT", "PREMISES", "APARTMENT",
"APARTMENT", "APARTMENT", "APARTMENT", "PREMISES"), SURFACE = c(19,
29, 17, 55, 35, 50, 67, 30, 28, 45), ROOM_COUNT = c(1, 2, 1,
3, 1, 2, 2, 1, 1, NA), PRICE = c(295, 470, 290, 610, 550, 500,
500, 655, 445, 1943), RENTAL_EXPENSES = c(45, NA, NA, NA, NA,
NA, 40, NA, NA, NA), RENTAL_EXPENSES_INCLUDED = c(TRUE, TRUE,
NA, TRUE, TRUE, TRUE, TRUE, TRUE, NA, NA)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
In this dataset, if RENTAL_EXPENSES_INCLUDED=TRUE, the variable PRICE contains the values in RENTAL_EXPENSES, and if RENTAL_EXPENSES_INCLUDED=FALSE, the variable PRICE does not contain the values in RENTAL_EXPENSES. My goal is to create a new column whith prices that does not contain the values in RENTAL_EXPENSES, named HC. I tried to create a function:
for(i in 1:length(marseilleannonces$RENTAL_EXPENSES_INCLUDED)){
x = marseilleannonces$RENTAL_EXPENSES_INCLUDED[i]
if(x == TRUE){
marseilleannonces$HC[i] = PRICE[i]-RENTAL_EXPENSES[i]
}
else {
marseilleannonces$HC[i] = PRICE[i]
}
}
R tells me that there is a missing value where TRUE/FALSE is required. Maybe the fact that there is a lot of NAs in my dataset is a problem.
Any advice is the right direction is welcomed.
Thanks in advance !
Edit: Based on your comments:
marseillannonces %>%
mutate(HC = case_when(RENTAL_EXPENSES_INCLUDED == TRUE ~ PRICE - RENTAL_EXPENSES,
RENTAL_EXPENSES_INCLUDED == FALSE ~ PRICE))

How to calculate means when you have missing values?

I would like to calculate the mean of the data frame that has some missing values. The sum of the data frame is 500 and the number of cells is 28. therefore the mean should be 17.8571. However, when calculating in R I need to mark the missing cells with 0 that changes the mean value
Sample data:
df<-structure(list(`10` = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10), `20` = c(20, 20, 20, 20, 20, 20, 20, 20, NA,
NA, NA, NA, NA, NA), `30` = c(30, 30, 30, 30, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), `40` = c(40, 40, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -14L), class = c("tbl_df",
"tbl", "data.frame"))
Sample code:
Where is my mistake?
df1<-rowMeans(df, na.rm=TRUE) # I also tried colMeans
df2<-mean(df1)
sum(df,na.rm = TRUE)/sum(!is.na(df))
You can convert your data.frame to a vector using unlist and calculate then the mean with the argument na.rm=TRUE to skip NA.
mean(unlist(df), na.rm=TRUE)
#[1] 17.85714
Another option is to convert the data.frame to a matrix.
mean(as.matrix(df), na.rm=TRUE)
#[1] 17.85714
To match mean with excel you can repeat the time value df number of times.
mean(rep(df$time, df$df))
#[1] 17.85714

Remove NA and only fill cells containing numbers in tableGrob

I have a table (top.table) I would like to display in a ggplot, but am having issues reformatting the table. I need to format it such that all NA elements are blank, and only fill with specified colors if there is a number contained within the element. Basically, fill the colors like in the code below except the NA elements should be filled default (white), and the NA text should be removed. If the removing of the NA is not possible in the way I described, changing the text color/fill would also work for me (i.e. change text color/fill of numbers, but not NA).
top.table <- structure(c(7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 57.5, 45.5,
NA, NA, NA, 128.5, 78.5, 71.5, 49, NA, NA, NA, 1043, NA, NA,
710, 838, 1481, 737, NA, NA, 1096, 5923, 3697, NA, 1726, NA,
NA, 3545, NA, NA, 1733, 2333, NA, 3807, 1795, NA, 2761, NA, 2887,
NA, NA, 2211, 2544), .Dim = c(11L, 5L), .Dimnames = list(NULL,
c("Sample Number", "Static", "D10 FB", "D12 FB", "D14 FB"
)))
colors <- structure(list(newcolor = c("dodgerblue2", "#E31A1C", "#FDBF6F",
"palegreen2", "skyblue2", "green4", "#6A3D9A", "#FF7F00", "gold1",
"#CAB2D6", "#FB9A99")), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data.frame"))
tt1 <- ttheme_minimal(
core = list(bg_params = list(fill = colors, col = NA))
)
g <- tableGrob(top.table, theme = tt1)
grid.draw(g)
This may seem like a very obvious solution, but why not just replace the NA with empty strings when you plot the table?
g <- tableGrob(replace(top.table, is.na(top.table), ""), theme = tt1)
grid.newpage()
grid.draw(g)
With help from #AllanCameron, the solution I came up with was to use repeat the colors to the number of columns in top.table and use replace() to convert all NA elements to "white" before calling tableGrob()
#make repeated columns of colors
table.colors <- matrix(rep(colors, each = ncol(top.table)),
ncol = ncol(top.table), byrow = TRUE)
#index matrix to fine NAs
table.ind <- is.na(top.table)
#make replacements
table.colors <- replace(table.colors, table.ind, "white")
tt1 <- ttheme_minimal(
core = list(bg_params = list(fill = table.colors))
)
g <- tableGrob(replace(top.table, is.na(top.table), ""), theme = tt1)
grid.draw(g)

(R) Add significance stars to correlation matrix heat map

I am looking at correlations between many variables in my data stratified by gender. I was able to create a heatmap using code I found on StackOverflow, but I'm not sure how to add stars for significance to the cells. I would also like to cut the matrix in half to avoid redundancy.
Here's the code:
# Variables to correlate
anthro <- c("Visit_age", "HeightCm", "WeightKg", "BMI",
"NeckLengthCm", "NeckCircCm", "HeadCircCm", "NeckVolumeCm")
peak <- c("ExtensorPeak_Newtons", "FlexorPeak_Newtons",
"RightPeak_Newtons", "LeftPeak_Newtons")
avg <- c("ExtensorAVG_Newtons", "FlexorAVG_Newtons",
"RightAVG_Newtons", "LeftAVG_Newtons")
# Function for creation of multiple heatmaps using
# male/female and peak/avg neck strength
heatmap <- function(gender, strength){
# Create three new variables: var1, var2, corr
# where corr is correlation between the var1 and var2
corrs <- filter(data, Gender == gender) %>%
select(anthro, strength) %>%
as.matrix() %>%
cor(use = "pairwise.complete.obs") %>%
as.data.frame() %>%
rownames_to_column(var = "var1") %>%
gather("var2", "corr", -var1)
# Plot heatmap
ggplot(corrs, aes(var1, var2)) +
geom_tile(aes(fill = corr), color = "white") +
scale_fill_gradient(low = "white", high = "steelblue") +
geom_text(aes(label = round(corr, 1))) +
ggtitle(gender) +
labs(x = "", y = "") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x =
element_text(angle = 30, hjust = 1))
}
# Create heatmaps
heatmap("Male", peak)
heatmap("Female", peak)
heatmap("Male", avg)
heatmap("Female", avg)
dput(head(data, 20)):
data <- structure(list(Gender = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("Male",
"Female"), class = "factor"), Visit_age = c(37, 38, 39, 22, 23,
24, 24, 20, 21, 21, 22, 22, 36, 37, 38, 38, 22, 42, 42, 43),
HeightCm = c(170, 170, 170, 182, 182, 182, 182, 177.8, 177.8,
177.8, 177.8, 177.8, 168, 168, 168, 168, 162.56, 164, 164,
164), WeightKg = c(63.18181, 58.63636, 60.45454, 70.90909,
77.72727, 75.45454, 80.45454, 78.86363, 81.36363, 80, 83.18181,
82.72727, 68.18181, 69.0909, 68.18181, 65, 69.0909, 48.18181,
50.45454, 47.72727), BMI = c(21.86222, 20.28939, 20.91852,
21.40716, 23.46554, 22.77941, 24.28889, 24.94671, 25.73752,
25.30617, 26.31266, 26.16888, 24.15739, 24.47948, 24.15739,
23.03004, 26.14529, 17.91412, 18.75912, 17.74511), NeckLengthCm = c(16,
16, 16, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16,
15, 15, 15, 15), NeckCircCm = c(35, 30, 32, 35, 34, 34, 36,
38, 39, 38, 40, 41, 39, 24, 36, 38, 34, 30, 29, 30), HeadCircCm = c(58,
58, 58, 56, 56, 56, 56, 57, 57, 57, 57, 57, 58, 58, 58, 58,
55, 52, 52, 52), NeckVolumeCm = c(1559.718, 1145.915, 1303.797,
1364.753, 1287.881, 1287.881, 1443.853, 1838.557, 1936.597,
1838.557, 2037.183, 2140.315, 1936.597, 733.3859, 1650.118,
1838.557, 1379.873, 1074.295, 1003.869, 1074.295), ExtensorPeak_Newtons = c(NA,
183.34, 145.96, NA, NA, 187.79, 153.525, NA, NA, 252.76,
227.395, 192.685, NA, NA, 168.21, 230.51, NA, NA, NA, 101.015
), FlexorPeak_Newtons = c(NA, 70.755, 68.975, NA, NA, 99.68,
112.585, NA, NA, 151.3, 136.615, 145.96, NA, NA, 97.9, 105.02,
NA, NA, NA, 53.4), RightPeak_Newtons = c(NA, 93.005, 125.935,
NA, NA, 85.885, 92.56, NA, NA, 102.35, 108.135, 108.135,
NA, NA, 74.315, 97.01, NA, NA, NA, 49.395), LeftPeak_Newtons = c(NA,
125.49, 131.275, NA, NA, 89.89, 99.68, NA, NA, 113.92, 121.93,
143.29, NA, NA, 59.185, 92.56, NA, NA, NA, 50.73), ExtensorAVG_Newtons = c(NA,
179.186637, 142.5483185, NA, NA, 178.445, 136.911637, NA,
NA, 242.97, 204.106637, 167.765, NA, NA, 161.09, 214.49,
NA, NA, NA, 95.081637), FlexorAVG_Newtons = c(NA, 68.2333185,
66.75, NA, NA, 87.516637, 100.125, NA, NA, 135.131637, 128.7533185,
138.84, NA, NA, 88.406637, 95.971637, NA, NA, NA, 51.62),
RightAVG_Newtons = c(NA, 85.1433185, 120.2983185, NA, NA,
75.65, 86.4783185, NA, NA, 96.7133185, 100.866637, 106.9483185,
NA, NA, 67.046637, 88.851637, NA, NA, NA, 47.7633185), LeftAVG_Newtons = c(NA,
121.93, 120.2983185, NA, NA, 74.315, 92.56, NA, NA, 110.656637,
111.546637, 130.83, NA, NA, 54.29, 88.11, NA, NA, NA, 48.801637
)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"
))
I found an alternative way to resolve your problem on http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram
Try to make a correlogram
library(corrplot)
# Correlation for Male
data_male <- data[data$Gender == "Male",]
M <- cor(data_male[,-1], use = "pairwise.complete.obs")
M <- round(M, 1)
#Significant correlation
p.mat <- cor(data_male[,-1])
# Plot the correlogram
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M,
method="color",
col=col(200),
type="upper",
order="hclust",
addCoef.col = "black",
tl.col="black",
number.cex = 0.7,
tl.cex = 0.6,
tl.srt=45,
p.mat =p.mat,
sig.level = 0.5,
insig = "label_sig")
You can do the same thing for Female
data_female <- data[data$Gender == "Female",]
F <- cor(data_female[,-1], use = "pairwise.complete.obs")
F <- round(F, 1)
corrplot(F,
method="color",
col=col(200),
type="upper",
order="hclust",
addCoef.col = "black",
tl.col="black",
number.cex = 0.7,
tl.cex = 0.6,
tl.srt=45,
p.mat =p.mat,
sig.level = 0.5,
insig = "label_sig")
Instead of your current argument to geom_text(aes(label= ...)) use:
label = paste(round(corr,1), c(" ","*")[(abs(corr) <= .05)+1])
This will add a "*" when the absolute value of corr is below 0.05.
Look at the code of ggcorrplot::ggcorrplot to see how they handle filling only half a square tile plot.

Resources