How to merge similar date in HTML using R - r

I have a dataframe in R which looks like.
Order Date Sell Sell_pc Order_fm mkt_pc Dealer_pc
2020-01-01 5 14.34 340 11.23 23.43
2020-01-01 1000 14.34 45900 13.23 34.45
2020-01-02 12 12.33 13 15.44 23.66
2020-01-02 13000 11.45 600000 15.21 14.44
2020-01-03 110000 12.33 31 15.34 12.34
2020-01-03 1600 11.45 18000 13.31 24.45
I want to convert the above-mentioned data frame in HTML image in R, Where I want to merge to similar date in on in the same sequence and add a light gray column for every two-column skipping next two-column.
Required output like:
I have tried:
html_image<-df %>% tableHTML(rownames = FALSE,
widths = rep(100, 6),
caption = "Order Book Reported") %>%
add_css_caption(css = list(c("font-weight", "border","font-size"),
c("bold", "1px solid black","16px")))%>%
add_css_row(css = list(c("background-color"), c("lightgray")), rows = 0:2)

The gt-package could be helpful here:
library(gt)
library(tidyverse)
df |>
mutate(Date = as.Date(Date)) |>
group_by(Date) |>
gt() |>
# gt(rowname_col = "Date") |>
tab_stubhead(label = "Date") |>
tab_header(
title = md("Order Book Reported")
) |>
tab_options(
row_group.as_column = F,
row_group.background.color = "gray",
heading.background.color = "orange",
column_labels.background.color = "orange"
) |>
tab_options(row_group.as_column = TRUE) |>
tab_style(
style = list(
cell_fill(color = "grey")
),
locations = cells_body(
rows = Date == "2020-01-02"
)
)

Related

ggplot histogram split on a boolean

using the public dataset (note it takes a few minutes to pull):
library(data.table)
dl <- tempfile()
download.file("http://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)
ratings <- fread(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c("userId", "movieId", "rating", "timestamp"))
movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")
if (as.numeric(version$year) < 2020 | (version$year=="2020" & as.numeric(version$month) < 3)){
# if using R 3.6 or earlier
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(levels(movieId))[movieId],
title = as.character(title),
genres = as.character(genres))
} else {
# if using R 4.0 or later
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(movieId),
title = as.character(title),
genres = as.character(genres))}
movielens <- left_join(ratings, movies, by = "movieId")
as in:
> head(movielens)
userId movieId rating timestamp title genres
1: 1 122 5 838985046 Boomerang (1992) Comedy|Romance
2: 1 185 5 838983525 Net, The (1995) Action|Crime|Thriller
3: 1 231 5 838983392 Dumb & Dumber (1994) Comedy
4: 1 292 5 838983421 Outbreak (1995) Action|Drama|Sci-Fi|Thriller
5: 1 316 5 838983392 Stargate (1994) Action|Adventure|Sci-Fi
6: 1 329 5 838983392 Star Trek: Generations (1994) Action|Adventure|Drama|Sci-Fi
>
i'm trying to split a ggplot histogram with fill to show difference between whole and half ratings per below:
movielens %>%
mutate(whole = rating == round(rating)) %>%
ggplot(mapping=aes(x=rating), fill=whole) +
geom_histogram()
as the half ratings are a lot less common but fill does not work for some reason...
You need to include fill in your aesthetic (aka mapping), not in your ggplot() call:
library(data.table)
library(stringr)
library(dplyr)
library(ggplot2)
dl <- tempfile()
download.file("http://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)
ratings <- fread(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c("userId", "movieId", "rating", "timestamp"))
movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")
if (as.numeric(version$year) < 2020 | (version$year=="2020" & as.numeric(version$month) < 3)){
# if using R 3.6 or earlier
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(levels(movieId))[movieId],
title = as.character(title),
genres = as.character(genres))
} else {
# if using R 4.0 or later
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(movieId),
title = as.character(title),
genres = as.character(genres))}
movielens <- left_join(ratings, movies, by = "movieId")
movielens %>%
mutate(whole = rating == round(rating)) %>%
ggplot(mapping=aes(x=rating, fill=whole)) +
geom_histogram()

Formatting an ftable in R

I have the following 3 way table I created in R.
with(dataset, ftable(xtabs(count ~ dos + sex + edu)))
The output looks like
edu high low medium unknown
dos sex
five-to-ten-years female 247776 44916 127133 23793
male 225403 37858 147821 20383
five-years-or-less female 304851 58018 182152 33649
male 253977 55720 193621 28972
more-than-ten-years female 709303 452605 539403 165675
male 629162 309193 689299 121336
native-born female 1988476 1456792 2094297 502153
male 1411509 1197395 2790522 395953
unknown female 57974 75480 73204 593141
male 40176 57786 93108 605542
I want to rename the variables and format the table so that I can include it in a report. I know that I can use dnn to rename the variables, but are there any other recommendations to rename the variables? And to format the table (similar to using kable)?
You could convert the output to a text matrix using the following function, after which you can style with kable however you choose:
ftab_to_matrix <- function(ft)
{
row_vars <- attr(ft, "row.vars")
for(i in seq_along(row_vars)){
row_vars[[i]] <- c(names(row_vars[i]), row_vars[[i]])}
rowvar_widths <- sapply(row_vars, function(x) max(nchar(x))) + 1
col_vars <- attr(ft, "col.vars")
rowvar_widths <- c(1, cumsum(c(rowvar_widths, max(nchar(names(col_vars))))))
ft_text <- capture.output(print(ft))
row_cols <- sapply(seq_along(rowvar_widths)[-1], function(x)
substr(ft_text, rowvar_widths[x - 1], rowvar_widths[x]))
ft_text <- substr(ft_text, rowvar_widths[length(rowvar_widths)] + 2, 100)
ft_breaks <- c(1, cumsum(lapply(strsplit(ft_text[length(ft_text)], "\\d "),
function(x) nchar(x) + 2)[[1]]))
col_cols <- sapply(seq_along(ft_breaks)[-1], function(x)
substr(ft_text, ft_breaks[x - 1], ft_breaks[x]))
trimws(cbind(row_cols, col_cols))
}
So, for example, using my example data from your last question, you could do something like:
my_tab <- with(`3waydata`, ftable(xtabs(count ~ duration + sex + education)))
as_image(kable_styling(kable(ftab_to_df(my_tab))), file = "kable.png")
Might have been easier had you given the full picture when you asked your first question... You could use gt to make fancy tables for reports. This is an edited version more fully demonstrating some capabilities.
library(dplyr)
library(gt)
way3data <- data %>%
group_by(duration, education, sex) %>%
summarise(count = sum(number)) %>%
ungroup
# Reorder with select and Titlecase with stringr
longer <- tidyr::pivot_wider(way3data,
values_from = count,
names_from = "education") %>%
select(duration, sex, high, medium, low, unknown) %>%
rename_with(stringr::str_to_title)
# Demonstrating some of the features of gt
# obviously could have done some of this
# to the original dataframe
myresults <- longer %>%
group_by(Duration) %>%
gt(rowname_col = "Sex") %>%
row_group_order(
groups = c("native-born",
"more-than-ten-years",
"five-to-ten-years",
"five-years-or-less",
"unknown")
) %>%
tab_spanner(label = "Education",
columns = matches("High|Low|Medium|Unknown")) %>%
tab_stubhead(label = "Duration or something") %>%
tab_style(
style = cell_text(style = "oblique", weight = "bold"),
locations = cells_row_groups()) %>%
tab_style(
style = cell_text(align = "right", style = "italic", weight = "bold"),
locations = cells_column_labels(
columns = vars(High, Low, Medium, Unknown)
)) %>%
tab_style(
style = cell_text(align = "right", weight = "bold"),
locations = cells_stub()) %>%
tab_header(
title = "Fancy table of counts with Duration, Education and Gender") %>%
tab_source_note(md("More information is available at https://stackoverflow.com/questions/62284264."))
# myresults
# Can save in other formats including .rtf
myresults %>%
gtsave(
"tab_1.png", expand = 10
)
You can read about all the formatting choices here
Data compliments of Allan
set.seed(69)
data <- data.frame(education = sample(c("high","low","medium","unknown"), 600, T),
sex = rep(c("Male", "Female"), 300),
duration = sample(c("unknown", "native-born",
"five-years-or-less", "five-to-ten-years",
"more-than-ten-years"), 600, T),
number = rpois(600, 10))

Is there a way to make my code functions work for multiple variables instead of the one i currently have it working for?

I have a dataset which looks as follows (with around 200 individuals):
NAME AGE2012 SEX SurveyDate12 WAZ12 BAZ12 HAZ12 HB12 SurveyDate14 WAZ14 BAZ14 HAZ14 HB14
1 22 Male 2012-11-26 -1.2 -0.54 -0.01 11.9 2014-11-26 -1.5 -0.52 -0.43 12.2
2 26 Female 2012-11-26 -1.5 -0.36 -0.04 11.2 2014-11-26 -1.7 -0.84 -0.32 11.4
I Am currently using them to make a slopegraph, so I have to do certain things with this dataset such as using pivot longer. I am practising to use factors, and so am trying to keep everything I do within a function. I have the code working to make the graph using just HB (see below). But what I would like to do is make this code work for all 4 of the variables I have in my dataset just using functions. Can anyone help me with this?
slopegraph_prep <- function(health_longer, HB){health %>%
select(NAME:SEX, starts_with("HB")) %>%
pivot_longer(cols = starts_with("HB"),
names_to = "Year",
names_prefix = "HB",
values_to = "HB") %>%
mutate(
HB = case_when(
HB < 0 ~ "NA",
TRUE ~ as.character(HB)
)
) %>%
na_if("NA") %>%
mutate(HB = as.numeric(HB)) %>%
mutate(
Year = case_when(
Year=="12" ~ "2012",
Year=="14" ~ "2014",
Year=="19" ~ "2019")
)
}
slopegraph_by_sex <- function(health, HB, SEX){ Subsetdata <- subset(health, SEX == SEX)
newggslopegraph(Subsetdata , Year , HB, NAME,
Title = "Haemoglobin",
SubTitle = SEX,
Caption = NULL,
RemoveMissing = FALSE)
}
df_healthmeas <- slopegraph_prep(health, "HB")
df_healthmeas_female <- slopegraph_by_sex(df_healthmeas, "Haemoglobin", "female")
df_healthmeas_male <- slopegraph_by_sex(df_healthmeas, "Haemoglobin", "male")
What I really want to do is to just be able to run this for example and for it to run, but I feel I will need to make my variables more generic?:
df_healthmeas <- slopegraph_prep(health, "WAZ")
df_healthmeas_female <- slopegraph_by_sex(df_healthmeas, "Weight to Age WAZ", "female")
df_healthmeas_male <- slopegraph_by_sex(df_healthmeas, "Weight to Age WAZ", "male")
Any help with this would be massively appreciated
Hey this is already doable with your code, you olny have to specify the function argument:
df_healthmeas <- slopegraph_prep(health_longer = health, HB =WAZ)
the variables you specified in the fiunction code is only a "dummy" so you could re-write it for better readabiliy as such:
slopegraph_prep <- function(data, var){data %>%
select(NAME:SEX, starts_with("var")) %>%
pivot_longer(cols = starts_with("var"),
names_to = "Year",
names_prefix = "var",
values_to = "var") %>%
mutate(
var = case_when(
var < 0 ~ "NA",
TRUE ~ as.character(var)
)
) %>%
na_if("NA") %>%
mutate(var = as.numeric(var)) %>%
mutate(
Year = case_when(
Year=="12" ~ "2012",
Year=="14" ~ "2014",
Year=="19" ~ "2019")
)
}

Dygraph with multiple series at different time intervals

I have 2 sets of time series with different time intervals which I am attempting to show in a single dygraph plot;
Stage (river level) and Modelled Stage - 5 or 15 minute interval
Rainfall and Forecast Rainfall - 3 hourly interval
I would like the stage set to be a line chart and rainfall to appear as a step plot similar to below.
My issue is that, as far as I can see, you must cbind your timeseries together in order to create a multi-series dygraph. Cbind fills in 'missing' points with NA causing my graph to appear with isolated points of rainfall like so
Is there any way to overplot in dygraph without combining everything into 1 time series object? Alternatively does anybody have any clever methods for filling in NAs during a cbind? I have a rather inelegant bit of code to fill in NAs after the cbind at the moment...
Example code for second plot
stage <- zoo(sample(1:100, 154, replace=TRUE), seq(as.POSIXct("2018-08-23"), as.POSIXct("2018-08-31"), by = 4500))
rain <- zoo(sample(1:100, 154, replace=TRUE), seq(as.POSIXct("2018-08-23"), as.POSIXct("2018-08-31"), by = 54000))
eventData <- cbind(stage, rain)
dygraph(eventData, main = "Sitename") %>%
dyOptions(useDataTimezone = TRUE, colors = colour, drawGrid = F) %>%
dyAxis("y", label = "Stage", valueRange = c(0, maxStage+maxStage*.2), independentTicks = TRUE) %>%
dyAxis("y2", label = "Rainfall ", valueRange = c(0, maxRain+maxRain*.5), independentTicks = TRUE) %>%
dySeries("Stage", axis=('y')) %>%
dySeries("Rainfall", axis=('y2'), stepPlot = T, fillGraph = T) %>%
You can use zoo::na.locf function to fill the missing rows.
In your example:
stage <- zoo(sample(1:100, 154, replace=TRUE), seq(as.POSIXct("2018-08-23"), as.POSIXct("2018-08-31"), by = 4500))
rain <- zoo(sample(1:100, 154, replace=TRUE), seq(as.POSIXct("2018-08-23"), as.POSIXct("2018-08-31"), by = 54000))
eventData <- cbind(stage, rain)
head(eventData)
stage rain
2018-08-23 00:00:00 85 61
2018-08-23 01:15:00 71 NA
2018-08-23 02:30:00 10 NA
2018-08-23 03:45:00 16 NA
2018-08-23 05:00:00 31 NA
2018-08-23 06:15:00 92 NA
# fill NAs with na.locf
eventData <- na.locf(eventData)
head(eventData)
stage rain
2018-08-23 00:00:00 85 61
2018-08-23 01:15:00 71 61
2018-08-23 02:30:00 10 61
2018-08-23 03:45:00 16 61
2018-08-23 05:00:00 31 61
2018-08-23 06:15:00 92 61
This can be plotted the way you want it:
library(dygraphs)
dygraph(eventData, main = "Sitename") %>%
dyOptions(drawGrid = F) %>%
dyAxis("y", label = "Stage", independentTicks = TRUE) %>%
dyAxis("y2", label = "Rainfall ", independentTicks = TRUE) %>%
dySeries("stage", axis=('y')) %>%
dySeries("rain", axis=('y2'), stepPlot = T, fillGraph = T)
See also here for a deeper discussion about filling NAs.

Multiple line chart using plotly r

I have a data frame which I am trying to plot using plotly as multiple line chart.Below is how the dataframe looks like:
Month_considered pct.x pct.y pct
<fct> <dbl> <dbl> <dbl>
1 Apr-17 79.0 18.4 2.61
2 May-17 78.9 18.1 2.99
3 Jun-17 77.9 18.7 3.42
4 Jul-17 77.6 18.5 3.84
5 Aug-17 78.0 18.3 3.70
6 Sep-17 78.0 18.9 3.16
7 Oct-17 77.6 18.9 3.49
8 Nov-17 77.6 18.4 4.01
9 Dec-17 78.5 18.0 3.46
10 Jan-18 79.3 18.4 2.31
11 2/1/18 78.9 19.6 1.48
When I iterate through to plot multiple lines below is the code used.
colNames <- colnames(delta)
p <-
plot_ly(
atc_seg_master,
x = ~ Month_considered,
type = 'scatter',
mode = 'line+markers',
line = list(color = 'rgb(205, 12, 24)', width = 4)
)
for (trace in colNames) {
p <-
p %>% plotly::add_trace(y = as.formula(paste0("~`", trace, "`")), name = trace)
}
p %>%
layout(
title = "Trend Over Time",
xaxis = list(title = ""),
yaxis = list (title = "Monthly Count of Products Sold")
)
p
This is how the output looks like
My question is how to remove trace 0 and month_considered to remove from the chart even though its not in colnames which I loop through to add the lines.
It looks like you were getting tripped up by two things:
When you initially defined p and included the data and x arguments, a trace was created -- trace 0. You can define a plot without providing any data or x values to start by just using p <- plot_ly() along with any desired layout features.
When you loop through the column names, your x axis column, Month_Considered is part of the set. You can exclude this by using setdiff() (part of base R) to create a vector with all of your column names except for Months_Considered
Putting those two things together, one way (of many possible) to accomplish what you're going for is as follows:
library(plotly)
df <- data.frame(Month_Considered = seq.Date(from = as.Date("2017-01-01"), by = "months", length.out = 12),
pct.x = seq(from = 70, to = 80, length.out = 12),
pct.y = seq(from = 30, to = 40, length.out = 12),
pct = seq(from = 10, to = 20, length.out = 12))
## Define a blank plot with the desired layout (don't add any traces yet)
p <- plot_ly()%>%
layout(title = "Trend Over Time",
xaxis = list(title = ""),
yaxis = list (title = "Monthly Count of Products Sold") )
## Make sure our list of columns to add doesnt include the Month Considered
ToAdd <- setdiff(colnames(df),"Month_Considered")
## Add the traces one at a time
for(i in ToAdd){
p <- p %>% add_trace(x = df[["Month_Considered"]], y = df[[i]], name = i,
type = 'scatter',
mode = 'line+markers',
line = list(color = 'rgb(205, 12, 24)', width = 4))
}
p

Resources