Plotting based on occurrence in group - r

I would to make a bar chart that plots the bar as a proportion of the total group rather than the usual percentage. For a var to "count" it only needs to occur once in a group. For example in this df where id is the grouping variable
df <-
tibble(id = c(rep(1, 3), rep(2, 3), rep(3, 3)),
vars = c("a", NA, "b", "c", "d", "e", "a", "a", "a"))
The a bars would be:
a = 2/3 # since a occurs in 2 out of 3 groups
b = 1/3
c = 1/3
d = 1/3
e = 1/3

If I understand you correctly, a one-liner would suffice:
ggplot(distinct(df)) + geom_bar(aes(vars, stat(count) / n_distinct(df$id)))

Working answer:
tibble(id = c(rep(1, 3), rep(2, 3), rep(3, 3)),
vars = c("a", "a", "b", "c", "d", "e", "a", "a", "a")) %>%
group_by(id) %>%
distinct(vars) %>%
ungroup() %>%
add_count(vars) %>%
mutate(prop = n / n_distinct(id)) %>%
distinct(vars, .keep_all = T) %>%
ggplot(aes(vars, prop)) +
geom_col()

Related

R ggplot legend with Waffle chart

library(tidyverse)
library(waffle)
df_2 <- structure(list(group = c(2, 2, 2, 1, 1, 1),
parts = c("A", "B", "C", "A", "B", "C"),
values = c(1, 39, 60, 14, 15, 71)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
df_2 %>% ggplot(aes(label = parts)) +
geom_pictogram(
n_rows = 10, aes(color = parts, values = values),
family = "fontawesome-webfont",
flip = TRUE
) +
scale_label_pictogram(
name = "Case",
values = c("male"),
breaks = c("A", "B", "C"),
labels = c("A", "B", "C")
) +
scale_color_manual(
name = "Case",
values = c("A" = "red", "B" = "green", "C" = "grey85"),
breaks = c("A", "B", "C"),
labels = c("A", "B", "C")
) +
facet_grid(~group)
With the above code, I got the legend what I expected:
However, when I replaced df_2 with the following df_1 dataframe, I was unable to combine two legends.
df_1 <- structure(list(group = c(2, 2, 2, 1, 1, 1),
parts = c("A", "B", "C", "A", "B", "C"),
values = c(0, 0, 100, 0, 0, 100)),
row.names = c(NA,-6L), class = c("tbl_df", "tbl", "data.frame"))
I kind of know the cause of the problem (0 values) but I would like to keep the legend the same as the graph above. Any suggestions would be appreciated.
To make it clear, the package "waffle" referred to here is not the CRAN package "waffle", but the GitHub-only package:
remotes::install_github("hrbrmstr/waffle")
library(waffle)
You will also need a way of displaying the pictograms, such as:
library(emojifont)
load.fontawesome()
Now, as with any other discrete scale, if you want to add values that are not present in the (post-stat) data, you need to use the limits argument:
df_1 %>% ggplot(aes(label = parts)) +
geom_pictogram(
n_rows = 10, aes(color = parts, values = values),
family = "fontawesome-webfont",
flip = TRUE
) +
scale_label_pictogram(
name = "Case",
values = c("male"),
breaks = c("A", "B", "C"),
labels = c("A", "B", "C"),
limits = c("A", "B", "C")
) +
scale_color_manual(
name = "Case",
values = c("A" = "red", "B" = "green", "C" = "grey85"),
breaks = c("A", "B", "C"),
labels = c("A", "B", "C")
) +
facet_grid(~group)
It is a bit tricky, but what you could do is say let's add 1 to all values so it will plot it like before. But using ggplot_build to remove from each case one row to get it in the right amount like this:
library(tidyverse)
library(waffle)
library(ggplot2)
library(dplyr)
library(emojifont)
library(waffle)
library(extrafont)
p <- df_1 %>% ggplot(aes(label = parts)) +
geom_pictogram(
n_rows = 10, aes(color = parts, values = values+1),
family = "fontawesome-webfont",
flip = TRUE
) +
scale_label_pictogram(
name = "Case",
values = c("male"),
breaks = c("A", "B", "C"),
labels = c("A", "B", "C")
) +
scale_color_manual(
name = "Case",
values = c("A" = "red", "B" = "green", "C" = "grey85"),
breaks = c("A", "B", "C"),
labels = c("A", "B", "C")
) +
facet_grid(~group)
q <- ggplot_build(p)
q$data[[1]] <- q$data[[1]] %>%
group_by(PANEL) %>%
slice(4:n())
q <- ggplot_gtable(q)
plot(q)
Created on 2022-10-20 with reprex v2.0.2

Transform a df into individual observations [duplicate]

This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Closed 7 months ago.
I want to transform a df from a "counting" approach (number of cases) to a "individual observations" approach.
Example:
df <- dplyr::tibble(
city = c("a", "a", "b", "b", "c", "c"),
sex = c(1,0,1,0,1,0),
age = c(1,2,1,2,1,2),
cases = c(2, 3, 1, 1, 1, 1))
Expected result
df <- dplyr::tibble(
city = c("a","a","a","a","a", "b", "b", "c", "c"),
sex = c(1,1,0,0,0,1,0,1,0),
age = c(1,1,2,2,2,1,2,1,2))
uncount() from tidyr can do that for you.
df |> tidyr::uncount(cases)

Pasting the RMSE and MAE next to a forecast in ggplot2

I have some locations with products that I need to forecast. It's around 300 location-product combinations. One plot is always the location and all the products (like in the image). Now I want to print the RMSE and MAE for each product next to the graph in the plot. They are calculated and stored in data_accuracy in their own columns, but I have no Idea how to print them into the plot. Is there a feasible way of doing it?
Ps: I know about the out of bounds error. It just happens with this example data but the results work as normal.
library(tidyverse)
library(tsibble)
library(fable)
library(lubridate)
data_ts <- data.frame(sales_year = c("2015-01-01", "2015-01-01", "2015-01-01", "2016-01-01",
"2016-01-01", "2016-01-01", "2017-01-01", "2017-01-01", "2017-01-01", "2018-01-01", "2018-01-01",
"2018-01-01", "2019-01-01", "2019-01-01", "2019-01-01"), product = c("a", "b", "c", "a", "b", "c",
"a", "b", "c", "a", "b", "c", "a", "b", "c"), sales = c(6, 11, 13, 6, 2, 6, 11, 12, 10, 4, 12, 2,
17, 6, 8))
data_ts <- data_ts%>%
mutate(sales_year = year(sales_year))
data_ts <- tsibble(data_ts, index = sales_year, key = product)
data_train <- data_ts %>%
filter(sales_year < "2018-01-01")
data_ses <- data_train %>%
model(ETS(sales ~ error("A") + trend("N") + season("N")))
data_ses_fc <- data_ses %>%
forecast(h = 1)
data_accuracy <- data_ses_fc %>%
accuracy(data_ts)
data_ses_fc %>%
autoplot(data_ts) +
geom_line(aes(y = .fitted), col="red",
data = augment(data_ses))
Something like this?
data_accuracy <- data_ses_fc %>%
accuracy(data_ts) %>%
mutate(res = paste0("RMSE: ",round(RMSE,2),"\n",
"MAE: ",round(MAE,2)))
data_ses_fc %>%
autoplot(data_ts) +
geom_line(aes(y = .fitted), col="red",
data = augment(data_ses)) +
geom_text(data=data_accuracy,aes(x = 2016,y=12,group=product,label=res))

Tidyverse: group_by, arrange, and lag across columns

I am working on a projection model for sports where I need to understand in a certain team's most recent game:
Who is their next opponent? (solved)
When is the last time their next opponent played?
reprex that can be used below. Using row 1 as an example, I would need to understand that "a"'s next opponent "e"'s most recent game was game_id_ 3.
game_id_ <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6)
game_date_ <- c(rep("2021-01-29", 6), rep("2021-01-30", 6))
team_ <- c("a", "b", "c", "d", "e", "f", "b", "c", "d", "f", "e", "a")
opp_ <- c("b", "a", "d", "c", "f", "e", "c", "b", "f", "d", "a", "e")
df <- data.frame(game_id_, game_date_, team_, opp_)
#Next opponent
df <- df %>%
arrange(game_date_, game_id_, team_) %>%
group_by(team_) %>%
mutate(next_opp = lead(opp_, n = 1L))
If I can provide more details, please let me know.
We can use match to return the corresponding game_id_
library(dplyr)
df %>%
arrange(game_date_, game_id_, team_) %>%
group_by(team_) %>%
mutate(next_opp = lead(opp_, n = 1L)) %>%
ungroup %>%
mutate(last_time = game_id_[match(next_opp, opp_)])

Get the row name used to annotation

First of all, my question is related to these other ones:
Lazy evaluation to annotations expanding function
R nested map through columns
So, I got this example data:
t <- tibble(a = c("a", "b", "c", "d", "e", "f", "g", "h"),
b = c( 1, 1, 1, 1, 2, 2, 2, 2),
c = c( 1, 1, 2, 2, 3, 3, 4, 4),
d = c( NA, NA, NA, "D", "E", NA, NA, NA),
e = c("A", NA, "C", NA, NA, NA, "G", "H")
)
And this functions
f1 <- function(data, group_col, expand_col){ #, return_group_col = TRUE, name_group_col = "group_col"){
data %>%
dplyr::group_by({{group_col}}) %>%
dplyr::mutate(
{{expand_col}} := dplyr::case_when(
!is.na({{expand_col}}) ~ {{expand_col}} ,
any( !is.na({{expand_col}}) ) & is.na({{expand_col}}) ~
paste(unique(unlist(str_split(na.omit({{expand_col}}), " ")) ),
collapse = " "),
TRUE ~ NA_character_
)
) %>%
dplyr::ungroup()
}
f2 <- function(data, group_col, expand_col, fun=f1){
v1 <- rlang::syms( colnames(data)[group_col] )
v2 <- rlang::syms( colnames(data)[expand_col] )
V <- tidyr::crossing( v1, v2 )
purrr::reduce2( V$v1, V$v2, fun, .init=data )
}
The function f1 use two columns, the first one {{group_col}} is a group identifier the second one {{expand_col}} may contain an annotation or NA. After a group_by by the {{group_col}} the {{expand_col}} is filled with the data from the other rows from the same group if it is NA. Example: f1(t, c, d).
The function f2 just propagates the function f1 using two sets of columns, the first set refers to grouping columns and the second set refers to annotation columns.
Then, I want to modify the function f1 to create (if needed) another column which will contain the information about the which {{group_col}} the {{expand_col}} was felled.
That means, imagine you run: t %>% f2(3:2, 4:5) you will get this:
structure(list(a = c("a", "b", "c", "d", "e", "f", "g", "h"),
b = c(1, 1, 1, 1, 2, 2, 2, 2), c = c(1, 1, 2, 2, 3, 3, 4,
4), d = c("D", "D", "D", "D", "E", "E", "E", "E"), e = c("A",
"A", "C", "C", "G H", "G H", "G", "H")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -8L))
Which is the same to run:
t %>%
f1(c, d)# %>%
f1(b, d) %>%
f1(c, e) %>%
f1(b, e)
You may notice that some rows were annotated previously. These rows should be filled with 'self' or something equivalent.
Here the example of the output I want:
structure(list(a = c("a", "b", "c", "d", "e", "f", "g", "h"),
b = c(1, 1, 1, 1, 2, 2, 2, 2),
c = c(1, 1, 2, 2, 3, 3, 4, 4),
d = c("D", "D", "D", "D", "E", "E", "E", "E"),
e = c("A", "A", "C", "C", "G H", "G H", "G", "H"),
d_fill = c("b", "b", "c", "self", "self", "c", "b", "b"),
e_fill = c("self", "c", "self", "c", "b", "b", "self", "self")
),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -8L))
Then I tried this unsuccessful modification:
f1 <- function(data, group_col, expand_col){ #, return_group_col = TRUE, name_group_col = "group_col"){
fill_column <- str_c(deparse(substitute(group_col)), "fill", sep = "_")
data %>%
dplyr::group_by({{group_col}}) %>%
dplyr::mutate(
{{fill_column}} := dplyr::if_else(
!is.na({{expand_col}}) & is.na({{fill_column}}) ~ "self",
is.na({{expand_col}}) & is.na({{fill_column}}) ~ deparse(substitute(group_col)),
TRUE ~ NA_character_
),
{{expand_col}} := dplyr::case_when(
!is.na({{expand_col}}) ~ {{expand_col}} ,
any( !is.na({{expand_col}}) ) & is.na({{expand_col}}) ~
paste(unique(unlist(str_split(na.omit({{expand_col}}), " ")) ),
collapse = " "),
TRUE ~ NA_character_
)
) %>%
dplyr::ungroup()
}
But when I run t %>% f1(c, d) to test it, I got this:
Error: `condition` must be a logical vector, not a `formula` object
Run `rlang::last_error()` to see where the error occurred.
25.
stop(fallback)
24.
signal_abort(cnd)
23.
.abort(text)
22.
glubort(fmt_args(args), ..., .envir = .envir)
21.
bad_args("condition", "must be a logical vector, not {friendly_type_of(condition)}")
20.
dplyr::if_else(!is.na(~d) & is.na(~"c_fill") ~ "self", is.na(~d) &
is.na(~"c_fill") ~ deparse(substitute(group_col)), TRUE ~
NA_character_)
19.
mutate_impl(.data, dots, caller_env())
18.
mutate.tbl_df(., `:=`({
{
fill_column
} ...
17.
dplyr::mutate(., `:=`({
{
fill_column
} ...
16.
function_list[[i]](value)
15.
freduce(value, `_function_list`)
14.
`_fseq`(`_lhs`)
13.
eval(quote(`_fseq`(`_lhs`)), env, env)
12.
eval(quote(`_fseq`(`_lhs`)), env, env)
11.
withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
10.
data %>% dplyr::group_by({
{
group_col
} ...
9.
f1(., c, d)
8.
function_list[[k]](value)
7.
withVisible(function_list[[k]](value))
6.
freduce(value, `_function_list`)
5.
`_fseq`(`_lhs`)
4.
eval(quote(`_fseq`(`_lhs`)), env, env)
3.
eval(quote(`_fseq`(`_lhs`)), env, env)
2.
withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
1.
t %>% f1(c, d)
I didn't figure out what is wrong.
Thanks in advance.

Resources