Get the row name used to annotation - r

First of all, my question is related to these other ones:
Lazy evaluation to annotations expanding function
R nested map through columns
So, I got this example data:
t <- tibble(a = c("a", "b", "c", "d", "e", "f", "g", "h"),
b = c( 1, 1, 1, 1, 2, 2, 2, 2),
c = c( 1, 1, 2, 2, 3, 3, 4, 4),
d = c( NA, NA, NA, "D", "E", NA, NA, NA),
e = c("A", NA, "C", NA, NA, NA, "G", "H")
)
And this functions
f1 <- function(data, group_col, expand_col){ #, return_group_col = TRUE, name_group_col = "group_col"){
data %>%
dplyr::group_by({{group_col}}) %>%
dplyr::mutate(
{{expand_col}} := dplyr::case_when(
!is.na({{expand_col}}) ~ {{expand_col}} ,
any( !is.na({{expand_col}}) ) & is.na({{expand_col}}) ~
paste(unique(unlist(str_split(na.omit({{expand_col}}), " ")) ),
collapse = " "),
TRUE ~ NA_character_
)
) %>%
dplyr::ungroup()
}
f2 <- function(data, group_col, expand_col, fun=f1){
v1 <- rlang::syms( colnames(data)[group_col] )
v2 <- rlang::syms( colnames(data)[expand_col] )
V <- tidyr::crossing( v1, v2 )
purrr::reduce2( V$v1, V$v2, fun, .init=data )
}
The function f1 use two columns, the first one {{group_col}} is a group identifier the second one {{expand_col}} may contain an annotation or NA. After a group_by by the {{group_col}} the {{expand_col}} is filled with the data from the other rows from the same group if it is NA. Example: f1(t, c, d).
The function f2 just propagates the function f1 using two sets of columns, the first set refers to grouping columns and the second set refers to annotation columns.
Then, I want to modify the function f1 to create (if needed) another column which will contain the information about the which {{group_col}} the {{expand_col}} was felled.
That means, imagine you run: t %>% f2(3:2, 4:5) you will get this:
structure(list(a = c("a", "b", "c", "d", "e", "f", "g", "h"),
b = c(1, 1, 1, 1, 2, 2, 2, 2), c = c(1, 1, 2, 2, 3, 3, 4,
4), d = c("D", "D", "D", "D", "E", "E", "E", "E"), e = c("A",
"A", "C", "C", "G H", "G H", "G", "H")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -8L))
Which is the same to run:
t %>%
f1(c, d)# %>%
f1(b, d) %>%
f1(c, e) %>%
f1(b, e)
You may notice that some rows were annotated previously. These rows should be filled with 'self' or something equivalent.
Here the example of the output I want:
structure(list(a = c("a", "b", "c", "d", "e", "f", "g", "h"),
b = c(1, 1, 1, 1, 2, 2, 2, 2),
c = c(1, 1, 2, 2, 3, 3, 4, 4),
d = c("D", "D", "D", "D", "E", "E", "E", "E"),
e = c("A", "A", "C", "C", "G H", "G H", "G", "H"),
d_fill = c("b", "b", "c", "self", "self", "c", "b", "b"),
e_fill = c("self", "c", "self", "c", "b", "b", "self", "self")
),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -8L))
Then I tried this unsuccessful modification:
f1 <- function(data, group_col, expand_col){ #, return_group_col = TRUE, name_group_col = "group_col"){
fill_column <- str_c(deparse(substitute(group_col)), "fill", sep = "_")
data %>%
dplyr::group_by({{group_col}}) %>%
dplyr::mutate(
{{fill_column}} := dplyr::if_else(
!is.na({{expand_col}}) & is.na({{fill_column}}) ~ "self",
is.na({{expand_col}}) & is.na({{fill_column}}) ~ deparse(substitute(group_col)),
TRUE ~ NA_character_
),
{{expand_col}} := dplyr::case_when(
!is.na({{expand_col}}) ~ {{expand_col}} ,
any( !is.na({{expand_col}}) ) & is.na({{expand_col}}) ~
paste(unique(unlist(str_split(na.omit({{expand_col}}), " ")) ),
collapse = " "),
TRUE ~ NA_character_
)
) %>%
dplyr::ungroup()
}
But when I run t %>% f1(c, d) to test it, I got this:
Error: `condition` must be a logical vector, not a `formula` object
Run `rlang::last_error()` to see where the error occurred.
25.
stop(fallback)
24.
signal_abort(cnd)
23.
.abort(text)
22.
glubort(fmt_args(args), ..., .envir = .envir)
21.
bad_args("condition", "must be a logical vector, not {friendly_type_of(condition)}")
20.
dplyr::if_else(!is.na(~d) & is.na(~"c_fill") ~ "self", is.na(~d) &
is.na(~"c_fill") ~ deparse(substitute(group_col)), TRUE ~
NA_character_)
19.
mutate_impl(.data, dots, caller_env())
18.
mutate.tbl_df(., `:=`({
{
fill_column
} ...
17.
dplyr::mutate(., `:=`({
{
fill_column
} ...
16.
function_list[[i]](value)
15.
freduce(value, `_function_list`)
14.
`_fseq`(`_lhs`)
13.
eval(quote(`_fseq`(`_lhs`)), env, env)
12.
eval(quote(`_fseq`(`_lhs`)), env, env)
11.
withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
10.
data %>% dplyr::group_by({
{
group_col
} ...
9.
f1(., c, d)
8.
function_list[[k]](value)
7.
withVisible(function_list[[k]](value))
6.
freduce(value, `_function_list`)
5.
`_fseq`(`_lhs`)
4.
eval(quote(`_fseq`(`_lhs`)), env, env)
3.
eval(quote(`_fseq`(`_lhs`)), env, env)
2.
withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
1.
t %>% f1(c, d)
I didn't figure out what is wrong.
Thanks in advance.

Related

Separate entries in dataframe in new rows in R [duplicate]

This question already has answers here:
Split comma-separated strings in a column into separate rows
(6 answers)
Closed 28 days ago.
I have data.frame df below.
df <- data.frame(id = c(1:12),
A = c("alpha", "alpha", "beta", "beta", "gamma", "gamma", "gamma", "delta",
"epsilon", "epsilon", "zeta", "eta"),
B = c("a", "a; b", "a", "c; d; e", "e", "e", "c; f", "g", "a", "g; h", "f", "d"),
C = c(NA, 4, 2, 7, 4, NA, 9, 1, 1, NA, 3, NA),
D = c("ii", "ii", "i", "iii", "iv", "v", "viii", "v", "viii", "i", "iii", "i"))
Column 'B' contains four entries with semicolons. How can I copy each of these rows and enter in column 'B' each of the separate values?
The expected result df2 is:
df2 <- data.frame(id = c(1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12),
A = c(rep("alpha", 3), rep("beta", 4), rep("gamma", 4), "delta", rep("epsilon", 3),
"zeta", "eta"),
B = c("a", "a", "b", "a", "c", "d", "e", "e", "e", "c", "f", "g", "a", "g", "h", "f", "d"),
C = c(NA, 4, 4, 2, 7, 7, 7, 4, NA, 9, 9, 1, 1, NA, NA, 3, NA),
D = c("ii", "ii", "ii", "i", "iii", "iii", "iii", "iv", "v", "viii", "viii", "v", "viii", "i", "i", "iii", "i"))
I tried this, but no luck:
df2 <- df
# split the values in column B
df2$B <- unlist(strsplit(as.character(df2$B), "; "))
# repeat the rows for each value in column B
df2 <- df2[rep(seq_len(nrow(df2)), sapply(strsplit(as.character(df1$B), "; "), length)),]
# match the number of rows in column B with the number of rows in df2
df2$id <- rep(df2$id, sapply(strsplit(as.character(df1$B), "; "), length))
# sort the dataframe by id
df2 <- df2[order(df2$id),]
We may use separate_rows here - specify the sep as ; followed by zero or more spaces (\\s*) to expand the rows
library(tidyr)
df_new <- separate_rows(df, B, sep = ";\\s*")
-checking with OP's expected
> all.equal(df_new, df2, check.attributes = FALSE)
[1] TRUE
In the base R, we may replicate the sequence of rows by the lengths of the list output
lst1 <- strsplit(df$B, ";\\s+")
df_new2 <- transform(df[rep(seq_len(nrow(df)), lengths(lst1)),], B = unlist(lst1))
row.names(df_new2) <- NULL

Tidyverse: group_by, arrange, and lag across columns

I am working on a projection model for sports where I need to understand in a certain team's most recent game:
Who is their next opponent? (solved)
When is the last time their next opponent played?
reprex that can be used below. Using row 1 as an example, I would need to understand that "a"'s next opponent "e"'s most recent game was game_id_ 3.
game_id_ <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6)
game_date_ <- c(rep("2021-01-29", 6), rep("2021-01-30", 6))
team_ <- c("a", "b", "c", "d", "e", "f", "b", "c", "d", "f", "e", "a")
opp_ <- c("b", "a", "d", "c", "f", "e", "c", "b", "f", "d", "a", "e")
df <- data.frame(game_id_, game_date_, team_, opp_)
#Next opponent
df <- df %>%
arrange(game_date_, game_id_, team_) %>%
group_by(team_) %>%
mutate(next_opp = lead(opp_, n = 1L))
If I can provide more details, please let me know.
We can use match to return the corresponding game_id_
library(dplyr)
df %>%
arrange(game_date_, game_id_, team_) %>%
group_by(team_) %>%
mutate(next_opp = lead(opp_, n = 1L)) %>%
ungroup %>%
mutate(last_time = game_id_[match(next_opp, opp_)])

Creating a vector of random strings with conditions in R

The problem is :
Create a vector of 50 random names, each made of 5 letters,out of which only the first is capital. The first, third and fifthletters are consonants and the second and fourth are vowels.
How can i do this?
Use letters and LETTERS which are built in constants in R, then define vowels as c(1, 5, 9, 15, 21), which is the subset to select or remove. Use sample to get 50 of each with replacement, and paste them together:
set.seed(69)
vowels <- c(1, 5, 9, 15, 21)
paste0( sample(LETTERS[-vowels], 50, TRUE),
sample(letters[vowels], 50, TRUE),
sample(letters[-vowels], 50, TRUE),
sample(letters[vowels], 50, TRUE),
sample(letters[-vowels], 50, TRUE))
#> [1] "Valif" "Cirer" "Tuniw" "Kimil" "Qehoc" "Jemif" "Senoy" "Jazic" "Hihuy"
#> [10] "Cezor" "Fuzic" "Menas" "Covay" "Rupov" "Xanij" "Pujur" "Qimin" "Dunop"
#> [19] "Xokez" "Zacox" "Muhac" "Yitab" "Gojob" "Dedah" "Nepan" "Dinel" "Ceyaw"
#> [28] "Foxiv" "Fiven" "Zotob" "Bezug" "Pusod" "Jawad" "Suluq" "Zubic" "Minax"
#> [37] "Gowex" "Debec" "Xaqut" "Duvov" "Lalal" "Zavuv" "Xobuk" "Zugil" "Gibac"
#> [46] "Yocan" "Voyuh" "Nigeh" "Yuqew" "Humup"
Created on 2020-04-05 by the reprex package (v0.3.0)
Just using a for loop to sample the built in vectors LETTERS and letters with conditional subsetting to subset to vowels and consonants
namesList <- list()
for(i in 1:50){
namesList[[i]] <- paste(c(sample(LETTERS[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 1)), collapse = "")
}
namesVec <- unlist(namesList)
Or in a vectorised fashion (which is better)
paste(
sample(LETTERS[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sep = "")

Plotting based on occurrence in group

I would to make a bar chart that plots the bar as a proportion of the total group rather than the usual percentage. For a var to "count" it only needs to occur once in a group. For example in this df where id is the grouping variable
df <-
tibble(id = c(rep(1, 3), rep(2, 3), rep(3, 3)),
vars = c("a", NA, "b", "c", "d", "e", "a", "a", "a"))
The a bars would be:
a = 2/3 # since a occurs in 2 out of 3 groups
b = 1/3
c = 1/3
d = 1/3
e = 1/3
If I understand you correctly, a one-liner would suffice:
ggplot(distinct(df)) + geom_bar(aes(vars, stat(count) / n_distinct(df$id)))
Working answer:
tibble(id = c(rep(1, 3), rep(2, 3), rep(3, 3)),
vars = c("a", "a", "b", "c", "d", "e", "a", "a", "a")) %>%
group_by(id) %>%
distinct(vars) %>%
ungroup() %>%
add_count(vars) %>%
mutate(prop = n / n_distinct(id)) %>%
distinct(vars, .keep_all = T) %>%
ggplot(aes(vars, prop)) +
geom_col()

Most elegant way to convert lists into igraph object for plotting

I am new to igraph and it seems to be a very powerful (and therefore also complex) package.
I tried to convert the following lists into an igraph object.
graph <- list(s = c("a", "b"),
a = c("s", "b", "c", "d"),
b = c("s", "a", "c", "d"),
c = c("a", "b", "d", "e", "f"),
d = c("a", "b", "c", "e", "f"),
e = c("c", "d", "f", "z"),
f = c("c", "d", "e", "z"),
z = c("e", "f"))
weights <- list(s = c(3, 5),
a = c(3, 1, 10, 11),
b = c(5, 3, 2, 3),
c = c(10, 2, 3, 7, 12),
d = c(15, 7, 2, 11, 2),
e = c(7, 11, 3, 2),
f = c(12, 2, 3, 2),
z = c(2, 2))
Interpretation is as follows: s is the starting node, it links to nodes a and b. The edges are weighted 3 for s to a and 5 for s to b and so on.
I tried all kinds of functions from igraph but only got all kinds of errors. What is the most elegant and easy way to convert the above into an igraph object for plotting the graph?
Create an edgelist and then a graph from that. Assign the weights and plot it.
set.seed(123)
e <- as.matrix(stack(graph))
g <- graph_from_edgelist(e)
E(g)$weight <- stack(weights)[[1]]
plot(g, edge.label = E(g)$weight)

Resources