R convert tidy hierarchical data frame to hierarchical list - r

Converting this
g1 g2 desc val
A a 1 v1
A a 2 v2
A b 3 v3
To:
desc val
A
a
1 v1
2 v2
b
3 v3
I've converted a hierarchical data frame with two grouping levels into a structured list using a for loop. This displayed descriptions with an associated variable in a list interspersed with the group levels in order.
The purpose is to present the hierarchical data as a list so that it can be printed with formatting to distinguish the different grouping levels, using openxlsx.
Is there a more efficient base R, tidyverse or other approach to achieve this?
For loop code
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}

Using a few packages from the tidyverse, we could do:
library(tidyverse)
# or explicitly load what you need
library(purrr)
library(dplyr)
library(tidyr)
library(stringr)
transpose(df) %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^v"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values)) %>%
drop_na(values) %>%
select(values, ind) %>%
replace_na(list(ind = ""))
Returns:
values ind
1 A
2 a
3 1 v1
5 2 v2
7 b
8 3 v3
Using tib data set, my solution seems to be a little slower than Plamen's:
Unit: milliseconds
expr min lq mean median uq max neval
old 17.658398 18.492957 21.292965 19.396304 21.770249 133.215223 100
new_simple 6.742158 7.013732 7.638155 7.190095 7.759104 12.640237 100
new_fast 4.064907 4.266243 4.837131 4.507865 4.871533 9.442904 100
tidyverse 4.980664 5.326694 6.004602 5.552611 6.215129 9.923524 100

I believe you can simplify and slightly optimize your code like this :
library(dplyr)
library(tidyr)
library(microbenchmark)
microbenchmark(
old = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
}
,
new_simple = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_group <- function(sub_df){
tibble(
desc = c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
, val = c(NA, NA, sub_df$val)
)
}
res_df <- bind_rows(lapply(tib_list, convert_group))
}
,
new_fast = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_desc <- function(sub_df){
c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
}
convert_val <- function(sub_df){ c(NA, NA, sub_df$val) }
res_df <- tibble(
desc = sapply(tib_list, convert_desc)
, val = sapply(tib_list, convert_val)
)
}
)
This gives me the following output:
Unit: milliseconds
expr min lq mean median uq max neval
old 41.06535 43.52606 49.42744 47.29305 52.74399 76.98021 100
new_simple 57.08038 60.65657 68.11021 63.38157 71.62398 112.24893 100
new_fast 24.16624 26.30785 31.07178 28.38764 31.91647 148.06442 100

Related

Fastest way of automatically renaming dataframes, if they fulfill certain conditions

I would like to automatically rename dataframes if they fulfill certain conditions. I have two question about this.
In the code below, the rm part fails, but I do not understand why.
I am wondering if there is a faster/better way to do this (for example by first putting the df's in a list, renaming and unlisting).
Example data:
df_a <- data.frame(
A = c("a", "b", "c"),
B = c("a", "b", "c"),
C = c("a", "b", "c")
)
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c"),
same_as_C = c("a", "b", "c")
)
My attempt is the following (where the condition is that more than 2 columns match):
# names of the data
names_of_dataset_X <- c("A", "B", "C")
names_of_dataset_Y <- c("same_as_A", "same_as_B", "same_as_C")
dfs <- ls()
for (i in seq_along(dfs)) {
if ( sum( names( get( dfs[i] ) ) %in% names_of_dataset_X) > 2) {
dataset_X <- copy(get( dfs[i] ))
rm(get( dfs[i] ))
} else if (TRUE) {
dataset_Y <- copy(get( dfs[i] ))
rm(get( dfs[i] ))
}
}

wilcox test inside rows

I have a data frame:
structure(list(groups = c("A", "A", "A", "A", "B", "B", "B",
"B", "C", "C", "C", "C", "D", "D", "D", "D"), weight = c(50.34869444,
49.20443342, 50.62727386, 50.12316397, 49.84571613, 50.88337532,
48.23188285, 51.13725686, 51.19946209, 49.02212935, 50.00188434,
49.70067628, 50.50444172, 48.88528478, 49.2378029, 49.11125589
), height = c(149.5389985, 150.7241218, 149.6922257, 149.6660622,
150.2770344, 149.6382699, 150.1900336, 151.264749, 151.3418096,
149.9407582, 150.2397936, 149.3163071, 148.079746, 149.1675788,
147.5201934, 150.8203477), age = c(10.18377395, 8.388813147,
9.858806212, 9.859746016, 9.584814407, 9.081315423, 10.67367302,
10.26713746, 10.96606861, 11.58603799, 10.34936347, 9.93621052,
9.584046986, 8.413787028, 10.39826156, 9.977231496), month_birth = c(3.627272074,
1.989467718, 2.175805989, 1.095100584, 2.16437856, 1.215151355,
2.63897628, 0.942159155, 1.155299136, 0.404000756, 1.695590789,
2.739378326, 1.950649717, 1.312775225, 1.904828579, 1.325257624
)), class = "data.frame", row.names = c(NA, -16L))
I want to use wilcox test to compare columns within each group individually
What was I trying to do:
wilcox.fun <- function(dat, col,group.labels) {
c1 <- combn(unique(group.labels),2)
sigs <- list()
for(i in 1:ncol(c1)) {
sigs[[i]] <- wilcox.test(
dat[c1[i,],col],
dat[c1[i,],col]
)
}
names(sigs) <- paste("Group",c1[1,],"by Group",c1[2,])
tests <- data.frame(Test=names(sigs),
W=unlist(lapply(sigs,function(x) x$statistic)),
p=unlist(lapply(sigs,function(x) x$p.value)),row.names=NULL)
return(tests)
}
debug(test.fun)
tests <- lapply(colnames(data[,c(2:6)]),function(x) wilcox.fun(data,group.labels=c(2:6),x))
names(tests) <- colnames(data[,c(2:6)])
I want to use the wilcox test to compare not between groups, but within the same group between the selected columns.
You can try this code to apply wilcox.test for every combination of variables within each group.
wilcox.fun <- function(dat) {
do.call(rbind, combn(names(dat)[-1], 2, function(x) {
test <- wilcox.test(dat[[x[1]]], dat[[x[2]]])
data.frame(Test = sprintf('Group %s by Group %s', x[1], x[2]),
W = test$statistic,
p = test$p.value)
}, simplify = FALSE))
}
result <- purrr::map_df(split(data, data$groups), wilcox.fun, .id = 'Group')

Replace values in vector where not %in% vector

Short question:
I can substitute certain variable values like this:
values <- c("a", "b", "a", "b", "c", "a", "b")
df <- data.frame(values)
What's the easiest way to replace all the values of df$values by "x" (where the value is neither "a" or "b")?
Output should be:
c("a", "b", "a", "b", "x", "a", "b")
Your example is a bit unclear and not reproducible.
However, based on guessing what you actually want, I could suggest trying this option using the data.table package:
df[values %in% c("a", "b"), values := "x"]
or the dplyr package:
df %>% mutate(values = ifelse(values %in% c("a","b"), x, values))
What about:
df[!df[, 1] %in% c("a", "b"), ] <- "x"
values
1 a
2 b
3 a
4 b
5 x
6 a
7 b

Recursive function to check all possible paths (from raw material to product)

I am struggling with a recursive function, who's goal is to determine which raw materials belong to which product. I clouldn't figure out, how to handle multiple possible paths in data frame "db". The wanted function should give: A-B-C-E, A-B-C-F, A-B-D-F for db. My function works for "da". I added it to show what I am after, and it is a bit like bill of materials explosion, but not exactly.
da <- data.frame(parent = c("A", "B", "B", "C", "D"),
child = c("B", "C", "D", "E", "F"),
stringsAsFactors = FALSE)
db <- data.frame(parent = c("A", "B", "B", "C", "D", "C"),
child = c("B", "C", "D", "E", "F", "F"),
stringsAsFactors = FALSE)
my_path <- function(a, df) {
b <- df$parent[df$child == a]
if (length(b) == 0) {
return(a)
} else {
return(c(my_path(b, df), a))
}
}
end_points <- da$child[is.na(match(da$child, da$parent))]
lapply(end_points, function(x) my_path(x, da)) # -> ok
end_points <- db$child[is.na(match(db$child, db$parent))]
lapply(end_points, function(x) my_path(x, db)) # -> not ok
Thx & kind regards
This is a job for igraph:
#the data
db <- data.frame(parent = c("A", "B", "B", "C", "D", "C"),
child = c("B", "C", "D", "E", "F", "F"),
stringsAsFactors = FALSE)
#create a graph
library(igraph)
g <- graph_from_data_frame(db)
#plot the graph
plot(g)
#find all vertices that have no ingoing resp. outgoing edges
starts <- V(g)[degree(g, mode = "in") == 0]
finals <- V(g)[degree(g, mode = "out") == 0]
#find paths, you need to loop if starts is longer than 1
res <- all_simple_paths(g, from = starts[[1]], to = finals)
#[[1]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B C E
#
#[[2]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B C F
#
#[[3]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B D F
#coerce to vectors
lapply(res, as_ids)

How to avoid for loop when iterating through unique values in a column [R]

Let's assume that we have following toy data:
library(tidyverse)
data <- tibble(
subject = c(1, 1, 1, 2, 2, 2, 2, 3, 3, 3),
id1 = c("a", "a", "b", "a", "a", "a", "b", "a", "a", "b"),
id2 = c("b", "c", "c", "b", "c", "d", "c", "b", "c", "c")
)
which represent network relationships for each subject. For example, there are three unique subjects in the data and the network for the first subject could be represented as sequence of relations:
a -- b, a --c, b -- c
The task is to compute centralities for each network. Using for loop this is straightforward:
library(igraph)
# Get unique subjects
subjects_uniq <- unique(data$subject)
# Compute centrality of nodes for each graph
for (i in 1:length(subjects_uniq)) {
current_data <- data %>% filter(subject == i) %>% select(-subject)
current_graph <- current_data %>% graph_from_data_frame(directed = FALSE)
centrality <- eigen_centrality(current_graph)$vector
}
Question: My dataset is huge so I wonder how to avoid explicit for loop. Should I use apply() and its modern cousins (maybe map() in the purrr package)? Any suggestions are greatly welcome.
Here is an option using map
library(tidyverse)
library(igraph)
map(subjects_uniq, ~data %>%
filter(subject == .x) %>%
select(-subject) %>%
graph_from_data_frame(directed = FALSE) %>%
{eigen_centrality(.)$vector})
#[[1]]
#a b c
#1 1 1
#[[2]]
# a b c d
#1.0000000 0.8546377 0.8546377 0.4608111
#[[3]]
#a b c
#1 1 1

Resources