Related
I have a simple example of foreach nested loop below. How can I improve the readability of the result res?
Below is another example of nested for loop with more readable results, each element of list can be easily identified e.g. res_desired[["B"]][[3]]
library(foreach)
library(doFuture)
registerDoFuture()
plan(multicore, workers = 1)
# foreach loop
res <- foreach(a = c("A", "B", "C", "D")) %:%
foreach(b = 1:4) %do%
{
paste0(rep(a, b))
}
# for loop
res_desired <- list()
for(a in c("A", "B", "C", "D"))
{
for(b in 1:4)
{
res_desired[[a]][[b]] <- paste0(rep(a, b))
}
}
The .final may be useful. According to ?foreach
.final - function of one argument that is called to return final result.
With one more nested loop
res_1 <- foreach(a = c("A", "B", "C", "D"),
.final = function(x) setNames(x, c("A", "B", "C", "D"))) %:%
foreach(c = c("a", "b", "c"),
.final = function(x) setNames(x, c("a", "b", "c"))) %:%
foreach(b = 1:4) %do% {paste0(rep(a, b))}
-checking
> res_1[["B"]][["c"]][[2]]
[1] "B" "B"
similar to
res_desired_1 <- list()
for(a in c("A", "B", "C", "D")) {
for(c in c("a", "b", "c")) {
for(b in 1:4) {
res_desired_1[[a]][[c]][[b]] <- paste0(rep(a, b))
}
}
}
> res_desired_1[["B"]][["c"]][[2]]
[1] "B" "B"
I have a data frame:
structure(list(groups = c("A", "A", "A", "A", "B", "B", "B",
"B", "C", "C", "C", "C", "D", "D", "D", "D"), weight = c(50.34869444,
49.20443342, 50.62727386, 50.12316397, 49.84571613, 50.88337532,
48.23188285, 51.13725686, 51.19946209, 49.02212935, 50.00188434,
49.70067628, 50.50444172, 48.88528478, 49.2378029, 49.11125589
), height = c(149.5389985, 150.7241218, 149.6922257, 149.6660622,
150.2770344, 149.6382699, 150.1900336, 151.264749, 151.3418096,
149.9407582, 150.2397936, 149.3163071, 148.079746, 149.1675788,
147.5201934, 150.8203477), age = c(10.18377395, 8.388813147,
9.858806212, 9.859746016, 9.584814407, 9.081315423, 10.67367302,
10.26713746, 10.96606861, 11.58603799, 10.34936347, 9.93621052,
9.584046986, 8.413787028, 10.39826156, 9.977231496), month_birth = c(3.627272074,
1.989467718, 2.175805989, 1.095100584, 2.16437856, 1.215151355,
2.63897628, 0.942159155, 1.155299136, 0.404000756, 1.695590789,
2.739378326, 1.950649717, 1.312775225, 1.904828579, 1.325257624
)), class = "data.frame", row.names = c(NA, -16L))
I want to use wilcox test to compare columns within each group individually
What was I trying to do:
wilcox.fun <- function(dat, col,group.labels) {
c1 <- combn(unique(group.labels),2)
sigs <- list()
for(i in 1:ncol(c1)) {
sigs[[i]] <- wilcox.test(
dat[c1[i,],col],
dat[c1[i,],col]
)
}
names(sigs) <- paste("Group",c1[1,],"by Group",c1[2,])
tests <- data.frame(Test=names(sigs),
W=unlist(lapply(sigs,function(x) x$statistic)),
p=unlist(lapply(sigs,function(x) x$p.value)),row.names=NULL)
return(tests)
}
debug(test.fun)
tests <- lapply(colnames(data[,c(2:6)]),function(x) wilcox.fun(data,group.labels=c(2:6),x))
names(tests) <- colnames(data[,c(2:6)])
I want to use the wilcox test to compare not between groups, but within the same group between the selected columns.
You can try this code to apply wilcox.test for every combination of variables within each group.
wilcox.fun <- function(dat) {
do.call(rbind, combn(names(dat)[-1], 2, function(x) {
test <- wilcox.test(dat[[x[1]]], dat[[x[2]]])
data.frame(Test = sprintf('Group %s by Group %s', x[1], x[2]),
W = test$statistic,
p = test$p.value)
}, simplify = FALSE))
}
result <- purrr::map_df(split(data, data$groups), wilcox.fun, .id = 'Group')
Converting this
g1 g2 desc val
A a 1 v1
A a 2 v2
A b 3 v3
To:
desc val
A
a
1 v1
2 v2
b
3 v3
I've converted a hierarchical data frame with two grouping levels into a structured list using a for loop. This displayed descriptions with an associated variable in a list interspersed with the group levels in order.
The purpose is to present the hierarchical data as a list so that it can be printed with formatting to distinguish the different grouping levels, using openxlsx.
Is there a more efficient base R, tidyverse or other approach to achieve this?
For loop code
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
Using a few packages from the tidyverse, we could do:
library(tidyverse)
# or explicitly load what you need
library(purrr)
library(dplyr)
library(tidyr)
library(stringr)
transpose(df) %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^v"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values)) %>%
drop_na(values) %>%
select(values, ind) %>%
replace_na(list(ind = ""))
Returns:
values ind
1 A
2 a
3 1 v1
5 2 v2
7 b
8 3 v3
Using tib data set, my solution seems to be a little slower than Plamen's:
Unit: milliseconds
expr min lq mean median uq max neval
old 17.658398 18.492957 21.292965 19.396304 21.770249 133.215223 100
new_simple 6.742158 7.013732 7.638155 7.190095 7.759104 12.640237 100
new_fast 4.064907 4.266243 4.837131 4.507865 4.871533 9.442904 100
tidyverse 4.980664 5.326694 6.004602 5.552611 6.215129 9.923524 100
I believe you can simplify and slightly optimize your code like this :
library(dplyr)
library(tidyr)
library(microbenchmark)
microbenchmark(
old = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
}
,
new_simple = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_group <- function(sub_df){
tibble(
desc = c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
, val = c(NA, NA, sub_df$val)
)
}
res_df <- bind_rows(lapply(tib_list, convert_group))
}
,
new_fast = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_desc <- function(sub_df){
c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
}
convert_val <- function(sub_df){ c(NA, NA, sub_df$val) }
res_df <- tibble(
desc = sapply(tib_list, convert_desc)
, val = sapply(tib_list, convert_val)
)
}
)
This gives me the following output:
Unit: milliseconds
expr min lq mean median uq max neval
old 41.06535 43.52606 49.42744 47.29305 52.74399 76.98021 100
new_simple 57.08038 60.65657 68.11021 63.38157 71.62398 112.24893 100
new_fast 24.16624 26.30785 31.07178 28.38764 31.91647 148.06442 100
I am struggling with a recursive function, who's goal is to determine which raw materials belong to which product. I clouldn't figure out, how to handle multiple possible paths in data frame "db". The wanted function should give: A-B-C-E, A-B-C-F, A-B-D-F for db. My function works for "da". I added it to show what I am after, and it is a bit like bill of materials explosion, but not exactly.
da <- data.frame(parent = c("A", "B", "B", "C", "D"),
child = c("B", "C", "D", "E", "F"),
stringsAsFactors = FALSE)
db <- data.frame(parent = c("A", "B", "B", "C", "D", "C"),
child = c("B", "C", "D", "E", "F", "F"),
stringsAsFactors = FALSE)
my_path <- function(a, df) {
b <- df$parent[df$child == a]
if (length(b) == 0) {
return(a)
} else {
return(c(my_path(b, df), a))
}
}
end_points <- da$child[is.na(match(da$child, da$parent))]
lapply(end_points, function(x) my_path(x, da)) # -> ok
end_points <- db$child[is.na(match(db$child, db$parent))]
lapply(end_points, function(x) my_path(x, db)) # -> not ok
Thx & kind regards
This is a job for igraph:
#the data
db <- data.frame(parent = c("A", "B", "B", "C", "D", "C"),
child = c("B", "C", "D", "E", "F", "F"),
stringsAsFactors = FALSE)
#create a graph
library(igraph)
g <- graph_from_data_frame(db)
#plot the graph
plot(g)
#find all vertices that have no ingoing resp. outgoing edges
starts <- V(g)[degree(g, mode = "in") == 0]
finals <- V(g)[degree(g, mode = "out") == 0]
#find paths, you need to loop if starts is longer than 1
res <- all_simple_paths(g, from = starts[[1]], to = finals)
#[[1]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B C E
#
#[[2]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B C F
#
#[[3]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B D F
#coerce to vectors
lapply(res, as_ids)
I am wondering how can I join list of vectors to a data.frame or just to vectors to append new item to each vector with a match.
# list of vectors that should be extended with values from vp
# based on last item match to vc
lst <- list(c("a", "b", "c"),
c("b", "d"),
c("f", "e")
)
vc <- c("c", "c", "d")
vp <- c("k", "l", "m")
# expected output:
expect <- list (c("a", "b", "c", "k"),
c("a", "b", "c", "l"),
c("b", "d", "m"),
c("f", "e"))
It is worth noticing that if last item in lst matches several values in vc, vector is duplicated. Vector stays unchanged if it does not match values in vc
Try this one:
L <- lapply(lst, function(v) vp[vc %in% v[length(v)]])
pv <- function(v1, v2) {
if (length(v2) == 0) {
list(v1)
}
else {
lapply(v2, function(v) c(v1,v))
}
}
L2 <- mapply(pv, lst, L)
unlist(L2, recursive=F)
Here is my solution:
l=lapply(lst, function(v) vp[vc %in% v])
res=sapply(1:length(lst), function(i)
{
x=lst[[i]]
y=l[[i]]
if (length(y)>0)
sapply(1:length(y), function(j) list(c(x, y[j])))
else
list(x)
}
)
unlist(res, recursive = FALSE)