I have a data frame:
structure(list(groups = c("A", "A", "A", "A", "B", "B", "B",
"B", "C", "C", "C", "C", "D", "D", "D", "D"), weight = c(50.34869444,
49.20443342, 50.62727386, 50.12316397, 49.84571613, 50.88337532,
48.23188285, 51.13725686, 51.19946209, 49.02212935, 50.00188434,
49.70067628, 50.50444172, 48.88528478, 49.2378029, 49.11125589
), height = c(149.5389985, 150.7241218, 149.6922257, 149.6660622,
150.2770344, 149.6382699, 150.1900336, 151.264749, 151.3418096,
149.9407582, 150.2397936, 149.3163071, 148.079746, 149.1675788,
147.5201934, 150.8203477), age = c(10.18377395, 8.388813147,
9.858806212, 9.859746016, 9.584814407, 9.081315423, 10.67367302,
10.26713746, 10.96606861, 11.58603799, 10.34936347, 9.93621052,
9.584046986, 8.413787028, 10.39826156, 9.977231496), month_birth = c(3.627272074,
1.989467718, 2.175805989, 1.095100584, 2.16437856, 1.215151355,
2.63897628, 0.942159155, 1.155299136, 0.404000756, 1.695590789,
2.739378326, 1.950649717, 1.312775225, 1.904828579, 1.325257624
)), class = "data.frame", row.names = c(NA, -16L))
I want to use wilcox test to compare columns within each group individually
What was I trying to do:
wilcox.fun <- function(dat, col,group.labels) {
c1 <- combn(unique(group.labels),2)
sigs <- list()
for(i in 1:ncol(c1)) {
sigs[[i]] <- wilcox.test(
dat[c1[i,],col],
dat[c1[i,],col]
)
}
names(sigs) <- paste("Group",c1[1,],"by Group",c1[2,])
tests <- data.frame(Test=names(sigs),
W=unlist(lapply(sigs,function(x) x$statistic)),
p=unlist(lapply(sigs,function(x) x$p.value)),row.names=NULL)
return(tests)
}
debug(test.fun)
tests <- lapply(colnames(data[,c(2:6)]),function(x) wilcox.fun(data,group.labels=c(2:6),x))
names(tests) <- colnames(data[,c(2:6)])
I want to use the wilcox test to compare not between groups, but within the same group between the selected columns.
You can try this code to apply wilcox.test for every combination of variables within each group.
wilcox.fun <- function(dat) {
do.call(rbind, combn(names(dat)[-1], 2, function(x) {
test <- wilcox.test(dat[[x[1]]], dat[[x[2]]])
data.frame(Test = sprintf('Group %s by Group %s', x[1], x[2]),
W = test$statistic,
p = test$p.value)
}, simplify = FALSE))
}
result <- purrr::map_df(split(data, data$groups), wilcox.fun, .id = 'Group')
Related
I would like to automatically rename dataframes if they fulfill certain conditions. I have two question about this.
In the code below, the rm part fails, but I do not understand why.
I am wondering if there is a faster/better way to do this (for example by first putting the df's in a list, renaming and unlisting).
Example data:
df_a <- data.frame(
A = c("a", "b", "c"),
B = c("a", "b", "c"),
C = c("a", "b", "c")
)
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c"),
same_as_C = c("a", "b", "c")
)
My attempt is the following (where the condition is that more than 2 columns match):
# names of the data
names_of_dataset_X <- c("A", "B", "C")
names_of_dataset_Y <- c("same_as_A", "same_as_B", "same_as_C")
dfs <- ls()
for (i in seq_along(dfs)) {
if ( sum( names( get( dfs[i] ) ) %in% names_of_dataset_X) > 2) {
dataset_X <- copy(get( dfs[i] ))
rm(get( dfs[i] ))
} else if (TRUE) {
dataset_Y <- copy(get( dfs[i] ))
rm(get( dfs[i] ))
}
}
I have data
test = data.table(
a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6)
)
I wish to take the unique values of column a, store it in another data.table, and afterwards fill in the remaining columns with the most prevalent values of those remaining columns, such that my resulting data.table would be:
test2 = data.table(a = c(1,3,4,5,6), b = "a", c = 1)
Column be has equal amounts of "a" and "c", but it doesn't matter which is chosen in those cases.
Attempt so far:
test2 = unique(test, by = "a")
test2[, c("b", "c") := lapply(.SD, FUN = function(x){test2[, .N, by = x][order(-N)][1,1]}), .SDcols = c("b", "c")]
EDIT: I would preferrably like a generic solution that is compatible with a function where I specify the column to be "uniqued", and the rest of the columns are with the single most prevalent value. Hence my use of lapply and .SD =)
EDIT2: as #MichaelChirico points out, how do we keep the class. With the following data.table some of the solutions does not work, although solution of #chinsoon12 does work:
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))
Another option:
dtmode <- function(x) x[which.max(rowid(x))]
test[, .(A=unique(A), B=dtmode(B), C=dtmode(C))]
data:
test = data.table(
A = c(1,1,3,4,5,6),
B = c("a", "be", "a", "c", "d", "c"),
C = rep(1, 6)
)
Not a clean way to do this but it works.
test = data.frame(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
a = unique(test$a)
b = tail(names(sort(table(test$b))), 1)
c = tail(names(sort(table(test$c))), 1)
test2 = cbind(a,b,c)
Output is like this:
> test2
a b c
[1,] "1" "c" "1"
[2,] "3" "c" "1"
[3,] "4" "c" "1"
[4,] "5" "c" "1"
[5,] "6" "c" "1"
>
#EmreKiratli is very close to what I would do:
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) as(tail(names(sort(table(x))), 1L), class(x)))
), .SDcols = !'a']
The as(., class(x)) part is because names in R are always character, so we have to convert back to the original class of x.
You might like this better in magrittr form since it's many nested functions:
library(magrittr)
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) {
table(x) %>% sort %>% names %>% tail(1L) %>% as(class(x))
})
), .SDcols = !'a']
I was able to make an OK solution, but if somebody can do it more elegantly, for example not going through the step of storting a list in refLevel below, please let me know! I'm very interested in learning data.table properly!
#solution:
test = data.table(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
test2 = unique(test, by="a")
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(test[, c("b", "c")], funPrev)
test2[, c("b", "c") := refLevel]
...and using a function (if anybody see any un-necessary step, please let me know):
genData = function(dt, var_unique, vars_prev){
data = copy(dt)
data = unique(data, by = var_unique)
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(dt[, .SD, .SDcols = vars_prev], funPrev)
data[, (vars_prev) := refLevel]
return(data)
}
test2 = genData(test, "a", c("b", "c"))
Here's another variant which one might find less sophisticated, yet more readable. It's essentially chinsoon12's rowid approach generalized for any number of columns. Also the classes are kept.
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))
test2 = unique(test, by = "a")
for (col in setdiff(names(test2), "a")) test2[[col]] = test2[[col]][which.max(rowid(test2[[col]]))]
Converting this
g1 g2 desc val
A a 1 v1
A a 2 v2
A b 3 v3
To:
desc val
A
a
1 v1
2 v2
b
3 v3
I've converted a hierarchical data frame with two grouping levels into a structured list using a for loop. This displayed descriptions with an associated variable in a list interspersed with the group levels in order.
The purpose is to present the hierarchical data as a list so that it can be printed with formatting to distinguish the different grouping levels, using openxlsx.
Is there a more efficient base R, tidyverse or other approach to achieve this?
For loop code
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
Using a few packages from the tidyverse, we could do:
library(tidyverse)
# or explicitly load what you need
library(purrr)
library(dplyr)
library(tidyr)
library(stringr)
transpose(df) %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^v"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values)) %>%
drop_na(values) %>%
select(values, ind) %>%
replace_na(list(ind = ""))
Returns:
values ind
1 A
2 a
3 1 v1
5 2 v2
7 b
8 3 v3
Using tib data set, my solution seems to be a little slower than Plamen's:
Unit: milliseconds
expr min lq mean median uq max neval
old 17.658398 18.492957 21.292965 19.396304 21.770249 133.215223 100
new_simple 6.742158 7.013732 7.638155 7.190095 7.759104 12.640237 100
new_fast 4.064907 4.266243 4.837131 4.507865 4.871533 9.442904 100
tidyverse 4.980664 5.326694 6.004602 5.552611 6.215129 9.923524 100
I believe you can simplify and slightly optimize your code like this :
library(dplyr)
library(tidyr)
library(microbenchmark)
microbenchmark(
old = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
}
,
new_simple = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_group <- function(sub_df){
tibble(
desc = c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
, val = c(NA, NA, sub_df$val)
)
}
res_df <- bind_rows(lapply(tib_list, convert_group))
}
,
new_fast = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_desc <- function(sub_df){
c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
}
convert_val <- function(sub_df){ c(NA, NA, sub_df$val) }
res_df <- tibble(
desc = sapply(tib_list, convert_desc)
, val = sapply(tib_list, convert_val)
)
}
)
This gives me the following output:
Unit: milliseconds
expr min lq mean median uq max neval
old 41.06535 43.52606 49.42744 47.29305 52.74399 76.98021 100
new_simple 57.08038 60.65657 68.11021 63.38157 71.62398 112.24893 100
new_fast 24.16624 26.30785 31.07178 28.38764 31.91647 148.06442 100
I am wondering how can I join list of vectors to a data.frame or just to vectors to append new item to each vector with a match.
# list of vectors that should be extended with values from vp
# based on last item match to vc
lst <- list(c("a", "b", "c"),
c("b", "d"),
c("f", "e")
)
vc <- c("c", "c", "d")
vp <- c("k", "l", "m")
# expected output:
expect <- list (c("a", "b", "c", "k"),
c("a", "b", "c", "l"),
c("b", "d", "m"),
c("f", "e"))
It is worth noticing that if last item in lst matches several values in vc, vector is duplicated. Vector stays unchanged if it does not match values in vc
Try this one:
L <- lapply(lst, function(v) vp[vc %in% v[length(v)]])
pv <- function(v1, v2) {
if (length(v2) == 0) {
list(v1)
}
else {
lapply(v2, function(v) c(v1,v))
}
}
L2 <- mapply(pv, lst, L)
unlist(L2, recursive=F)
Here is my solution:
l=lapply(lst, function(v) vp[vc %in% v])
res=sapply(1:length(lst), function(i)
{
x=lst[[i]]
y=l[[i]]
if (length(y)>0)
sapply(1:length(y), function(j) list(c(x, y[j])))
else
list(x)
}
)
unlist(res, recursive = FALSE)
How can i get rows of a data frame that has a same value in a element of that comparing with another data frame ?
I have written this but it didn't work.
# example of two data frame
df1 <- data.frame(V1 = c("a", "g", "h", "l", "n", "e"), V2 = c("b", "n", "i", "m", "i", "f"), stringsAsFactors = F)
df2 <- data.frame(V1 = c("a", "c", "f","h"), V2 = c("b", "d", "e","z"), stringsAsFactors = F)
# finding joint values in each element of two data frames
res1<-intersect(df1$V1,df2$V1)
res2<-intersect(df1$V2,df2$V2)
res3<-intersect(df1$V1,df2$V2)
res4<-intersect(df1$V1,df2$V2)
# Getting rows that has joint value at least in one element of df1
ress1<-df1[apply(df1, MARGIN = 1, function(x) all(x== res1)), ]
ress2<-df1[apply(df1, MARGIN = 1, function(x) all(x== res2)), ]
ress3<-df1[apply(df1, MARGIN = 1, function(x) all(x== res3)), ]
ress4<-df1[apply(df1, MARGIN = 1, function(x) all(x== res4)), ]
# Getting rows that has joint value at least in one element of df2
resss1<-df2[apply(df2, MARGIN = 1, function(x) all(x== res1)), ]
resss2<-df2[apply(df2, MARGIN = 1, function(x) all(x== res2)), ]
resss3<-df2[apply(df2, MARGIN = 1, function(x) all(x== res3)), ]
resss4<-df2[apply(df2, MARGIN = 1, function(x) all(x== res4)), ]
# then combine above results
final.res<-rbind(ress1,ress2,ress3,ress4,resss1,resss2,resss3,resss4)
My favorite result is:
a b
h z
h i
f e
e f
This should work
#Import data
df1 <- data.frame(V1 = c("a", "g", "h", "l", "n", "e"), V2 = c("b", "n", "i", "m", "i", "f"), stringsAsFactors = F)
df2 <- data.frame(V1 = c("a", "c", "f","h"), V2 = c("b", "d", "e","z"), stringsAsFactors = F)
# Get the intersects
vals <- intersect(c(df1$V1, df1$V2), c(df2$V1, df2$V2))
#Get the subsets and rbind them
full <- rbind(
subset(df1, df1$V1 %in% vals),
subset(df1, df1$V2 %in% vals),
subset(df2, df2$V1 %in% vals),
subset(df2, df2$V2 %in% vals)
)
#Remove duplicates
full <- full[!duplicated(full),]