unique words by group - r

this is my example dataframe
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
I would like to get only unique words within group and through groups
So I would like to get this
group word
A car, tree
B sun
I used aggregate and get this
aggregate(word ~ group , data = example, FUN = paste0)
group word
1 A car, car, house, tree
2 B sun ,sun, house
but now i need to select only unique values, but even this does not work out
for (i in 1:nrow(cluster)) {cluster[i, ][["word"]] = lapply(unlist(cluster[i, ][["word"]]), unique)}
with
Error in `[[<-.data.frame`(`*tmp*`, "word", value = list("car", "car, house", :
replacement has 3 rows, data has 1

A base R option using aggregate + subset + ave like below
with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, "[, ]+"))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
)
gives
ind values
1 A car, tree
2 B sun

Here's a dplyr solution:
library(dplyr)
library(tidyr)
example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))
output
group word
1 A car, tree
2 B sun

In base you can use strsplit to get the words, split them by group and use unique the get unique words per group. Use table to get the number of same words and take those which appear only once.
t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))
# group word
#A A car, tree
#B B sun
Or another way starting with the already used aggregate in the question.
t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1
# group word
#1 A car, tree
#2 B sun
And just for fun a Benchmark
library(bench)
library(dplyr)
library(tidyr)
library(tidyverse)
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
bench::mark(check = FALSE,
GKi = {t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))},
GKi2 = {t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1},
ThomasIsCoding = with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, ", "))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
),
Mael = {example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))},
"Nir Graham" = {example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
}
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 GKi 445.13µs 486.26µs 1997. 16.03KB 6.15 974 3
2 GKi2 916.97µs 968.68µs 1023. 7.3KB 6.15 499 3
3 ThomasIsCoding 3.54ms 3.73ms 266. 8.19KB 8.45 126 4
4 Mael 16.07ms 16.48ms 60.1 60.04KB 6.68 27 3
5 Nir Graham 37.29ms 39.49ms 24.0 90.59KB 8.00 9 3
GKi is about 2 times faster than GKi2, 7 times faster than ThomasIsCoding, 30 than Mael and 80 than Nir Graham.

library(tidyverse)
example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))

Related

NA values in vectors passed to map_dfr

I'm trying to pass vectors, each with a different number of NA values, through to a map() function but it's returning an error.
I have a tibble of N numeric columns and 1 categorical column. I want to compare the distributions for each of the numeric columns against the other split by the values of the categorical column. I use overlapping::overlap() to calculate the overlap of the distributions, and i feed the numeric columns into a map_dfr function for the iteration. For example:
require(overlapping)
require(dplyr)
require(purrr)
set.seed( 1 )
n <- 100
G1 <- sample( 0:30, size = n, replace = TRUE )
G2 <- sample( 0:30, size = n, replace = TRUE, prob = dbinom( 0:30, 31, .55 ))
G3 <- sample( 0:30, size = n, replace = TRUE, prob = dbinom( 0:30, 41, .65 ))
Data <- data.frame(y = G1, x = G2, z = G3, group = rep(c("G1","G2", "G3"), each = n), class = rep(c("C1","C2", "C3"), each = 1)) %>% as_tibble()
Data
overlap_fcn <- function(.x) {
## construct list of vectors
dist_list <- list(
"C1" = Data %>%
filter(class == 'C1', !is.na(.x)) %>%
pull(.x),
"C2" = Data %>%
filter(class == 'C2', !is.na(.x)) %>%
pull(.x),
"C3" = Data %>%
filter(class == 'C3', !is.na(.x)) %>%
pull(.x)
)
## calculate distribution overlaps
return(
enframe(
overlapping::overlap(dist_list)$OV*100
) %>%
mutate(value = paste0(round(value, 2), "%"),
class = .x) %>%
rename(comparison = name, overlap = value) %>%
relocate(class)
)
}
overlap_table <- purrr::map_dfr(
.x = c('y', 'x', "z"),
.f = ~overlap_fcn(.x))
overlap_table
The above works as intended. However, in practice I have different amounts of missingess in each of x, y, and z. I try to account for this with the filter on !is.na(.x) but it's not working. For example:
Data$x[1:3] <- NA
Data$y[10:20] <- NA
Data$z[100:150] <- NA
overlap_table <- purrr::map_dfr(
.x = c('x', 'y', "z"),
.f = ~overlap_fcn(.x))
returns this error:
Error in density.default(x[[j]], n = nbins, ...): 'x' contains missing values
Error in density.default(x[[j]], n = nbins, ...): 'x' contains missing values
Traceback:
1. purrr::map_dfr(.x = c("x", "y", "z"), .f = ~overlap_fcn(.x))
2. map(.x, .f, ...)
3. .f(.x[[i]], ...)
4. overlap_fcn(.x)
5. enframe(overlapping::overlap(dist_list)$OV * 100) %>% mutate(value = paste0(round(value,
. 2), "%"), class = .x) %>% rename(comparison = name, overlap = value) %>%
. relocate(class) # at line 25-33 of file <text>
6. relocate(., class)
7. rename(., comparison = name, overlap = value)
8. mutate(., value = paste0(round(value, 2), "%"), class = .x)
9. enframe(overlapping::overlap(dist_list)$OV * 100)
10. overlapping::overlap(dist_list)
11. density(x[[j]], n = nbins, ...)
12. density.default(x[[j]], n = nbins, ...)
13. stop("'x' contains missing values")
Can anyone help me out here please? I'm sure it's something super obvious i'm missing; i just can't see what!
Here, the .x is character class. We may need to convert to symbol and evaluate (!!)
overlap_fcn <- function(.x) {
## construct list of vectors
dist_list <- list(
"C1" = Data %>%
filter(class == 'C1', !is.na(!! rlang::sym(.x))) %>%
pull(.x),
"C2" = Data %>%
filter(class == 'C2', !is.na(!! rlang::sym(.x))) %>%
pull(.x),
"C3" = Data %>%
filter(class == 'C3', !is.na(!! rlang::sym(.x))) %>%
pull(.x)
)
## calculate distribution overlaps
return(
enframe(
overlapping::overlap(dist_list)$OV*100
) %>%
mutate(value = paste0(round(value, 2), "%"),
class = .x) %>%
rename(comparison = name, overlap = value) %>%
relocate(class)
)
}
-testing after creating the NAs in Data
> purrr::map_dfr(
+ .x = c('x', 'y', "z"),
+ .f = ~overlap_fcn(.x))
# A tibble: 9 × 3
class comparison overlap
<chr> <chr> <chr>
1 x C1-C2 98.61%
2 x C1-C3 97.46%
3 x C2-C3 97.5%
4 y C1-C2 95.47%
5 y C1-C3 96.22%
6 y C2-C3 97.14%
7 z C1-C2 90.17%
8 z C1-C3 94.9%
9 z C2-C3 89.24%

Fixing mismatched matrices

I have a data set df that has been split into int1 and int2. In int1andint2, there is two elements for the IDA and three elements for theID` B.
My goal is to create a 2x2 matrix for ID A and 3x3 for ID B, and have it divided from my example list of matrices l1. Currently, my code is creating a 3x3 matrix for ID A and 2x2 matrix for ID B using a combination of the product from g1 and f2 using map2() resulting to lstmat.
Any suggestions on how I can get the desired output of a 2x2 matrix for ID A and 3x3 matrix for ID B?
Example data:
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2011"), by = "days"), 500)
ID <- rep(c("A","B"), 5000)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$jDate <- julian(as.Date(df$date), origin = as.Date('1970-01-01'))
df$Month <- month(df$date)
df$year <- year(df$date)
t1 <- c(100,150)
t2 <- c(200,250)
mat <- cbind(t1,t2)
t1 <- c(150,150,200)
t2 <- c(250,250,350)
t3 <- c(350,350, 400)
mat2 <- cbind(t1,t2, t3)
l1 <- list(mat, mat2)
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "3") %>%
group_split()
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
sep = '_'))
int1 <- int1[-1]
int2 <- int2[-1]
Any suggestions for changes to this code for the desired result? :
g1 <- as.integer(gl(length(int1), 3, length(int1)))
f2 <- function(.int1, .int2) {
t(outer(seq_along(.int1), seq_along(.int2),
FUN = Vectorize(function(i, j) min(.int1[[i]]$jDate) -
min(.int2[[j]]$jDate))))
}
lstMat <- map2(split(int1, g1), split(int2, g1), f2)
map2(l1, lstMat, `/`)
As the 'int1', 'int2' have duplicated names, split on the names instead of creating a grouping index with gl
lstMat <- map2(split(int1, names(int1)), split(int2, names(int2)), f2)
map2(l1, lstMat, `/`)
-output
[[1]]
t1 t2
[1,] 3.571429 5.263158
[2,] 8.333333 8.928571
[[2]]
t1 t2 t3
[1,] 5.357143 6.578947 7.291667
[2,] 8.333333 8.928571 9.210526
[3,] 25.000000 19.444444 14.285714

how to filter for a string across columns one at a time

In the table below, I am trying to filter out rows with the word blue. The final result should be two rows for 11th and 12th oct. What I have tried which is not working:
tibble::tribble(
~date, ~col_1, ~col_2, ~col_3,
"11/10/2021", "blue", "jhf", "fff",
"12/10/2021", "ert", "blue", "kkk",
"13/10/2021", "yui", "yui", "blkeee"
) %>%
mutate(date = lubridate::dmy(date)) %>%
dplyr::filter(across(.cols = where(is.character),
.fns = ~str_detect(., regex("blue", ignore_case = TRUE))
))
Any ideas?
You may use if_any -
library(dplyr)
library(stringr)
df %>%
filter(if_any(starts_with('col'),
~str_detect(., regex("blue", ignore_case = TRUE))))
# date col_1 col_2 col_3
# <chr> <chr> <chr> <chr>
#1 11/10/2021 blue jhf fff
#2 12/10/2021 ert blue kkk
In base R -
subset(df, Reduce(`|`, lapply(df[-1], grepl, pattern = "blue", ignore.case = TRUE)))
# Base R option 1:
df[rowSums(Vectorize(`==`)("blue", df)) >= 1,]
# Base R option 2:
df[
apply(
df[,
vapply(
df,
is.character,
logical(1)
)
],
1,
function(x){
any(x == "blue")
}
),
]

format not being applied to data frame

I try to apply formatting to a data frame created from a printed TableOne object but it won't "stick"
Sample
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F, quote = F) %>%as.data.frame() %>% format(big.mark = ",")
Result:
1 2 p test
n 13 13
futime (mean (SD)) 51730.77 (34688.14) 68176.92 (32467.63) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694
This behaves similar to simply creating my own data frame
c(1213,2,3,1213,2,3) %>% table()%>% as.data.frame() %>% format(big.mark = ",")
Result:
. Freq
1 2 2
2 3 2
3 1213 2
This is unlike when simply using the format option on a numeric variable or even a one column data frame
123321789 %>% format(big.mark = ",")
Result:
[1] "123,321,789"
or
c(1213,2,3,1213,2,3) %>% as.data.frame() %>% format(big.mark = ",")
Result:
1 1,213
2 2
3 3
4 1,213
5 2
6 3
This is linked to the fact that table returns factors.
The examples you provided apply to numeric data.
Try:
library(dplyr)
result <- c(1213,2,3,1213,2,3) %>% table() %>% as.data.frame
class(result$.)
#> [1] "factor"
result %>% mutate_all( ~format(as.numeric(as.character(.x)),big.mark=','))
#> . Freq
#> 1 2 2
#> 2 3 2
#> 3 1,213 2
OK, so this one lead me down a rabbit hole of text formatting.
Ended up writing a function to address the matter, used address some weird rounding issues.
styleTableOne <- function(x){
if(!is.na(as.numeric(x))){return(format(as.numeric(x),big.mark = ","))}
if(x == ""){return(x)}
if(x == "<0.001"){return(x)}
if(x == "0.0"){return(x)}
if(x == " "){return(x)}
if (length(strsplit(x, split = "(", fixed = T)[[1]]) == 2){
set1 <- strsplit(x, split = "(",fixed = T)[[1]][1] %>% as.numeric()
set2 <- strsplit(x, split = "(",fixed = T)[[1]][2] %>% str_remove(fixed(")")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set <- paste(set1,set2,sep = " (")
set <- paste0(set,")")
return(set)}
x %>% strsplit(split = " ",fixed = T) %>% .[[1]] -> x
x <- subset(x, x != "")
set1 <- strsplit(x, split = " ",fixed = T)[1] %>% as.numeric()
set2 <- strsplit(x, split = " ",fixed = T)[2] %>% str_remove(fixed("[")) %>% str_remove(fixed(",")) %>% as.numeric()
set3 <- strsplit(x, split = " ",fixed = T)[3] %>% str_remove(fixed("]")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set3 <- case_when(
set1 > 100 ~ round(set3,0),
set1 > 25 ~ round(set3,1),
T ~ round(set3,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set3 %<>% format(big.mark = ",")
set <- paste0(set1," (",set2,"-",set3,")")
return(set)
}
Then you can do:
library(survival)
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F) %>%
as.data.frame() %>%
sapply(sapply, styleTableOne) %>%
as.data.frame(row.names = row.names(print(table1)))
result:
1 2 p test
n 13 13
futime (mean (SD)) 51,731 (34,688) 68,177 (32,468) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694

R: t-test between rows within each factor level

This is the data frame I'm trying to work on:
m <- matrix(rnorm(108, mean = 5000, sd = 1000), nrow = 36)
colnames(m) <- paste('V', 1:3, sep = '')
df <- data.frame(type = factor(rep(c('T1', 'T2', 'T3', 'T4', 'T5',
'T6', 'T7', 'T8', 'T9'), each = 4)),
treatment = factor(rep(rep(c('C','P', 'N', 'S'), each = 1),
9)),
as.data.frame(m))
I want to know how can I perform a t-test between the rows within each "type". Here's an example of t-tests for type T1 I want:
t.test(df[1,3:5], df[2, 3:5])
t.test(df[1,3:5], df[3, 3:5])
t.test(df[1,3:5], df[4, 3:5])
t.test(df[1,3:5], df[3, 3:5])
t.test(df[1,3:5], df[4, 3:5])
I'm trying to figure out how can I loop through all rows and get all the p-values from the t-test (along with the type and treatment for identification), instead of calculating each row manually. Any help or suggestion would be greatly appreciated.
Something like this:
library(dplyr)
t_tests = df %>%
split(.$type) %>%
lapply(function(x){
t(x[3:5]) %>%
data.frame %>%
setNames(x$treatment) %>%
combn(2, simplify = FALSE) %>%
lapply(function(x){
data.frame(treatment = paste0(names(x), collapse = ", "),
p_value = t.test(x[,1], x[,2])$p.value)
}) %>%
do.call(rbind, .)
}) %>%
do.call(rbind, .) %>%
mutate(type = sub("[.].+", "", row.names(.)))
Result:
> head(t_tests, 10)
treatment p_value type
1 C, P 0.6112274 T1
2 C, N 0.6630060 T1
3 C, S 0.5945135 T1
4 P, N 0.9388568 T1
5 P, S 0.8349370 T1
6 N, S 0.9049995 T1
7 C, P 0.3274583 T2
8 C, N 0.9755364 T2
9 C, S 0.7391661 T2
10 P, N 0.3177871 T2
Edits (Added an extra level "file" to the dataset):
library(dplyr)
t_tests = df %>%
split(.$file) %>%
lapply(function(y){
split(y, y$type) %>%
lapply(function(x){
t(x[4:6]) %>%
data.frame %>%
setNames(x$treatment) %>%
combn(2, simplify = FALSE) %>%
lapply(function(x){
data.frame(treatment = paste0(names(x), collapse = ", "),
p_value = t.test(x[,1], x[,2])$p.value)
}) %>%
do.call(rbind, .)
}) %>%
do.call(rbind, .) %>%
mutate(type = sub("[.].+", "", row.names(.)))
}) %>%
do.call(rbind, .) %>%
mutate(file = sub("[.].+", "", row.names(.)))
Result:
treatment p_value type file
1 C, P 0.3903450 T1 file1
2 C, N 0.3288727 T1 file1
3 C, S 0.0638599 T1 file1
4 P, N 0.6927599 T1 file1
5 P, S 0.1159615 T1 file1
6 N, S 0.2184015 T1 file1
7 C, P 0.1147805 T2 file1
8 C, N 0.4961888 T2 file1
9 C, S 0.9048607 T2 file1
10 P, N 0.4203666 T2 file1
11 P, S 0.3425908 T2 file1
12 N, S 0.7262478 T2 file1
13 C, P 0.6300293 T3 file1
14 C, N 0.8255837 T3 file1
15 C, S 0.7140522 T3 file1
16 P, N 0.4768694 T3 file1
17 P, S 0.3992130 T3 file1
18 N, S 0.8740219 T3 file1
19 C, P 0.2434270 T4 file1
20 C, N 0.2713622 T4 file1
Note about edit:
OP wanted an extra top level file to be added to the data, one can simply add another split + lapply and do.call at the end.
New Data:
m <- matrix(rnorm(324, mean = 5000, sd = 1000), nrow = 108)
colnames(m) <- paste('V', 1:3, sep = '')
df <- data.frame(type = factor(rep(c('T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9'), each = 4)),
treatment = factor(rep(rep(c('C','P', 'N', 'S'), each = 1), 9)),
file = factor(rep(c("file1", "file2", "file3"), each = 36)),
as.data.frame(m))

Resources