format not being applied to data frame - r

I try to apply formatting to a data frame created from a printed TableOne object but it won't "stick"
Sample
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F, quote = F) %>%as.data.frame() %>% format(big.mark = ",")
Result:
1 2 p test
n 13 13
futime (mean (SD)) 51730.77 (34688.14) 68176.92 (32467.63) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694
This behaves similar to simply creating my own data frame
c(1213,2,3,1213,2,3) %>% table()%>% as.data.frame() %>% format(big.mark = ",")
Result:
. Freq
1 2 2
2 3 2
3 1213 2
This is unlike when simply using the format option on a numeric variable or even a one column data frame
123321789 %>% format(big.mark = ",")
Result:
[1] "123,321,789"
or
c(1213,2,3,1213,2,3) %>% as.data.frame() %>% format(big.mark = ",")
Result:
1 1,213
2 2
3 3
4 1,213
5 2
6 3

This is linked to the fact that table returns factors.
The examples you provided apply to numeric data.
Try:
library(dplyr)
result <- c(1213,2,3,1213,2,3) %>% table() %>% as.data.frame
class(result$.)
#> [1] "factor"
result %>% mutate_all( ~format(as.numeric(as.character(.x)),big.mark=','))
#> . Freq
#> 1 2 2
#> 2 3 2
#> 3 1,213 2

OK, so this one lead me down a rabbit hole of text formatting.
Ended up writing a function to address the matter, used address some weird rounding issues.
styleTableOne <- function(x){
if(!is.na(as.numeric(x))){return(format(as.numeric(x),big.mark = ","))}
if(x == ""){return(x)}
if(x == "<0.001"){return(x)}
if(x == "0.0"){return(x)}
if(x == " "){return(x)}
if (length(strsplit(x, split = "(", fixed = T)[[1]]) == 2){
set1 <- strsplit(x, split = "(",fixed = T)[[1]][1] %>% as.numeric()
set2 <- strsplit(x, split = "(",fixed = T)[[1]][2] %>% str_remove(fixed(")")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set <- paste(set1,set2,sep = " (")
set <- paste0(set,")")
return(set)}
x %>% strsplit(split = " ",fixed = T) %>% .[[1]] -> x
x <- subset(x, x != "")
set1 <- strsplit(x, split = " ",fixed = T)[1] %>% as.numeric()
set2 <- strsplit(x, split = " ",fixed = T)[2] %>% str_remove(fixed("[")) %>% str_remove(fixed(",")) %>% as.numeric()
set3 <- strsplit(x, split = " ",fixed = T)[3] %>% str_remove(fixed("]")) %>% as.numeric()
set1 <- case_when(
set1 > 100 ~ round(set1,0),
set1 > 25 ~ round(set1,1),
T ~ round(set1,2)
)
set2 <- case_when(
set1 > 100 ~ round(set2,0),
set1 > 25 ~ round(set2,1),
T ~ round(set2,2)
)
set3 <- case_when(
set1 > 100 ~ round(set3,0),
set1 > 25 ~ round(set3,1),
T ~ round(set3,2)
)
set1 %<>% format(big.mark = ",")
set2 %<>% format(big.mark = ",")
set3 %<>% format(big.mark = ",")
set <- paste0(set1," (",set2,"-",set3,")")
return(set)
}
Then you can do:
library(survival)
library(dplyr)
library(tableone)
data(ovarian)
data <- ovarian
data$futime <- data$futime * 100
vars <- c("futime","fustat")
catvars <- c("fustat")
table1 <- CreateTableOne(vars = vars, factorVars = catvars,strata = "rx", data = data)
print(table1, printToggle = F) %>%
as.data.frame() %>%
sapply(sapply, styleTableOne) %>%
as.data.frame(row.names = row.names(print(table1)))
result:
1 2 p test
n 13 13
futime (mean (SD)) 51,731 (34,688) 68,177 (32,468) 0.224
fustat = 1 (%) 7 (53.8) 5 (38.5) 0.694

Related

unique words by group

this is my example dataframe
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
I would like to get only unique words within group and through groups
So I would like to get this
group word
A car, tree
B sun
I used aggregate and get this
aggregate(word ~ group , data = example, FUN = paste0)
group word
1 A car, car, house, tree
2 B sun ,sun, house
but now i need to select only unique values, but even this does not work out
for (i in 1:nrow(cluster)) {cluster[i, ][["word"]] = lapply(unlist(cluster[i, ][["word"]]), unique)}
with
Error in `[[<-.data.frame`(`*tmp*`, "word", value = list("car", "car, house", :
replacement has 3 rows, data has 1
A base R option using aggregate + subset + ave like below
with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, "[, ]+"))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
)
gives
ind values
1 A car, tree
2 B sun
Here's a dplyr solution:
library(dplyr)
library(tidyr)
example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))
output
group word
1 A car, tree
2 B sun
In base you can use strsplit to get the words, split them by group and use unique the get unique words per group. Use table to get the number of same words and take those which appear only once.
t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))
# group word
#A A car, tree
#B B sun
Or another way starting with the already used aggregate in the question.
t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1
# group word
#1 A car, tree
#2 B sun
And just for fun a Benchmark
library(bench)
library(dplyr)
library(tidyr)
library(tidyverse)
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
bench::mark(check = FALSE,
GKi = {t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))},
GKi2 = {t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1},
ThomasIsCoding = with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, ", "))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
),
Mael = {example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))},
"Nir Graham" = {example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
}
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 GKi 445.13µs 486.26µs 1997. 16.03KB 6.15 974 3
2 GKi2 916.97µs 968.68µs 1023. 7.3KB 6.15 499 3
3 ThomasIsCoding 3.54ms 3.73ms 266. 8.19KB 8.45 126 4
4 Mael 16.07ms 16.48ms 60.1 60.04KB 6.68 27 3
5 Nir Graham 37.29ms 39.49ms 24.0 90.59KB 8.00 9 3
GKi is about 2 times faster than GKi2, 7 times faster than ThomasIsCoding, 30 than Mael and 80 than Nir Graham.
library(tidyverse)
example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))

Mutate a column of models: "Error: Problem with `mutate()` input `model`. x Input `model` must be a vector, not a `lm` object."

I have a dataframe that contains as a column a model formula definition. I would like to mutate a new column where each row is a model based on the corresponding rows model definition.
Some data:
# Set up
library(tidyverse)
library(lubridate)
# Create data
mydf <- data.frame(
cohort = seq(ymd('2019-01-01'), ymd('2019-12-31'), by = '1 days'),
n = rnorm(365, 1000, 50) %>% round,
cohort_cost = rnorm(365, 800, 50)
) %>%
crossing(tenure_days = 0:365) %>%
mutate(activity_date = cohort + days(tenure_days)) %>%
mutate(daily_revenue = rnorm(nrow(.), 20, 1)) %>%
group_by(cohort) %>%
arrange(activity_date) %>%
mutate(cumulative_revenue = cumsum(daily_revenue)) %>%
arrange(cohort, activity_date) %>%
mutate(payback_velocity = round(cumulative_revenue / cohort_cost, 2)) %>%
select(cohort, n, cohort_cost, activity_date, tenure_days, everything())
## wider data
mydf_wide <- mydf %>%
select(cohort, n, cohort_cost, tenure_days, payback_velocity) %>%
group_by(cohort, n, cohort_cost) %>%
pivot_wider(names_from = tenure_days, values_from = payback_velocity, names_prefix = 'velocity_day_')
Now, the final problem code block. It fails on the very last line:
models <- data.frame(
from = mydf$tenure_days %>% unique,
to = mydf$tenure_days %>% unique
) %>%
expand.grid %>%
filter(to > from) %>%
filter(from > 0) %>%
arrange(from) %>%
mutate(mod_formula = paste0('velocity_day_', to, ' ~ velocity_day_', from)) %>%
mutate(model = lm(as.formula(mod_formula), data = mydf_wide))
Error: Problem with mutate() input model.
x Input model must be a vector, not a lm object.
ℹ Input model is lm(as.formula(mod_formula), data = mydf_wide).
If I run the last code block minus the last line and take a look at the resulting data frame 'models' it looks like this:
models %>% head
from to mod_formula
1 1 2 velocity_day_2 ~ velocity_day_1
2 1 3 velocity_day_3 ~ velocity_day_1
3 1 4 velocity_day_4 ~ velocity_day_1
4 1 5 velocity_day_5 ~ velocity_day_1
5 1 6 velocity_day_6 ~ velocity_day_1
6 1 7 velocity_day_7 ~ velocity_day_1
I tried making it a list column, but to do that as far as I'm aware I need to group by. But in this case I need to group by everything. I amended the last code block:
models <- data.frame(
from = mydf$tenure_days %>% unique,
to = mydf$tenure_days %>% unique
) %>%
expand.grid %>%
filter(to > from) %>%
filter(from > 0) %>%
arrange(from) %>%
mutate(mod_formula = paste0('velocity_day_', to, ' ~ velocity_day_', from)) %>%
group_by_all() %>%
nest() %>%
mutate(model = lm(as.formula(mod_formula), data = mydf_wide))
However this results in the same error.
How can I add a new column onto 'models' that contains a linear model for each row based on the formula in field 'mod_formula'?
lm is not vectorized. Add rowwise to create a model for each row.
library(dplyr)
models <- data.frame(
from = mydf$tenure_days %>% unique,
to = mydf$tenure_days %>% unique
) %>%
expand.grid %>%
filter(to > from) %>%
filter(from > 0) %>%
arrange(from) %>%
mutate(mod_formula = paste0('velocity_day_', to, ' ~ velocity_day_', from)) %>%
rowwise() %>%
mutate(model = list(lm(as.formula(mod_formula), data = mydf_wide)))
models
# from to mod_formula model
# <int> <int> <chr> <list>
#1 1 2 velocity_day_2 ~ velocity_day_1 <lm>
#2 1 3 velocity_day_3 ~ velocity_day_1 <lm>
#3 1 4 velocity_day_4 ~ velocity_day_1 <lm>
#4 1 5 velocity_day_5 ~ velocity_day_1 <lm>
#5 1 6 velocity_day_6 ~ velocity_day_1 <lm>
#6 1 7 velocity_day_7 ~ velocity_day_1 <lm>
#...
#...
You can also use map instead of rowwise.
mutate(model = purrr::map(mod_formula, ~lm(.x, data = mydf_wide)))

How should I format across rows of a gt table efficiently in R?

If I want to efficiently format rows of a gt table, is there a better method than what I have shown below.
Some rows are character, and so need no formatting,
some are numbers where one decimal place is required,
some are numbers where two decimal places is required,
and some are percentages where two decimal places is required. Whatever is done, should ideally generalise to other possible formats.
I create a data frame that creates the specification for formatting, but each format requires a separate command in the pipe.
library(dplyr)
library(gt)
#create small dataset
gtcars_8 <-
gtcars %>%
dplyr::group_by(ctry_origin) %>%
dplyr::top_n(2) %>%
dplyr::ungroup() %>%
dplyr::filter(ctry_origin != "United Kingdom")
#transpose data
row_labels <- colnames(gtcars_8)
gtcars_8_t <- as.data.frame(t(as.matrix(gtcars_8)))
gtcars_8_t$row_labels <- row_labels
my_column_names <- colnames(gtcars_8_t)[1:8]
#format data
format_specs <- as.data.frame(row_labels[1:10])
format_specs$type <- c("c","c","n","c","c","n","n","n","n","p")
format_specs$decimals <- c( 0 , 0 , 0 , 0 , 0 , 1 , 2 , 2 , 1 , 2 )
format_specs
#make basic gt table
gtcars_8_t %>%
slice(1:10) %>%
gt()
#make gt table with formats hardcoded (desired output)
gtcars_8_t %>%
slice(1:10) %>%
gt() %>%
cols_move_to_start("row_labels") %>%
#format for rows where: type = n, and decimals = 1
fmt(columns = vars(my_column_names),
rows = which(format_specs$type == "n" & format_specs$decimals == 1 ),
fns = function(x) {
formatC(as.numeric(x), digits = 1, format = "f")
} ) %>%
#format for rows where: type = n, and decimals = 2
fmt(columns = vars(my_column_names),
rows = which(format_specs$type == "n" & format_specs$decimals == 2 ),
fns = function(x) {
formatC(as.numeric(x), digits = 2, format = "f")
} ) %>%
#format for rows where: type = p, and decimals = 2
fmt(columns = vars(my_column_names),
rows = which(format_specs$type == "p" & format_specs$decimals == 2 ),
fns = function(x) {
paste0(formatC(as.numeric(x), digits = 2, format = "f"),"%")
} )
While not quite the same, applying formatting in gt appears to be a bit trickier than one might first expect ( eg).
A generalisable approach to achieve this is to set up a wrapper which loops through the format_specs dataframe and applies the format rules row by row. For the looping part I make use of purrr::reduce but a simple for-loop should also work:
library(dplyr)
library(purrr)
library(gt)
#create small dataset
gtcars_8 <-
gtcars %>%
dplyr::group_by(ctry_origin) %>%
dplyr::top_n(2) %>%
dplyr::ungroup() %>%
dplyr::filter(ctry_origin != "United Kingdom")
#> Selecting by msrp
#transpose data
row_labels <- colnames(gtcars_8)
gtcars_8_t <- as.data.frame(t(as.matrix(gtcars_8)))
gtcars_8_t$row_labels <- row_labels
my_column_names <- colnames(gtcars_8_t)[1:8]
#format data
format_specs <- data.frame(row = row_labels[1:10]) # Name column with row labels
format_specs$type <- c("c","c","n","c","c","n","n","n","n","p")
format_specs$decimals <- c( 0 , 0 , 0 , 0 , 0 , 1 , 2 , 2 , 1 , 2 )
myfmt <- function(data, cols, row_spec) {
reduce(row_spec$row, function(x, y) {
row_spec <- filter(row_spec, row == y)
fmt(x, columns = cols,
rows = which(x[["_data"]][["row_labels"]] == y),
fns = function(x) switch(row_spec$type,
n = scales::number(as.numeric(x), accuracy = 10^(-row_spec$decimals), big.mark = ""),
p = scales::percent(as.numeric(x), scale = 1, accuracy = 10^(-row_spec$decimals))))
}, .init = data)
}
gtcars_8_t %>%
slice(1:10) %>%
gt() %>%
cols_move_to_start("row_labels") %>%
myfmt(vars(my_column_names), format_specs)
Created on 2020-06-12 by the reprex package (v0.3.0)
Results in this table:
Got this in a couple less lines.
library(tidyverse)
library(scales)
library(gt)
#create small dataset
gtcars_8 <-
gtcars %>%
dplyr::group_by(ctry_origin) %>%
dplyr::top_n(2) %>%
dplyr::ungroup() %>%
dplyr::filter(ctry_origin != "United Kingdom")
#> Selecting by msrp
gtcars_8 %>%
rownames_to_column() %>%
#mutate(hp_rpm = scales::number(hp_rpm)) %>% example formatting change
mutate_all(as.character) %>%
pivot_longer(-rowname) %>%
pivot_wider(names_from = rowname) %>%
gt()

How to sovle self-defining function to count can't work

I write a function to count daily number of people in hospital but it can't work when the number is 0 in some days.
my function:
tsdata2 <- function(df){
t.f <- as.data.frame(table(df$DATE_INHOSPITAL2)) %>% rename(whole =Freq)
## sex
man.d <- df %>% filter(GENDER == 1)
man.f <- as.data.frame(table(man.d$DATE_INHOSPITAL2)) %>% rename(man =Freq)
woman.d <- df %>% filter(GENDER == 2)
woman.f <- as.data.frame(table(woman.d$DATE_INHOSPITAL2)) %>% rename(woman =Freq)
## age 65
agelo65.d <- df %>% filter(age_group65 == 1)
agelo65.f <- as.data.frame(table(agelo65.d$DATE_INHOSPITAL2)) %>% rename(agelo65 =Freq)
ageup65.d <- df %>% filter(age_group65 == 2)
ageup65.f <- as.data.frame(table(ageup65.d$DATE_INHOSPITAL2)) %>% rename(ageupwith65 =Freq)
## age 10
age10.1.d <- df %>% filter(age_group10 == 1)
age10.1.d.f <- as.data.frame(table(age10.1.d$DATE_INHOSPITAL2)) %>% rename(agelo40 =Freq)
age10.2.d <- df %>% filter(age_group10 == 2)
age10.2.d.f <- as.data.frame(table(age10.2.d$DATE_INHOSPITAL2)) %>% rename(age41_50 =Freq)
age10.3.d <- df %>% filter(age_group10 == 3)
age10.3.d.f <- as.data.frame(table(age10.3.d$DATE_INHOSPITAL2)) %>% rename(age51_60 =Freq)
age10.4.d <- df %>% filter(age_group10 == 4)
age10.4.d.f <- as.data.frame(table(age10.4.d$DATE_INHOSPITAL2)) %>% rename(age61_70 =Freq)
age10.5.d <- df %>% filter(age_group10 == 5)
age10.5.d.f <- as.data.frame(table(age10.5.d$DATE_INHOSPITAL2)) %>% rename(age71_80 =Freq)
age10.6.d <- df %>% filter(age_group10 == 6)
age10.6.d.f <- as.data.frame(table(age10.6.d$DATE_INHOSPITAL2)) %>% rename(ageup80 =Freq)
datebreaks<-seq(as.Date("2014-01-01"),as.Date("2018-12-31"),by="1 day")
full <- data.frame(Var1 = as.character(datebreaks) )
result <- full %>%
left_join(t.f) %>%
left_join(man.f) %>%
left_join(woman.f) %>%
left_join(agelo65.f) %>%
left_join(ageup65.f) %>%
left_join(age10.1.d.f) %>%
left_join(age10.2.d.f) %>%
left_join(age10.3.d.f) %>%
left_join(age10.4.d.f) %>%
left_join(age10.5.d.f) %>%
left_join(age10.6.d.f) %>% replace(., is.na(.), 0)
return(result)
}
list <- split(total,total$DISEASE_CODE1_2to3)
test <- map(list,tsdata2)
I think the error was because the number of hospital admissions on a given day was zero.
How can I improve this code that it can work even the number is zero.
test <- map(list,tsdata2)
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Joining, by = "Var1"
Error: `by` required, because the data sources have no common variables
The reason of such an error thrown is that you are applying left_join on empty data frames with no columns to join. Along your data frame filtering and contingency table creation, data frames with no columns to join were generated. Please see below the simulation :
library(dplyr)
df1 <- data.frame(a = 1:10, b = letters[1:10])
df2 <- data.frame(a = 1:10, c = letters[11:20])
df2 <- df2[,-c(1, 2)]
str(df2)
# 'data.frame': 10 obs. of 0 variables
df2 %>% left_join(df1)
The code above throws an error:
Error: by required, because the data sources have no common
variables Call rlang::last_error() to see a backtrace
To avoid such a problem you can implement simple check if the data frame is with no columns then change to dummy data frame:
library(dplyr)
df1 <- data.frame(a = 1:10, b = letters[1:10])
df2 <- df1[,-c(1, 2)]
df_dummy <- data.frame(a = 1, c = 0)
if(ncol(df2) == 0) df2 <- df_dummy
df1 %>% left_join(df2)
#
# Joining, by = "a"
# a b c
# 1 1 a 0
# 2 2 b NA
# 3 3 c NA
# 4 4 d NA
# 5 5 e NA
# 6 6 f NA
# 7 7 g NA
# 8 8 h NA
# 9 9 i NA
# 10 10 j NA

Remove all instances of duplicate cells (not entire rows / columns) in a dataframe in R

I have a dataframe:
genes_1 = c("a","b","c","d","e")
genes_2 = c("f","g","c","e","j")
genes_3 = c("a","b","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
My desired output:
genes_1 = c("","","","d","")
genes_2 = c("f","g","","","j")
genes_3 = c("","","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
How can I achieve this?
Thanks
0-dependency base R solution:
data.frame(
genes_1 = c("a","b","c","d","e"),
genes_2 = c("f","g","c","e","j"),
genes_3 = c("a","b","m","n","o"),
stringsAsFactors = FALSE
) -> xdf
dups <- names(which(table(unlist(xdf, use.names = FALSE)) > 1))
xdf[] <- lapply(xdf, function(x) { x[x %in% dups] <- "" ; x })
xdf
unlist() recursively unwinds all the columns into a single character vector.
table() counts all occurrences of each element.
which() narrows down to only the ones which are TRUE
names() grabs the character select vector elements.
We then work by column to replace all occurrences in the vector that match with ""
library(microbenchmark)
library(data.table)
microbenchmark(
base = {
ydf <- xdf
dups <- names(which(table(unlist(ydf, use.names = FALSE)) > 1))
ydf[] <- lapply(ydf, function(x) { x[x %in% dups] <- "" ; x })
},
base.2 = {
ydf <- xdf
tmp <- unlist(ydf)
ydf[arrayInd(which(duplicated(tmp) | duplicated(tmp, fromLast = TRUE)), dim(ydf))] <- ""
},
tidyverse = {
ydf <- xdf
ydf %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID) -> ydf
},
data.table = {
ydt <- data.table(xdf)
ydt[,lapply(.SD, function(x) { x[x %in% dups] <- "" ; x })]
}
) %>%
{ print(.) ; . } %>%
autoplot()
Another base solution:
tmp <- unlist(df)
df[arrayInd(which(duplicated(tmp) | duplicated(tmp,fromLast=TRUE)), dim(df))] <- NA
# genes_1 genes_2 genes_3
#1 <NA> f <NA>
#2 <NA> g <NA>
#3 <NA> <NA> m
#4 d <NA> n
#5 <NA> j o
unlist just creates a long vector for all the values in df
arrayInd then creates a two-column row/column index for subsetting df for the duplicated values.
Here is a tidyverse solution. df2 is the final output.
library(tidyverse)
df2 <- df %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID)

Resources