R: lapply and tibble not working as expected in two loop - r

I have a set of code that loops through all possible scenarios that I use for a drilldown feature in my shiny application. However, the Level_3_Drilldowns isn't working as expected as I am not getting my defined tibble names.
Below is a single output for Level_2_Drilldowns which is producing what is expected:
[[3]]
[[3]]$id
[1] "ocean"
[[3]]$type
[1] "column"
[[3]]$data
[[3]]$data[[1]]
[[3]]$data[[1]]$name
[1] "Boat"
[[3]]$data[[1]]$y
[1] 2
[[3]]$data[[1]]$PerTotal
[1] 37
[[3]]$data[[1]]$drilldown
[1] "ocean_boat"
Here, I can see that name, y, and PerTotal are defined (I need to be able to reference them in my graph tooltip)
Below is a single output for Level_3_Drilldowns which is not producing what is expected:
[[5]]
[[5]]$id
[1] "ocean_boat"
[[5]]$type
[1] "column"
[[5]]$data
[[5]]$data[[1]]
[[5]]$data[[1]][[1]]
[1] "Fig"
[[5]]$data[[1]][[2]]
[1] 1
[[5]]$data[[1]][[3]]
[1] 37
[[5]]$data[[2]]
[[5]]$data[[2]][[1]]
[1] "Tony"
[[5]]$data[[2]][[2]]
[1] 1
[[5]]$data[[2]][[3]]
[1] 37
You can see that name, y and PerTotal are not defined.
Any ideas on why they are not being shown?
Below is the full code:
library (tidyr)
library (data.table)
library (highcharter)
x <- c("Farm","Farm","Farm","City","City","City","Ocean","Ocean")
y <- c("Sheep","Sheep","Cow","Car","Bus","Bus","Boat","Boat")
z <- c("Bill","Tracy","Sandy","Bob","Carl","Newt","Fig","Tony")
a <- c(1,1,1,1,1,1,1,1)
b <- c(3,5,1,3,8,5,3,9)
dat <- data.frame(x,y,z,a)
#First Tier #Copied
datSum <- dat %>%
group_by(x) %>%
summarize(Quantity = sum(a)
)
datSum <- arrange(datSum,desc(Quantity))
Lvl1dfStatus <- tibble(name = datSum$x, y = datSum$Quantity, drilldown = tolower(name))
Level_2_Drilldowns <- lapply(unique(dat$x), function(x_level) {
# x_level is what you called 'input' earlier.
datSum2 <- dat[dat$x == x_level,]
datSum2 <- datSum2 %>%
group_by(y) %>%
summarize(Quantity = sum(a),
PerTotal = sum(b)
)
datSum2 <- arrange(datSum2,desc(Quantity))
Lvl2dfStatus <- tibble(name = datSum2$y,y = datSum2$Quantity, PerTotal = datSum2$PerTotal, drilldown = tolower(paste(x_level, name, sep = "_")))
list(id = tolower(x_level), type = "column", data = list_parse(Lvl2dfStatus))
})
Level_3_Drilldowns <- lapply(unique(dat$x), function(x_level) {
datSum2 <- dat[dat$x == x_level,]
lapply(unique(datSum2$y), function(y_level) {
datSum3 <- datSum2[datSum2$y == y_level,]
datSum3 <- datSum3 %>%
group_by(z) %>%
summarize(Quantity = sum(a),
PerTotal = sum(b)
)
datSum3 <- arrange(datSum3,desc(Quantity))
Lvl3dfStatus <- tibble(name = datSum3$z,y = datSum3$Quantity, PerTotal = datSum3$PerTotal)
list(id = tolower(paste(x_level, y_level, sep = "_")), type = "column", data = list_parse2(Lvl3dfStatus))
})
}) %>% unlist(recursive = FALSE)

Related

How to fix error "Error in `[.data.frame`(x#data, i, j, ..., drop = FALSE) : undefined columns selected" in the function variogramST() of gstat

Error in [.data.frame(x#data, i, j, ..., drop = FALSE) :
undefined columns selected
I'm gettiing this error when using gstat to calculate spatiotemporal variogram with the function variogramST(). Here is the code
library(sf)
library(tidyverse)
library(gstat)
library(sp)
library(spacetime)
library(raster)
library(rgdal)
library(rgeos)
data <- read.csv("data/Synthesis_of_Environmental_Mercury_Loads_in_New_York_State__1969-2017___Chemical_Data.csv")
dataProcess <- function(){
names(data)[1] <- "SAMPLE_ID"
data$Chem_Units[data$Chem_Units == "碌g/l"] = "mcg/l"
data_df <- data %>%
dplyr::select("SAMPLE_ID", "Latitude", "Longitude", "BDate", "Year", "TissueCollected", "Chem_Value", "Chem_Units", "Chemical_Type", "Final_Chem_Standardized") %>%
filter(Chemical_Type == "THg") %>%
filter(Chem_Units == "mcg/l" | Chem_Units == "ng/l") %>%
na.omit()
return(data_df)
}
data_df <- dataProcess()
data_water <- subset(data_df, TissueCollected == 'Water' | TissueCollected == 'Surface Water' | TissueCollected == 'Groundwater' | TissueCollected == "Treated water supply" | TissueCollected == "Untreated water supply")
sptTransform <- function(data.df) {
data.df$date <- as.POSIXlt.Date(as.Date(data.df$BDate, format = "%m/%d/%Y"), origin = "1970-01-01 EDT")
data.df <- subset(data.df, select = -c(BDate, Year))
data.df$Final_Chem_Standardized <- data.df$Final_Chem_Standardized * 1000000
coordinates(data.df) = ~Longitude + Latitude
projection(data.df) = CRS("+init=epsg:4326")
chemData <- spTransform(data.df, CRS("+init=epsg:32617"))
return(chemData)
}
spt_water <- sptTransform(data_water)
numVec <- spt_water#data$Final_Chem_Standardized
df <- data.frame(numVec)
names(df) <- "FCS6"
stidf_water <- STIDF(sp = spt_water,
time = spt_water#data$date,
data = df)
var <- variogramST(FCS6 ~ 1, data = stidf_water, tunit = "days", assumeRegular = F, na.omit = T)
The last line always shows the error shown at the beginning.

Match list elements based on attribute component

I have a data set that i split into two list int1 and int2.
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("01-01-2013"), by = "days"), 300)
ID <- rep(c("A","B", "C"), 300)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
# Create first list
int1 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "1") %>%
group_split()
# Create second list
int2 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
I then assign a attribute to each list (match). I created a function check to grab this attribute more easily. I removed some elements from one list for this exmaple.
int1 <- int1[-c(3,6)]
# Convenience function to grab the attributes for you
check <- function(x) {
return(attr(x, "match"))
}
# Add an attribute to hold the attributes of each list element
attr(int1, "match") <- data.frame(id = sapply(int1, function(x) paste(x$ID[1])),
interval_start_date = sapply(int1, function(x) paste(x$new[1]))
)
# Check the attributes
check(int1)
# Add an attribute "tab" to hold the attributes of each list element
attr(int2, "match") <- data.frame(id = sapply(int2, function(x) paste(x$ID[1])),
interval_start_date = sapply(int2, function(x) paste(x$new[1]))
)
# Check the attributes
check(int2)
I would like to remove elements that are not in another based on the attribute that I had added. Specifically I would like to remove any that don't have the same interval_start_date and ID associated with it. For the interval_start_date, only the year and the day have to match, as the month will most likely differ between the two list. In this case, I would like int2 to match int1. Any thoughts on how I could do this? A base r method is preferred, if possible.
# Expected results
expected_int2 <- list(int2[[1]], int2[[2]], int2[[3]], int2[[4]], int2[[5]],
int2[[6]], int2[[7]])
names(expected_int2) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = "_"))
We may create an index with %in% after pasteing the 'id' and the formatted 'interval_start_date' i.e. after removing the 'month' part
i1 <- with(check(int2), paste(id, format(as.Date(interval_start_date),
"%Y-%d"))) %in% with(check(int1), paste(id,
format(as.Date(interval_start_date), "%Y-%d")))
> which(i1)
[1] 1 2 4 5 7 8 9
out <- int2[i1]

Reactive Function

Prior to start creating my app with Shiny I've created a function (NextWordPrediction) that updates my dataframe based on an user's input as follows:
If input exists in df increase its Frequency by 1
If input does't exist in df add it
Function code:
NextWordPrediction <- function(input) {
dat <- dat %>%
filter(., N_gram == str_count(input, "\\S+") + 1) %>%
filter(grepl(paste("^", tolower(str_squish(input)), sep = ""), Word)) %>%
arrange(., desc(Prop))
if (nrow(dat) != 0) {
assign("dat",
dat %>%
mutate(Frequency = ifelse(Word == input &
N_gram == str_count(input, "\\S+"),
Frequency + 1,
Frequency)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
val <- dat$Word_to_Predict[1]
ans <- paste(str_squish(input), val)
return(list(ans, head(dat,5)))
} else if (nrow(dat) == 0 & word(input, 1) != "NA") {
assign("dat",
dat %>%
add_row(., Word = tolower(input), Frequency = + 1, N_gram = str_count(input, "\\S+"),
Word_to_Predict = word(input, -1)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
input_1 <- Reduce(paste, word(input, 2:str_count(input,"\\S+")))
return(NextWordPrediction(input_1))
} else if (word(input, 1) == "NA") {
ans <- paste("Word not in dictionary. We added this to our database!")
return(ans)
}
}
As a next step I want to extend this functionality to a Shiny app and I've tried the following without success. The function usability is functional but after an input my df is not updated accordingly.
server.R
library(shiny)
dat <- read.csv("dat_all.csv")
shinyServer(function(input, output) {
NextWordPrediction <- function(input) {
dat <- dat %>%
filter(., N_gram == str_count(input, "\\S+") + 1) %>%
filter(grepl(paste("^", tolower(str_squish(input)), sep = ""), Word)) %>%
arrange(., desc(Prop))
if (nrow(dat) != 0) {
assign("dat",
dat %>%
mutate(Frequency = ifelse(Word == input &
N_gram == str_count(input, "\\S+"),
Frequency + 1,
Frequency)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
val <- dat$Word_to_Predict[1]
ans <- paste(str_squish(input), val)
return(list(ans, head(dat,5)))
} else if (nrow(dat) == 0 & word(input, 1) != "NA") {
assign("dat",
dat %>%
add_row(., Word = tolower(input), Frequency = + 1, N_gram = str_count(input, "\\S+"),
Word_to_Predict = word(input, -1)) %>%
group_by(., N_gram) %>%
mutate(., Prop = Frequency/ sum(Frequency)) %>%
data.frame(.),
envir = .GlobalEnv)
input_1 <- Reduce(paste, word(input, 2:str_count(input,"\\S+")))
return(NextWordPrediction(input_1))
} else if (word(input, 1) == "NA") {
ans <- paste("Word not in dictionary. We added this to our database!")
return(ans)
}
}
output$predictiontext = reactive({
NextWordPrediction(input$text)[1]
})
output$predictiontable = renderTable({
NextWordPrediction(input$text)[2]
})
})
ui.R
library(shiny)
# Define UI for application that draws a histogram
shinyUI(fluidPage(
# Application title
titlePanel("NextWordPrediction"),
# Sidebar with a slider input for number of bins
sidebarLayout(
sidebarPanel(
textInput("text",
"Type something...",
"")
),
# Show a plot of the generated distribution
mainPanel(
wellPanel(
# Link to report
helpText(a('More information on the app',
href=link,
target = '_blank')
),
# Link to repo
helpText(a('Code repository',
href=link,
target = '_blank')
),
textOutput("predictiontext"),
tableOutput('predictiontable')
)
)
))
)
Update 1: Data
df<- data.frame(Word = c("hello", "she was great", "this is", "long time ago in"), Frequency = c(4, 3, 10, 1),
N_gram = c(1, 3, 2, 4), Prop = c(4/18, 3/18, 10/18, 1/18), Word_to_Predict = c(NA, "great", "is", "in"))
NextWordPrediction("she was") ## returns "she was" & "great"
NextWordPrediction("hours ago") ## returns "hours ago" & "in"
NextWordPrediction("words not in data") ## returns "Word not in dictionary. We added this to our database!" after trying "not in data", "in data" and adds "words not in data" to dataset

R Highcharter: Dynamic multi level drilldown in Shiny

I am trying to create a multi-layer drilldown graph using highcharter with dynamic data in shiny. I am able to accomplish this using just R code with a set input but when I put it in a shiny application and try to have it subset the data dynamically, it fails.
Below is the code that that works in R (only drilling down from Farm to Sheep):
library(shinyjs)
library(tidyr)
library(data.table)
library(highcharter)
library(dplyr)
x <- c("Farm","Farm","Farm","City","City","City","Ocean","Ocean")
y <- c("Sheep","Sheep","Cow","Car","Bus","Bus","Boat","Boat")
z <- c("Bill","Tracy","Sandy","Bob","Carl","Newt","Fig","Tony")
a <- c(1,1,1,1,1,1,1,1)
dat <- data.frame(x,y,z,a)
input <- "Farm"
input2 <- "Sheep"
#First Tier
datSum <- dat %>%
group_by(x) %>%
summarize(Quantity = sum(a)
)
datSum <- arrange(datSum,desc(Quantity))
Lvl1dfStatus <- tibble(name = datSum$x, y = datSum$Quantity, drilldown = tolower(name))
#Second Tier
datSum2 <- dat[dat$x == input,]
datSum2 <- datSum2 %>%
group_by(y) %>%
summarize(Quantity = sum(a)
)
datSum2 <- arrange(datSum2,desc(Quantity))
Lvl2dfStatus <- tibble(name = datSum2$y,y = datSum2$Quantity, drilldown = tolower(name))
#Third Tier
datSum2 <- dat[dat$x == input,]
datSum3 <- datSum2[datSum2$y == input2,]
datSum3 <- datSum3 %>%
group_by(z) %>%
summarize(Quantity = sum(a)
)
datSum3 <- arrange(datSum3,desc(Quantity))
Lvl3dfStatus <- tibble(name = datSum3$z,y = datSum3$Quantity)
#Graph
ClickedTest <- JS("function(event) {Shiny.onInputChange('ClickedInput', event.point.name);}")
highchart() %>%
hc_xAxis(type = "category") %>%
hc_add_series(Lvl1dfStatus, "column", hcaes(x = name, y = y), color = "#E4551F") %>%
hc_plotOptions(column = list(stacking = "normal",
events = list(click = ClickedTest))) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = list(
list(id = tolower(input), type = "column", data = list_parse(Lvl2dfStatus)),
list(id = tolower(input2), type = "column", data = list_parse2(Lvl3dfStatus))
)
)
Below is the code that fails in Shiny when changing input to dynamic:
library (shinyjs)
library (tidyr)
library (data.table)
library (highcharter)
library (dplyr)
library (shinydashboard)
library (shiny)
x <- c("Farm","Farm","Farm","City","City","City","Ocean","Ocean")
y <- c("Sheep","Sheep","Cow","Car","Bus","Bus","Boat","Boat")
z <- c("Bill","Tracy","Sandy","Bob","Carl","Newt","Fig","Tony")
a <- c(1,1,1,1,1,1,1,1)
dat <- data.frame(x,y,z,a)
# input <- "Farm"
# input2 <- "Sheep"
header <- dashboardHeader()
body <- dashboardBody(
highchartOutput("Test"),
verbatimTextOutput("trial")
)
sidebar <- dashboardSidebar()
ui <- dashboardPage(header, sidebar, body)
server <- function(input, output, session) {
Lvl1ClickHardCoded <- ""
output$Test <- renderHighchart({
#First Tier
datSum <- dat %>%
group_by(x) %>%
summarize(Quantity = sum(a)
)
datSum <- arrange(datSum,desc(Quantity))
Lvl1dfStatus <- tibble(name = datSum$x, y = datSum$Quantity, drilldown = tolower(name))
#Second Tier
rowcheck <- dat[dat$x == input$ClickedInput,]
if (nrow(rowcheck)!=0){
datSum2 <- dat[dat$x == input$ClickedInput,]
datSum2 <- datSum2 %>%
group_by(y) %>%
summarize(Quantity = sum(a)
)
datSum2 <- arrange(datSum2,desc(Quantity))
Lvl2dfStatus <- tibble(name = datSum2$y,y = datSum2$Quantity, drilldown = tolower(name))
Lvl1ClickHardCoded <<- input$ClickedInput
Lvl1id <<- tolower(input$ClickedInput)
}
else{
Lvl2dfStatus <- data.table(Group.1=numeric(), x=numeric())
Lvl2dfStatus <- tibble(name = Lvl2dfStatus$Group.1,y = Lvl2dfStatus$x)
Lvl1id <- ""
}
#Third Tier
rowcheck <- dat[dat$x == Lvl1ClickHardCoded,]
rowcheck <- rowcheck[rowcheck$y == input$ClickedInput,]
if (nrow(rowcheck)!=0){
datSum2 <- dat[dat$x == Lvl1ClickHardCoded,]
datSum3 <- datSum2[datSum2$y == input$ClickedInput,]
datSum3 <- datSum3 %>%
group_by(z) %>%
summarize(Quantity = sum(a)
)
datSum3 <- arrange(datSum3,desc(Quantity))
Lvl3dfStatus <- tibble(name = datSum3$z,y = datSum3$Quantity)
Lvl2id <<- tolower(input$ClickedInput)
}
else{
Lvl3dfStatus <- data.table(Group.1=numeric(), x=numeric())
Lvl3dfStatus <- tibble(name = Lvl3dfStatus$Group.1,y = Lvl3dfStatus$x)
Lvl2id <- ""
}
#Graph
ClickedTest <- JS("function(event) {Shiny.onInputChange('ClickedInput', event.point.name);}")
highchart() %>%
hc_xAxis(type = "category") %>%
hc_add_series(Lvl1dfStatus, "column", hcaes(x = name, y = y), color = "#E4551F") %>%
hc_plotOptions(column = list(stacking = "normal",
events = list(click = ClickedTest))) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = list(
list(id = Lvl1id, type = "column", data = list_parse(Lvl2dfStatus)),
list(id = Lvl2id, type = "column", data = list_parse2(Lvl3dfStatus))
)
)
})
output$trial <- renderText({input$ClickedInput})
}
shinyApp(ui, server)
Your approach was kind of mislead by the click function. It is totally unnecessary, since (as can be seen in the non-shiny example) Highcharts has its own mechanisms to detect series clicks and can find and render drilldowns on its own.
You trying to catch the click event made the Highcharts chart building function re-render every time (resetting any drilldown) so you could not see any drilldown events at all.
The solution is to just copy your working Highcharts example into the renderHighchart function. You will immediately see that the "Farm" and "Sheep" dropdowns work.
I suppose that you were confusing yourself by using the terms "input" for the sublevel names as they are no input at all (in the shiny sense). What you have to do to get the drilldown working properly is to predefine the drilldown sets when you create the Highcharts chart. So you tell the Plugin in advance what drilldowns will be used and Highchart drills down only based on the IDs you specify.
I edited your code such that all the possible drilldowns are created in a loop and everything is working:
library (shinyjs)
library (tidyr)
library (data.table)
library (highcharter)
library (dplyr)
library (shinydashboard)
library (shiny)
x <- c("Farm","Farm","Farm","City","City","City","Ocean","Ocean")
y <- c("Sheep","Sheep","Cow","Car","Bus","Bus","Boat","Boat")
z <- c("Bill","Tracy","Sandy","Bob","Carl","Newt","Fig","Tony")
a <- c(1,1,1,1,1,1,1,1)
dat <- data.frame(x,y,z,a)
header <- dashboardHeader()
body <- dashboardBody(
highchartOutput("Working"),
verbatimTextOutput("trial")
)
sidebar <- dashboardSidebar()
ui <- dashboardPage(header, sidebar, body)
server <- function(input, output, session) {
output$Working <- renderHighchart({
#First Tier #Copied
datSum <- dat %>%
group_by(x) %>%
summarize(Quantity = sum(a)
)
datSum <- arrange(datSum,desc(Quantity))
Lvl1dfStatus <- tibble(name = datSum$x, y = datSum$Quantity, drilldown = tolower(name))
#Second Tier # Generalized to not use one single input
# Note: I am creating a list of Drilldown Definitions here.
Level_2_Drilldowns <- lapply(unique(dat$x), function(x_level) {
# x_level is what you called 'input' earlier.
datSum2 <- dat[dat$x == x_level,]
datSum2 <- datSum2 %>%
group_by(y) %>%
summarize(Quantity = sum(a)
)
datSum2 <- arrange(datSum2,desc(Quantity))
# Note: The "drilldown" variable has to be unique, this is why we use level 1 plus level 2 names.
Lvl2dfStatus <- tibble(name = datSum2$y,y = datSum2$Quantity, drilldown = tolower(paste(x_level, name, sep = "_")))
list(id = tolower(x_level), type = "column", data = list_parse(Lvl2dfStatus))
})
#Third Tier # Generalized through all of level 2
# Note: Again creating a list of Drilldown Definitions here.
Level_3_Drilldowns <- lapply(unique(dat$x), function(x_level) {
datSum2 <- dat[dat$x == x_level,]
lapply(unique(datSum2$y), function(y_level) {
datSum3 <- datSum2[datSum2$y == y_level,]
datSum3 <- datSum3 %>%
group_by(z) %>%
summarize(Quantity = sum(a)
)
datSum3 <- arrange(datSum3,desc(Quantity))
Lvl3dfStatus <- tibble(name = datSum3$z,y = datSum3$Quantity)
# Note: The id must match the one we specified above as "drilldown"
list(id = tolower(paste(x_level, y_level, sep = "_")), type = "column", data = list_parse2(Lvl3dfStatus))
})
}) %>% unlist(recursive = FALSE)
highchart() %>%
hc_xAxis(type = "category") %>%
hc_add_series(Lvl1dfStatus, "column", hcaes(x = name, y = y), color = "#E4551F") %>%
hc_plotOptions(column = list(stacking = "normal")) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series = c(Level_2_Drilldowns, Level_3_Drilldowns)
)
})
output$trial <- renderText({input$ClickedInput})
}
shinyApp(ui, server)
If for any reason, you should not be satisfied with collecting all drilldowns beforehand, there is an api for adding drilldowns on the fly. Try searching for Highcharts and "addSeriesAsDrilldown". I am not sure, however, if this is accessible outside of JavaScript.

Remove all instances of duplicate cells (not entire rows / columns) in a dataframe in R

I have a dataframe:
genes_1 = c("a","b","c","d","e")
genes_2 = c("f","g","c","e","j")
genes_3 = c("a","b","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
My desired output:
genes_1 = c("","","","d","")
genes_2 = c("f","g","","","j")
genes_3 = c("","","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
How can I achieve this?
Thanks
0-dependency base R solution:
data.frame(
genes_1 = c("a","b","c","d","e"),
genes_2 = c("f","g","c","e","j"),
genes_3 = c("a","b","m","n","o"),
stringsAsFactors = FALSE
) -> xdf
dups <- names(which(table(unlist(xdf, use.names = FALSE)) > 1))
xdf[] <- lapply(xdf, function(x) { x[x %in% dups] <- "" ; x })
xdf
unlist() recursively unwinds all the columns into a single character vector.
table() counts all occurrences of each element.
which() narrows down to only the ones which are TRUE
names() grabs the character select vector elements.
We then work by column to replace all occurrences in the vector that match with ""
library(microbenchmark)
library(data.table)
microbenchmark(
base = {
ydf <- xdf
dups <- names(which(table(unlist(ydf, use.names = FALSE)) > 1))
ydf[] <- lapply(ydf, function(x) { x[x %in% dups] <- "" ; x })
},
base.2 = {
ydf <- xdf
tmp <- unlist(ydf)
ydf[arrayInd(which(duplicated(tmp) | duplicated(tmp, fromLast = TRUE)), dim(ydf))] <- ""
},
tidyverse = {
ydf <- xdf
ydf %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID) -> ydf
},
data.table = {
ydt <- data.table(xdf)
ydt[,lapply(.SD, function(x) { x[x %in% dups] <- "" ; x })]
}
) %>%
{ print(.) ; . } %>%
autoplot()
Another base solution:
tmp <- unlist(df)
df[arrayInd(which(duplicated(tmp) | duplicated(tmp,fromLast=TRUE)), dim(df))] <- NA
# genes_1 genes_2 genes_3
#1 <NA> f <NA>
#2 <NA> g <NA>
#3 <NA> <NA> m
#4 d <NA> n
#5 <NA> j o
unlist just creates a long vector for all the values in df
arrayInd then creates a two-column row/column index for subsetting df for the duplicated values.
Here is a tidyverse solution. df2 is the final output.
library(tidyverse)
df2 <- df %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID)

Resources