extract a single column - r

I have a list of 701 given csv files. Each one has the same number of columns (7) but different number of rows (between 25000 and 28000).
Here is an extract of the first file:
Date,Week,Week Day,Hour,Price,Volume,Sale/Purchase
18/03/2011,11,5,1,-3000.00,17416,Sell
18/03/2011,11,5,1,-1001.10,17427,Sell
18/03/2011,11,5,1,-1000.00,18055,Sell
18/03/2011,11,5,1,-500.10,18057,Sell
18/03/2011,11,5,1,-500.00,18064,Sell
18/03/2011,11,5,1,-400.10,18066,Sell
18/03/2011,11,5,1,-400.00,18066,Sell
18/03/2011,11,5,1,-300.10,18068,Sell
18/03/2011,11,5,1,-300.00,18118,Sell
I made a nonlinear regression of the supply curve of the ninth hour for the year 2012. The datas for 2012 are in 290. to 654. csv files.
allenamen <- dir(pattern="*.csv")
alledat <- lapply(allenamen, read.csv, header = TRUE, sep = ",", stringsAsFactors = FALSE)
h <- list()
for(i in 290:654) {
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i-289]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i-289]] <- coef(f)
}
This works and I get the coefficients a, b, c and d for every day in 2012.
This is the head(h):
[[1]]
a b c d
2.513378e+03 4.668218e-02 -3.181322e+00 2.637142e+04
[[2]]
a b c d
2.803172e+03 6.696201e-02 -4.576432e+00 2.574454e+04
[[3]]
a b c d
3.298991e+03 5.817949e-02 -3.425728e+00 2.393888e+04
[[4]]
a b c d
2.150487e+03 3.810406e-02 -2.658772e+00 2.675609e+04
[[5]]
a b c d
2.326199e+03 3.044967e-02 -1.780965e+00 2.604374e+04
[[6]]
a b c d
2934.0193270 0.0302937 -1.9912913 26283.0300823
And this is dput(head(h)):
list(structure(c(2513.37818972349, 0.0466821822063123, -3.18132213466142,
26371.4241646124), .Names = c("a", "b", "c", "d")), structure(c(2803.17230054557,
0.0669620116294894, -4.57643230249848, 25744.5376725213), .Names = c("a",
"b", "c", "d")), structure(c(3298.99066895304, 0.0581794881246528,
-3.42572804902504, 23938.8754575156), .Names = c("a", "b", "c",
"d")), structure(c(2150.48734655237, 0.0381040636898022, -2.65877160023262,
26756.0907073567), .Names = c("a", "b", "c", "d")), structure(c(2326.19873555633,
0.0304496684589379, -1.7809654498454, 26043.735374657), .Names = c("a",
"b", "c", "d")), structure(c(2934.01932702805, 0.0302937043170001,
-1.99129130343521, 26283.0300823458), .Names = c("a", "b", "c",
"d")))
Now I am trying to get just a column with h$a but I get NULL. How can I get just the a column?
In addition to this I want to plot the single coefficients and Date. I tried this code:
koeffreihe <- function(x) {
files <- list.files(pattern="*.csv")
df <- data.frame()
for(i in 1:length(files)){
xx <- read.csv(as.character(files[i]))
xx <- subset(xx, Sale.Purchase == "Sell" & Hour == 3)
df <- rbind(df, xx)
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i]] <- coef(f)
}
df$Date <- as.Date(as.character(df$Date), format="%d/%m/%Y")
plot(h$x ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
}
koeffreihe(a)
But I get this error:
invalid type (NULL) for variable 'h$x'
So the problem is that h$a is NULL. If someone can fix this problem I guess the code will work too.
Thank you for your help!

First transform your list into a data.frame:
h.df <- setNames(do.call(rbind.data.frame, h), names(h[[1]]))
# a b c d
#1 2513.378 0.04668218 -3.181322 26371.42
#2 2803.172 0.06696201 -4.576432 25744.54
#3 3298.991 0.05817949 -3.425728 23938.88
#4 2150.487 0.03810406 -2.658772 26756.09
#5 2326.199 0.03044967 -1.780965 26043.74
#6 2934.019 0.03029370 -1.991291 26283.03
Then you can extract variables easily:
h.df$a
#[1] 2513.378 2803.172 3298.991 2150.487 2326.199 2934.019
Alternatively you can iterate over the list to extract the variable:
sapply(h, "[", "a")
# a a a a a a
#2513.378 2803.172 3298.991 2150.487 2326.199 2934.019

In this line, although x is a variable, h$x is looking for a column named x in h:
plot(h$x ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
You probably want h[[x]] instead.
From ?'[[':
x$name is equivalent to x[["name", exact = FALSE]].
That is, you are looking for a column literally named x.

Related

Can I somehow refer to a column by more than one name?

I am working with two data.frames which use different terminology. To keep the terminology of each data.frame intact, I am currently deliberating whether it would be an idea to simply add the columns to the other data.frame.
df_a <- data.frame(
A = c("a", "b", "c"),
B = c("a", "b", "c")
)
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c")
)
df_a <- cbind(df_a, df_b)
df_b <- cbind(df_b, df_a)
This will however become problematic as soon as I will start making changes to any of these columns. I was wondering if there is instead a way or even a trick, to refer to a column by more than one name. Obviously this does not work, but something like:
df_a <- data.frame(
A & same_as_A = c("a", "b", "c"),
B & same_as_B = c("a", "b", "c")
)
Where df_a$same_as_A is equal to df_a$A
"a" "b" "c"
You can derive your own superclass of data.frame, wrap [ and $, and handle aliases explicitly.
aliases <- function(x, ...) {
dots <- list(...)
stopifnot(!is.null(names(dots)), all(nzchar(names(dots))))
nms <- attr(x, "aliases")
attr(x, "aliases") <- c(nms[!names(nms) %in% names(dots)], dots)
if (class(x)[1] != "aliased_dataframe") {
class(x) <- c("aliased_dataframe", class(x))
}
x
}
`[.aliased_dataframe` <- function(x, i, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
`$.aliased_dataframe` <- function(x, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
`$<-.aliased_dataframe` <- function(x, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
Demo:
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c")
)
df_b[, "a"]
# Error in `[.data.frame`(df_b, , "a") : undefined columns selected
df_b$a
# NULL
df_b <- aliases(df_b, a="same_as_A", b="same_as_B")
df_b[, "a"]
# [1] "a" "b" "c"
df_b$a
# [1] "a" "b" "c"
df_b$a <- c("A","B","C")
df_b
# same_as_A same_as_B
# 1 A a
# 2 B b
# 3 C c
Coincidentally, this works with tbl_df as well, but sadly not with data.table variants.
library(tibble) # or dplyr
df_b <- tibble(df_b)
df_b[, "a"]
# Error in `stop_subscript()`:
# ! Can't subset columns that don't exist.
# x Column `a` doesn't exist.
# Run `rlang::last_error()` to see where the error occurred.
df_b$a
# Warning: Unknown or uninitialised column: `a`.
# NULL
df_b <- aliases(df_b, a="same_as_A", b="same_as_B")
df_b[, "a"]
# # A tibble: 3 x 1
# same_as_A
# <chr>
# 1 a
# 2 b
# 3 c
df_b$a
# [1] "a" "b" "c"
df_b$a <- c("A","B","C")
df_b
# # A tibble: 3 x 2
# same_as_A same_as_B
# <chr> <chr>
# 1 A a
# 2 B b
# 3 C c
I should note that this accommodates explicit use of j=, as in df_b[,"a"]; the shortcut of df_b["a"] is technically overloading the i= argument, and while the base [.data.frame is correctly inferring your intent, these S3 wrappers are not. It is not difficult to add that (just another conditional, perhaps starting with if (missing(j) && !missing(i) && is.character(i))), but for simplicity I"m keeping it out. Because of this, df_b["a"] fails.
Another note, I did not overload [[, so df_b[["a"]] returns NULL. If it's really important to you, one could adapt this methodology to do that as well.

Building a Tree with Node Pairs

I have a data.table of node pairs where Parent is higher up the tree than Child.
I need to extract all the individual chains from these rules e.g. if I have in format parent>child: (a>b, b>c, b>e, c>d), the chains are (a>b>c>d, a>b>e).
I've made an example with some dummy data showing what I want to do. Any suggestions on how to do this would be great? It feels like it should be straightforward but I'm struggling to think how to start. Thank you :)
library(data.table)
library(data.tree)
# example input and expected output
input <- data.table(Parent = c("a", "b", "c",
"e", "b"),
Child = c("b", "c", "d",
"b", "f"))
output <- data.table(Tree = c(rep(1,4), rep(2,3), rep(3,3), rep(4,4)),
List = c("a", "b", "c", "d",
"e", "b", "f",
"a", "b", "f",
"e", "b", "c", "d"),
Hierarchy = c(1:4, 1:3, 1:3, 1:4))
# attempt with data.tree, only builds the node pairs.
# ignore world part, was following: https://cran.r-project.org/web/packages/data.tree/vignettes/data.tree.html#tree-creation
input[, pathString := paste("world", Parent, Child, sep = "/")]
data.tree::as.Node(input)
# attempt to re-structure
input[, Tree := .I]
dt1 <- input[, .(List = c(Parent, Child),
Hierarchy = 1:2), by=Tree]
Here is another possible solution - a little messy as well though
Output
output(input)
# tree_nums elems hierarchy
# 1: 1 a 1
# 2: 1 b 2
# 3: 1 c 3
# 4: 1 d 4
# 5: 2 e 1
# 6: 2 b 2
# 7: 2 c 3
# 8: 2 d 4
# 9: 3 a 1
# 10: 3 b 2
# 11: 3 f 3
# 12: 4 e 1
# 13: 4 b 2
# 14: 4 f 3
#
Function
output <- function (input) {
# init
helper <- do.call(paste0, input)
elements <- unique(unlist(input))
res <- integer(length(elements))
ind <- elements %in% input$Child
# first generation
parents <- elements[!ind]
res[!ind] <- 1L
# later generations
val <- 1L
parents <- parents
trees <- setNames(as.list(seq_along(parents)), parents)
while (any(res == 0L)) {
val <- val + 1L
children <- unique(input$Child[input$Parent %in% parents])
res[elements %in% children] <- val
# create the tree
nextHelper <- expand.grid(parents, children)
nextHelper$conc <- do.call(paste0, nextHelper)
nextHelper <- nextHelper[nextHelper$conc %in% helper,]
df_1 <- do.call(rbind, strsplit(names(trees),''))
df_2 <- base::merge(df_1, nextHelper[,-3L], by.x = ncol(df_1), by.y = 'Var1', all.x = TRUE)
n1 <- ncol(df_2)
if (n1 > 2L) df_2 <- df_2[,c(2:(n1-1),1L,n1)]
df_2$Var2 <- as.character(df_2$Var2)
df_2$Var2[is.na(df_2$Var2)] <- ''
treeNames <- do.call(paste0, df_2)
trees <- setNames(as.list(seq_along(treeNames)), treeNames)
parents <- children
}
elems <- strsplit(names(trees),'')
tree_nums <- rep(as.integer(trees), lengths(elems))
elems <- unlist(elems)
output <- data.table::data.table(tree_nums,elems)
out <- data.table::data.table(elements, res)
output$hierarchy <- out$res[match(output$elems, out$elements)]
output
}
I have a solution after a bit of a slog, but would prefer something more efficient if it exists.
library(stringi)
# convert to string
setkey(input, Parent)
sep <- ">>"
split_regex <- "(?<=%1$s)[^(%1$s)]*$"
trees <- sprintf("%s%s%s", input$Parent, sep, input$Child)
# get the base nodes, the children
children <- stri_extract_first_regex(trees, sprintf(split_regex, sep),
simplify = TRUE)
# find that which are parents
grid <- input[J(unique(children)), ][!is.na(Child), ]
update <- unique(grid$Parent)
N <- nrow(grid)
while(N > 0){
# add the children on for the ones at the base of the chains, might mean
# making more tree splits
all_trees <- unique(unlist(lapply(update, function(x){
pos <- children == x
y <- grid[Parent %in% x, Child]
trees <- c(trees[!pos], CJ(trees[pos], y)[, sprintf("%s%s%s", V1, sep, V2)])
trees
})))
# I have some trees embedded now, so remove these ones
trim <- sapply(seq_along(all_trees), function(i){
any(stri_detect_fixed(all_trees[-i], all_trees[i]))
})
trees <- all_trees[!trim]
# update operations on expanded trees until no children remain with a dependency
children <- stri_extract_first_regex(trees, sprintf(split_regex, sep, sep),
simplify = TRUE)
grid <- input[J(unique(children)), ][!is.na(Child), ]
update <- unique(grid$Parent)
N <- nrow(grid)
}
# re-structure to appropriate format
output <- data.table(pattern = trees)
output[, Tree := 1:.N]
output[, split := stri_split_regex(pattern, sep)]
output <- output[, .(List = split[[1]],
Hierarchy = 1:length(split[[1]])), by=Tree]
output[]

How to index and multiply two matrices efficiently?

I have two matrices "A", "B", and a data frame "C". They are
A <- matrix(1:10, nrow = 2)
colnames(A) <- letters[1:5]
B <- matrix(11:16, nrow = 2)
colnames(B) <- letters[6:8]
C <- data.frame(ix1 = c("a", "d"), ix2 = c("f", "h"))
I want to create a vector "vec" with length 2 and values
vec[1] = A[,"a"] %*% B[,"f"]
vec[2] = A[,"d"] %*% B[,"h"]
This can be done easily with a for loop, but it is time consuming when the sizes of "A", "B" and "C" grow. How to do it efficiently?
You can vectorize as follows, but I'm not sure how costly it would be to transpose A
(vec <- diag(crossprod(A[, as.character(C$ix1)], B[, as.character(C$ix2)])))
## [1] 35 233
You can use mapply:
vec = mapply(function(u,v) A[,u]%*%B[,v], c('a','d'), c('f','h'))
If you want to use your data.frame C:
vec = mapply(function(u,v) A[,u]%*%B[,v], as.character(C[,1]), as.character(C[,2]))
# a d
# 35 233
What really matters is the number of line of C, the number of lines in A and B might not be the bottleneck:
v1=rnorm(1000000)
v2=rnorm(1000000)
#> system.time(v1%*%v2)
# user system elapsed
# 0.01 0.00 0.02

Recode dataframe based on one column - in reverse

I asked this question a while ago (Recode dataframe based on one column) and the answer worked perfectly. Now however, i almost want to do the reverse. Namely, I have a (700k * 2000) of 0/1/2 or NA. In a separate dataframe I have two columns (Ref and Obs). The 0 corresponds to two instances of Ref, 1 is one instance of Ref and one instance of Obs and 2 is two Obs. To clarify, data snippet:
Genotype File ---
Ref Obs
A G
T C
G C
Ref <- c("A", "T", "G")
Obs <- c("G", "C", "C")
Current Data---
Sample.1 Sample.2 .... Sample.2000
0 1 2
0 0 0
0 NA 1
mat <- matrix(nrow=3, ncol=3)
mat[,1] <- c(0,0,0)
mat[,2] <- c(1,0,NA)
mat[,3] <- c(2,0,1)
Desired Data format---
Sample.1 Sample.1 Sample.2 Sample.2 Sample.2000 Sample.2000
A A A G G G
T T T T T T
G G 0 0 G C
I think that's right. The desired data format has two columns (space separated) for each sample. 0 in this format (plink ped file for the bioinformaticians out there) is missing data.
MAJOR ASSUMPTION: your data is in 3 element frames, i.e. you want to apply your mapping to the first 3 rows, then the next 3, and so on, which I think makes sense given DNA frames. If you want a rolling 3 element window this will not work (but code can be modified to make it work). This will work for an arbitrary number of columns, and arbitrary number of 3 row groups:
# Make up a matrix with your properties (4 cols, 6 rows)
col <- 4L
frame <- 3L
mat <- matrix(sample(c(0:2, NA_integer_), 2 * frame * col, replace=T), ncol=col)
# Mapping data
Ref <- c("A", "T", "G")
Obs <- c("G", "C", "C")
map.base <- cbind(Ref, Obs)
num.to.let <- matrix(c(1, 1, 1, 2, 2, 2), byrow=T, ncol=2) # how many from each of ref obs
# Function to map 0,1,2,NA to Ref/Obs
re_map <- function(mat.small) { # 3 row matrices, with col columns
t(
mapply( # iterate through each row in matrix
function(vals, map, num.to.let) {
vals.2 <- unlist(lapply(vals, function(x) map[num.to.let[x + 1L, ]]))
ifelse(is.na(vals.2), 0, vals.2)
},
vals=split(mat.small, row(mat.small)), # a row
map=split(map.base, row(map.base)), # the mapping for that row
MoreArgs=list(num.to.let=num.to.let) # general conversion of number to Obs/Ref
) )
}
# Split input data frame into 3 row matrices (assumes frame size 3),
# and apply mapping function to each group
mat.split <- split.data.frame(mat, sort(rep(1:(nrow(mat) / frame), frame)))
mat.res <- do.call(rbind, lapply(mat.split, re_map))
colnames(mat.res) <- paste0("Sample.", rep(1:ncol(mat), each=2))
print(mat.res, quote=FALSE)
# Sample.1 Sample.1 Sample.2 Sample.2 Sample.3 Sample.3 Sample.4 Sample.4
# 1 G G A G G G G G
# 2 C C 0 0 T C T C
# 3 0 0 G C G G G G
# 1 A A A A A G A A
# 2 C C C C T C C C
# 3 C C G G 0 0 0 0
I am not sure but this could be what you need:
first same simple data
geno <- data.frame(Ref = c("A", "T", "G"), Obs = c("G", "C", "C"))
data <- data.frame(s1 = c(0,0,0),s2 = c(1, 0, NA))
then a couple of functions:
f <- function(i , x, geno){
x <- x[i]
if(!is.na(x)){
if (x == 0) {y <- geno[i , c(1,1)]}
if (x == 1) {y <- geno[i, c(1,2)]}
if (x == 2) {y <- geno[i, c(2,2)]}
}
else y <- c(0,0)
names(y) <- c("s1", "s2")
y
}
g <- function(x, geno){
Reduce(rbind, lapply(1:length(x), FUN = f , x = x, geno = geno))
}
The way f() is defined may not be the most elegant but it does the job
Then simply run it as a doble for loop in a lapply fashion
as.data.frame(Reduce(cbind, lapply(data , g , geno = geno )))
hope it helps
Here's one way based on the sample data in your answer:
# create index
idx <- lapply(data, function(x) cbind((x > 1) + 1, (x > 0) + 1))
# list of matrices
lst <- lapply(idx, function(x) {
tmp <- apply(x, 2, function(y) geno[cbind(seq_along(y), y)])
replace(tmp, is.na(tmp), 0)
})
# one data frame
as.data.frame(lst)
# s1.1 s1.2 s2.1 s2.2
# 1 A A A G
# 2 T T T T
# 3 G G 0 0

R make.unique starting in 1

I have a data frame with columns that are in groups of 4 like so:
a b c d a b c d a b c d a b c d...
Then, I use the function rep to create tags for the columns:
rep(c("a", "b", "c", "d"), len=ncol)
Finally I use the function make.unique to create the tags:
a b c d a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3...
However, I would like to get:
a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4...
Is there an easy way to accomplish this? In the make.unique documentation does not mention any parameters to obtain this behaviour.
Here is a further variant. Applying the function make.unique.2 by #adn.bps can still produces some duplicates:
> u = c("a", "a", "b", "c", "c", "d", "c", "a.1")
> make.unique.2(u)
[1] "a.1" "a.2" "b" "c.1" "c.2" "d" "c.3" "a.1"
To avoid that, I've done:
dotify <- function(x, avoid){
l <- length(x)
if(l == 1L){
return(x)
}
numbers <- 1L:l
out <- paste0(x, ".", numbers)
ndots <- 1L
while(any(out %in% avoid)){
ndots <- ndots + 1L
out <- paste0(x, paste0(rep(".", ndots), collapse = ""), numbers)
}
out
}
make.unique2 <- function(x){
if(anyDuplicated(x)){
splt <- split(x, x)
u <- names(splt)
for(i in 1L:length(splt)){
splt_i <- splt[[i]]
j <- match(splt_i[1L], u)
avoid <- u[-j]
splt_i_new <- dotify(splt_i, avoid)
u <- c(avoid, splt_i_new)
splt[[i]] <- splt_i_new
}
x <- unsplit(splt, x)
}
x
}
make.unique2(u)
# [1] "a..1" "a..2" "b" "c.1" "c.2" "d" "c.3" "a.1"
make.unique.2 = function(x, sep='.'){
ave(x, x, FUN=function(a){if(length(a) > 1){paste(a, 1:length(a), sep=sep)} else {a}})
}
Testing against your example:
> u = rep(c("a", "b", "c", "d"), 4)
> make.unique.2(u)
[1] "a.1" "b.1" "c.1" "d.1" "a.2" "b.2" "c.2" "d.2" "a.3" "b.3" "c.3" "d.3"
[13] "a.4" "b.4" "c.4" "d.4"
If an element is not duplicated, it is left alone:
> u = c('a', 'a', 'b', 'c', 'c', 'c', 'd')
> make.unique.2(u)
[1] "a.1" "a.2" "b" "c.1" "c.2" "c.3" "d"
Wouldn't call this pretty, but it does the job:
> ncol <- 10
> apply(expand.grid(c("a","b","c","d"),1:((ncol+3)/4)), 1,
+ function(x)paste(x,collapse=""))[1:ncol]
[1] "a1" "b1" "c1" "d1" "a2" "b2" "c2" "d2" "a3" "b3"
where ncol is the number of tags to generate.
n <- 4
ncol <- 16
paste(letters[seq(n)], rep(seq(ncol/n), each = n, len = ncol), sep = "")
TLDR
# install.packages('makeunique')
library(makeunique)
# Simple use case
simple_input <- c("a", "b", "c", "d", "a", "b", "c", "d")
make_unique(simple_input, sep = "", wrap_in_brackets = FALSE)
Full answer
Disclaimer: I'm the author of the makeunique package
#Stéphane Laurent's answer is great! But I thought it might help future users to provide a similar but slightly more easily customisable version that I packaged up for my particular use case. Functionally the code below differs from #Stephane's answer in that It throws an informative error if the appended de-duplicating numbers lead to creation of an element that was already in your starting vector. You can then customise the separator or change whether numbers are wrapped in brackets to eliminate the problem. This lets you create unique datasets with more consistent, prettier suffixes
Option 1 (makeunique package)
library(makeunique)
# Simple use case
simple_input <- c("a", "b", "c", "d", "a", "b", "c", "d")
make_unique(simple_input, sep = "", wrap_in_brackets = FALSE)
#> [1] "a1" "b1" "c1" "d1" "a2" "b2" "c2" "d2"
# A harder case which #Stephane highlighted
# make_unique will throw an error instead of automatically fixing, so the user can choose how to resolve
difficult_input <- c("a", "b", "c", "d", "a", "b", "c", "d", "d2")
make_unique(difficult_input, sep = "", wrap_in_brackets = FALSE)
#> Error in make_unique(difficult_input, sep = "", wrap_in_brackets = FALSE): make_unique failed to make vector unique.
#> This is because appending ' <dup_number>' to duplicate values led tocreation of term(s) that were in the original dataset:
#> [d2]
#>
#> Please try again with a different argument for either `wrap_in_brackets` or `sep`
# Fix by using '-' as a separator
make_unique(difficult_input, sep = "-", wrap_in_brackets = FALSE)
#> [1] "a-1" "b-1" "c-1" "d-1" "a-2" "b-2" "c-2" "d-2" "d2"
Created on 2022-10-14 by the reprex package (v2.0.1)
Option 2: Use a custom function
Instead of using the package, feel free to just use the source function in your own code
make_unique <- function(x, sep = " ", wrap_in_brackets = TRUE, warn_about_type_conversion = TRUE){
if(!(is.character(sep) & length(sep) == 1)) stop('`sep` must be a string, not a ', paste0(class(sep), collapse = " "))
if(!(is.logical(wrap_in_brackets) & length(wrap_in_brackets) == 1)) stop('`wrap_in_brackets` must be a flag, not a ', paste0(class(wrap_in_brackets), collapse = " "))
if(!(is.logical(warn_about_type_conversion) & length(warn_about_type_conversion) == 1)) stop('`warn_about_type_conversion` must be a flag, not a ', paste0(class(warn_about_type_conversion), collapse = " "))
if(!any(is.numeric(x),is.character(x),is.factor(x))) stop('input to `make_unique` must be a character, numeric, or factor variable')
if(is.factor(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting factor to character variable')
x <- as.character(x)
}
else if(is.numeric(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting numeric variable to a character vector')
x <- as.character(x)
}
deduplicated = stats::ave(x, x, FUN = function(a){
if(length(a) > 1){
suffixes <- seq_along(a)
if(wrap_in_brackets) suffixes <- paste0('(', suffixes, ')')
paste0(a, sep, suffixes)
}
else {a}
})
values_still_duplicated <- deduplicated[duplicated(deduplicated)]
if(length(stats::na.omit(values_still_duplicated)) > 0){
stop(
"make_unique failed to make vector unique.\n",
"This is because appending ' <dup_number>' to duplicate values led to",
"creation of term(s) that were in the original dataset: \n[",
paste0(values_still_duplicated, collapse = ', '),
"]\n\nPlease try again with a different argument for either `wrap_in_brackets` or `sep`"
)
}
return(deduplicated)
}

Resources