R make.unique starting in 1

R make.unique starting in 1 - r

I have a data frame with columns that are in groups of 4 like so:
a b c d a b c d a b c d a b c d...
Then, I use the function rep to create tags for the columns:
rep(c("a", "b", "c", "d"), len=ncol)
Finally I use the function make.unique to create the tags:
a b c d a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3...
However, I would like to get:
a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4...
Is there an easy way to accomplish this? In the make.unique documentation does not mention any parameters to obtain this behaviour.

Here is a further variant. Applying the function make.unique.2 by #adn.bps can still produces some duplicates:
> u = c("a", "a", "b", "c", "c", "d", "c", "a.1")
> make.unique.2(u)
[1] "a.1" "a.2" "b" "c.1" "c.2" "d" "c.3" "a.1"
To avoid that, I've done:
dotify <- function(x, avoid){
l <- length(x)
if(l == 1L){
return(x)
}
numbers <- 1L:l
out <- paste0(x, ".", numbers)
ndots <- 1L
while(any(out %in% avoid)){
ndots <- ndots + 1L
out <- paste0(x, paste0(rep(".", ndots), collapse = ""), numbers)
}
out
}
make.unique2 <- function(x){
if(anyDuplicated(x)){
splt <- split(x, x)
u <- names(splt)
for(i in 1L:length(splt)){
splt_i <- splt[[i]]
j <- match(splt_i[1L], u)
avoid <- u[-j]
splt_i_new <- dotify(splt_i, avoid)
u <- c(avoid, splt_i_new)
splt[[i]] <- splt_i_new
}
x <- unsplit(splt, x)
}
x
}
make.unique2(u)
# [1] "a..1" "a..2" "b" "c.1" "c.2" "d" "c.3" "a.1"

make.unique.2 = function(x, sep='.'){
ave(x, x, FUN=function(a){if(length(a) > 1){paste(a, 1:length(a), sep=sep)} else {a}})
}
Testing against your example:
> u = rep(c("a", "b", "c", "d"), 4)
> make.unique.2(u)
[1] "a.1" "b.1" "c.1" "d.1" "a.2" "b.2" "c.2" "d.2" "a.3" "b.3" "c.3" "d.3"
[13] "a.4" "b.4" "c.4" "d.4"
If an element is not duplicated, it is left alone:
> u = c('a', 'a', 'b', 'c', 'c', 'c', 'd')
> make.unique.2(u)
[1] "a.1" "a.2" "b" "c.1" "c.2" "c.3" "d"

Wouldn't call this pretty, but it does the job:
> ncol <- 10
> apply(expand.grid(c("a","b","c","d"),1:((ncol+3)/4)), 1,
+ function(x)paste(x,collapse=""))[1:ncol]
[1] "a1" "b1" "c1" "d1" "a2" "b2" "c2" "d2" "a3" "b3"
where ncol is the number of tags to generate.

n <- 4
ncol <- 16
paste(letters[seq(n)], rep(seq(ncol/n), each = n, len = ncol), sep = "")

TLDR
# install.packages('makeunique')
library(makeunique)
# Simple use case
simple_input <- c("a", "b", "c", "d", "a", "b", "c", "d")
make_unique(simple_input, sep = "", wrap_in_brackets = FALSE)
Full answer
Disclaimer: I'm the author of the makeunique package
#Stéphane Laurent's answer is great! But I thought it might help future users to provide a similar but slightly more easily customisable version that I packaged up for my particular use case. Functionally the code below differs from #Stephane's answer in that It throws an informative error if the appended de-duplicating numbers lead to creation of an element that was already in your starting vector. You can then customise the separator or change whether numbers are wrapped in brackets to eliminate the problem. This lets you create unique datasets with more consistent, prettier suffixes
Option 1 (makeunique package)
library(makeunique)
# Simple use case
simple_input <- c("a", "b", "c", "d", "a", "b", "c", "d")
make_unique(simple_input, sep = "", wrap_in_brackets = FALSE)
#> [1] "a1" "b1" "c1" "d1" "a2" "b2" "c2" "d2"
# A harder case which #Stephane highlighted
# make_unique will throw an error instead of automatically fixing, so the user can choose how to resolve
difficult_input <- c("a", "b", "c", "d", "a", "b", "c", "d", "d2")
make_unique(difficult_input, sep = "", wrap_in_brackets = FALSE)
#> Error in make_unique(difficult_input, sep = "", wrap_in_brackets = FALSE): make_unique failed to make vector unique.
#> This is because appending ' <dup_number>' to duplicate values led tocreation of term(s) that were in the original dataset:
#> [d2]
#>
#> Please try again with a different argument for either `wrap_in_brackets` or `sep`
# Fix by using '-' as a separator
make_unique(difficult_input, sep = "-", wrap_in_brackets = FALSE)
#> [1] "a-1" "b-1" "c-1" "d-1" "a-2" "b-2" "c-2" "d-2" "d2"
Created on 2022-10-14 by the reprex package (v2.0.1)
Option 2: Use a custom function
Instead of using the package, feel free to just use the source function in your own code
make_unique <- function(x, sep = " ", wrap_in_brackets = TRUE, warn_about_type_conversion = TRUE){
if(!(is.character(sep) & length(sep) == 1)) stop('`sep` must be a string, not a ', paste0(class(sep), collapse = " "))
if(!(is.logical(wrap_in_brackets) & length(wrap_in_brackets) == 1)) stop('`wrap_in_brackets` must be a flag, not a ', paste0(class(wrap_in_brackets), collapse = " "))
if(!(is.logical(warn_about_type_conversion) & length(warn_about_type_conversion) == 1)) stop('`warn_about_type_conversion` must be a flag, not a ', paste0(class(warn_about_type_conversion), collapse = " "))
if(!any(is.numeric(x),is.character(x),is.factor(x))) stop('input to `make_unique` must be a character, numeric, or factor variable')
if(is.factor(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting factor to character variable')
x <- as.character(x)
}
else if(is.numeric(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting numeric variable to a character vector')
x <- as.character(x)
}
deduplicated = stats::ave(x, x, FUN = function(a){
if(length(a) > 1){
suffixes <- seq_along(a)
if(wrap_in_brackets) suffixes <- paste0('(', suffixes, ')')
paste0(a, sep, suffixes)
}
else {a}
})
values_still_duplicated <- deduplicated[duplicated(deduplicated)]
if(length(stats::na.omit(values_still_duplicated)) > 0){
stop(
"make_unique failed to make vector unique.\n",
"This is because appending ' <dup_number>' to duplicate values led to",
"creation of term(s) that were in the original dataset: \n[",
paste0(values_still_duplicated, collapse = ', '),
"]\n\nPlease try again with a different argument for either `wrap_in_brackets` or `sep`"
)
}
return(deduplicated)
}

Related

Subset a vector and retrieve first elements if exceed the length in R

Imagine I have a vector like this one:
c("A", "B", "C", "D")
there are 4 positions. If I make a sample with size 1 I can get 1, 2, 3 or 4. What I want is to be able to subset of length 3 of that vector according its order, for example, if I get 2:
c("B", "C", "D")
If I get 3:
c("C", "D", "A")
If I get 4:
c("D","A", "B")
So that's the logic, the vector is sorted and the last elements connects with the first element when I subset.

Using seq, f gives you the desired subset for a specified vector v, of which you would like to subset l elements with a starting point at the nth position.
f <- function(v, n, l) v[seq(n - 1, length.out = l) %% length(v) + 1]
output
f(v, n = 4, l = 3)
#[1] "D" "A" "B"
f(v, n = 3, l = 4)
#[1] "C" "D" "A" "B"
f(v, n = 2, l = 5)
#[1] "B" "C" "D" "A" "B"

I think I got it!
v <- c("A", "B", "C", "D")
p <- sample(1:length(v), 1)
r <- c(v[p:length(v)])
c(r, v[!(v %in% r)])[1:3]
And the outputs:
v <- c("A", "B", "C", "D") # your vector
r <- c(v[2:length(v)])
c(r, v[!(v %in% r)])[1:3]
#> [1] "B" "C" "D"
r <- c(v[3:length(v)])
c(r, v[!(v %in% r)])[1:3]
#> [1] "C" "D" "A"
r <- c(v[4:length(v)])
c(r, v[!(v %in% r)])[1:3]
#> [1] "D" "A" "B"
Created on 2022-05-16 by the reprex package (v2.0.1)
Wrapped in a function:
f <- function(v, nth) {
r <- c(v[nth:length(v)])
return(c(r, v[!(v %in% r)])[1:3])
}

Can I somehow refer to a column by more than one name?

I am working with two data.frames which use different terminology. To keep the terminology of each data.frame intact, I am currently deliberating whether it would be an idea to simply add the columns to the other data.frame.
df_a <- data.frame(
A = c("a", "b", "c"),
B = c("a", "b", "c")
)
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c")
)
df_a <- cbind(df_a, df_b)
df_b <- cbind(df_b, df_a)
This will however become problematic as soon as I will start making changes to any of these columns. I was wondering if there is instead a way or even a trick, to refer to a column by more than one name. Obviously this does not work, but something like:
df_a <- data.frame(
A & same_as_A = c("a", "b", "c"),
B & same_as_B = c("a", "b", "c")
)
Where df_a$same_as_A is equal to df_a$A
"a" "b" "c"

You can derive your own superclass of data.frame, wrap [ and $, and handle aliases explicitly.
aliases <- function(x, ...) {
dots <- list(...)
stopifnot(!is.null(names(dots)), all(nzchar(names(dots))))
nms <- attr(x, "aliases")
attr(x, "aliases") <- c(nms[!names(nms) %in% names(dots)], dots)
if (class(x)[1] != "aliased_dataframe") {
class(x) <- c("aliased_dataframe", class(x))
}
x
}
`[.aliased_dataframe` <- function(x, i, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
`$.aliased_dataframe` <- function(x, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
`$<-.aliased_dataframe` <- function(x, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
Demo:
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c")
)
df_b[, "a"]
# Error in `[.data.frame`(df_b, , "a") : undefined columns selected
df_b$a
# NULL
df_b <- aliases(df_b, a="same_as_A", b="same_as_B")
df_b[, "a"]
# [1] "a" "b" "c"
df_b$a
# [1] "a" "b" "c"
df_b$a <- c("A","B","C")
df_b
# same_as_A same_as_B
# 1 A a
# 2 B b
# 3 C c
Coincidentally, this works with tbl_df as well, but sadly not with data.table variants.
library(tibble) # or dplyr
df_b <- tibble(df_b)
df_b[, "a"]
# Error in `stop_subscript()`:
# ! Can't subset columns that don't exist.
# x Column `a` doesn't exist.
# Run `rlang::last_error()` to see where the error occurred.
df_b$a
# Warning: Unknown or uninitialised column: `a`.
# NULL
df_b <- aliases(df_b, a="same_as_A", b="same_as_B")
df_b[, "a"]
# # A tibble: 3 x 1
# same_as_A
# <chr>
# 1 a
# 2 b
# 3 c
df_b$a
# [1] "a" "b" "c"
df_b$a <- c("A","B","C")
df_b
# # A tibble: 3 x 2
# same_as_A same_as_B
# <chr> <chr>
# 1 A a
# 2 B b
# 3 C c
I should note that this accommodates explicit use of j=, as in df_b[,"a"]; the shortcut of df_b["a"] is technically overloading the i= argument, and while the base [.data.frame is correctly inferring your intent, these S3 wrappers are not. It is not difficult to add that (just another conditional, perhaps starting with if (missing(j) && !missing(i) && is.character(i))), but for simplicity I"m keeping it out. Because of this, df_b["a"] fails.
Another note, I did not overload [[, so df_b[["a"]] returns NULL. If it's really important to you, one could adapt this methodology to do that as well.

RStudio keyboard shortcut to turn strings into objects and vice versa? (i.e. remove or add quotes around names)

Is there a keyboard shortcut for converting something like rm("a", "b", "c", "d") into rm(a, b, c, d)?

Not an RStudio shortcut but you can do, ctrl+F, check the Regex box and replace \"(.*?)\" by \1
Coming back at it later, here are two functions that operate in both directions,
You could build an addin from those and trigger the execution with hotkeys
quote_vars <- function(expr) {
expr <- substitute(expr)
vars <- all.vars(expr)
vars <- setNames(as.list(vars), vars)
do.call(substitute, list(expr, vars))
}
unquote_strings <- function(expr) {
expr <- deparse(substitute(expr))
expr <- gsub("\"(.*?)\"", "\\1", expr)
parse(text= expr)[[1]]
}
quote_vars(rm(a, b, c, d))
#> rm("a", "b", "c", "d")
unquote_strings(rm("a", "b", "c", "d"))
#> rm(a, b, c, d)
Created on 2019-07-05 by the reprex package (v0.3.0)

For viceversa conversions, an option is
f1 <- function(...) {
v1 <- rlang::enexprs(...)
if(is.character(v1[[1]])) {
rlang::syms(v1)
} else purrr::map(v1, ~ rlang::as_name(.x))
}
-testing
f1("a", "b", "c", "d") # changes to symbol
#[[1]]
#a
#[[2]]
#b
#[[3]]
#c
#[[4]]
#d
f1(a, b, c, d) # changes to character
#[[1]]
#[1] "a"
#[[2]]
#[1] "b"
#[[3]]
#[1] "c"
#[[4]]
#[1] "d"
NOTE: Returning a list to have consistent behavior
With rm, we can use do.call
out <- f1("a", "b", "c", "d")
do.call("rm", out)
a
#Error: object 'a' not found
b
#Error: object 'b' not found
data
a <- b <- c <-d <- 1:5

Test if string in n positions of alphabet

As the title suggests, I am looking for an elegant* way to test whether a character is in the first n positions in the alphabet.
So, for a character vector as follows:
names <- c("Brian", "Cormac", "Zachariah")
And with n <- 6
It would return:
TRUE','TRUE', 'FALSE'
*I am aware that I can use substr(names,1,1) %in% c("A", "B", "C", "D", "E", "F"), but I was hoping for a better solution.
EDIT: What I mean by position in the alphabet is whether the first letter is in the first n letters in alphabetical order. So, "A" is in the first n = 1+, "B" is in the first n =2+, "Y" in the first n=25 letters, etc.

PoGibas comment seems to have as elegant as it gets. Next step would be wrapping it in a function:
cht6_pog <- function(string) {
x <- toupper(substring(string, 1, 1)) %in% LETTERS[1:6]
names(x) <- string
x
}
cht6_pog(names)
Brian Cormac Zachariah
TRUE TRUE FALSE

Here is my answer for your question.
# fun:
check_char <- function(string, start_n, end_n, char_pattern)
{
str_list <- strsplit(substr(string, start_n, end_n), "")
out <- sapply(str_list, function(x) any(tolower(x) %in% tolower(char_pattern)))
return(out)
}
# args:
str_vec <- c("Google", "Facebook", "Amazon")
str_n <- 1
end_n <- 4
char <- LETTERS[1:6]
# run:
out <- check_char(str_vec, str_n, end_n, char)
print(out)
# [1] FALSE TRUE TRUE

extract a single column

I have a list of 701 given csv files. Each one has the same number of columns (7) but different number of rows (between 25000 and 28000).
Here is an extract of the first file:
Date,Week,Week Day,Hour,Price,Volume,Sale/Purchase
18/03/2011,11,5,1,-3000.00,17416,Sell
18/03/2011,11,5,1,-1001.10,17427,Sell
18/03/2011,11,5,1,-1000.00,18055,Sell
18/03/2011,11,5,1,-500.10,18057,Sell
18/03/2011,11,5,1,-500.00,18064,Sell
18/03/2011,11,5,1,-400.10,18066,Sell
18/03/2011,11,5,1,-400.00,18066,Sell
18/03/2011,11,5,1,-300.10,18068,Sell
18/03/2011,11,5,1,-300.00,18118,Sell
I made a nonlinear regression of the supply curve of the ninth hour for the year 2012. The datas for 2012 are in 290. to 654. csv files.
allenamen <- dir(pattern="*.csv")
alledat <- lapply(allenamen, read.csv, header = TRUE, sep = ",", stringsAsFactors = FALSE)
h <- list()
for(i in 290:654) {
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i-289]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i-289]] <- coef(f)
}
This works and I get the coefficients a, b, c and d for every day in 2012.
This is the head(h):
[[1]]
a b c d
2.513378e+03 4.668218e-02 -3.181322e+00 2.637142e+04
[[2]]
a b c d
2.803172e+03 6.696201e-02 -4.576432e+00 2.574454e+04
[[3]]
a b c d
3.298991e+03 5.817949e-02 -3.425728e+00 2.393888e+04
[[4]]
a b c d
2.150487e+03 3.810406e-02 -2.658772e+00 2.675609e+04
[[5]]
a b c d
2.326199e+03 3.044967e-02 -1.780965e+00 2.604374e+04
[[6]]
a b c d
2934.0193270 0.0302937 -1.9912913 26283.0300823
And this is dput(head(h)):
list(structure(c(2513.37818972349, 0.0466821822063123, -3.18132213466142,
26371.4241646124), .Names = c("a", "b", "c", "d")), structure(c(2803.17230054557,
0.0669620116294894, -4.57643230249848, 25744.5376725213), .Names = c("a",
"b", "c", "d")), structure(c(3298.99066895304, 0.0581794881246528,
-3.42572804902504, 23938.8754575156), .Names = c("a", "b", "c",
"d")), structure(c(2150.48734655237, 0.0381040636898022, -2.65877160023262,
26756.0907073567), .Names = c("a", "b", "c", "d")), structure(c(2326.19873555633,
0.0304496684589379, -1.7809654498454, 26043.735374657), .Names = c("a",
"b", "c", "d")), structure(c(2934.01932702805, 0.0302937043170001,
-1.99129130343521, 26283.0300823458), .Names = c("a", "b", "c",
"d")))
Now I am trying to get just a column with h$a but I get NULL. How can I get just the a column?
In addition to this I want to plot the single coefficients and Date. I tried this code:
koeffreihe <- function(x) {
files <- list.files(pattern="*.csv")
df <- data.frame()
for(i in 1:length(files)){
xx <- read.csv(as.character(files[i]))
xx <- subset(xx, Sale.Purchase == "Sell" & Hour == 3)
df <- rbind(df, xx)
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i]] <- coef(f)
}
df$Date <- as.Date(as.character(df$Date), format="%d/%m/%Y")
plot(h$x ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
}
koeffreihe(a)
But I get this error:
invalid type (NULL) for variable 'h$x'
So the problem is that h$a is NULL. If someone can fix this problem I guess the code will work too.
Thank you for your help!

First transform your list into a data.frame:
h.df <- setNames(do.call(rbind.data.frame, h), names(h[[1]]))
# a b c d
#1 2513.378 0.04668218 -3.181322 26371.42
#2 2803.172 0.06696201 -4.576432 25744.54
#3 3298.991 0.05817949 -3.425728 23938.88
#4 2150.487 0.03810406 -2.658772 26756.09
#5 2326.199 0.03044967 -1.780965 26043.74
#6 2934.019 0.03029370 -1.991291 26283.03
Then you can extract variables easily:
h.df$a
#[1] 2513.378 2803.172 3298.991 2150.487 2326.199 2934.019
Alternatively you can iterate over the list to extract the variable:
sapply(h, "[", "a")
# a a a a a a
#2513.378 2803.172 3298.991 2150.487 2326.199 2934.019

In this line, although x is a variable, h$x is looking for a column named x in h:
plot(h$x ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
You probably want h[[x]] instead.
From ?'[[':
x$name is equivalent to x[["name", exact = FALSE]].
That is, you are looking for a column literally named x.