repeating vector of letters - r

Is there a function to create a repeating list of letters in R?
something like
letters[1:30]
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
[20] "t" "u" "v" "w" "x" "y" "z" NA NA NA NA
but instead of NA, I would like the output to continue aa, bb, cc, dd ...

It's not too difficult to piece together a quick function to do something like this:
myLetters <- function(length.out) {
a <- rep(letters, length.out = length.out)
grp <- cumsum(a == "a")
vapply(seq_along(a),
function(x) paste(rep(a[x], grp[x]), collapse = ""),
character(1L))
}
myLetters(60)
# [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l"
# [13] "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x"
# [25] "y" "z" "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"
# [37] "kk" "ll" "mm" "nn" "oo" "pp" "qq" "rr" "ss" "tt" "uu" "vv"
# [49] "ww" "xx" "yy" "zz" "aaa" "bbb" "ccc" "ddd" "eee" "fff" "ggg" "hhh"

If you just want unique names, you could use
make.unique(rep(letters, length.out = 30), sep='')
Edit:
Here's another way to get repeating letters using Reduce.
myletters <- function(n)
unlist(Reduce(paste0,
replicate(n %/% length(letters), letters, simplify=FALSE),
init=letters,
accumulate=TRUE))[1:n]
myletters(60)
# [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l"
# [13] "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x"
# [25] "y" "z" "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"
# [37] "kk" "ll" "mm" "nn" "oo" "pp" "qq" "rr" "ss" "tt" "uu" "vv"
# [49] "ww" "xx" "yy" "zz" "aaa" "bbb" "ccc" "ddd" "eee" "fff" "ggg" "hhh"

Working solution
A function to produce Excel-style column names, i.e.
# A, B, ..., Z, AA, AB, ..., AZ, BA, BB, ..., ..., ZZ, AAA, ...
letterwrap <- function(n, depth = 1) {
args <- lapply(1:depth, FUN = function(x) return(LETTERS))
x <- do.call(expand.grid, args = list(args, stringsAsFactors = F))
x <- x[, rev(names(x)), drop = F]
x <- do.call(paste0, x)
if (n <= length(x)) return(x[1:n])
return(c(x, letterwrap(n - length(x), depth = depth + 1)))
}
letterwrap(26^2 + 52) # through AAZ
Botched attempt
Initially I thought this would best be done cleverly by converting to base 26, but that doesn't work. The issue is that Excel column names aren't base 26, which took me a long time to realize. The catch is 0: if you try to map a letter (like A) to 0, you've got a problem when you want to distinguish between A and AA and AAA...
Another way to illustrate the problem is in "digits". In base 10, there are 10 single-digit numbers (0-9), then 90 double-digit numbers (10:99), 900 three-digit numbers... generalizing to 10^d - 10^(d - 1) numbers with d digits for d > 1. However, in Excel column names there are 26 single-letter names, 26^2 double-letter names, 26^3 triple-letter names, with no subtraction.
I'll leave this code as a warning to others:
## Converts a number to base 26, returns a vector for each "digit"
b26 <- function(n) {
stopifnot(n >= 0)
if (n <= 1) return(n)
n26 <- rep(NA, ceiling(log(n, base = 26)))
for (i in seq_along(n26)) {
n26[i] <- (n %% 26)
n <- n %/% 26
}
return(rev(n26))
}
## Returns the name of nth value in the sequence
## A, B, C, ..., Z, AA, AB, AC, ..., AZ, BA, ...
letterwrap1 <- function(n, lower = FALSE) {
let <- if (lower) letters else LETTERS
base26 <- b26(n)
base26[base26 == 0] <- 26
paste(let[base26], collapse = "")
}
## Vectorized version of letterwrap
letter_col_names <- Vectorize(letterwrap, vectorize.args="n")
> letter_col_names(1:4)
[1] "A" "B" "C" "D"
> letter_col_names(25:30)
[1] "Y" "Z" "AA" "AB" "AC" "AD"
# Looks pretty good
# Until we get here:
> letter_col_names(50:54)
[1] "AX" "AY" "BZ" "BA" "BB"

There is almost certainly a better way, but this is what I ended up with:
letter_wrap <- function(idx) {
vapply(
idx,
function(x)
paste0(
rep(
letters[replace(x %% 26, !x %% 26, 26)], 1 + (x - 1) %/% 26 ), collapse=""), "")
}
letter_wrap(1:60)
# [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n"
# [15] "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z" "aa" "bb"
# [29] "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj" "kk" "ll" "mm" "nn" "oo" "pp"
# [43] "qq" "rr" "ss" "tt" "uu" "vv" "ww" "xx" "yy" "zz" "aaa" "bbb" "ccc" "ddd"
# [57] "eee" "fff" "ggg" "hhh"
EDIT: failed to notice Ananda's answer before I posted this one. This one is different enough that I'm leaving it. Note it takes the index vector as an input, as opposed to the number of items.

Probably not the cleanest, but easy to see what's happening:
foo<-letters[1:26]
outlen <- 73 # or whatever length you want
oof <- vector(len=26)
for ( j in 2:(outlen%/%26)) {
for (k in 1:26) oof[k] <- paste(rep(letters[k],j),sep='',collapse='')
foo<-c(foo,oof)
}
for (jj in 1:(outlen%%26) ) foo[(26*j)+jj]<-paste(rep(letters[jj],(j+1)),sep='',collapse='')
foo
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n"
[15] "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z" "aa" "bb"
[29] "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj" "kk" "ll" "mm" "nn" "oo" "pp"
[43] "qq" "rr" "ss" "tt" "uu" "vv" "ww" "xx" "yy" "zz" "aaa" "bbb" "ccc" "ddd"
[57] "eee" "fff" "ggg" "hhh" "iii" "jjj" "kkk" "lll" "mmm" "nnn" "ooo" "ppp" "qqq" "rrr"
[71] "sss" "ttt" "uuu"
EDIT: Matthew wins, hands-down:
microbenchmark(anandaLetters(5000),matthewletters(5000),carlletters(5000),times=10)
Unit: milliseconds
expr min lq median uq max neval
anandaLetters(5000) 85.339200 85.567978 85.9827715 86.260298 86.612231 10
matthewletters(5000) 3.413706 3.503506 3.9067535 3.946950 4.106453 10
carlletters(5000) 94.893983 95.405418 96.4492430 97.234784 110.681780 10

Let me do a little correction on seq "AY" "BZ". You have to rest out one letter to the previous digiletter.
colExcel2num <- function(x) {
p <- seq(from = nchar(x) - 1, to = 0)
y <- utf8ToInt(x) - utf8ToInt("A") + 1L
S <- sum(y * 26^p)
return(S)
}
## Converts a number to base 26, returns a vector for each "digit"
b26 <- function(n) {
stopifnot(n >= 0)
if (n <= 1) return(n)
n26 <- rep(NA, ceiling(log(n, base = 26)))
for (i in seq_along(n26)) {
n26[i] <- (n %% 26)
n <- n %/% 26
}
return(rev(n26))
}
## Retorna el nombre de columna Excel según la posición de columna
## A, B, C, ..., Z, AA, AB, AC, ..., AZ, BA, ...
colnum2Excel <- function(n, lower = FALSE) {
let <- if (lower) letters else LETTERS
base26 <- b26(n)
i <- base26 == 0
base26[i] <- 26
base26[lead(i, default = FALSE)] <- base26[lead(i, default = FALSE)] - 1
paste(let[base26], collapse = "")
}
## Return df's column index based on column name
## A, B, C, ..., Z, AA, AB, AC, ..., AZ, BA, ...
## buscando el número de columna en el df
varnum2Excel <- function(df, colname, lower = FALSE) {
index <- match(colname, names(df))
stopifnot(index > 0)
return(colnum2Excel(index))
}
Here some example:
require(openxlsx)
table <- data.frame(milk = c(1,2,3), oranges = c(2,4,6))
table <- table %>%
mutate(
ajjhh = sprintf(paste0(
varnum2Excel(.,"milk"), "%1$s", " + ",
varnum2Excel(.,"oranges"),"%1$s"),
2:(n()+1)
)
)
class(table$ajjhh) <- c(class(table$ajjhh), "formula")
wb <- createWorkbook()
addWorksheet(wb = wb, sheetName = "Sheet1", tabColour = "chocolate4")
writeData (wb, "Sheet1", x = table)
saveWorkbook(wb, "formulashasnotgone.xlsx", overwrite = TRUE)

Related

How to paste characters in groups of 3 in R

Here is a DNA string that I want to split and then combine in groups of 3
dna=c("TACACGATGACAGTCTTGACGGGTTCTCCTACT")
dna.sg = unlist(strsplit(dna, ""))
Gives
[1] "T" "A" "C" "A" "C" "G" "A" "T" "G" "A" "C" "A" "G" "T" "C" "T" "T" "G" "A" "C" "G" "G" "G" "T" "T" "C" "T" "C" "C" "T" "A" "C" "T"
But I'd like to have
"TAC" "ACG" [...]
You may split every 3 characters in strsplit.
unlist(strsplit(dna, "(?<=.{3})", perl = TRUE))
#[1] "TAC" "ACG" "ATG" "ACA" "GTC" "TTG" "ACG" "GGT" "TCT" "CCT" "ACT"
Other possibilities:
dna <- c("TACACGATGACAGTCTTGACGGGTTCTCCTACT")
regmatches(dna, gregexpr(".{3}", dna))[[1]]
# [1] "TAC" "ACG" "ATG" "ACA" "GTC" "TTG" "ACG" "GGT" "TCT" "CCT" "ACT"
sapply(seq(1, nchar(dna), 3), \(x) substr(dna, x, x+3-1))
# [1] "TAC" "ACG" "ATG" "ACA" "GTC" "TTG" "ACG" "GGT" "TCT" "CCT" "ACT"
substring(dna, seq(1, nchar(dna), by = 3), seq(3, nchar(dna), by = 3))
# [1] "TAC" "ACG" "ATG" "ACA" "GTC" "TTG" "ACG" "GGT" "TCT" "CCT" "ACT"
unlist(strsplit(gsub("(.{3})", "\\1 ", dna), split = " "))
# [1] "TAC" "ACG" "ATG" "ACA" "GTC" "TTG" "ACG" "GGT" "TCT" "CCT" "ACT"
Felt like doing a benchmarking with all those solutions:
dna <- c("TACACGATGACAGTCTTGACGGGTTCTCCTACT")
library(microbenchmark)
bm <- microbenchmark(
reg = regmatches(dna, gregexpr(".{3}", dna))[[1]],
substr = sapply(seq(1, nchar(dna), 3), \(x) substr(dna, x, x+3-1)),
substring = substring(dna, seq(1, nchar(dna), by = 3), seq(3, nchar(dna), by = 3)),
gsub = unlist(strsplit(gsub("(.{3})", "\\1 ", dna), split = " ")),
strsplit = unlist(strsplit(dna, "(?<=.{3})", perl = TRUE)),
times = 10L,
setup = gc(FALSE)
)
autoplot(bm)
gsub seems to be a clear winner!

Creating a list of 6-character strings that have a distance/ difference of at least 3 per string? (for a DNA/ oligo-related problem)

I want to create a list of strings comprised of only A's G's C's and T's that have a difference/ distance of at least 3, eg (two strings/ oligos could be eg. ATCTGA and TAGTGC). I can create all the possible combinations of a 6nt string, but I can't work out how to select only a subset 3-distant oligos. I know that there will be more than one list, but any list would do.
Not really done much DNA data manipulation so I am unsure how to approach this, would appreciate any suggestions of any tools out there.
Thank
Given a reference oligo x of nonzero length, this function returns a character vector listing all oligos of equal length whose Hamming distance from x is at least mindist.
oligo1 <- function(x, mindist = 0L) {
acgt <- c("A", "C", "G", "T")
x <- match(strsplit(x, "")[[1L]], acgt)
if ((n <- length(x)) == 0L || anyNA(x)) {
stop("'x' is not a valid oligo.")
}
if (mindist > n) {
return(character(0L))
}
P <- gtools::permutations(4L, n, repeats.allowed = TRUE)
if (mindist > 0L) {
P <- P[rowSums(P != rep.int(x, rep.int(4^n, n))) >= mindist, , drop = FALSE]
}
m <- nrow(P)
do.call(paste0, split(acgt[P], gl(n, m)))
}
oligo1("AA", 0L)
## [1] "AA" "AC" "AG" "AT" "CA" "CC" "CG" "CT" "GA" "GC"
## [11] "GG" "GT" "TA" "TC" "TG" "TT"
oligo1("AA", 1L)
## [1] "AC" "AG" "AT" "CA" "CC" "CG" "CT" "GA" "GC" "GG"
## [11] "GT" "TA" "TC" "TG" "TT"
oligo1("AA", 2L)
## [1] "CC" "CG" "CT" "GC" "GG" "GT" "TC" "TG" "TT"
Employing the above recursively, you can find the largest set containing x whose elements mutually satisfy the condition on Hamming distance. More precisely, you can construct the longest y such that x %in% y and the Hamming distance from y[i] to y[j] is at least mindist for all i != j.
oligo2 <- function(x, mindist = 0L) {
y <- c(x, oligo1(x, mindist))
n <- length(y)
pos <- 2L
while (pos < n) {
y <- c(y[1:pos], intersect(y[(pos+1L):n], oligo1(y[pos], mindist)))
n <- length(y)
pos <- pos + 1L
}
y
}
oligo2("AA", 0L)
## [1] "AA" "AA" "AC" "AG" "AT" "CA" "CC" "CG" "CT" "GA"
## [11] "GC" "GG" "GT" "TA" "TC" "TG" "TT"
oligo2("AA", 1L)
## [1] "AA" "AC" "AG" "AT" "CA" "CC" "CG" "CT" "GA" "GC"
## [11] "GG" "GT" "TA" "TC" "TG" "TT"
oligo2("AA", 2L)
## [1] "AA" "CC" "GG" "TT"
Hence one possible answer to your question would be:
oligo2("AAAAAA", 3L)
## [1] "AAAAAA" "AAACCC" "AAAGGG" "AAATTT" "AACACG"
## [6] "AACCAT" "AACGTA" "AACTGC" "AAGAGT" "AAGCTG"
## [11] "AAGGAC" "AAGTCA" "AATATC" "AATCGA" "AATGCT"
## [16] "AATTAG" "ACAACT" "ACACAG" "ACAGTC" "ACATGA"
## [21] "ACCAAC" "ACCCCA" "ACCGGT" "ACCTTG" "ACGATA"
## [26] "ACGCGC" "ACGGCG" "ACGTAT" "ACTAGG" "ACTCTT"
## [31] "ACTGAA" "ACTTCC" "AGAAGC" "AGACTA" "AGAGAT"
## [36] "AGATCG" "AGCATT" "AGCCGG" "AGCGCC" "AGCTAA"
## [41] "AGGAAG" "AGGCCT" "AGGGGA" "AGGTTC" "AGTACA"
## [46] "AGTCAC" "AGTGTG" "AGTTGT" "ATAATG" "ATACGT"
## [51] "ATAGCA" "ATATAC" "ATCAGA" "ATCCTC" "ATCGAG"
## [56] "ATCTCT" "ATGACC" "ATGCAA" "ATGGTT" "ATGTGG"
## [61] "ATTAAT" "ATTCCG" "ATTGGC" "ATTTTA"
The length-6 oligos in this list are mutually at least 3-distant.

Randomly shuffle letters in words in sentences

I want to randomly disrupt the order of the letters that make up words in sentences. I can do the shuffling for single words, e.g.:
a <- "bach"
sample(unlist(str_split(a, "")), nchar(a))
[1] "h" "a" "b" "c"
but I fail to do it for sentences, e.g.:
b <- "bach composed fugues and cantatas"
What I've tried so far:
split into words:
b1 <- str_split(b, " ")
[[1]]
[1] "bach" "composed" "fugues" "and" "cantatas"
calculate the number of characters per word:
n <- lapply(b1, function(x) nchar(x))
n
[[1]]
[1] 4 8 6 3 8
split words in b1 into single letters:
b2 <- str_split(unlist(str_split(b, " ")), "")
b2
[[1]]
[1] "b" "a" "c" "h"
[[2]]
[1] "c" "o" "m" "p" "o" "s" "e" "d"
[[3]]
[1] "f" "u" "g" "u" "e" "s"
[[4]]
[1] "a" "n" "d"
[[5]]
[1] "c" "a" "n" "t" "a" "t" "a" "s"
Jumble the letters in each word based on the above:
lapply(b2, function(x) sample(unlist(x), unlist(n), replace = T))
[[1]]
[1] "h" "a" "c" "b"
[[2]]
[1] "o" "p" "o" "s"
[[3]]
[1] "g" "s" "s" "u"
[[4]]
[1] "d" "d" "a" "d"
[[5]]
[1] "c" "n" "s" "a"
That's obviously not the right result. How can I randomly jumble the sequence of letters in each word in the sentence?
After b2 you can randomly shuffle character using sample and paste the words back.
paste0(sapply(b2, function(x) paste0(sample(x), collapse = "")), collapse = " ")
#[1] "bhac moodscpe uefusg and tsatnaac"
Note that you don't need to mention the size in sample if you want the output to be of same length as input with replace = FALSE.

R function for repeat loop to create multiple variables from table

Apologies for any poorly formed question - I'm very new to R.
I am looking to create multiple character strings from this data table..
I have created the character string:
coor1 <- R_data[1,8]
I am looking to iterate this for other indices as follows:
coor1 <- data[1,8]
coor2 <- data[2,8]
coor3 <- data[3,8]
coor4 <- data[4,8]
coor5 <- data[5,8] etc....
I have tried using a for loop but with no success. Any advice would be great.
Thanks very much.
I think you just need to subset the 8th column with $:
data <- data.frame(V1 = rep(NA, 26))
data[, 2:7] <- NA
data[, 8] <- letters
data$V8
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u"
[22] "v" "w" "x" "y" "z"
data[1, 8]
[1] "a"
data[2, 8]
[1] "b"
data[3, 8]
[1] "c"
data[4, 8]
[1] "d"
You can assign it to a variable as well:
coor <- data$V8
And extract a single result with []:
coor[1]
[1]"a"
coor[2]
[1] "b"
This can also be accomplished from the original dataframe:
data$V8[1]
[1] "a"
which is same as:
data[1, 8]
[1] "a"
What a loop would look like:
coors <- vector() #allocate space for storage
for(i in seq_len(nrow(data))){
coors[i] <- data[i, 8]
}

How to create a hashed dataframe in R

Given the following data (myinput.txt):
A q,y,h
B y,f,g
C n,r,q
### more rows
How can I convert it into such data structure in R?
$A
[1] "q" "y" "h"
$B
[1] "y" "f" "g"
$C
[1] "n" "r" "q"
I've assumed this as your data:
dat <- read.table(text="q,y,h
y,f,g
n,r,q", header=FALSE, sep=",", row.names=c("A", "B", "C"))
If you want an automatic method:
as.list(as.data.frame((t(dat)), stringsAsFactors=FALSE))
## $A
## [1] "q" "y" "h"
##
## $B
## [1] "y" "f" "g"
##
## $C
## [1] "n" "r" "q"
Another couple of methods which work are:
lapply(apply(dat, 1, list), "[[", 1)
unlist(apply(dat, 1, list), recursive=FALSE)
Using a bit of readLines strsplit and regex to account for breaking the names off the start:
dat <- readLines(textConnection("A q,y,h
B y,f,g
C n,r,q"))
result <- lapply(strsplit(dat,"\\s{2}|,"),function(x) x[2:length(x)])
names(result) <- gsub("^(.+)\\s{2}.+$","\\1",dat)
> result
$A
[1] "q" "y" "h"
$B
[1] "y" "f" "g"
$C
[1] "n" "r" "q"
or with less regex and more steps:
result <- strsplit(dat,"\\s{2}|,")
names(result) <- lapply(result,"[",1)
result <- lapply(result,function(x) x[2:length(x)])
> result
$A
[1] "q" "y" "h"
$B
[1] "y" "f" "g"
$C
[1] "n" "r" "q"

Resources