This question already has answers here:
How to sort a character vector where elements contain letters and numbers?
(6 answers)
Closed 7 years ago.
I have dataframe with 223 columns.
dput(colnames(a6))
c("d.54", "PRODUCT", "POS", "d.53", "d.52", "d.51", "d.50", "d.49",
"d.48", "d.47", "d.46", "d.45", "d.44", "d.43", "d.42", "d.41",
"d.40", "d.39", "d.38", "d.37", "d.36", "d.35", "d.34", "d.33",
"d.32", "d.31", "d.30", "d.29", "d.28", "d.27", "d.26", "d.25",
"d.24", "d.23", "d.22", "d.21", "d.20", "d.19", "d.18", "d.17",
"d.16", "d.15", "d.14", "d.13", "d.12", "d.11", "d.10", "d.9",
"d.8", "d.7", "d.6", "d.5", "d.4", "d.3", "d.2", "d.1", "d",
"agr", "n", "s", "n.1", "s.1", "n.2", "s.2", "n.3", "s.3", "n.4",
"s.4", "n.5", "s.5", "n.6", "s.6", "n.7", "s.7", "n.8", "s.8",
"n.9", "s.9", "n.10", "s.10", "n.11", "s.11", "n.12", "s.12",
"n.13", "s.13", "n.14", "s.14", "n.15", "s.15", "n.16", "s.16",
"n.17", "s.17", "n.18", "s.18", "n.19", "s.19", "n.20", "s.20",
"n.21", "s.21", "n.22", "s.22", "n.23", "s.23", "n.24", "s.24",
"n.25", "s.25", "n.26", "s.26", "n.27", "s.27", "n.28", "s.28",
"n.29", "s.29", "n.30", "s.30", "n.31", "s.31", "n.32", "s.32",
"n.33", "s.33", "n.34", "s.34", "n.35", "s.35", "n.36", "s.36",
"n.37", "s.37", "n.38", "s.38", "n.39", "s.39", "n.40", "s.40",
"n.41", "s.41", "n.42", "s.42", "n.43", "s.43", "n.44", "s.44",
"n.45", "s.45", "n.46", "s.46", "n.47", "s.47", "n.48", "s.48",
"n.49", "s.49", "n.50", "s.50", "n.51", "s.51", "n.52", "s.52",
"n.53", "s.53", "n.54", "s.54", "r.0", "r.1", "r.2", "r.3", "r.4",
"r.5", "r.6", "r.7", "r.8", "r.9", "r.10", "r.11", "r.12", "r.13",
"r.14", "r.15", "r.16", "r.17", "r.18", "r.19", "r.20", "r.21",
"r.22", "r.23", "r.24", "r.25", "r.26", "r.27", "r.28", "r.29",
"r.30", "r.31", "r.32", "r.33", "r.34", "r.35", "r.36", "r.37",
"r.38", "r.39", "r.40", "r.41", "r.42", "r.43", "r.44", "r.45",
"r.46", "r.47", "r.48", "r.49", "r.50", "r.51", "r.52", "r.53",
"r.54")
I try to reorder columns in such way
agr d d.1 d.2 --d.54 ....
sort by 1 char then sort by number in each group.
I try a7=a6[,order(colnames(a6))]
but it sort it like char only and a get such result
colnames(a7)
[1] "agr" "d" "d.1" "d.10" "d.11" "d.12" "d.13" "d.14" "d.15" "d.16" "d.17"
[12] "d.18" "d.19" "d.2" "d.20" "d.21"
Think there is simply answer on such question, but i cant find it...
You can use mixedorder from library(gtools)
library(gtools)
a6[mixedorder(nm1)]
head(nm1[mixedorder(nm1)])
#[1] "agr" "d" "d.1" "d.2" "d.3" "d.4"
Using another example
set.seed(24)
v1 <- sample(paste0(letters[1:4], '.', 1:20))
mixedsort(v1)
#[1] "a.1" "a.5" "a.9" "a.13" "a.17" "b.2" "b.6" "b.10" "b.14" "b.18"
#[11] "c.3" "c.7" "c.11" "c.15" "c.19" "d.4" "d.8" "d.12" "d.16" "d.20"
data
nm1 <- colnames(a6)
a6 <- setNames(do.call(data.frame,as.list(1:223)),c("d.54","PRODUCT","POS","d.53","d.52","d.51","d.50","d.49","d.48","d.47","d.46","d.45","d.44","d.43","d.42","d.41","d.40","d.39","d.38","d.37","d.36","d.35","d.34","d.33","d.32","d.31","d.30","d.29","d.28","d.27","d.26","d.25","d.24","d.23","d.22","d.21","d.20","d.19","d.18","d.17","d.16","d.15","d.14","d.13","d.12","d.11","d.10","d.9","d.8","d.7","d.6","d.5","d.4","d.3","d.2","d.1","d","agr","n","s","n.1","s.1","n.2","s.2","n.3","s.3","n.4","s.4","n.5","s.5","n.6","s.6","n.7","s.7","n.8","s.8","n.9","s.9","n.10","s.10","n.11","s.11","n.12","s.12","n.13","s.13","n.14","s.14","n.15","s.15","n.16","s.16","n.17","s.17","n.18","s.18","n.19","s.19","n.20","s.20","n.21","s.21","n.22","s.22","n.23","s.23","n.24","s.24","n.25","s.25","n.26","s.26","n.27","s.27","n.28","s.28","n.29","s.29","n.30","s.30","n.31","s.31","n.32","s.32","n.33","s.33","n.34","s.34","n.35","s.35","n.36","s.36","n.37","s.37","n.38","s.38","n.39","s.39","n.40","s.40","n.41","s.41","n.42","s.42","n.43","s.43","n.44","s.44","n.45","s.45","n.46","s.46","n.47","s.47","n.48","s.48","n.49","s.49","n.50","s.50","n.51","s.51","n.52","s.52","n.53","s.53","n.54","s.54","r.0","r.1","r.2","r.3","r.4","r.5","r.6","r.7","r.8","r.9","r.10","r.11","r.12","r.13","r.14","r.15","r.16","r.17","r.18","r.19","r.20","r.21","r.22","r.23","r.24","r.25","r.26","r.27","r.28","r.29","r.30","r.31","r.32","r.33","r.34","r.35","r.36","r.37","r.38","r.39","r.40","r.41","r.42","r.43","r.44","r.45","r.46","r.47","r.48","r.49","r.50","r.51","r.52","r.53","r.54"));
names(a6)[do.call(order,c(read.table(text=names(a6),sep='.',fill=T),na.last=F))];
## [1] "agr" "d" "d.1" "d.2" "d.3" "d.4" "d.5"
## [8] "d.6" "d.7" "d.8" "d.9" "d.10" "d.11" "d.12"
## [15] "d.13" "d.14" "d.15" "d.16" "d.17" "d.18" "d.19"
## [22] "d.20" "d.21" "d.22" "d.23" "d.24" "d.25" "d.26"
## [29] "d.27" "d.28" "d.29" "d.30" "d.31" "d.32" "d.33"
## [36] "d.34" "d.35" "d.36" "d.37" "d.38" "d.39" "d.40"
## [43] "d.41" "d.42" "d.43" "d.44" "d.45" "d.46" "d.47"
## [50] "d.48" "d.49" "d.50" "d.51" "d.52" "d.53" "d.54"
## [57] "n" "n.1" "n.2" "n.3" "n.4" "n.5" "n.6"
## [64] "n.7" "n.8" "n.9" "n.10" "n.11" "n.12" "n.13"
## [71] "n.14" "n.15" "n.16" "n.17" "n.18" "n.19" "n.20"
## [78] "n.21" "n.22" "n.23" "n.24" "n.25" "n.26" "n.27"
## [85] "n.28" "n.29" "n.30" "n.31" "n.32" "n.33" "n.34"
## [92] "n.35" "n.36" "n.37" "n.38" "n.39" "n.40" "n.41"
## [99] "n.42" "n.43" "n.44" "n.45" "n.46" "n.47" "n.48"
## [106] "n.49" "n.50" "n.51" "n.52" "n.53" "n.54" "POS"
## [113] "PRODUCT" "r.0" "r.1" "r.2" "r.3" "r.4" "r.5"
## [120] "r.6" "r.7" "r.8" "r.9" "r.10" "r.11" "r.12"
## [127] "r.13" "r.14" "r.15" "r.16" "r.17" "r.18" "r.19"
## [134] "r.20" "r.21" "r.22" "r.23" "r.24" "r.25" "r.26"
## [141] "r.27" "r.28" "r.29" "r.30" "r.31" "r.32" "r.33"
## [148] "r.34" "r.35" "r.36" "r.37" "r.38" "r.39" "r.40"
## [155] "r.41" "r.42" "r.43" "r.44" "r.45" "r.46" "r.47"
## [162] "r.48" "r.49" "r.50" "r.51" "r.52" "r.53" "r.54"
## [169] "s" "s.1" "s.2" "s.3" "s.4" "s.5" "s.6"
## [176] "s.7" "s.8" "s.9" "s.10" "s.11" "s.12" "s.13"
## [183] "s.14" "s.15" "s.16" "s.17" "s.18" "s.19" "s.20"
## [190] "s.21" "s.22" "s.23" "s.24" "s.25" "s.26" "s.27"
## [197] "s.28" "s.29" "s.30" "s.31" "s.32" "s.33" "s.34"
## [204] "s.35" "s.36" "s.37" "s.38" "s.39" "s.40" "s.41"
## [211] "s.42" "s.43" "s.44" "s.45" "s.46" "s.47" "s.48"
## [218] "s.49" "s.50" "s.51" "s.52" "s.53" "s.54"
Related
I was stuck in removing the prefix of each sample. I have tried to remove all the number within the sample, but this could not be a good way for grouping. I would like to only keep the sample name as the last two suffix. ( For example: AAP-L ) The details are list as below. Thank you in advance!
geo$pd$title
[1] "AAB-HT002-AAP-L" "AAB-HT003-AAP-L" "AAB-HT006-AAP-L" "AAB-HT002-AAP-NL"
[5] "AAB-HT003-AAP-NL" "AAB-HT006-AAP-NL" "AAB-C007-AU-L" "AAB-HT001-AT-L"
[9] "AAB-N-C021-Normal-NC" "AAB-N-C022-Normal-NC" "AAB-C024-Normal-NC" "AAB-N-C025-Normal-NC"
[13] "AAB-HT010-AAP.T-L" "AAB-HT011-AAP-L" "AAB-HT012-AAP-L" "AAB-HT010-AAP.T-NL"
[17] "AAB-HT011-AAP-NL" "AAB-HT012-AAP-NL" "AAB-C013-AU-L" "AAB-C033-AU-L"
[21] "AAB-C037-AT-L" "AAB-C043-AU-L" "AAB-HT041-AU-L" "AAB-N-C026-Normal-NC"
[25] "AAB-N-C027-Normal-NC" "AAB-N-C028-Normal-NC" "AAB-N-C029-Normal-NC" "AAB-C014-AAP-L"
[29] "AAB-HT017-AAP.T-L" "AAB-HT018-AAP-L" "AAB-C014-AAP-NL" "AAB-HT017-AAP.T-NL"
[33] "AAB-HT018-AAP-NL" "AAB-C047-AT-L" "AAB-M044-AU-L" "AAB-N-C030-Normal-NC"
[37] "AAB-N-C032-Normal-NC" "AAB-N-C034-Normal-NC" "AAB-N-C035-Normal-NC" "AAB-C020-AAP.T-L"
[41] "AAB-C038-AAP-L" "AABM046-AAP-L" "AAB-C020-AAP.T-NL" "AABM046-AAP-NL"
[45] "AAB-C048-AT-L" "AAB-HT050-AT-L" "AAB-M-060-AU-L" "AAB-M-061-AU-L"
[49] "AAB-N-C036-Normal-NC" "AAB-N-C039-Normal-NC" "AAB-N-C042-Normal-NC" "AAB-N-C045-Normal-NC"
[53] "AAB-C052-AAP-L" "AAB-C076-AAP-L" "AAB-M056-AAP-L" "AAB-M058-AAP-L"
[57] "AAB-C052-AAP-NL" "AAB-C076-AAP-NL" "AAB-M056-AAP-NL" "AAB-M058-AAP-NL"
[61] "AAB-HT077-AU-L" "AAB-HT082-AU-L" "AAB-M080-AU-L" "AAB-N-C054-Normal-NC"
[65] "AAB-N-C055-Normal-NC" "AAB-N-C059-Normal-NC" "AAB-N-C062-Normal-NC" "AAB-C083-AAP-L"
[69] "AAB-HT009-AAP-L" "AAB-HT079-AAP-L" "AAB-SF086-AAP-L" "AAB-C083-AAP-NL"
[73] "AAB-HT079-AAP-NL" "AAB-SF086-AAP-NL" "AAB-C016-AU-L" "AAB-HT008-AU-L"
[77] "AAB-HT091-AT-L" "AAB-SF087-AU-L" "AAB-N-C063-Normal-NC" "AAB-N-C064-Normal-NC"
[81] "AAB-N-C065-Normal-NC" "AAB-HT103-AAP-L" "AAB-SF078-AAP.T-L" "AAB-SF099-AAP-L"
[85] "AAB-HT103-AAP-NL" "AAB-SF078-AAP.T-NL" "AAB-SF099-AAP-NL" "AAB-HT096-AT-L"
[89] "AAB-M094-AU-L" "AAB-SF089-AU-L" "AAB-SF090-AU-L" "AAB-SF100-AU-L"
[93] "AAB-N-C069-Normal-NC" "AAB-N-C070-Normal-NC" "AAB-N-C071-Normal-NC" "AAB-N-C072-Normal-NC"
[97] "AAB-N-C074-Normal-NC" "AAB-N-C075-Normal-NC" "AAB-N-C085-Normal-NC" "AAB-C092-Normal-NC"
[101] "AAB-M112-AAP-L" "AAB-SF104-AAP-L" "AAB-SF114-AAP-L" "AAB-SF115-AAP.T-L"
[105] "AAB-M112-AAP-NL" "AAB-SF104-AAP-NL" "AAB-SF114-AAP-NL" "AAB-SF115-AAP.T-NL"
[109] "AAB-C109-AU-L" "AAB-C111-AU-L" "AAB-HT101-AU-L" "AAB-M110-AT-L"
[113] "AAB-SF106-AU-L" "AAB-SF113-AU-L" "AAB-N-C098-Normal-NC" "AAB-N-C105-Normal-NC"
[117] "AAB-N-C107-Normal-NC" "AAB-N-C108-Normal-NC" "AAB-HT095-AAP.T-L" "AAB-HT095-AAP.T-NL"
[121] "AAB-HT097-AT-L" "AAB-C093-Normal-NC"
Try this:
library(stringr)
# test data:
string <- c("AAB-HT002-AAP-L", "AAB-HT017-AAP.T-L", "AAB-HT003-AAP-L", "AAB-HT006-AAP-L", "AAB-HT002-AAP-NL")
str_split_fixed(string, '-', n=3)[, 3]
# output:
[1] "AAP-L" "AAP.T-L" "AAP-L" "AAP-L" "AAP-NL"
This will deliver the terminal (alpha+period)-dash-(alpha+period)-end components.
titles <-c("AAB-HT002-AAP-L", "AAB-HT003-AA.P-L", "AAB-HT006-AAP-L", "AAB-HT002-AA.P-NL")
sub( "(.+)([-])([[:alpha:].]+[-][[:alpha:].]+$)", "\\3", titles)
[1] "AAP-L" "AA.P-L" "AAP-L" "AA.P-NL"
We could use
library(stringr)
str_remove(string, ".*\\d+-")
[1] "AAP-L" "AAP.T-L" "AAP-L" "AAP-L" "AAP-NL"
I am writing a function that takes in a start and end day in the format of dhhmmss (day-hour-minutes-second) and calculates the length of the Palindrome numbers between the start and end dhhmmss.
By defintion the start hhmmss is 000000 and end hhmmss is 235959.
My function has to take only the start d and end d and calculate the length of the Palindrome numbers between these two
Here's how I did it
Reverse.numberAsString <- function(x){ # Reverse using string manipulation
x.out <- as.character(x) # convert number to a character string
x.out <- unlist(strsplit(x.out, '')) # break the string up into a vector
x.out <- rev(x.out) # reverse it
x.out <- paste(x.out, collapse='') # join it back together
x.out <- as.numeric(x.out) # turn it back to a number
return(x.out)
}
is.Palindrome <- function(x){
x == sapply(x,Reverse.numberAsString)
}
palindrom_fun <- function(n1, n2){
if (n1 > n2) { print('n1 cannot be > n2')
} else {
n1.mod <- as.numeric(paste(c(n1, "000000"), collapse = ""))
n2.mod <- as.numeric(paste(c(n2, "235959"), collapse = ""))
x <- seq(from = n1.mod, to = n2.mod, by = 1)
palindrome_number <- x[is.Palindrome(x)]
length.palindrom <- length(palindrome_number)
return(length.palindrom)
}
}
palindrom_fun(1, 2)
# 1236
However, the above function will not work if n1 = 0 and n1 = 1 because of the line
n1.mod <- as.numeric(paste(c(n1, "000000"), collapse = ""))
n2.mod <- as.numeric(paste(c(n2, "235959"), collapse = ""))
since R is not able to create a sequence of number from 0000000 to 1235959. How can I get my function to work for this case?
You may compare head and reversed tail of character vectors using : (since head and tail are slow). For the desired sequence you may use sprintf to generate leading zeros.
isPalindrome <- Vectorize(function(x) {
s <- el(strsplit(as.character(x), ""))
ll <- length(s)
l2 <- pmax(floor(ll / 2), 1)
# out <- all(head(s, l) == rev(tail(s, l))) ## slower
out <- all(s[1:l2] == s[ll:(ll - l2 + 1)])
return(out)
})
## Test
x <- c("0000000", "1123456", "1231321", "0000", "1234", "11", "12", "1")
isPalindrome(x)
# 0000000 1123456 1231321 0000 1234 11 12 1
# TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
In the following palindromFun function I'll add the actual palindroms as attributes so that they are being returned by the function. (To switch off this behavior just comment out the line with the ## mark).
palindromFun <- function(n1, n2) {
if (n1 > n2) {
print('n1 cannot be > n2')
} else {
tm <- sprintf("%06d", 0:235959)
dy <- n1:n2
r <- paste0(rep(dy, each=length(tm)), tm)
pd <- isPalindrome(r)
out <- sum(pd)
out <- `attr<-`(out, "palindroms", r[pd]) ## mark
return(out)
}
}
Result 1
r1 <- palindromFun(n1=0, n2=1)
r1
# [1] 472
# attr(,"palindroms")
# [1] "0000000" "0001000" "0002000" "0003000" "0004000" "0005000" "0006000"
# [8] "0007000" "0008000" "0009000" "0010100" "0011100" "0012100" "0013100"
# [15] "0014100" "0015100" "0016100" "0017100" "0018100" "0019100" "0020200"
# [22] "0021200" "0022200" "0023200" "0024200" "0025200" "0026200" "0027200"
# [29] "0028200" "0029200" "0030300" "0031300" "0032300" "0033300" "0034300"
# [36] "0035300" "0036300" "0037300" "0038300" "0039300" "0040400" "0041400"
# [43] "0042400" "0043400" "0044400" "0045400" "0046400" "0047400" "0048400"
# [50] "0049400" "0050500" "0051500" "0052500" "0053500" "0054500" "0055500"
# [57] "0056500" "0057500" "0058500" "0059500" "0060600" "0061600" "0062600"
# [64] "0063600" "0064600" "0065600" "0066600" "0067600" "0068600" "0069600"
# [71] "0070700" "0071700" "0072700" "0073700" "0074700" "0075700" "0076700"
# [78] "0077700" "0078700" "0079700" "0080800" "0081800" "0082800" "0083800"
# [85] "0084800" "0085800" "0086800" "0087800" "0088800" "0089800" "0090900"
# [92] "0091900" "0092900" "0093900" "0094900" "0095900" "0096900" "0097900"
# [99] "0098900" "0099900" "0100010" "0101010" "0102010" "0103010" "0104010"
# [106] "0105010" "0106010" "0107010" "0108010" "0109010" "0110110" "0111110"
# [113] "0112110" "0113110" "0114110" "0115110" "0116110" "0117110" "0118110"
# [120] "0119110" "0120210" "0121210" "0122210" "0123210" "0124210" "0125210"
# [127] "0126210" "0127210" "0128210" "0129210" "0130310" "0131310" "0132310"
# [134] "0133310" "0134310" "0135310" "0136310" "0137310" "0138310" "0139310"
# [141] "0140410" "0141410" "0142410" "0143410" "0144410" "0145410" "0146410"
# [148] "0147410" "0148410" "0149410" "0150510" "0151510" "0152510" "0153510"
# [155] "0154510" "0155510" "0156510" "0157510" "0158510" "0159510" "0160610"
# [162] "0161610" "0162610" "0163610" "0164610" "0165610" "0166610" "0167610"
# [169] "0168610" "0169610" "0170710" "0171710" "0172710" "0173710" "0174710"
# [176] "0175710" "0176710" "0177710" "0178710" "0179710" "0180810" "0181810"
# [183] "0182810" "0183810" "0184810" "0185810" "0186810" "0187810" "0188810"
# [190] "0189810" "0190910" "0191910" "0192910" "0193910" "0194910" "0195910"
# [197] "0196910" "0197910" "0198910" "0199910" "0200020" "0201020" "0202020"
# [204] "0203020" "0204020" "0205020" "0206020" "0207020" "0208020" "0209020"
# [211] "0210120" "0211120" "0212120" "0213120" "0214120" "0215120" "0216120"
# [218] "0217120" "0218120" "0219120" "0220220" "0221220" "0222220" "0223220"
# [225] "0224220" "0225220" "0226220" "0227220" "0228220" "0229220" "0230320"
# [232] "0231320" "0232320" "0233320" "0234320" "0235320" "1000001" "1001001"
# [239] "1002001" "1003001" "1004001" "1005001" "1006001" "1007001" "1008001"
# [246] "1009001" "1010101" "1011101" "1012101" "1013101" "1014101" "1015101"
# [253] "1016101" "1017101" "1018101" "1019101" "1020201" "1021201" "1022201"
# [260] "1023201" "1024201" "1025201" "1026201" "1027201" "1028201" "1029201"
# [267] "1030301" "1031301" "1032301" "1033301" "1034301" "1035301" "1036301"
# [274] "1037301" "1038301" "1039301" "1040401" "1041401" "1042401" "1043401"
# [281] "1044401" "1045401" "1046401" "1047401" "1048401" "1049401" "1050501"
# [288] "1051501" "1052501" "1053501" "1054501" "1055501" "1056501" "1057501"
# [295] "1058501" "1059501" "1060601" "1061601" "1062601" "1063601" "1064601"
# [302] "1065601" "1066601" "1067601" "1068601" "1069601" "1070701" "1071701"
# [309] "1072701" "1073701" "1074701" "1075701" "1076701" "1077701" "1078701"
# [316] "1079701" "1080801" "1081801" "1082801" "1083801" "1084801" "1085801"
# [323] "1086801" "1087801" "1088801" "1089801" "1090901" "1091901" "1092901"
# [330] "1093901" "1094901" "1095901" "1096901" "1097901" "1098901" "1099901"
# [337] "1100011" "1101011" "1102011" "1103011" "1104011" "1105011" "1106011"
# [344] "1107011" "1108011" "1109011" "1110111" "1111111" "1112111" "1113111"
# [351] "1114111" "1115111" "1116111" "1117111" "1118111" "1119111" "1120211"
# [358] "1121211" "1122211" "1123211" "1124211" "1125211" "1126211" "1127211"
# [365] "1128211" "1129211" "1130311" "1131311" "1132311" "1133311" "1134311"
# [372] "1135311" "1136311" "1137311" "1138311" "1139311" "1140411" "1141411"
# [379] "1142411" "1143411" "1144411" "1145411" "1146411" "1147411" "1148411"
# [386] "1149411" "1150511" "1151511" "1152511" "1153511" "1154511" "1155511"
# [393] "1156511" "1157511" "1158511" "1159511" "1160611" "1161611" "1162611"
# [400] "1163611" "1164611" "1165611" "1166611" "1167611" "1168611" "1169611"
# [407] "1170711" "1171711" "1172711" "1173711" "1174711" "1175711" "1176711"
# [414] "1177711" "1178711" "1179711" "1180811" "1181811" "1182811" "1183811"
# [421] "1184811" "1185811" "1186811" "1187811" "1188811" "1189811" "1190911"
# [428] "1191911" "1192911" "1193911" "1194911" "1195911" "1196911" "1197911"
# [435] "1198911" "1199911" "1200021" "1201021" "1202021" "1203021" "1204021"
# [442] "1205021" "1206021" "1207021" "1208021" "1209021" "1210121" "1211121"
# [449] "1212121" "1213121" "1214121" "1215121" "1216121" "1217121" "1218121"
# [456] "1219121" "1220221" "1221221" "1222221" "1223221" "1224221" "1225221"
# [463] "1226221" "1227221" "1228221" "1229221" "1230321" "1231321" "1232321"
# [470] "1233321" "1234321" "1235321"
Result 2
r2 <- palindromFun(n1=0, n2=2)
r2
# [1] 708
# attr(,"palindroms")
# [1] "0000000" "0001000" "0002000" "0003000" "0004000" "0005000" "0006000"
# [8] "0007000" "0008000" "0009000" "0010100" "0011100" "0012100" "0013100"
# [15] "0014100" "0015100" "0016100" "0017100" "0018100" "0019100" "0020200"
# [22] "0021200" "0022200" "0023200" "0024200" "0025200" "0026200" "0027200"
# [29] "0028200" "0029200" "0030300" "0031300" "0032300" "0033300" "0034300"
# [36] "0035300" "0036300" "0037300" "0038300" "0039300" "0040400" "0041400"
# [43] "0042400" "0043400" "0044400" "0045400" "0046400" "0047400" "0048400"
# [50] "0049400" "0050500" "0051500" "0052500" "0053500" "0054500" "0055500"
# [57] "0056500" "0057500" "0058500" "0059500" "0060600" "0061600" "0062600"
# [64] "0063600" "0064600" "0065600" "0066600" "0067600" "0068600" "0069600"
# [71] "0070700" "0071700" "0072700" "0073700" "0074700" "0075700" "0076700"
# [78] "0077700" "0078700" "0079700" "0080800" "0081800" "0082800" "0083800"
# [85] "0084800" "0085800" "0086800" "0087800" "0088800" "0089800" "0090900"
# [92] "0091900" "0092900" "0093900" "0094900" "0095900" "0096900" "0097900"
# [99] "0098900" "0099900" "0100010" "0101010" "0102010" "0103010" "0104010"
# [106] "0105010" "0106010" "0107010" "0108010" "0109010" "0110110" "0111110"
# [113] "0112110" "0113110" "0114110" "0115110" "0116110" "0117110" "0118110"
# [120] "0119110" "0120210" "0121210" "0122210" "0123210" "0124210" "0125210"
# [127] "0126210" "0127210" "0128210" "0129210" "0130310" "0131310" "0132310"
# [134] "0133310" "0134310" "0135310" "0136310" "0137310" "0138310" "0139310"
# [141] "0140410" "0141410" "0142410" "0143410" "0144410" "0145410" "0146410"
# [148] "0147410" "0148410" "0149410" "0150510" "0151510" "0152510" "0153510"
# [155] "0154510" "0155510" "0156510" "0157510" "0158510" "0159510" "0160610"
# [162] "0161610" "0162610" "0163610" "0164610" "0165610" "0166610" "0167610"
# [169] "0168610" "0169610" "0170710" "0171710" "0172710" "0173710" "0174710"
# [176] "0175710" "0176710" "0177710" "0178710" "0179710" "0180810" "0181810"
# [183] "0182810" "0183810" "0184810" "0185810" "0186810" "0187810" "0188810"
# [190] "0189810" "0190910" "0191910" "0192910" "0193910" "0194910" "0195910"
# [197] "0196910" "0197910" "0198910" "0199910" "0200020" "0201020" "0202020"
# [204] "0203020" "0204020" "0205020" "0206020" "0207020" "0208020" "0209020"
# [211] "0210120" "0211120" "0212120" "0213120" "0214120" "0215120" "0216120"
# [218] "0217120" "0218120" "0219120" "0220220" "0221220" "0222220" "0223220"
# [225] "0224220" "0225220" "0226220" "0227220" "0228220" "0229220" "0230320"
# [232] "0231320" "0232320" "0233320" "0234320" "0235320" "1000001" "1001001"
# [239] "1002001" "1003001" "1004001" "1005001" "1006001" "1007001" "1008001"
# [246] "1009001" "1010101" "1011101" "1012101" "1013101" "1014101" "1015101"
# [253] "1016101" "1017101" "1018101" "1019101" "1020201" "1021201" "1022201"
# [260] "1023201" "1024201" "1025201" "1026201" "1027201" "1028201" "1029201"
# [267] "1030301" "1031301" "1032301" "1033301" "1034301" "1035301" "1036301"
# [274] "1037301" "1038301" "1039301" "1040401" "1041401" "1042401" "1043401"
# [281] "1044401" "1045401" "1046401" "1047401" "1048401" "1049401" "1050501"
# [288] "1051501" "1052501" "1053501" "1054501" "1055501" "1056501" "1057501"
# [295] "1058501" "1059501" "1060601" "1061601" "1062601" "1063601" "1064601"
# [302] "1065601" "1066601" "1067601" "1068601" "1069601" "1070701" "1071701"
# [309] "1072701" "1073701" "1074701" "1075701" "1076701" "1077701" "1078701"
# [316] "1079701" "1080801" "1081801" "1082801" "1083801" "1084801" "1085801"
# [323] "1086801" "1087801" "1088801" "1089801" "1090901" "1091901" "1092901"
# [330] "1093901" "1094901" "1095901" "1096901" "1097901" "1098901" "1099901"
# [337] "1100011" "1101011" "1102011" "1103011" "1104011" "1105011" "1106011"
# [344] "1107011" "1108011" "1109011" "1110111" "1111111" "1112111" "1113111"
# [351] "1114111" "1115111" "1116111" "1117111" "1118111" "1119111" "1120211"
# [358] "1121211" "1122211" "1123211" "1124211" "1125211" "1126211" "1127211"
# [365] "1128211" "1129211" "1130311" "1131311" "1132311" "1133311" "1134311"
# [372] "1135311" "1136311" "1137311" "1138311" "1139311" "1140411" "1141411"
# [379] "1142411" "1143411" "1144411" "1145411" "1146411" "1147411" "1148411"
# [386] "1149411" "1150511" "1151511" "1152511" "1153511" "1154511" "1155511"
# [393] "1156511" "1157511" "1158511" "1159511" "1160611" "1161611" "1162611"
# [400] "1163611" "1164611" "1165611" "1166611" "1167611" "1168611" "1169611"
# [407] "1170711" "1171711" "1172711" "1173711" "1174711" "1175711" "1176711"
# [414] "1177711" "1178711" "1179711" "1180811" "1181811" "1182811" "1183811"
# [421] "1184811" "1185811" "1186811" "1187811" "1188811" "1189811" "1190911"
# [428] "1191911" "1192911" "1193911" "1194911" "1195911" "1196911" "1197911"
# [435] "1198911" "1199911" "1200021" "1201021" "1202021" "1203021" "1204021"
# [442] "1205021" "1206021" "1207021" "1208021" "1209021" "1210121" "1211121"
# [449] "1212121" "1213121" "1214121" "1215121" "1216121" "1217121" "1218121"
# [456] "1219121" "1220221" "1221221" "1222221" "1223221" "1224221" "1225221"
# [463] "1226221" "1227221" "1228221" "1229221" "1230321" "1231321" "1232321"
# [470] "1233321" "1234321" "1235321" "2000002" "2001002" "2002002" "2003002"
# [477] "2004002" "2005002" "2006002" "2007002" "2008002" "2009002" "2010102"
# [484] "2011102" "2012102" "2013102" "2014102" "2015102" "2016102" "2017102"
# [491] "2018102" "2019102" "2020202" "2021202" "2022202" "2023202" "2024202"
# [498] "2025202" "2026202" "2027202" "2028202" "2029202" "2030302" "2031302"
# [505] "2032302" "2033302" "2034302" "2035302" "2036302" "2037302" "2038302"
# [512] "2039302" "2040402" "2041402" "2042402" "2043402" "2044402" "2045402"
# [519] "2046402" "2047402" "2048402" "2049402" "2050502" "2051502" "2052502"
# [526] "2053502" "2054502" "2055502" "2056502" "2057502" "2058502" "2059502"
# [533] "2060602" "2061602" "2062602" "2063602" "2064602" "2065602" "2066602"
# [540] "2067602" "2068602" "2069602" "2070702" "2071702" "2072702" "2073702"
# [547] "2074702" "2075702" "2076702" "2077702" "2078702" "2079702" "2080802"
# [554] "2081802" "2082802" "2083802" "2084802" "2085802" "2086802" "2087802"
# [561] "2088802" "2089802" "2090902" "2091902" "2092902" "2093902" "2094902"
# [568] "2095902" "2096902" "2097902" "2098902" "2099902" "2100012" "2101012"
# [575] "2102012" "2103012" "2104012" "2105012" "2106012" "2107012" "2108012"
# [582] "2109012" "2110112" "2111112" "2112112" "2113112" "2114112" "2115112"
# [589] "2116112" "2117112" "2118112" "2119112" "2120212" "2121212" "2122212"
# [596] "2123212" "2124212" "2125212" "2126212" "2127212" "2128212" "2129212"
# [603] "2130312" "2131312" "2132312" "2133312" "2134312" "2135312" "2136312"
# [610] "2137312" "2138312" "2139312" "2140412" "2141412" "2142412" "2143412"
# [617] "2144412" "2145412" "2146412" "2147412" "2148412" "2149412" "2150512"
# [624] "2151512" "2152512" "2153512" "2154512" "2155512" "2156512" "2157512"
# [631] "2158512" "2159512" "2160612" "2161612" "2162612" "2163612" "2164612"
# [638] "2165612" "2166612" "2167612" "2168612" "2169612" "2170712" "2171712"
# [645] "2172712" "2173712" "2174712" "2175712" "2176712" "2177712" "2178712"
# [652] "2179712" "2180812" "2181812" "2182812" "2183812" "2184812" "2185812"
# [659] "2186812" "2187812" "2188812" "2189812" "2190912" "2191912" "2192912"
# [666] "2193912" "2194912" "2195912" "2196912" "2197912" "2198912" "2199912"
# [673] "2200022" "2201022" "2202022" "2203022" "2204022" "2205022" "2206022"
# [680] "2207022" "2208022" "2209022" "2210122" "2211122" "2212122" "2213122"
# [687] "2214122" "2215122" "2216122" "2217122" "2218122" "2219122" "2220222"
# [694] "2221222" "2222222" "2223222" "2224222" "2225222" "2226222" "2227222"
# [701] "2228222" "2229222" "2230322" "2231322" "2232322" "2233322" "2234322"
# [708] "2235322"
My number of palindroms seems to be different from yours, though.
Here is a quick method to create the desired sequence using R's builtin time and date functions.
#create the time sequence for every second for 1 day
dateseq <- seq(as.POSIXct("2020-08-15"), as.POSIXct("2020-08-16"), by="1 sec")
#remove the last element (midnight the next day)
dateseq <- dateseq[-86401]
#format the desire
answer <- format(dateseq, "%H%M%S")
tail(answer)
#[1] "235954" "235955" "235956" "235957" "235958" "235959"
Here's one way to approach the entire problem using a functional approach, using only base R. That is, breaking each problem down to a single task and building up the functionality you need:
# Converts strings in the format "1234556" to date times
as_time <- function(chr) {
chr[nchar(chr) == 7] <- paste0("0", chr[nchar(chr) == 7])
strptime(chr, "%d%H%M%S")
}
# Converts date-times to strings in format "1234556"
as_chr <- function(t) {
paste0(as.numeric(substr(t, 9, 10)), strftime(t, "%H%M%S"))
}
# Gets a sequence of valid strings between to strings in format "1234556"
seq_times <- function(t1, t2)
{
as_chr(seq(as_time(t1), as_time(t2), by = "1 sec"))
}
# Reverse strings in a character vector
rev_string <- function(s) {
sapply(s, function(x) intToUtf8(rev(utf8ToInt(x))), USE.NAMES = FALSE)
}
# Returns only the subset of a given character vector that are palindromes
get_palindromes <- function(t1, t2) {
str <- seq_times(t1, t2)
str[str == rev_string(str)]
}
So now we can do:
get_palindromes("1000000", "2000000")
#> [1] "1000001" "1001001" "1002001" "1003001" "1004001" "1005001" "1010101"
#> [8] "1011101" "1012101" "1013101" "1014101" "1015101" "1020201" "1021201"
#> [15] "1022201" "1023201" "1024201" "1025201" "1030301" "1031301" "1032301"
#> [22] "1033301" "1034301" "1035301" "1040401" "1041401" "1042401" "1043401"
#> [29] "1044401" "1045401" "1050501" "1051501" "1052501" "1053501" "1054501"
#> [36] "1055501" "1060601" "1061601" "1062601" "1063601" "1064601" "1065601"
#> [43] "1070701" "1071701" "1072701" "1073701" "1074701" "1075701" "1080801"
#> [50] "1081801" "1082801" "1083801" "1084801" "1085801" "1090901" "1091901"
#> [57] "1092901" "1093901" "1094901" "1095901" "1100011" "1101011" "1102011"
#> [64] "1103011" "1104011" "1105011" "1110111" "1111111" "1112111" "1113111"
#> [71] "1114111" "1115111" "1120211" "1121211" "1122211" "1123211" "1124211"
#> [78] "1125211" "1130311" "1131311" "1132311" "1133311" "1134311" "1135311"
#> [85] "1140411" "1141411" "1142411" "1143411" "1144411" "1145411" "1150511"
#> [92] "1151511" "1152511" "1153511" "1154511" "1155511" "1160611" "1161611"
#> [99] "1162611" "1163611" "1164611" "1165611" "1170711" "1171711" "1172711"
#> [106] "1173711" "1174711" "1175711" "1180811" "1181811" "1182811" "1183811"
#> [113] "1184811" "1185811" "1190911" "1191911" "1192911" "1193911" "1194911"
#> [120] "1195911" "1200021" "1201021" "1202021" "1203021" "1204021" "1205021"
#> [127] "1210121" "1211121" "1212121" "1213121" "1214121" "1215121" "1220221"
#> [134] "1221221" "1222221" "1223221" "1224221" "1225221" "1230321" "1231321"
#> [141] "1232321" "1233321" "1234321" "1235321"
and
get_palindromes("2235000", "3060000")
#> [1] "2235322" "3000003" "3001003" "3002003" "3003003" "3004003" "3005003"
#> [8] "3010103" "3011103" "3012103" "3013103" "3014103" "3015103" "3020203"
#> [15] "3021203" "3022203" "3023203" "3024203" "3025203" "3030303" "3031303"
#> [22] "3032303" "3033303" "3034303" "3035303" "3040403" "3041403" "3042403"
#> [29] "3043403" "3044403" "3045403" "3050503" "3051503" "3052503" "3053503"
#> [36] "3054503" "3055503"
What do you mean by the length? If you mean the count then I think we can use of simple math to see how many possibilities are there.
Let us say for n1 = 1 and n2 =2, out of 7 places available(dhhmmss), you can have only 2 choices for the 1st and the 7th place. Now for the remaining 6 places, we need to think only about first 3 places as the rest of them will be same as the first three( by the palindrome logic).
Now for the 2nd place, we can have only 3 choices(0, 1, 2 as we can only have the hour from 00 to 23, just consider the ten's place). Let us store the value at the 2nd place to a variable h. Next, we have 3rd place which can have 10, 10 and 4 choices for h={0,1,2} respectively. Following that, we have the 4th place which can only have 6 choices( ranging from 00 to 59,here just the ten's place).
Hence, the total choices are 2*[10+10+4]*6 = 288 choices.
You can use rep() to create the various time elements (days, hours,etc) and then expand.grid() to get every combination of the elements. stri_reverse() from stringi can be used to compare the reverse of the string and thus establish if it is a palindrome.
find_palindrome<-function(day_start,day_end){
day<-rep(day_start:day_end)
hour<-rep(0:23)
min_sec<-rep(0:59)
#expand.grid() finds every combination of inputs
#min_sec is used twice within expand.grid(), once for minutes and once for seconds.
# The "%02d" within sprint() preserves a 2-digit length (e.g. '01' instead of '1'.)
df<-expand.grid(day, sprintf("%02d",hour), sprintf("%02d",min_sec), sprintf("%02d",min_sec))
df<-as.data.frame(df)
#create a column concatinating the values
df$compare1<-paste(df[,1],df[,2], df[,3], df[,4], sep="")
#reverse the order in another column
df$compare2<-stringi::stri_reverse(df$compare1)
#compare the numbers to find your palendromes
palindrone<-df$compare1[df$compare1 == df$compare2]
return(palindrone)
}
Then run the function:
#example using day 0 to day 2
find_palindrome(0,2)
I am sure this has been asked and solved before, but probably I am searching for the wrong terms. I cannot find the relevant thread.
In R, I would like to generate all possible words / strings, where each position can take only a set of values, like
pos1 can be ABC
pos2 can be ABCD
pos3 can be ABC
pos4 can be BCD
etc.
Eg.: BABC is a solution but DABC is not.
If you can point me towards a solution, I would really appreciate!
Thanks for your time!
... timb!, timc!, timd! ...
thx,
Bud
In Base R we can do the following
pos1 <- c('A','B','C')
pos2 <- c('A','B','C','D')
pos4 <- c('B','C','D')
AllPos <- list(pos1,pos2,pos3,pos4)
result <- AllPos[1]
for(i in AllPos[-1] ){
result <- apply(merge(result ,i),1,paste0,collapse="")
}
> result
[1] "AAAB" "BAAB" "CAAB" "ABAB" "BBAB" "CBAB" "ACAB" "BCAB" "CCAB" "ADAB"
[11] "BDAB" "CDAB" "AABB" "BABB" "CABB" "ABBB" "BBBB" "CBBB" "ACBB" "BCBB"
[21] "CCBB" "ADBB" "BDBB" "CDBB" "AACB" "BACB" "CACB" "ABCB" "BBCB" "CBCB"
[31] "ACCB" "BCCB" "CCCB" "ADCB" "BDCB" "CDCB" "AAAC" "BAAC" "CAAC" "ABAC"
[41] "BBAC" "CBAC" "ACAC" "BCAC" "CCAC" "ADAC" "BDAC" "CDAC" "AABC" "BABC"
[51] "CABC" "ABBC" "BBBC" "CBBC" "ACBC" "BCBC" "CCBC" "ADBC" "BDBC" "CDBC"
[61] "AACC" "BACC" "CACC" "ABCC" "BBCC" "CBCC" "ACCC" "BCCC" "CCCC" "ADCC"
[71] "BDCC" "CDCC" "AAAD" "BAAD" "CAAD" "ABAD" "BBAD" "CBAD" "ACAD" "BCAD"
[81] "CCAD" "ADAD" "BDAD" "CDAD" "AABD" "BABD" "CABD" "ABBD" "BBBD" "CBBD"
[91] "ACBD" "BCBD" "CCBD" "ADBD" "BDBD" "CDBD" "AACD" "BACD" "CACD" "ABCD"
[101] "BBCD" "CBCD" "ACCD" "BCCD" "CCCD" "ADCD" "BDCD" "CDCD"
A quick and dirty base R solution...
p1 <- "ABC"
p2 <- "ABCD"
p3 <- "ABC"
p4 <- "BCD"
apply(expand.grid(strsplit(p1, "")[[1]], strsplit(p2, "")[[1]],
strsplit(p3, "")[[1]], strsplit(p4, "")[[1]]), 1, paste0,
collapse = "")
#> [1] "AAAB" "BAAB" "CAAB" "ABAB" "BBAB" "CBAB" "ACAB" "BCAB" "CCAB" "ADAB"
#> [11] "BDAB" "CDAB" "AABB" "BABB" "CABB" "ABBB" "BBBB" "CBBB" "ACBB" "BCBB"
#> [21] "CCBB" "ADBB" "BDBB" "CDBB" "AACB" "BACB" "CACB" "ABCB" "BBCB" "CBCB"
#> [31] "ACCB" "BCCB" "CCCB" "ADCB" "BDCB" "CDCB" "AAAC" "BAAC" "CAAC" "ABAC"
#> [41] "BBAC" "CBAC" "ACAC" "BCAC" "CCAC" "ADAC" "BDAC" "CDAC" "AABC" "BABC"
#> [51] "CABC" "ABBC" "BBBC" "CBBC" "ACBC" "BCBC" "CCBC" "ADBC" "BDBC" "CDBC"
#> [61] "AACC" "BACC" "CACC" "ABCC" "BBCC" "CBCC" "ACCC" "BCCC" "CCCC" "ADCC"
#> [71] "BDCC" "CDCC" "AAAD" "BAAD" "CAAD" "ABAD" "BBAD" "CBAD" "ACAD" "BCAD"
#> [81] "CCAD" "ADAD" "BDAD" "CDAD" "AABD" "BABD" "CABD" "ABBD" "BBBD" "CBBD"
#> [91] "ACBD" "BCBD" "CCBD" "ADBD" "BDBD" "CDBD" "AACD" "BACD" "CACD" "ABCD"
#> [101] "BBCD" "CBCD" "ACCD" "BCCD" "CCCD" "ADCD" "BDCD" "CDCD"
Created on 2020-06-18 by the reprex package (v0.3.0)
expand.grid is your friend here.
A simple solution:
apply(expand.grid(list(
LETTERS[1:3],
LETTERS[1:4],
LETTERS[1:3],
LETTERS[2:4])), 1, paste, collapse = "")
#> [1] "AAAB" "BAAB" "CAAB" "ABAB" "BBAB" "CBAB" "ACAB" "BCAB" "CCAB" "ADAB"
#> [11] "BDAB" "CDAB" "AABB" "BABB" "CABB" "ABBB" "BBBB" "CBBB" "ACBB" "BCBB"
#> [21] "CCBB" "ADBB" "BDBB" "CDBB" "AACB" "BACB" "CACB" "ABCB" "BBCB" "CBCB"
#> [31] "ACCB" "BCCB" "CCCB" "ADCB" "BDCB" "CDCB" "AAAC" "BAAC" "CAAC" "ABAC"
#> [41] "BBAC" "CBAC" "ACAC" "BCAC" "CCAC" "ADAC" "BDAC" "CDAC" "AABC" "BABC"
#> [51] "CABC" "ABBC" "BBBC" "CBBC" "ACBC" "BCBC" "CCBC" "ADBC" "BDBC" "CDBC"
#> [61] "AACC" "BACC" "CACC" "ABCC" "BBCC" "CBCC" "ACCC" "BCCC" "CCCC" "ADCC"
#> [71] "BDCC" "CDCC" "AAAD" "BAAD" "CAAD" "ABAD" "BBAD" "CBAD" "ACAD" "BCAD"
#> [81] "CCAD" "ADAD" "BDAD" "CDAD" "AABD" "BABD" "CABD" "ABBD" "BBBD" "CBBD"
#> [91] "ACBD" "BCBD" "CCBD" "ADBD" "BDBD" "CDBD" "AACD" "BACD" "CACD" "ABCD"
#> [101] "BBCD" "CBCD" "ACCD" "BCCD" "CCCD" "ADCD" "BDCD" "CDCD"
Created on 2020-06-18 by the reprex package (v0.3.0)
I have a word and want to output in R all possible deviatons (replacement, substitution, insertion) for a fixed distance value into a vector.
For instance, the word "Cat" and a fixed distance value of 1 results in a vector with the elements "cot", "at", ...
I'm going to assume that you want all actual words, not just permutations of the characters with an edit distance of 1 that would include non-words such as "zat".
We can do this using adist() to compute the edit distance between your target word and all eligible English words, taken from some word list. Here, I used the English syllable dictionary from the quanteda package (you did tag this question as quanteda after all) but this could have been any vector of English dictionary words from any other source as well.
To narrow things down, we first exclude all words that are different in length from the target word by your distance value.
distfn <- function(word, distance = 1) {
# select eligible words for efficiency
eligible_y_words <- names(quanteda::data_int_syllables)
wordlengths <- nchar(eligible_y_words)
eligible_y_words <- eligible_y_words[wordlengths >= (nchar(word) - distance) &
wordlengths <= (nchar(word) + distance)]
# compute Levenshtein distance
distances <- utils::adist(word, eligible_y_words)[1, ]
# return only those for the requested distance value
eligible_y_words[distances == distance]
}
distfn("cat", 1)
## [1] "at" "bat" "ca" "cab" "cac" "cad" "cai" "cal" "cam" "can"
## [11] "cant" "cao" "cap" "caq" "car" "cart" "cas" "cast" "cate" "cato"
## [21] "cats" "catt" "cau" "caw" "cay" "chat" "coat" "cot" "ct" "cut"
## [31] "dat" "eat" "fat" "gat" "hat" "kat" "lat" "mat" "nat" "oat"
## [41] "pat" "rat" "sat" "scat" "tat" "vat" "wat"
To demonstrate how this works on longer words, with alternative distance values.
distfn("coffee", 1)
## [1] "caffee" "coffeen" "coffees" "coffel" "coffer" "coffey" "cuffee"
## [8] "toffee"
distfn("coffee", 2)
## [1] "caffey" "calfee" "chafee" "chaffee" "cofer" "coffee's"
## [7] "coffelt" "coffers" "coffin" "cofide" "cohee" "coiffe"
## [13] "coiffed" "colee" "colfer" "combee" "comfed" "confer"
## [19] "conlee" "coppee" "cottee" "coulee" "coutee" "cuffe"
## [25] "cuffed" "diffee" "duffee" "hoffer" "jaffee" "joffe"
## [31] "mcaffee" "moffet" "noffke" "offen" "offer" "roffe"
## [37] "scoffed" "soffel" "soffer" "yoffie"
(Yes, according to the CMU pronunciation dictionary, those are all actual words...)
EDIT: Make for all permutations of letters, not just actual words
This involves permutations from the alphabet that have the fixed edit distances from the input word. Here I've done it not particular efficiently by forming all permutations of letters within the eligible ranges, and then computing their edit distance from the target word, and then selecting them. So it's a variation of above, except instead of a dictionary, it uses permuted words.
distfn2 <- function(word, distance = 1) {
result <- character()
# start with deletions
for (i in max((nchar(word) - distance), 0):(nchar(word) - 1)) {
result <- c(
result,
combn(unlist(strsplit(word, "", fixed = TRUE)), i,
paste,
collapse = "", simplify = TRUE
)
)
}
# now for changes and insertions
for (i in (nchar(word)):(nchar(word) + distance)) {
# all possible edits
edits <- apply(expand.grid(rep(list(letters), i)),
1, paste0,
collapse = ""
)
# remove original word
edits <- edits[edits != word]
# get all distances, add to result
distances <- utils::adist(word, edits)[1, ]
result <- c(result, edits[distances == distance])
}
result
}
For the OP example:
distfn2("cat", 1)
## [1] "ca" "ct" "at" "caa" "cab" "cac" "cad" "cae" "caf" "cag"
## [11] "cah" "cai" "caj" "cak" "cal" "cam" "can" "cao" "cap" "caq"
## [21] "car" "cas" "aat" "bat" "dat" "eat" "fat" "gat" "hat" "iat"
## [31] "jat" "kat" "lat" "mat" "nat" "oat" "pat" "qat" "rat" "sat"
## [41] "tat" "uat" "vat" "wat" "xat" "yat" "zat" "cbt" "cct" "cdt"
## [51] "cet" "cft" "cgt" "cht" "cit" "cjt" "ckt" "clt" "cmt" "cnt"
## [61] "cot" "cpt" "cqt" "crt" "cst" "ctt" "cut" "cvt" "cwt" "cxt"
## [71] "cyt" "czt" "cau" "cav" "caw" "cax" "cay" "caz" "cata" "catb"
## [81] "catc" "catd" "cate" "catf" "catg" "cath" "cati" "catj" "catk" "catl"
## [91] "catm" "catn" "cato" "catp" "catq" "catr" "cats" "caat" "cbat" "acat"
## [101] "bcat" "ccat" "dcat" "ecat" "fcat" "gcat" "hcat" "icat" "jcat" "kcat"
## [111] "lcat" "mcat" "ncat" "ocat" "pcat" "qcat" "rcat" "scat" "tcat" "ucat"
## [121] "vcat" "wcat" "xcat" "ycat" "zcat" "cdat" "ceat" "cfat" "cgat" "chat"
## [131] "ciat" "cjat" "ckat" "clat" "cmat" "cnat" "coat" "cpat" "cqat" "crat"
## [141] "csat" "ctat" "cuat" "cvat" "cwat" "cxat" "cyat" "czat" "cabt" "cact"
## [151] "cadt" "caet" "caft" "cagt" "caht" "cait" "cajt" "cakt" "calt" "camt"
## [161] "cant" "caot" "capt" "caqt" "cart" "cast" "catt" "caut" "cavt" "cawt"
## [171] "caxt" "cayt" "cazt" "catu" "catv" "catw" "catx" "caty" "catz"
Also works with other edit distances, although it becomes very slow for longer words.
d2 <- distfn2("cat", 2)
set.seed(100)
c(head(d2, 50), sample(d2, 50), tail(d2, 50))
## [1] "c" "a" "t" "ca" "ct" "at" "aaa" "baa"
## [9] "daa" "eaa" "faa" "gaa" "haa" "iaa" "jaa" "kaa"
## [17] "laa" "maa" "naa" "oaa" "paa" "qaa" "raa" "saa"
## [25] "taa" "uaa" "vaa" "waa" "xaa" "yaa" "zaa" "cba"
## [33] "aca" "bca" "cca" "dca" "eca" "fca" "gca" "hca"
## [41] "ica" "jca" "kca" "lca" "mca" "nca" "oca" "pca"
## [49] "qca" "rca" "cnts" "cian" "pcatb" "cqo" "uawt" "hazt"
## [57] "cpxat" "aaet" "ckata" "caod" "ncatl" "qcamt" "cdtp" "qajt"
## [65] "bckat" "qcatr" "cqah" "rcbt" "cvbt" "bbcat" "vcaz" "ylcat"
## [73] "cahz" "jcgat" "mant" "jatd" "czlat" "cbamt" "cajta" "cafp"
## [81] "cizt" "cmaut" "qwat" "jcazt" "hdcat" "ucant" "hate" "cajtl"
## [89] "caaty" "cix" "nmat" "cajit" "cmnat" "caobt" "catoi" "ncau"
## [97] "ucoat" "ncamt" "jath" "oats" "chatz" "ciatz" "cjatz" "ckatz"
## [105] "clatz" "cmatz" "cnatz" "coatz" "cpatz" "cqatz" "cratz" "csatz"
## [113] "ctatz" "cuatz" "cvatz" "cwatz" "cxatz" "cyatz" "czatz" "cabtz"
## [121] "cactz" "cadtz" "caetz" "caftz" "cagtz" "cahtz" "caitz" "cajtz"
## [129] "caktz" "caltz" "camtz" "cantz" "caotz" "captz" "caqtz" "cartz"
## [137] "castz" "cattz" "cautz" "cavtz" "cawtz" "caxtz" "caytz" "caztz"
## [145] "catuz" "catvz" "catwz" "catxz" "catyz" "catzz"
This could be speeded up by less brute force formation of all permutations and then applying adist() to them - it could consist of changes or insertions of known edit distances generated algorithmically from letters.
using the variable list below I want to for all combinations, join the variables into a string seperated by "+"
l_ALLVar_list <- c("a","b","c","d","z1","z2","z3")
I have the code to generate the 127 combinations
all_combos=do.call("c", lapply(seq_along(l_ALLVar_list), function(i) combn(l_ALLVar_list, i, FUN = list)))
and using position 66 as an example
> all_combos[66]
[[1]]
[1] "a" "b" "c" "z2"
I want to be able to join the elements of these at index 66 into the string a+b+c+z2
I have tried
str_c(c(lol[66]),collapse=',')
but it comes back as
c(\"weight\", \"length\", \"wheel_base\", \"city_mpg\")
paste(all_combos[66], collapse = '')
produces the same again
any help would be appreciated
You can use the FUN argument in combn to paste all the combinations of l_ALLVar_list in one call, eliminating the need for your all_combos list.
unlist(lapply(seq_along(l_ALLVar_list), combn, x=l_ALLVar_list, paste, collapse="+"))
# [1] "a" "b" "c" "d" "z1"
# [6] "z2" "z3" "a+b" "a+c" "a+d"
# [11] "a+z1" "a+z2" "a+z3" "b+c" "b+d"
# [16] "b+z1" "b+z2" "b+z3" "c+d" "c+z1"
# [21] "c+z2" "c+z3" "d+z1" "d+z2" "d+z3"
# [26] "z1+z2" "z1+z3" "z2+z3" "a+b+c" "a+b+d"
# [31] "a+b+z1" "a+b+z2" "a+b+z3" "a+c+d" "a+c+z1"
# [36] "a+c+z2" "a+c+z3" "a+d+z1" "a+d+z2" "a+d+z3"
# [41] "a+z1+z2" "a+z1+z3" "a+z2+z3" "b+c+d" "b+c+z1"
# [46] "b+c+z2" "b+c+z3" "b+d+z1" "b+d+z2" "b+d+z3"
# [51] "b+z1+z2" "b+z1+z3" "b+z2+z3" "c+d+z1" "c+d+z2"
# [56] "c+d+z3" "c+z1+z2" "c+z1+z3" "c+z2+z3" "d+z1+z2"
# [61] "d+z1+z3" "d+z2+z3" "z1+z2+z3" "a+b+c+d" "a+b+c+z1"
# [66] "a+b+c+z2" "a+b+c+z3" "a+b+d+z1" "a+b+d+z2" "a+b+d+z3"
# [71] "a+b+z1+z2" "a+b+z1+z3" "a+b+z2+z3" "a+c+d+z1" "a+c+d+z2"
# [76] "a+c+d+z3" "a+c+z1+z2" "a+c+z1+z3" "a+c+z2+z3" "a+d+z1+z2"
# [81] "a+d+z1+z3" "a+d+z2+z3" "a+z1+z2+z3" "b+c+d+z1" "b+c+d+z2"
# [86] "b+c+d+z3" "b+c+z1+z2" "b+c+z1+z3" "b+c+z2+z3" "b+d+z1+z2"
# [91] "b+d+z1+z3" "b+d+z2+z3" "b+z1+z2+z3" "c+d+z1+z2" "c+d+z1+z3"
# [96] "c+d+z2+z3" "c+z1+z2+z3" "d+z1+z2+z3" "a+b+c+d+z1" "a+b+c+d+z2"
#[101] "a+b+c+d+z3" "a+b+c+z1+z2" "a+b+c+z1+z3" "a+b+c+z2+z3" "a+b+d+z1+z2"
#[106] "a+b+d+z1+z3" "a+b+d+z2+z3" "a+b+z1+z2+z3" "a+c+d+z1+z2" "a+c+d+z1+z3"
#[111] "a+c+d+z2+z3" "a+c+z1+z2+z3" "a+d+z1+z2+z3" "b+c+d+z1+z2" "b+c+d+z1+z3"
#[116] "b+c+d+z2+z3" "b+c+z1+z2+z3" "b+d+z1+z2+z3" "c+d+z1+z2+z3" "a+b+c+d+z1+z2"
#[121] "a+b+c+d+z1+z3" "a+b+c+d+z2+z3" "a+b+c+z1+z2+z3" "a+b+d+z1+z2+z3" "a+c+d+z1+z2+z3"
#[126] "b+c+d+z1+z2+z3" "a+b+c+d+z1+z2+z3"
Use lapply to do paste for each item in your list:
result <- unlist(lapply(all_combos,
function(c) do.call(paste, c(as.list(c), sep="+"))))
> result[66:70]
[1] "a+b+c+z2" "a+b+c+z3" "a+b+d+z1" "a+b+d+z2" "a+b+d+z3"