Using "LETTERS" to add a row to dataset - r

I have a dataset that contains 301 columns. I want to add a row that contains a unique letter starting with a and repeats until the final column. What I have is:
DF:
Var1 Var2.....Var78 Var79...Var130V Var131
What I want in my extra row is:
DF:
Var1 Var2.....Var78 Var79...Var130V Var131
NR A B CA CB EA EB
I have tried
DF[[nrow(DF)+1,]<- rep(letters)
DF[nrow(DF)+1,]<- paste(letters:ncol)
Any help would be appreciated!

You could have the same unique letter headers that you would get in "Excel" columns using this one-liner:
c(LETTERS, c(sapply(LETTERS, paste0, LETTERS)))[1:301]
#> [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M"
#> [14] "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z"
#> [27] "AA" "AB" "AC" "AD" "AE" "AF" "AG" "AH" "AI" "AJ" "AK" "AL" "AM"
#> [40] "AN" "AO" "AP" "AQ" "AR" "AS" "AT" "AU" "AV" "AW" "AX" "AY" "AZ"
#> [53] "BA" "BB" "BC" "BD" "BE" "BF" "BG" "BH" "BI" "BJ" "BK" "BL" "BM"
#> [66] "BN" "BO" "BP" "BQ" "BR" "BS" "BT" "BU" "BV" "BW" "BX" "BY" "BZ"
#> [79] "CA" "CB" "CC" "CD" "CE" "CF" "CG" "CH" "CI" "CJ" "CK" "CL" "CM"
#> [92] "CN" "CO" "CP" "CQ" "CR" "CS" "CT" "CU" "CV" "CW" "CX" "CY" "CZ"
#> [105] "DA" "DB" "DC" "DD" "DE" "DF" "DG" "DH" "DI" "DJ" "DK" "DL" "DM"
#> [118] "DN" "DO" "DP" "DQ" "DR" "DS" "DT" "DU" "DV" "DW" "DX" "DY" "DZ"
#> [131] "EA" "EB" "EC" "ED" "EE" "EF" "EG" "EH" "EI" "EJ" "EK" "EL" "EM"
#> [144] "EN" "EO" "EP" "EQ" "ER" "ES" "ET" "EU" "EV" "EW" "EX" "EY" "EZ"
#> [157] "FA" "FB" "FC" "FD" "FE" "FF" "FG" "FH" "FI" "FJ" "FK" "FL" "FM"
#> [170] "FN" "FO" "FP" "FQ" "FR" "FS" "FT" "FU" "FV" "FW" "FX" "FY" "FZ"
#> [183] "GA" "GB" "GC" "GD" "GE" "GF" "GG" "GH" "GI" "GJ" "GK" "GL" "GM"
#> [196] "GN" "GO" "GP" "GQ" "GR" "GS" "GT" "GU" "GV" "GW" "GX" "GY" "GZ"
#> [209] "HA" "HB" "HC" "HD" "HE" "HF" "HG" "HH" "HI" "HJ" "HK" "HL" "HM"
#> [222] "HN" "HO" "HP" "HQ" "HR" "HS" "HT" "HU" "HV" "HW" "HX" "HY" "HZ"
#> [235] "IA" "IB" "IC" "ID" "IE" "IF" "IG" "IH" "II" "IJ" "IK" "IL" "IM"
#> [248] "IN" "IO" "IP" "IQ" "IR" "IS" "IT" "IU" "IV" "IW" "IX" "IY" "IZ"
#> [261] "JA" "JB" "JC" "JD" "JE" "JF" "JG" "JH" "JI" "JJ" "JK" "JL" "JM"
#> [274] "JN" "JO" "JP" "JQ" "JR" "JS" "JT" "JU" "JV" "JW" "JX" "JY" "JZ"
#> [287] "KA" "KB" "KC" "KD" "KE" "KF" "KG" "KH" "KI" "KJ" "KK" "KL" "KM"
#> [300] "KN" "KO"
To write this into your first row in DF you could do:
DF <- rbind(c(LETTERS, c(sapply(LETTERS, paste0, LETTERS)))[1:301], DF)

Related

Unique string combinations of consecutive integers

I have a vector of integers from 0-9 and need all unique possible combinations of these consecutive vector elements, including the original elements.
> vec <- 0:9
> vec
[1] 0 1 2 3 4 5 6 7 8 9
The task is similar to this question. The major (and tricky) difference is that I only need consecutive combinations (e.g "0", "01", "012", ... "0123456789", ... "1", ... "123456789") and not non-consecutive combinations (such as "013").
How would I go about creating this subset of combinations?
Here is a nested sapply approach
unlist(
sapply(
seq_along(vec),
function(k) {
sapply(
k:length(vec),
function(l) paste0(vec[k:l], collapse = "")
)
}
)
)
which produces
[1] "0" "01" "012" "0123" "01234"
[6] "012345" "0123456" "01234567" "012345678" "0123456789"
[11] "1" "12" "123" "1234" "12345"
[16] "123456" "1234567" "12345678" "123456789" "2"
[21] "23" "234" "2345" "23456" "234567"
[26] "2345678" "23456789" "3" "34" "345"
[31] "3456" "34567" "345678" "3456789" "4"
[36] "45" "456" "4567" "45678" "456789"
[41] "5" "56" "567" "5678" "56789"
[46] "6" "67" "678" "6789" "7"
[51] "78" "789" "8" "89" "9"
Another option is sapply + embed
unlist(
sapply(
seq_along(vec),
function(k) {
do.call(paste0, rev(data.frame(embed(vec, k))))
}
)
)
which gives
[1] "0" "1" "2" "3" "4"
[6] "5" "6" "7" "8" "9"
[11] "01" "12" "23" "34" "45"
[16] "56" "67" "78" "89" "012"
[21] "123" "234" "345" "456" "567"
[26] "678" "789" "0123" "1234" "2345"
[31] "3456" "4567" "5678" "6789" "01234"
[36] "12345" "23456" "34567" "45678" "56789"
[41] "012345" "123456" "234567" "345678" "456789"
[46] "0123456" "1234567" "2345678" "3456789" "01234567"
[51] "12345678" "23456789" "012345678" "123456789" "0123456789"
Here is a one-liner with RcppAlgos (I am the author):
RcppAlgos::comboGeneral(10, 2, repetition = TRUE, FUN = function(r) {
paste((0:9)[r[1]:r[2]], collapse = "")
}, FUN.VALUE = "a")
[1] "0" "01" "012" "0123" "01234" "012345"
[7] "0123456" "01234567" "012345678" "0123456789" "1" "12"
[13] "123" "1234" "12345" "123456" "1234567" "12345678"
[19] "123456789" "2" "23" "234" "2345" "23456"
[25] "234567" "2345678" "23456789" "3" "34" "345"
[31] "3456" "34567" "345678" "3456789" "4" "45"
[37] "456" "4567" "45678" "456789" "5" "56"
[43] "567" "5678" "56789" "6" "67" "678"
[49] "6789" "7" "78" "789" "8" "89"
[55] "9"
Another possible solution, based on purrr::map:
library(tidyverse)
map(0:9, \(x) map(x:9, \(y) str_c(x:y, collapse = ""))) %>% unlist
#> [1] "0" "01" "012" "0123" "01234"
#> [6] "012345" "0123456" "01234567" "012345678" "0123456789"
#> [11] "1" "12" "123" "1234" "12345"
#> [16] "123456" "1234567" "12345678" "123456789" "2"
#> [21] "23" "234" "2345" "23456" "234567"
#> [26] "2345678" "23456789" "3" "34" "345"
#> [31] "3456" "34567" "345678" "3456789" "4"
#> [36] "45" "456" "4567" "45678" "456789"
#> [41] "5" "56" "567" "5678" "56789"
#> [46] "6" "67" "678" "6789" "7"
#> [51] "78" "789" "8" "89" "9"

How to make conditional merging on a list of lists in R?

I have a list.
I want to compare elements pairwise and then merge the elements if they meet a criteria
sample list:
[[1]]
[1] "466" "1758" "975"
[[2]]
[1] "1528" "185" "975"
[[3]]
[1] "1528" "185" "975"
[[4]]
[1] "2892" "758" "971"
[[5]]
[1] "1321" "274" "969"
[[6]]
[1] "1321" "274" "969"
[[7]]
[1] "115" "1321" "969"
[[8]]
[1] "1321" "441" "969"
[[9]]
[1] "504" "61" "948"
[[10]]
[1] "504" "61" "948"
Suppose the criteria is simply: if element 1 is equal to element 2 then merge them and remove the second element from the list.
Expected output:
[[1]]
[1] "466" "1758" "975"
[[2]]
[1] "1528" "185" "975" "1528" "185" "975"
[[3]]
[1] "2892" "758" "971"
[[4]]
[1] "1321" "274" "969" "1321" "274" "969"
[[5]]
[1] "115" "1321" "969"
[[6]]
[1] "1321" "441" "969"
[[7]]
[1] "504" "61" "948" "504" "61" "948"
Thanks
We could create a logical index with duplicated, extract the elements from the list and concatenate with Map, update the list and extract only those elements based on the index
i1 <- duplicated(lst1)
i2 <- duplicated(lst1, fromLast = TRUE)
lst2 <- lst1
lst2[i1] <- Map(c, lst1[i1], lst1[i2])
lst2[!i2]
#[[1]]
#[1] "466" "1758" "975"
#[[2]]
#[1] "1528" "185" "975" "1528" "185" "975"
#[[3]]
[#1] "2892" "758" "971"
#[[4]]
#[1] "1321" "274" "969" "1321" "274" "969"
#[[5]]
#[1] "115" "1321" "969"
#[[6]]
#[1] "1321" "441" "969"
#[[7]]
#[1] "504" "61" "948" "504" "61" "948"
Or using split
lst2[i1] <- lapply(split(lst1[i1|i2], lst1[i1|i2], drop = TRUE), unlist)
data
lst1 <- list(c("466", "1758", "975"), c("1528", "185", "975"), c("1528",
"185", "975"), c("2892", "758", "971"), c("1321", "274", "969"
), c("1321", "274", "969"), c("115", "1321", "969"), c("1321",
"441", "969"), c("504", "61", "948"), c("504", "61", "948"))

When I want to process DocumentTermMatrix type dataset by using R function apply , I got the error

Below is my DTM type data set:
View(sms_dtm_freq_train)
sms_dtm_freq_train[["dimnames"]]
$Docs
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
[15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
[29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
[43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
[57] "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70"
[71] "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
[85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96" "97" "98"
[99] "99" "100" "101" "102" "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"
..........
[ reached getOption("max.print") -- omitted 4057 entries ]
$Terms
[1] "â£wk" "…" "–" "abiola" "abl" "abt"
[7] "accept" "access" "account" "across" "activ" "actual"
[13] "add" "address" "admir" "adult" "advanc" "aft"
[19] "afternoon" "aftr" "age" "ago" "ahead" "aight"
[25] "aint" "air" "aiyah" "alex" "almost" "alon"
[31] "alreadi" "alright" "alrit" "also" "alway" "amp"
[37] "angri" "announc" "anoth" "answer" "anybodi" "anymor"
[43] "anyon" "anyth" "anytim" "anyway" "apart" "app"
[49] "appli" "appoint" "appreci" "april" "ard" "area"
[55] "argument" "arm" "around" "arrang" "arrest" "arriv"
[61] "asap" "ask" "askd" "asleep" "ass" "attempt"
[67] "auction" "avail" "ave" "avoid" "await" "award"
[73] "away" "awesom" "babe" "babi" "back" "bad"
[79] "bag" "bak" "balanc" "bank" "bare" "bath"
[85] "batteri" "bcoz" "bcum" "bday" "beauti" "becom"
[91] "bed" "bedroom" "begin" "believ" "belli" "best"
[97] "better" "bid" "big" "bill" "bird" "birthday"
..............
[ reached getOption("max.print") -- omitted 1057 entries ]
When i run:
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
I got the error messages below:
Error in apply(sms_dtm_freq_train, MARGIN = 2, convert_counts) :
dim(X) must have a positive length
I type the same codes as the text book "Machine Learning with R" 's , but i got the errors.
I am very confused.
Anyone can help me solve this problem ?
Thanks!

Split dataset to training, cross-validation and test dataset in R. ifelse returns unexpected result

I want to write a function that will split a dataframe to train, cross-validation and test sets.
My code is the following, exemplified by a small dataset:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train <- 0.7
test <- 0.6
# Function_split_test_train_regression <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train * nrow(data))
test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index), test * length(setdiff(data$index, train_index))))
# etc
#}
At this point I make some checks and I get a surprising to me result:
> test == 1
[1] FALSE
> sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index)))
[1] "225" "186" "41" "381" "356" "178" "147" "158" "21" "259" "207" "159" "250" "167" "128" "218" "271" "197" "376" "19" "77"
[22] "205" "46" "3" "212" "238" "61" "11" "68" "130" "200" "274" "127" "305" "201" "32" "48" "184" "290" "349" "155" "370"
[43] "366" "333" "243" "161" "108" "65" "125" "306" "357" "189" "337" "118" "364" "6" "149" "87" "252" "194" "362" "383" "93"
[64] "38" "18" "322" "220" "307" "60" "353"
> test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index))))
> test_index
[1] "219"
Why iflese returns 219 rather than the value of the second argument (since the condition test == 1 evaluates to FALSE) ?
Your advice will be appreciated.
================================================================================
EDIT
Following the suggestion made in the comments I changed the code replacing the name test with the name test_fraction but the problem remained. The new code:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train_fraction <- 0.7
test_fraction <- 0.6
# Function_split_test_crossval_train_regr <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train_fraction * nrow(data))
test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
test_fraction * length(setdiff(data$index, train_index))))
#}
The results:
> train_index
[1] "119" "118" "143" "344" "293" "341" "305" "95" "82" "58" "226" "35" "363" "111" "84" "137" "24" "151" "381" "110" "93"
[22] "198" "133" "6" "112" "228" "62" "36" "165" "353" "271" "385" "322" "291" "316" "268" "333" "37" "377" "176" "343" "281"
[43] "245" "75" "238" "183" "215" "68" "274" "64" "224" "391" "26" "83" "66" "308" "1" "372" "161" "170" "300" "52" "30"
[64] "15" "57" "148" "312" "311" "194" "367" "27" "342" "260" "181" "163" "171" "193" "210" "327" "248" "172" "263" "47" "351"
[85] "166" "292" "278" "61" "116" "204" "309" "200" "96" "330" "383" "346" "249" "368" "41" "38" "235" "4" "77" "273" "191"
[106] "212" "99" "31" "286" "79" "184" "284" "267" "374" "355" "358" "124" "114" "335" "70" "203" "379" "14" "287" "67" "34"
[127] "340" "127" "91" "222" "240" "387" "357" "242" "310" "347" "142" "103" "105" "117" "189" "361" "177" "126" "392" "5" "317"
[148] "174" "352" "87" "234" "147" "202" "261" "277" "214" "290" "339" "109" "43" "120" "169" "318" "56" "94" "115" "314" "320"
[169] "276" "237" "296" "307" "23" "186" "360" "146" "313" "152" "206" "328" "60" "195" "69" "107" "97" "92" "325" "20" "362"
[190] "157" "101" "10" "192" "134" "251" "259" "2" "29" "265" "331" "144" "63" "384" "81" "338" "364" "213" "380" "150" "48"
[211] "54" "354" "187" "283" "356" "389" "72" "32" "121" "376" "33" "359" "349" "239" "241" "232" "196" "74" "156" "201" "390"
[232] "326" "285" "51" "131" "304" "85" "45" "336" "280" "178" "128" "98" "275" "246" "65" "39" "188" "55" "90" "197" "9"
[253] "173" "40" "295" "149" "230" "140" "135" "236" "21" "369" "301" "220" "122" "253" "208" "388" "159" "282" "88" "158" "167"
[274] "257"
> sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index)))
[1] "337" "378" "164" "225" "16" "44" "221" "179" "25" "28" "324" "175" "139" "154" "17" "252" "211" "155" "233" "162" "130"
[22] "216" "255" "190" "365" "373" "73" "207" "42" "3" "348" "227" "49" "12" "53" "315" "199" "256" "129" "375" "205" "18"
[43] "289" "168" "264" "160" "145" "382" "136" "302" "185" "323" "100" "270" "113" "294" "247" "345" "209" "104" "321" "7" "138"
[64] "78" "386" "366" "298" "231" "86" "19"
> test_fraction == 1
[1] FALSE
> test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index))))
> test_index
[1] "28"
I have no idea why this is happening, I hope someone come with an explanation.
But I found a solution to your problem. You need to pass the arguments to the object inside the ifelse():
ifelse(
test_fraction == 1,
test_index <- setdiff(data$index, train_index),
test_index <- sample(setdiff(data$index, train_index),test_fraction * length(setdiff(data$index, train_index)))
)
I don't if this is bad practice or not, but it works. It also can be used to assign multiple conditions in the conditions like my answer here.

R: find first non-NA observation in data.table column by group

I have a data.table with many missing values and I want a variable which gives me a 1 for the first non-missin value in each group.
Say I have such a data.table:
library(data.table)
DT <- data.table(iris)[,.(Petal.Width,Species)]
DT[c(1:10,15,45:50,51:70,101:134),Petal.Width:=NA]
which now has missings in the beginning, at the end and in between. I have tried two versions, one is:
DT[min(which(!is.na(Petal.Width))),first_available:=1,by=Species]
but it only finds the global minimum (in this case, setosa gets the correct 1), not the minimum by group. I think this is the case because data.table first subsets by i, then sorts by group, correct? So it will only work with the row that is the global minimum of which(!is.na(Petal.Width)) which is the first non-NA value.
A second attempt with the test in j:
DT[,first_available:= ifelse(min(which(!is.na(Petal.Width))),1,0),by=Species]
which just returns a column of 1s. Here, I don't have a good explanation as to why it doesn't work.
my goal is this:
DT[,first_available:=0]
DT[c(11,71,135),first_available:=1]
but in reality I have hundreds of groups. Any help would be appreciated!
Edit: this question does come close but is not targeted at NA's and does not solve the issue here if I understand it correctly. I tried:
DT <- data.table(DT, key = c('Species'))
DT[unique(DT[,key(DT), with = FALSE]), mult = 'first']
Here's one way:
DT[!is.na(Petal.Width), first := as.integer(seq_len(.N) == 1L), by = Species]
We can try
DT[DT[, .I[which.max(!is.na(Petal.Width))] , Species]$V1,
first_available := 1][is.na(first_available), first_available := 0]
Or a slightly more compact option is
DT[, first_available := as.integer(1:nrow(DT) %in%
DT[, .I[!is.na(Petal.Width)][1L], by = Species]$V1)][]
> DT[!is.na(DT$Petal.Width) & DT$first_available == 1]
# Petal.Width Species first_available
# 1: 0.2 setosa 1
# 2: 1.8 versicolor 1
# 3: 1.4 virginica 1
> rownames(DT)[!is.na(DT$Petal.Width) & DT$first_available == 1]
# [1] "11" "71" "135"
> rownames(DT)[!is.na(DT$Petal.Width) & DT$first_available == 0]
# [1] "12" "13" "14" "16" "17" "18" "19" "20" "21" "22" "23" "24"
# [13] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36"
# [25] "37" "38" "39" "40" "41" "42" "43" "44" "72" "73" "74" "75"
# [37] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87"
# [49] "88" "89" "90" "91" "92" "93" "94" "95" "96" "97" "98" "99"
# [61] "100" "136" "137" "138" "139" "140" "141" "142" "143" "144" "145" "146"
# [73] "147" "148" "149" "150"

Resources