R: find first non-NA observation in data.table column by group - r

I have a data.table with many missing values and I want a variable which gives me a 1 for the first non-missin value in each group.
Say I have such a data.table:
library(data.table)
DT <- data.table(iris)[,.(Petal.Width,Species)]
DT[c(1:10,15,45:50,51:70,101:134),Petal.Width:=NA]
which now has missings in the beginning, at the end and in between. I have tried two versions, one is:
DT[min(which(!is.na(Petal.Width))),first_available:=1,by=Species]
but it only finds the global minimum (in this case, setosa gets the correct 1), not the minimum by group. I think this is the case because data.table first subsets by i, then sorts by group, correct? So it will only work with the row that is the global minimum of which(!is.na(Petal.Width)) which is the first non-NA value.
A second attempt with the test in j:
DT[,first_available:= ifelse(min(which(!is.na(Petal.Width))),1,0),by=Species]
which just returns a column of 1s. Here, I don't have a good explanation as to why it doesn't work.
my goal is this:
DT[,first_available:=0]
DT[c(11,71,135),first_available:=1]
but in reality I have hundreds of groups. Any help would be appreciated!
Edit: this question does come close but is not targeted at NA's and does not solve the issue here if I understand it correctly. I tried:
DT <- data.table(DT, key = c('Species'))
DT[unique(DT[,key(DT), with = FALSE]), mult = 'first']

Here's one way:
DT[!is.na(Petal.Width), first := as.integer(seq_len(.N) == 1L), by = Species]

We can try
DT[DT[, .I[which.max(!is.na(Petal.Width))] , Species]$V1,
first_available := 1][is.na(first_available), first_available := 0]
Or a slightly more compact option is
DT[, first_available := as.integer(1:nrow(DT) %in%
DT[, .I[!is.na(Petal.Width)][1L], by = Species]$V1)][]

> DT[!is.na(DT$Petal.Width) & DT$first_available == 1]
# Petal.Width Species first_available
# 1: 0.2 setosa 1
# 2: 1.8 versicolor 1
# 3: 1.4 virginica 1
> rownames(DT)[!is.na(DT$Petal.Width) & DT$first_available == 1]
# [1] "11" "71" "135"
> rownames(DT)[!is.na(DT$Petal.Width) & DT$first_available == 0]
# [1] "12" "13" "14" "16" "17" "18" "19" "20" "21" "22" "23" "24"
# [13] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36"
# [25] "37" "38" "39" "40" "41" "42" "43" "44" "72" "73" "74" "75"
# [37] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87"
# [49] "88" "89" "90" "91" "92" "93" "94" "95" "96" "97" "98" "99"
# [61] "100" "136" "137" "138" "139" "140" "141" "142" "143" "144" "145" "146"
# [73] "147" "148" "149" "150"

Related

Unique string combinations of consecutive integers

I have a vector of integers from 0-9 and need all unique possible combinations of these consecutive vector elements, including the original elements.
> vec <- 0:9
> vec
[1] 0 1 2 3 4 5 6 7 8 9
The task is similar to this question. The major (and tricky) difference is that I only need consecutive combinations (e.g "0", "01", "012", ... "0123456789", ... "1", ... "123456789") and not non-consecutive combinations (such as "013").
How would I go about creating this subset of combinations?
Here is a nested sapply approach
unlist(
sapply(
seq_along(vec),
function(k) {
sapply(
k:length(vec),
function(l) paste0(vec[k:l], collapse = "")
)
}
)
)
which produces
[1] "0" "01" "012" "0123" "01234"
[6] "012345" "0123456" "01234567" "012345678" "0123456789"
[11] "1" "12" "123" "1234" "12345"
[16] "123456" "1234567" "12345678" "123456789" "2"
[21] "23" "234" "2345" "23456" "234567"
[26] "2345678" "23456789" "3" "34" "345"
[31] "3456" "34567" "345678" "3456789" "4"
[36] "45" "456" "4567" "45678" "456789"
[41] "5" "56" "567" "5678" "56789"
[46] "6" "67" "678" "6789" "7"
[51] "78" "789" "8" "89" "9"
Another option is sapply + embed
unlist(
sapply(
seq_along(vec),
function(k) {
do.call(paste0, rev(data.frame(embed(vec, k))))
}
)
)
which gives
[1] "0" "1" "2" "3" "4"
[6] "5" "6" "7" "8" "9"
[11] "01" "12" "23" "34" "45"
[16] "56" "67" "78" "89" "012"
[21] "123" "234" "345" "456" "567"
[26] "678" "789" "0123" "1234" "2345"
[31] "3456" "4567" "5678" "6789" "01234"
[36] "12345" "23456" "34567" "45678" "56789"
[41] "012345" "123456" "234567" "345678" "456789"
[46] "0123456" "1234567" "2345678" "3456789" "01234567"
[51] "12345678" "23456789" "012345678" "123456789" "0123456789"
Here is a one-liner with RcppAlgos (I am the author):
RcppAlgos::comboGeneral(10, 2, repetition = TRUE, FUN = function(r) {
paste((0:9)[r[1]:r[2]], collapse = "")
}, FUN.VALUE = "a")
[1] "0" "01" "012" "0123" "01234" "012345"
[7] "0123456" "01234567" "012345678" "0123456789" "1" "12"
[13] "123" "1234" "12345" "123456" "1234567" "12345678"
[19] "123456789" "2" "23" "234" "2345" "23456"
[25] "234567" "2345678" "23456789" "3" "34" "345"
[31] "3456" "34567" "345678" "3456789" "4" "45"
[37] "456" "4567" "45678" "456789" "5" "56"
[43] "567" "5678" "56789" "6" "67" "678"
[49] "6789" "7" "78" "789" "8" "89"
[55] "9"
Another possible solution, based on purrr::map:
library(tidyverse)
map(0:9, \(x) map(x:9, \(y) str_c(x:y, collapse = ""))) %>% unlist
#> [1] "0" "01" "012" "0123" "01234"
#> [6] "012345" "0123456" "01234567" "012345678" "0123456789"
#> [11] "1" "12" "123" "1234" "12345"
#> [16] "123456" "1234567" "12345678" "123456789" "2"
#> [21] "23" "234" "2345" "23456" "234567"
#> [26] "2345678" "23456789" "3" "34" "345"
#> [31] "3456" "34567" "345678" "3456789" "4"
#> [36] "45" "456" "4567" "45678" "456789"
#> [41] "5" "56" "567" "5678" "56789"
#> [46] "6" "67" "678" "6789" "7"
#> [51] "78" "789" "8" "89" "9"

How to find unique couples of numbers in a vector in R?

Let us suppose to have C<-c(1,2,3,4,5)
I want to find all the unique couples of numbers that can be extracted from this vector, e.g.,12,13 23 etc. How can I do it?
One option could be:
na.omit(c(`diag<-`(sapply(x, paste0, x), NA)))
[1] "12" "13" "14" "15" "21" "23" "24" "25" "31" "32" "34" "35" "41" "42" "43" "45"
[17] "51" "52" "53" "54"
Using RcppAlgos package.
## Combinations
unlist(RcppAlgos::comboGeneral(x, 2, FUN=function(x) Reduce(paste0, x)))
# [1] "12" "13" "14" "15" "23" "24" "25" "34" "35" "45"
## Permutations
unlist(RcppAlgos::permuteGeneral(x, 2, FUN=function(x) Reduce(paste0, x)))
# [1] "12" "13" "14" "15" "21" "23" "24" "25" "31" "32" "34" "35" "41" "42" "43"
# [16] "45" "51" "52" "53" "54"

How to mutate columns but keep rownames in R pipe?

r$> iris %>% .[which(as.numeric(rownames(.))%%3!=0),] %>% rownames
[1] "1" "2" "4" "5" "7" "8" "10" "11" "13" "14" "16" "17" "19" "20" "22" "23" "25" "26" "28" "29" "31" "32" "34" "35" "37" "38" "40" "41" "43" "44" "46"
[32] "47" "49" "50" "52" "53" "55" "56" "58" "59" "61" "62" "64" "65" "67" "68" "70" "71" "73" "74" "76" "77" "79" "80" "82" "83" "85" "86" "88" "89" "91" "92"
[63] "94" "95" "97" "98" "100" "101" "103" "104" "106" "107" "109" "110" "112" "113" "115" "116" "118" "119" "121" "122" "124" "125" "127" "128" "130" "131" "133" "134" "136" "137" "139"
[94] "140" "142" "143" "145" "146" "148" "149"
r$> iris %>% .[which(as.numeric(rownames(.))%%3!=0),] %>% mutate(Sepal.Length=Sepal.Length+1) %>% rownames
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31"
[32] "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60" "61" "62"
[63] "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90" "91" "92" "93"
[94] "94" "95" "96" "97" "98" "99" "100"
I like mutate(),because it's easy to use in pipe. As above example,you can find rownames changed after mutate.
I need to mutate columns but keep the rowname not changed,how to do it through R pipe?
That is because mutate or in general dplyr readjusts rownames from 1 after any operation hence, it does not maintain the original rownames.
If you need them for further manipulation store them as a column.
library(dplyr)
iris %>%
.[which(as.numeric(rownames(.))%%3!=0),] %>%
mutate(row = rownames(.),
Sepal.Length=Sepal.Length+1) %>%
pull(row)
# [1] "1" "2" "4" "5" "7" "8" "10" "11" "13" "14" "16" "17" "19" "20" "22" "23" "25" "26"
# [19] "28" "29" "31" "32" "34" "35" "37" "38" "40" "41" "43" "44" "46" "47" "49" "50" "52" "53"
# [37] "55" "56" "58" "59" "61" "62" "64" "65" "67" "68" "70" "71" "73" "74" "76" "77" "79" "80"
# [55] "82" "83" "85" "86" "88" "89" "91" "92" "94" "95" "97" "98" "100" "101" "103" "104" "106" "107"
# [73] "109" "110" "112" "113" "115" "116" "118" "119" "121" "122" "124" "125" "127" "128" "130" "131" "133" "134"
# [91] "136" "137" "139" "140" "142" "143" "145" "146" "148" "149"
iris %>% tibble::rownames_to_column(., 'rowname') %>% .[which(as.numeric(rownames(.))%%3!=0),] %>% mutate(Sepal.Length=Sepal.Length+1) %>% tibble::column_to_rownames(.,'rowname')
Having rownames goes against dplyr's dataframe structure principles. See Hadley's reasoning here.
Best workaround if you want to stick to dplyr is to first store the rownames as a column using rownames_to_column, do your manipulation, then reinstate the rownames using column_to_rownames (both functions are from the tibble package).
Using your example:
library(dplyr)
library(tibble)
iris %>%
.[which(as.numeric(rownames(.))%%3!=0),] %>%
# store rownames in a column
rownames_to_column(var = "rowid") %>%
# dplyr manipulation
mutate(Sepal.Length=Sepal.Length+1) %>%
# reinstate rownames
column_to_rownames(var = "rowid") %>%
# check rownames
rownames()
# [1] "1" "2" "4" "5" "7" "8" "10" "11" "13" "14" "16" "17" "19" "20" "22" "23" "25" "26" "28" "29" "31" "32" "34" "35" "37" "38"
# [27] "40" "41" "43" "44" "46" "47" "49" "50" "52" "53" "55" "56" "58" "59" "61" "62" "64" "65" "67" "68" "70" "71" "73" "74" "76" "77"
# [53] "79" "80" "82" "83" "85" "86" "88" "89" "91" "92" "94" "95" "97" "98" "100" "101" "103" "104" "106" "107" "109" "110" "112" "113" "115" "116"
# [79] "118" "119" "121" "122" "124" "125" "127" "128" "130" "131" "133" "134" "136" "137" "139" "140" "142" "143" "145" "146" "148" "149"

When I want to process DocumentTermMatrix type dataset by using R function apply , I got the error

Below is my DTM type data set:
View(sms_dtm_freq_train)
sms_dtm_freq_train[["dimnames"]]
$Docs
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
[15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
[29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
[43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
[57] "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70"
[71] "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
[85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96" "97" "98"
[99] "99" "100" "101" "102" "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"
..........
[ reached getOption("max.print") -- omitted 4057 entries ]
$Terms
[1] "â£wk" "…" "–" "abiola" "abl" "abt"
[7] "accept" "access" "account" "across" "activ" "actual"
[13] "add" "address" "admir" "adult" "advanc" "aft"
[19] "afternoon" "aftr" "age" "ago" "ahead" "aight"
[25] "aint" "air" "aiyah" "alex" "almost" "alon"
[31] "alreadi" "alright" "alrit" "also" "alway" "amp"
[37] "angri" "announc" "anoth" "answer" "anybodi" "anymor"
[43] "anyon" "anyth" "anytim" "anyway" "apart" "app"
[49] "appli" "appoint" "appreci" "april" "ard" "area"
[55] "argument" "arm" "around" "arrang" "arrest" "arriv"
[61] "asap" "ask" "askd" "asleep" "ass" "attempt"
[67] "auction" "avail" "ave" "avoid" "await" "award"
[73] "away" "awesom" "babe" "babi" "back" "bad"
[79] "bag" "bak" "balanc" "bank" "bare" "bath"
[85] "batteri" "bcoz" "bcum" "bday" "beauti" "becom"
[91] "bed" "bedroom" "begin" "believ" "belli" "best"
[97] "better" "bid" "big" "bill" "bird" "birthday"
..............
[ reached getOption("max.print") -- omitted 1057 entries ]
When i run:
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
I got the error messages below:
Error in apply(sms_dtm_freq_train, MARGIN = 2, convert_counts) :
dim(X) must have a positive length
I type the same codes as the text book "Machine Learning with R" 's , but i got the errors.
I am very confused.
Anyone can help me solve this problem ?
Thanks!

Split dataset to training, cross-validation and test dataset in R. ifelse returns unexpected result

I want to write a function that will split a dataframe to train, cross-validation and test sets.
My code is the following, exemplified by a small dataset:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train <- 0.7
test <- 0.6
# Function_split_test_train_regression <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train * nrow(data))
test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index), test * length(setdiff(data$index, train_index))))
# etc
#}
At this point I make some checks and I get a surprising to me result:
> test == 1
[1] FALSE
> sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index)))
[1] "225" "186" "41" "381" "356" "178" "147" "158" "21" "259" "207" "159" "250" "167" "128" "218" "271" "197" "376" "19" "77"
[22] "205" "46" "3" "212" "238" "61" "11" "68" "130" "200" "274" "127" "305" "201" "32" "48" "184" "290" "349" "155" "370"
[43] "366" "333" "243" "161" "108" "65" "125" "306" "357" "189" "337" "118" "364" "6" "149" "87" "252" "194" "362" "383" "93"
[64] "38" "18" "322" "220" "307" "60" "353"
> test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index))))
> test_index
[1] "219"
Why iflese returns 219 rather than the value of the second argument (since the condition test == 1 evaluates to FALSE) ?
Your advice will be appreciated.
================================================================================
EDIT
Following the suggestion made in the comments I changed the code replacing the name test with the name test_fraction but the problem remained. The new code:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train_fraction <- 0.7
test_fraction <- 0.6
# Function_split_test_crossval_train_regr <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train_fraction * nrow(data))
test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
test_fraction * length(setdiff(data$index, train_index))))
#}
The results:
> train_index
[1] "119" "118" "143" "344" "293" "341" "305" "95" "82" "58" "226" "35" "363" "111" "84" "137" "24" "151" "381" "110" "93"
[22] "198" "133" "6" "112" "228" "62" "36" "165" "353" "271" "385" "322" "291" "316" "268" "333" "37" "377" "176" "343" "281"
[43] "245" "75" "238" "183" "215" "68" "274" "64" "224" "391" "26" "83" "66" "308" "1" "372" "161" "170" "300" "52" "30"
[64] "15" "57" "148" "312" "311" "194" "367" "27" "342" "260" "181" "163" "171" "193" "210" "327" "248" "172" "263" "47" "351"
[85] "166" "292" "278" "61" "116" "204" "309" "200" "96" "330" "383" "346" "249" "368" "41" "38" "235" "4" "77" "273" "191"
[106] "212" "99" "31" "286" "79" "184" "284" "267" "374" "355" "358" "124" "114" "335" "70" "203" "379" "14" "287" "67" "34"
[127] "340" "127" "91" "222" "240" "387" "357" "242" "310" "347" "142" "103" "105" "117" "189" "361" "177" "126" "392" "5" "317"
[148] "174" "352" "87" "234" "147" "202" "261" "277" "214" "290" "339" "109" "43" "120" "169" "318" "56" "94" "115" "314" "320"
[169] "276" "237" "296" "307" "23" "186" "360" "146" "313" "152" "206" "328" "60" "195" "69" "107" "97" "92" "325" "20" "362"
[190] "157" "101" "10" "192" "134" "251" "259" "2" "29" "265" "331" "144" "63" "384" "81" "338" "364" "213" "380" "150" "48"
[211] "54" "354" "187" "283" "356" "389" "72" "32" "121" "376" "33" "359" "349" "239" "241" "232" "196" "74" "156" "201" "390"
[232] "326" "285" "51" "131" "304" "85" "45" "336" "280" "178" "128" "98" "275" "246" "65" "39" "188" "55" "90" "197" "9"
[253] "173" "40" "295" "149" "230" "140" "135" "236" "21" "369" "301" "220" "122" "253" "208" "388" "159" "282" "88" "158" "167"
[274] "257"
> sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index)))
[1] "337" "378" "164" "225" "16" "44" "221" "179" "25" "28" "324" "175" "139" "154" "17" "252" "211" "155" "233" "162" "130"
[22] "216" "255" "190" "365" "373" "73" "207" "42" "3" "348" "227" "49" "12" "53" "315" "199" "256" "129" "375" "205" "18"
[43] "289" "168" "264" "160" "145" "382" "136" "302" "185" "323" "100" "270" "113" "294" "247" "345" "209" "104" "321" "7" "138"
[64] "78" "386" "366" "298" "231" "86" "19"
> test_fraction == 1
[1] FALSE
> test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index))))
> test_index
[1] "28"
I have no idea why this is happening, I hope someone come with an explanation.
But I found a solution to your problem. You need to pass the arguments to the object inside the ifelse():
ifelse(
test_fraction == 1,
test_index <- setdiff(data$index, train_index),
test_index <- sample(setdiff(data$index, train_index),test_fraction * length(setdiff(data$index, train_index)))
)
I don't if this is bad practice or not, but it works. It also can be used to assign multiple conditions in the conditions like my answer here.

Resources