How to combine levels with some automation in R - r

I have a variable in my dataset for which there are about 2000 factor levels, but many of the levels are quite similar:
"7" "BE Classifieds - Village Voice - Display"
"8" "Bridgevine Online"
"9" "British Columbia BNE - Group 1"
"10" "British Columbia BNE - Group 10"
"11" "British Columbia BNE - Group 11"
"12" "British Columbia BNE - Group 12"
"13" "British Columbia BNE - Group 13"
"14" "British Columbia BNE - Group 14"
"15" "British Columbia BNE - Group 2"
"16" "British Columbia BNE - Group 3"
"17" "British Columbia BNE - Group 4"
"18" "British Columbia BNE - Group 5"
"19" "British Columbia BNE - Group 6"
"20" "British Columbia BNE - Group 7"
"21" "British Columbia BNE - Group 8"
"22" "British Columbia BNE - Group 9"
"23" "buyjustenergydefault"
"24" "C2CBetaBrands"
"25" "C2CBetaElectrcity"
"26" "C2CBetaJE-ES"
"27" "CallerID Callback - Energy Savings"
"28" "Choose Energy Transfers"
"29" "Choose Energy Warm Transfers"
"30" "Clear Corporate Teletech Telesales Transfer to JE"
"31" "Clear Internet Live Transfer - TX, GA"
"32" "Clear Internet Transfer - Non-ISTA"
"33" "commenergy"
"34" "Commercial to Residential Transfer"
"35" "Coreg"
"36" "Customer Service-Energy Savings"
"37" "CW Leads - Non transfer (ISTA)"
"38" "CW Leads - Non transfer (Non-ISTA)"
"39" "D* Just Clean In-Line"
"40" "D* Transfer - Scripting Test"
"41" "Default/Unknown Program"
"42" "DirectStar Live Transfer - Retentions"
"43" "DStar (FC) Transfer"
"44" "DStar Leads - Non transfer (ISTA)"
"45" "DStar Leads - Non transfer (Non-ISTA)"
"46" "DStar Outbound Friends and Family"
"47" "Dstar to Energy Georgia Transfer"
"48" "DStar Transfer - Non-ISTA"
"49" "DStar Transfer - TX, GA"
"50" "electricdotcomwebsitedefault"
"51" "ES-AEP-HAILO-GOOG-DTOP-Competitors-AB"
"52" "ES-AEP-HAILO-GOOG-DTOP-Nonbrand-AB"
"53" "ES-AEP-HAILO-GOOG-MOBI-NT-Competitor-AB"
"54" "ES-AEP-HAILO-GOOG-MOBI-NT-Competitors-AB-Unspecified"
"55" "ES-AEP-HAILO-GOOG-MOBI-NT-Nonbrand-AB"
"56" "es-albertaenergyprovidersca-webdefault"
"57" "ES-CEDS-HAILO-GOOG-DTOP-Brand-GA"
"58" "ES-CEDS-HAILO-GOOG-DTOP-Brand-NJ"
"59" "ES-CEDS-HAILO-GOOG-DTOP-Brand-PA"
"60" "ES-CEDS-HAILO-GOOG-DTOP-Competitors-GA"
"61" "ES-CEDS-HAILO-GOOG-DTOP-Competitors-NJ"
"62" "ES-CEDS-HAILO-GOOG-DTOP-Content-GA"
"63" "ES-CEDS-HAILO-GOOG-DTOP-NonBrand-GA"
"64" "ES-CEDS-HAILO-GOOG-DTOP-Nonbrand-NJ"
"65" "ES-CEDS-HAILO-GOOG-DTOP-Nonbrand-PA"
"66" "ES-CEDS-HAILO-GOOG-MOBI-NT-Brand-GA"
"67" "ES-CEDS-HAILO-GOOG-MOBI-NT-Brand-NJ"
"68" "ES-CEDS-HAILO-GOOG-MOBI-NT-Brand-PA"
"69" "ES-CEDS-HAILO-GOOG-MOBI-NT-Competitors-GA"
"70" "ES-CEDS-HAILO-GOOG-MOBI-NT-Competitors-GA-Unspecified"
"71" "ES-CEDS-HAILO-GOOG-MOBI-NT-Competitors-NJ"
"72" "ES-CEDS-HAILO-GOOG-MOBI-NT-Competitors-NJ-Unspecified"
"73" "ES-CEDS-HAILO-GOOG-MOBI-NT-Competitors-PA"
"74" "ES-CEDS-HAILO-GOOG-MOBI-NT-Nonbrand-GA"
"75" "ES-CEDS-HAILO-GOOG-MOBI-NT-Nonbrand-NJ"
"76" "ES-CEDS-HAILO-GOOG-MOBI-NT-Nonbrand-PA"
"77" "ES-CEDS-HAILO-MSN-DTOP-Brand-GA"
"78" "ES-CEDS-HAILO-MSN-DTOP-Brand-NJ"
"79" "ES-CEDS-HAILO-MSN-DTOP-Brand-PA"
"80" "ES-CEDS-HAILO-MSN-DTOP-Competitors-GA"
"81" "ES-CEDS-HAILO-MSN-DTOP-Competitors-NJ"
"82" "ES-CEDS-HAILO-MSN-DTOP-Nonbrand-GA"
"83" "ES-CEDS-HAILO-MSN-DTOP-Nonbrand-NJ"
"84" "ES-CEDS-HAILO-MSN-DTOP-Nonbrand-PA"
"85" "ES-EYSG-HAILO-GOOG-DTOP-Competitors-NY"
"86" "ES-EYSG-HAILO-GOOG-DTOP-Competitors-TX"
"87" "ES-EYSG-HAILO-GOOG-DTOP-Nonbrand-TX"
"88" "ES-Google-JE-AB"
"89" "es-launchpad-electriccompanycom-webdefault"
"90" "ES-NAT-DTOP-DENT-eEXST-NONE-justenergy.com-011B"
"91" "ES-NAT-DTOP-DENT-eEXST-NONE-justenergy.com-059nsAB"
"92" "ES-NAT-DTOP-DENT-eGEOS[direct-energy-alberta]-NONE-albertaenergyproviders.com-DEF"
"93" "ES-NAT-DTOP-DENT-eHOME-NONE-EnergySavings.com-DEF"
"94" "ES-NAT-DTOP-DENT-eHOME-NONE-JustEnergy.com-011B"
"95" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-021nsNY"
"96" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-024nsOH"
"97" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-025nsMA"
"98" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-034nsON"
"99" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-036nsAB"
"100" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-037nsNJ"
"101" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-046nsAB"
"102" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-053AnsTX"
"103" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-059nsAB"
"104" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-086BnsTX"
"105" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-DEFnsCA"
"106" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-DEFnsIL"
"107" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-DEFnsIN"
"108" "ES-NAT-DTOP-DENT-eHOME-NONE-justenergy.com-DEFnsPA"
"109" "ES-NAT-DTOP-DENT-eHOME-NONE-newyork.justenergy.com-DEF"
"110" "ES-NAT-DTOP-DENT-eHOME-NONE-saveonenergy.com-DEF{IL}Just"
"111" "ES-NAT-DTOP-DENT-eHOME-NONE-saveonenergy.com-DEF{MA}Just"
"112" "ES-NAT-DTOP-DENT-eHOME-NONE-saveonenergy.com-DEF{NJ}Commerce"
"113" "ES-NAT-DTOP-DENT-eHOME-NONE-saveonenergy.com-DEF{PA}Commerce"
"114" "ES-NAT-DTOP-DENT-eHOME-NONE-saveonenergy.com-DEF{PA}Just"
"115" "ES-NAT-DTOP-DENT-eHOME-NONE-texas.justenergy.com-DEF"
"116" "ES-NAT-DTOP-DENT-eOTHR-NONE-EnergySavings.com-DEF"
"117" "ES-NAT-DTOP-DENT-eOTHR-NONE-JustEnergy.com-011B"
"118" "ES-NAT-DTOP-DENT-eOTHR-NONE-justenergy.com-021nsNY"
"119" "ES-NAT-DTOP-DENT-eOTHR-NONE-justenergy.com-046nsAB"
"120" "ES-NAT-DTOP-DENT-eOTHR-NONE-justenergy.com-059nsAB"
"121" "ES-NAT-DTOP-DENT-eOTHR-NONE-saveonenergy.com-DEF{MA}Just"
"122" "ES-NAT-DTOP-DENT-eOTHR-NONE-saveonenergy.com-DEF{NJ}Commerce"
"123" "ES-NAT-DTOP-REFR-eEXST-OTHR-justenergy.com-024nsOH"
"124" "ES-NAT-DTOP-REFR-eEXST-OTHR-justenergy.com-059nsAB"
"125" "ES-NAT-DTOP-REFR-eGEOS-OTHR-saveonenergy.com-DEF{NJ}Commerce"
"126" "ES-NAT-DTOP-REFR-eGEOS-OTHR-saveonenergy.com-DEF{PA}Commerce"
"127" "ES-NAT-DTOP-REFR-eGEOS-OTHR-saveonenergy.com-DEF{PA}Just"
"128" "ES-NAT-DTOP-REFR-eHOME-ENRG-JustEnergy.com-011B"
"129" "ES-NAT-DTOP-REFR-eHOME-ENRG-justenergy.com-046nsAB"
"130" "ES-NAT-DTOP-REFR-eHOME-ENRG-justenergy.com-DEFnsCA"
"131" "ES-NAT-DTOP-REFR-eHOME-OTHR-EnergySavings.com-DEF"
"132" "ES-NAT-DTOP-REFR-eHOME-OTHR-JustEnergy.com-011B"
"133" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-021nsNY"
"134" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-024nsOH"
"135" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-025nsMA"
"136" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-037nsNJ"
"137" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-046nsAB"
"138" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-054nsON"
"139" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-059nsAB"
"140" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-074nsTX"
"141" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-086BnsTX"
"142" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-DEFnsIL"
"143" "ES-NAT-DTOP-REFR-eHOME-OTHR-justenergy.com-DEFnsPA"
"144" "ES-NAT-DTOP-REFR-eHOME-OTHR-newyork.justenergy.com-DEF"
"145" "ES-NAT-DTOP-REFR-eOTHR-ENRG-JustEnergy.com-011B"
"146" "ES-NAT-DTOP-REFR-eOTHR-OTHR-EnergySavings.com-DEF"
"147" "ES-NAT-DTOP-REFR-eOTHR-OTHR-JustEnergy.com-011B"
"148" "ES-NAT-DTOP-REFR-eOTHR-OTHR-justenergy.com-021nsNY"
"149" "ES-NAT-DTOP-REFR-eOTHR-OTHR-justenergy.com-025nsMA"
"150" "ES-NAT-DTOP-REFR-eOTHR-OTHR-justenergy.com-036nsAB"
"151" "ES-NAT-DTOP-REFR-eOTHR-OTHR-justenergy.com-046nsAB"
"152" "ES-NAT-DTOP-REFR-eOTHR-OTHR-justenergy.com-059nsAB"
"153" "ES-NAT-DTOP-REFR-eOTHR-OTHR-justenergy.com-DEFnsIN"
"154" "ES-NAT-DTOP-REFR-eOTHR-OTHR-justenergy.com-DEFnsPA"
"155" "ES-NAT-DTOP-REFR-eOTHR-OTHR-newyorkenergyrates.com-DEF"
"156" "ES-NAT-DTOP-REFR-eOTHR-OTHR-saveonenergy.com-DEF{IL}Just"
"157" "ES-NAT-DTOP-REFR-eOTHR-OTHR-saveonenergy.com-DEF{NJ}Commerce"
"158" "ES-NAT-DTOP-REFR-eOTHR-OTHR-saveonenergy.com-DEF{PA}Commerce"
"159" "ES-NAT-DTOP-REFR-eOTHR-OTHR-saveonenergy.com-DEF{TX}Just"
"160" "ES-NAT-DTOP-SENG-eEXST-BING-justenergy.com-054nsON"
"161" "ES-NAT-DTOP-SENG-eEXST-BING-justenergy.com-059nsAB"
"162" "ES-NAT-DTOP-SENG-eEXST-GOOG-justenergy.com-011B"
"163" "ES-NAT-DTOP-SENG-eEXST-GOOG-justenergy.com-021nsNY"
"164" "ES-NAT-DTOP-SENG-eEXST-GOOG-justenergy.com-059nsAB"
"165" "ES-NAT-DTOP-SENG-eEXST-GOOG-justenergy.com-DEFnse
So I'm wondering how to combine all of the levels that start with a certain prefix, say "British Columbia BNE" - ....
I've looked into combine.levels and varclus but I don't think these are the right commands...is there a way to combine by prefix?

I figured it out. Use idx = grep(prefix, data.frame) to get the indices and then
data.frame[idx] = blah

Related

anova() does not work properly with lme objects after updating - do I miss something?

I had a code that worked fine so far. I want to test things with gls, lme and gamm (from packages nlme and mgcv), and I compared different models with anova(). However, I needed another package, that did not work with my R version (which was almost one year old). Thus, I updated R (via the updater package) and RStudio.
The issue now is, that anova() does not give any output after running or only "Denom. DF: 91" and nothing else.
Now I tried different things and searched a lot, but I found no current threat dealing with such a problem, while looking at the help files just says, it should work that way I use it. Thus, I am suspecting that I miss something essential (probably even obvious), but I don't get it. I hope you can tell me where I do something wrong.
Here is some data to play with (copied from txt-file):
"treat" "x" "time" "nest"
"1" "1" 49.37 1 "K1"
"2" "1" 48.68 1 "K2"
"3" "2" 44.7 1 "T7"
"4" "2" 49.3 1 "T8"
"5" "1" 48.78 1 "K3"
"6" "2" 42.37 1 "T10"
"7" "1" 39.26 1 "K4"
"8" "2" 46.36 1 "T11"
"9" "1" 40.36 1 "K5"
"10" "2" 47.14 1 "T9"
"11" "1" 48.81 1 "K6"
"12" "1" 40.4 1 "K10"
"13" "2" 53.42 1 "T4"
"14" "2" 46.85 1 "T5"
"15" "2" 44.58 1 "T2"
"16" "2" 47.51 1 "T6"
"17" "1" 51.7 1 "K8"
"18" "1" 48.16 1 "K7"
"19" "2" 48.86 1 "T3"
"20" "1" 44.6 1 "K11"
"21" "1" 49.71 1 "K9"
"22" "2" 44.54 1 "T1"
"23" "2" 41.55 2 "T3"
"24" "1" 32.55 2 "K3"
"25" "1" 42.15 2 "K1"
"26" "2" 51.06 2 "T1"
"27" "1" 38.43 2 "K11"
"28" "2" 39.91 2 "T11"
"29" "1" 36.73 2 "K7"
"30" "2" 50.19 2 "T4"
"31" "1" 42.26 2 "K8"
"32" "1" 43.02 2 "K6"
"33" "2" 37.6 2 "T10"
"34" "1" 33.42 2 "K4"
"35" "2" 39.64 2 "T5"
"36" "2" 43.56 2 "T2"
"37" "2" 35.31 2 "T7"
"38" "2" 37 2 "T8"
"39" "2" 40.87 2 "T6"
"40" "1" 35.29 2 "K9"
"41" "2" 41.83 2 "T9"
"42" "1" 37.88 2 "K10"
"43" "1" 36.5 2 "K5"
"44" "1" 34.21 3 "K4"
"45" "1" 38.04 3 "K6"
"46" "1" 35.14 3 "K3"
"47" "2" 38.18 3 "T10"
"48" "1" 40.26 3 "K11"
"49" "2" 37.09 3 "T3"
"50" "2" 43.1 3 "T11"
"51" "2" 34.26 3 "T7"
"52" "1" 36.58 3 "K9"
"53" "1" 35.81 3 "K2"
"54" "1" 39.83 3 "K10"
"55" "2" 37.65 3 "T6"
"56" "1" 39.8 3 "K7"
"57" "1" 36.41 3 "K8"
"58" "1" 35.22 3 "K5"
"59" "2" 39.68 3 "T8"
"60" "2" 41.12 3 "T1"
"61" "2" 36.93 3 "T9"
"62" "1" 35.66 3 "K1"
"63" "2" 36.91 3 "T4"
"64" "2" 38.84 3 "T5"
"65" "2" 34.31 3 "T2"
"66" "1" 32.71 4 "K9"
"67" "2" 37.84 4 "T11"
"68" "1" 28.01 4 "K10"
"69" "2" 39.69 5 "T11"
"70" "2" 35.08 4 "T10"
"71" "2" 34.43 4 "T9"
"72" "1" 32.12 4 "T8"
"73" "2" 30.41 4 "T7"
"74" "1" 31.81 4 "K7"
"75" "2" 36.41 4 "T6"
"76" "1" 29.17 5 "K6"
"77" "1" 28.59 4 "K6"
"78" "2" 33.99 4 "T5"
"79" "1" 30.41 4 "K5"
"80" "1" 29.8 4 "K4"
"81" "2" 34.72 4 "T4"
"82" "2" 34.38 4 "T3"
"83" "1" 28.12 4 "K3"
"84" "2" 34.62 4 "T2"
"85" "1" 31.88 4 "K2"
"86" "1" 29.35 4 "K1"
"87" "2" 37.95 4 "T1"
"88" "2" 40.85 5 "T4"
"89" "2" 35.07 5 "T5"
"90" "2" 36.15 5 "T8"
"91" "2" 36.48 5 "T10"
"92" "1" 33.73 4 "K8"
"93" "1" 28.17 5 "K9"
"94" "1" 32.81 5 "K10"
"95" "1" 32.17 4 "K11"
And this is basically one of the models I try to run:
test <- read.table(file="C:/Users/marvi_000/Desktop/testdata.txt")
str(test)
test$treat <- as.factor(test$treat)
test$nest <- as.factor(test$nest)
library(nlme)
m.test <- gls(x ~ treat * time,
correlation = corAR1(form =~ time | nest),
test, na.action = na.omit)
anova(m.test)
the output is:
Denom. DF: 91
When comparing models with anova(m1, m2) nothing happens at all.
The same is true when I run a gamm from package mgcv and using anova(m$lme) or anova(m1$lme, m2$lme).
I would appreciate any help or hint, pointing me towards the right direction. Thanks a lot!
EDIT:
After some discussion, I found out, that it is a problem with the scripts. I'm using RStudio and RMarkdown. However, when I run the code (with cntrl+enter, line by line) within the markdown script, the anova(lmemodel) command does not work as supposed to. However, if I just copy this single command into a plane r script (still using the current environment), the command is executed properly showing the desired output.
I have no clue what is happening there. If anybody has an idea where the problem is, or how to solve it, I would still be happy to hear it.

How to find unique couples of numbers in a vector in R?

Let us suppose to have C<-c(1,2,3,4,5)
I want to find all the unique couples of numbers that can be extracted from this vector, e.g.,12,13 23 etc. How can I do it?
One option could be:
na.omit(c(`diag<-`(sapply(x, paste0, x), NA)))
[1] "12" "13" "14" "15" "21" "23" "24" "25" "31" "32" "34" "35" "41" "42" "43" "45"
[17] "51" "52" "53" "54"
Using RcppAlgos package.
## Combinations
unlist(RcppAlgos::comboGeneral(x, 2, FUN=function(x) Reduce(paste0, x)))
# [1] "12" "13" "14" "15" "23" "24" "25" "34" "35" "45"
## Permutations
unlist(RcppAlgos::permuteGeneral(x, 2, FUN=function(x) Reduce(paste0, x)))
# [1] "12" "13" "14" "15" "21" "23" "24" "25" "31" "32" "34" "35" "41" "42" "43"
# [16] "45" "51" "52" "53" "54"

How to mutate columns but keep rownames in R pipe?

r$> iris %>% .[which(as.numeric(rownames(.))%%3!=0),] %>% rownames
[1] "1" "2" "4" "5" "7" "8" "10" "11" "13" "14" "16" "17" "19" "20" "22" "23" "25" "26" "28" "29" "31" "32" "34" "35" "37" "38" "40" "41" "43" "44" "46"
[32] "47" "49" "50" "52" "53" "55" "56" "58" "59" "61" "62" "64" "65" "67" "68" "70" "71" "73" "74" "76" "77" "79" "80" "82" "83" "85" "86" "88" "89" "91" "92"
[63] "94" "95" "97" "98" "100" "101" "103" "104" "106" "107" "109" "110" "112" "113" "115" "116" "118" "119" "121" "122" "124" "125" "127" "128" "130" "131" "133" "134" "136" "137" "139"
[94] "140" "142" "143" "145" "146" "148" "149"
r$> iris %>% .[which(as.numeric(rownames(.))%%3!=0),] %>% mutate(Sepal.Length=Sepal.Length+1) %>% rownames
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31"
[32] "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60" "61" "62"
[63] "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90" "91" "92" "93"
[94] "94" "95" "96" "97" "98" "99" "100"
I like mutate(),because it's easy to use in pipe. As above example,you can find rownames changed after mutate.
I need to mutate columns but keep the rowname not changed,how to do it through R pipe?
That is because mutate or in general dplyr readjusts rownames from 1 after any operation hence, it does not maintain the original rownames.
If you need them for further manipulation store them as a column.
library(dplyr)
iris %>%
.[which(as.numeric(rownames(.))%%3!=0),] %>%
mutate(row = rownames(.),
Sepal.Length=Sepal.Length+1) %>%
pull(row)
# [1] "1" "2" "4" "5" "7" "8" "10" "11" "13" "14" "16" "17" "19" "20" "22" "23" "25" "26"
# [19] "28" "29" "31" "32" "34" "35" "37" "38" "40" "41" "43" "44" "46" "47" "49" "50" "52" "53"
# [37] "55" "56" "58" "59" "61" "62" "64" "65" "67" "68" "70" "71" "73" "74" "76" "77" "79" "80"
# [55] "82" "83" "85" "86" "88" "89" "91" "92" "94" "95" "97" "98" "100" "101" "103" "104" "106" "107"
# [73] "109" "110" "112" "113" "115" "116" "118" "119" "121" "122" "124" "125" "127" "128" "130" "131" "133" "134"
# [91] "136" "137" "139" "140" "142" "143" "145" "146" "148" "149"
iris %>% tibble::rownames_to_column(., 'rowname') %>% .[which(as.numeric(rownames(.))%%3!=0),] %>% mutate(Sepal.Length=Sepal.Length+1) %>% tibble::column_to_rownames(.,'rowname')
Having rownames goes against dplyr's dataframe structure principles. See Hadley's reasoning here.
Best workaround if you want to stick to dplyr is to first store the rownames as a column using rownames_to_column, do your manipulation, then reinstate the rownames using column_to_rownames (both functions are from the tibble package).
Using your example:
library(dplyr)
library(tibble)
iris %>%
.[which(as.numeric(rownames(.))%%3!=0),] %>%
# store rownames in a column
rownames_to_column(var = "rowid") %>%
# dplyr manipulation
mutate(Sepal.Length=Sepal.Length+1) %>%
# reinstate rownames
column_to_rownames(var = "rowid") %>%
# check rownames
rownames()
# [1] "1" "2" "4" "5" "7" "8" "10" "11" "13" "14" "16" "17" "19" "20" "22" "23" "25" "26" "28" "29" "31" "32" "34" "35" "37" "38"
# [27] "40" "41" "43" "44" "46" "47" "49" "50" "52" "53" "55" "56" "58" "59" "61" "62" "64" "65" "67" "68" "70" "71" "73" "74" "76" "77"
# [53] "79" "80" "82" "83" "85" "86" "88" "89" "91" "92" "94" "95" "97" "98" "100" "101" "103" "104" "106" "107" "109" "110" "112" "113" "115" "116"
# [79] "118" "119" "121" "122" "124" "125" "127" "128" "130" "131" "133" "134" "136" "137" "139" "140" "142" "143" "145" "146" "148" "149"

When I want to process DocumentTermMatrix type dataset by using R function apply , I got the error

Below is my DTM type data set:
View(sms_dtm_freq_train)
sms_dtm_freq_train[["dimnames"]]
$Docs
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
[15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
[29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
[43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
[57] "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70"
[71] "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
[85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96" "97" "98"
[99] "99" "100" "101" "102" "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"
..........
[ reached getOption("max.print") -- omitted 4057 entries ]
$Terms
[1] "â£wk" "…" "–" "abiola" "abl" "abt"
[7] "accept" "access" "account" "across" "activ" "actual"
[13] "add" "address" "admir" "adult" "advanc" "aft"
[19] "afternoon" "aftr" "age" "ago" "ahead" "aight"
[25] "aint" "air" "aiyah" "alex" "almost" "alon"
[31] "alreadi" "alright" "alrit" "also" "alway" "amp"
[37] "angri" "announc" "anoth" "answer" "anybodi" "anymor"
[43] "anyon" "anyth" "anytim" "anyway" "apart" "app"
[49] "appli" "appoint" "appreci" "april" "ard" "area"
[55] "argument" "arm" "around" "arrang" "arrest" "arriv"
[61] "asap" "ask" "askd" "asleep" "ass" "attempt"
[67] "auction" "avail" "ave" "avoid" "await" "award"
[73] "away" "awesom" "babe" "babi" "back" "bad"
[79] "bag" "bak" "balanc" "bank" "bare" "bath"
[85] "batteri" "bcoz" "bcum" "bday" "beauti" "becom"
[91] "bed" "bedroom" "begin" "believ" "belli" "best"
[97] "better" "bid" "big" "bill" "bird" "birthday"
..............
[ reached getOption("max.print") -- omitted 1057 entries ]
When i run:
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
I got the error messages below:
Error in apply(sms_dtm_freq_train, MARGIN = 2, convert_counts) :
dim(X) must have a positive length
I type the same codes as the text book "Machine Learning with R" 's , but i got the errors.
I am very confused.
Anyone can help me solve this problem ?
Thanks!

Split dataset to training, cross-validation and test dataset in R. ifelse returns unexpected result

I want to write a function that will split a dataframe to train, cross-validation and test sets.
My code is the following, exemplified by a small dataset:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train <- 0.7
test <- 0.6
# Function_split_test_train_regression <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train * nrow(data))
test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index), test * length(setdiff(data$index, train_index))))
# etc
#}
At this point I make some checks and I get a surprising to me result:
> test == 1
[1] FALSE
> sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index)))
[1] "225" "186" "41" "381" "356" "178" "147" "158" "21" "259" "207" "159" "250" "167" "128" "218" "271" "197" "376" "19" "77"
[22] "205" "46" "3" "212" "238" "61" "11" "68" "130" "200" "274" "127" "305" "201" "32" "48" "184" "290" "349" "155" "370"
[43] "366" "333" "243" "161" "108" "65" "125" "306" "357" "189" "337" "118" "364" "6" "149" "87" "252" "194" "362" "383" "93"
[64] "38" "18" "322" "220" "307" "60" "353"
> test_index <- ifelse(test == 1, setdiff(data$index, train_index),
sample(setdiff(data$index, train_index),
test * length(setdiff(data$index, train_index))))
> test_index
[1] "219"
Why iflese returns 219 rather than the value of the second argument (since the condition test == 1 evaluates to FALSE) ?
Your advice will be appreciated.
================================================================================
EDIT
Following the suggestion made in the comments I changed the code replacing the name test with the name test_fraction but the problem remained. The new code:
library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train_fraction <- 0.7
test_fraction <- 0.6
# Function_split_test_crossval_train_regr <- function(data, train, test, seed){
set.seed(seed)
setDT(data)
data[, index := row.names(data)]
train_index <- sample(data$index, train_fraction * nrow(data))
test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
test_fraction * length(setdiff(data$index, train_index))))
#}
The results:
> train_index
[1] "119" "118" "143" "344" "293" "341" "305" "95" "82" "58" "226" "35" "363" "111" "84" "137" "24" "151" "381" "110" "93"
[22] "198" "133" "6" "112" "228" "62" "36" "165" "353" "271" "385" "322" "291" "316" "268" "333" "37" "377" "176" "343" "281"
[43] "245" "75" "238" "183" "215" "68" "274" "64" "224" "391" "26" "83" "66" "308" "1" "372" "161" "170" "300" "52" "30"
[64] "15" "57" "148" "312" "311" "194" "367" "27" "342" "260" "181" "163" "171" "193" "210" "327" "248" "172" "263" "47" "351"
[85] "166" "292" "278" "61" "116" "204" "309" "200" "96" "330" "383" "346" "249" "368" "41" "38" "235" "4" "77" "273" "191"
[106] "212" "99" "31" "286" "79" "184" "284" "267" "374" "355" "358" "124" "114" "335" "70" "203" "379" "14" "287" "67" "34"
[127] "340" "127" "91" "222" "240" "387" "357" "242" "310" "347" "142" "103" "105" "117" "189" "361" "177" "126" "392" "5" "317"
[148] "174" "352" "87" "234" "147" "202" "261" "277" "214" "290" "339" "109" "43" "120" "169" "318" "56" "94" "115" "314" "320"
[169] "276" "237" "296" "307" "23" "186" "360" "146" "313" "152" "206" "328" "60" "195" "69" "107" "97" "92" "325" "20" "362"
[190] "157" "101" "10" "192" "134" "251" "259" "2" "29" "265" "331" "144" "63" "384" "81" "338" "364" "213" "380" "150" "48"
[211] "54" "354" "187" "283" "356" "389" "72" "32" "121" "376" "33" "359" "349" "239" "241" "232" "196" "74" "156" "201" "390"
[232] "326" "285" "51" "131" "304" "85" "45" "336" "280" "178" "128" "98" "275" "246" "65" "39" "188" "55" "90" "197" "9"
[253] "173" "40" "295" "149" "230" "140" "135" "236" "21" "369" "301" "220" "122" "253" "208" "388" "159" "282" "88" "158" "167"
[274] "257"
> sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index)))
[1] "337" "378" "164" "225" "16" "44" "221" "179" "25" "28" "324" "175" "139" "154" "17" "252" "211" "155" "233" "162" "130"
[22] "216" "255" "190" "365" "373" "73" "207" "42" "3" "348" "227" "49" "12" "53" "315" "199" "256" "129" "375" "205" "18"
[43] "289" "168" "264" "160" "145" "382" "136" "302" "185" "323" "100" "270" "113" "294" "247" "345" "209" "104" "321" "7" "138"
[64] "78" "386" "366" "298" "231" "86" "19"
> test_fraction == 1
[1] FALSE
> test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index),
+ test_fraction * length(setdiff(data$index, train_index))))
> test_index
[1] "28"
I have no idea why this is happening, I hope someone come with an explanation.
But I found a solution to your problem. You need to pass the arguments to the object inside the ifelse():
ifelse(
test_fraction == 1,
test_index <- setdiff(data$index, train_index),
test_index <- sample(setdiff(data$index, train_index),test_fraction * length(setdiff(data$index, train_index)))
)
I don't if this is bad practice or not, but it works. It also can be used to assign multiple conditions in the conditions like my answer here.

Resources