error Predictor.new() function package IML in R - r

I am attempting to use package 'iml' in R to create plots of SHAP values from a GBM model created in H2O.
When I try to create the R6 Predictor object using the Predictor.new() function I get an error that states Error : all(feature.class %in% names(feature.types)) is not TRUE.
From this I am guessing that there is something about one of the feature classes that is incorrect, but this is just an educated guess based upon what the error message is literally saying.
Here is a sample of anonymized data (I can't share the real data because it is confidential):
structure(list(dlr_id_cur = c(1, 2), date_eff = structure(c(16014,
15416), class = "Date"), new_vec_ind = structure(c(1L, 1L), .Label = c("NNA",
"UNA"), class = "factor"), cntrct_term = c(9587879614862828,
19), amt_financed = c(9455359, 65561175), reg_payment = c(885288,
389371), acct_stat_cd = structure(c(3L, 3L), .Label = c("11",
"22", "33"), class = "factor"), base_rental = c(1, 626266), down_pymt = c(2,
6654661), car_count = c(5, 1), dur_lease = c(3974, 6466), returned = structure(1:2, .Label = c("00",
"11"), class = "factor"), state = structure(c(10L, 1L), .Label = c("ANA",
"BNA", "CNA", "DNA", "FNA", "GNA", "HNA", "INA", "KNA", "LNA",
"MNA", "NNA", "ONA", "PNA", "QNA", "RNA", "SNA", "TNA", "UNA",
"VNA", "WNA"), class = "factor"), zip = c(34633, 45222), zip_two_digits = structure(c(71L,
36L), .Label = c("00", "01", "02", "03", "04", "05", "06", "07",
"08", "09", "110", "111", "112", "113", "114", "115", "116",
"117", "118", "119", "220", "221", "222", "223", "224", "225",
"226", "227", "228", "229", "330", "331", "332", "333", "334",
"335", "336", "337", "338", "339", "440", "441", "442", "443",
"444", "445", "446", "447", "448", "449", "550", "551", "552",
"553", "554", "555", "556", "557", "558", "559", "660", "661",
"662", "663", "664", "665", "666", "667", "668", "669", "770",
"771", "772", "773", "774", "775", "776", "777", "778", "779",
"880", "881", "882", "883", "884", "885", "886", "887", "888",
"889", "990", "991", "992", "993", "994", "995", "996", "997",
"998", "999", "ANA", "BNA", "CNA", "ENA", "GNA", "HNA", "JNA",
"KNA", "LNA", "MNA", "NNA", "PNA", "RNA", "SNA", "TNA", "VNA"
), class = "factor")
, mod_year_date = c(8156, 6278), vehic_mod_fam_code = structure(c(2L,
2L), .Label = c("BNA", "CNA", "ENA", "MNA", "SNA", "TNA", "VNA",
"XNA"), class = "factor"), mod_class_code = structure(c(4L, 2L
), .Label = c("BNA", "CNA", "ENA", "GNA", "MNA", "RNA", "SNA"
), class = "factor"), count_dl_DL_CDE_CSPS_A_NP = c(945, 337),
DL_CDE_CSPS_A_NP_avg_dl = c(3355188283749626, 8835582388327814
), count_sv_DL_CDE_CSPS_A_NP = c(6532, 8475), DL_CDE_CSPS_A_NP_avg_sv = c(4471193398278526,
6934672627789796), count_dl_NUM_CSPS_INIT_SCR = c(774, 773
), NUM_CSPS_INIT_SCR_avg_dl = c(9468453388562312, 5847816458727333
), count_sv_NUM_CSPS_INIT_SCR = c(2467, 3882), NUM_CSPS_INIT_SCR_avg_sv = c(5857936629789154,
8963457353776469), count_FFV = c(8563, 2566), average_FFV = c(25697792913881564,
13693335921646120), csps_NUM_SV = c(8, 6), avg_SV_rating = c(9817541424596360,
6218928542331853), csps_FFV_ratio = c(23125612473476952,
2), avg_DL_rating = c(2182256921592387, 7668957586431513),
has_DL_rating = c(1, 8), has_bad_DL_rating = c(2, 4), serv_has_MNT = c(7,
3), serv_has_SCP = c(5, 4), serv_has_ELW = c(9, 4), serv_has_LCP = c(7,
1), ro_count = c(6, 1), ro_tot_cust_pay = c(2, 188759), ro_tot_pay = c(3,
764372), date_eff_weekday = structure(c(4L, 3L), .Label = c("FNA",
"MNA", "SNA", "TNA", "WNA"), class = "factor"), date_eff_month_int = c(83,
7), date_eff_day = c(2, 24)), .Names = c("dlr_id_cur", "date_eff",
"new_vec_ind", "cntrct_term", "amt_financed", "reg_payment",
"acct_stat_cd", "base_rental", "down_pymt", "car_count", "dur_lease",
"returned", "state", "zip", "zip_two_digits", "mod_year_date",
"vehic_mod_fam_code", "mod_class_code", "count_dl_DL_CDE_CSPS_A_NP",
"DL_CDE_CSPS_A_NP_avg_dl", "count_sv_DL_CDE_CSPS_A_NP", "DL_CDE_CSPS_A_NP_avg_sv",
"count_dl_NUM_CSPS_INIT_SCR", "NUM_CSPS_INIT_SCR_avg_dl", "count_sv_NUM_CSPS_INIT_SCR",
"NUM_CSPS_INIT_SCR_avg_sv", "count_FFV", "average_FFV", "csps_NUM_SV",
"avg_SV_rating", "csps_FFV_ratio", "avg_DL_rating", "has_DL_rating",
"has_bad_DL_rating", "serv_has_MNT", "serv_has_SCP", "serv_has_ELW",
"serv_has_LCP", "ro_count", "ro_tot_cust_pay", "ro_tot_pay",
"date_eff_weekday", "date_eff_month_int", "date_eff_day"), row.names = 1:2, class = "data.frame")
# 1. create a data frame with just the features
features_iml <- as.data.frame(df_testR) %>% dplyr::select(-returned)
# 2. Create a vector with the actual responses
response_iml <- as.numeric(as.vector(df_testR$returned))
# 3. Create custom predict function that returns the predicted values as a
# vector (probability of customer churn in my example)
pred <- function(model, newdata) {
results <- as.data.frame(h2o.predict(model, as.h2o(newdata)))
return(results[[3L]])
}
# 4. example of prediction output
pred(GBM5, features_iml) %>% head()
# 5. create Predictor object
predictor = Predictor$new(model = GBM5, data = features_iml, y =
response_iml, predict.fun = pred, class = "classification")
Error : all(feature.class %in% names(feature.types)) is not TRUE
Here are also so basic descriptions of the dataset and model object I'm
using in the code above:
class(GBM5)
[1] "H2OBinomialModel"
attr(,"package")
[1] "h2o"
class(df_testR)
[1] "tbl_df" "tbl" "data.frame"
dim(df_testR)
[1] 47006 44
If there is anything else I can provide or if I have been unclear please let me know.

In the iml package there are specific feature classes that are acceptable, namely numeric, integer, character, factor and ordered. If you have any Date objects, or any other data type than the 5 listed here than the Predictor object can not be created.

Related

Remove single characters without changing the numbers from an r dataframe

My dataframe has many arrows, ">" and "<"s in it alongside some of the element values. I want to remove these characters but keep the numbers. I only know how to replace the entire element with NA with the following code.
df <- apply(df, 1:2, gsub, pattern = "<|>", replacement = "")
Will someone please help me edit this so that it keeps the element numbers too, instead of throwing the entire thing out?
Dataframe:
structure(list(`Analyte Sample` = c(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14), A = c("4190", "6665", "7435", "2052",
"783", "322", "199", "90", "46", "17", "8", "3", "3", "<1↓"
), B = c("11569", "6677", "3852", "983.88", "589", "359", "203",
"68", "33", "12", "6", "<2↓", "4", "<1↓"), C = c("20453",
"7699", "2499", "707.98", "412", "328", "156", "88", "39", "27",
"17", "<1↓", "<3↓", "<1↓"), D = c("7893", ">20000↑",
"1623", "685.64", "321", "644", "112", "65", "35", "29", "9",
"5", "<3↓", "<1↓"), E = c("320", "15444", "2049", "1065",
"389", "365", "145", "77", "38", "16", "9", "6", "<2↓", "<2↓"
), F = c("7438", ">21999↑", "3472", "1057", "563", "401", "167",
"89", "46", "19", "6", "<1↓", "<1↓", "<1↓"), G = c(7345,
9001, 2473, 1138, 516, 403, 134, 81, 37, 17, 8, 6, 4, 3), H = c("9004",
"3998", "2299", "964.88", "499", "341", "112", "88", "39", "32",
"<29↓", "<30↓", "<31↓", "<29↓"), I = c("8434", "8700",
"2217", "1263", "567", "352", "153", "80", "43", "18", "9", "2",
"3", "<1↓"), J = c("7734", "6733", "2092", "1115", "637", "332",
"155", "82", "37", "17", "10", "4", "1", "<1↓"), K = c(">3718↑",
">3000↑", "2118", "862.13", "426", "355", "143", "78", "44",
"22", "11", "<4↓", "<4↓", "<3↓"), L = c(6345, 7688, 2311,
1195, 647, 366, 177, 83, 41, 20, 8, 6, 3, 2), M = c("4222", ">25587↑",
"1846", "814.61", "422", "314", "154", "86", "41", "27", "21",
"<2↓", "<2↓", "<3↓"), N = c("6773", "8934", "2381", "1221",
"677", "356", "146", "89", "40", "17", "10", "5", "2", "<2↓"
), O = c(">2200↑", ">2133↑", ">2000↑", "564.5", "226",
"476", "111", "60", "32", "36", "18", "<10↓", "<1↓", "<2↓"
)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L), spec = structure(list(cols = list(`Analyte Sample` = structure(list(), class = c("collector_double",
"collector")), A = structure(list(), class = c("collector_character",
"collector")), B = structure(list(), class = c("collector_character",
"collector")), C = structure(list(), class = c("collector_character",
"collector")), D = structure(list(), class = c("collector_character",
"collector")), E = structure(list(), class = c("collector_character",
"collector")), F = structure(list(), class = c("collector_character",
"collector")), G = structure(list(), class = c("collector_double",
"collector")), H = structure(list(), class = c("collector_character",
"collector")), I = structure(list(), class = c("collector_character",
"collector")), J = structure(list(), class = c("collector_character",
"collector")), K = structure(list(), class = c("collector_character",
"collector")), L = structure(list(), class = c("collector_double",
"collector")), M = structure(list(), class = c("collector_character",
"collector")), N = structure(list(), class = c("collector_character",
"collector")), O = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
You can use lapply() which returns a list and assign it back to df[]. [] is to keep the original attributes, i.e. a class of data.frame. You will see that df becomes what you want.
df[] <- lapply(df, gsub, pattern = "<|>", replacement = "")
I think in your case the best would be to use a regular expression. Using tidyverse:
df %>% mutate_at(vars(A:O), ~ as.numeric(gsub("[^0-9]*([0-9]*).*", "\\1", .)))
If you specifically want only to change values which start with a < or >, you do the following:
df %>% mutate_at(vars(A:O), ~ as.numeric(gsub("[<>]*([0-9]*).*", "\\1", .)))
Of course, you can also use apply... but mind the way apply changes the data frame into a matrix before applying the function (the columns which are numbers will have spaces prefixed, so we need to include space in the pattern):
apply(df, 2, function(x) gsub("[ <>]*([0-9]*).*", "\\1", x))
Explanation:
The pattern [0-9]* matches a digit any number of times. The pattern [^0-9] matches anything but a digit any number of times.
You can try one of these options:
#Code 1
df <- apply(df, 1:2, function(x) gsub(pattern = "<|>", replacement = "",x))
#Code 2
df <- sapply(df,function(x) gsub(pattern = "<|>", replacement = "",x))
Just be careful that the output can be a matrix, so you will have to transform again to dataframe using as.data.frame().

UPDATED: quantstrat error when applying a strategy: Error in chart_Series

EDIT: The first part of this question seems to have been solved, I have another problem however.
After loading in the data, I am trying to plot chart.Posn but now I am running into this error:
> chart.Posn(portfolio.st, 'GOOG')
Error in chart_Series(Prices, name = Symbol, TA = TA, ...) :
'x' must be a time-series object
Which I do not know if it is a side-affect to me using tidyquant originally and not obtaining symbols from quantmod
Code (I provide new data just below)
GOOG <- xts(GOOG, order.by = GOOG$date)
GOOG$date <- NULL
GOOG$predictions <- as.numeric(as.character(GOOG$predictions))
currency('USD')
Sys.setenv(TZ="UTC")
stock("GOOG", currency="USD", multiplier=1)
symbols <- c("GOOG")
initDate="2007-01-01"
strategy.st <- portfolio.st <- account.st <- "SVMstrat"
rm.strat(strategy.st)
initPortf(portfolio.st, symbols=symbols, initDate=initDate, currency='USD')
initAcct(account.st, portfolios=portfolio.st, initDate=initDate, currency='USD')
initOrders(portfolio.st, initDate=initDate)
strategy(strategy.st, store=TRUE)
GOOG$predictions <- as.numeric(as.character(GOOG$predictions))
mean(GOOG$direction == GOOG$predictions)
add.signal(strategy.st,
name = sigThreshold,
arguments = list(column = "predictions",
threshold = 0.5,
relationship = "gte",
cross = TRUE),
label = "longEntry")
add.signal(strategy.st,
name = sigThreshold,
arguments = list(column = "predictions",
threshold = 0.5,
relationship = "lte",
cross = TRUE),
label = "longExit")
applySignals(strategy = strategy.st, mktdata=GOOG)
add.rule(strategy.st,
name="ruleSignal",
arguments=list(sigcol="longEntry",
sigval=TRUE,
ordertype="market",
orderside="long",
replace=FALSE,
prefer="Open",
orderqty = 1),
type="enter", path.dep=TRUE)
add.rule(strategy.st,
name="ruleSignal",
arguments=list(sigcol="longExit",
sigval=TRUE,
orderqty="all",
ordertype="market", orderside="long",
replace=FALSE, prefer="Open"),
type="exit", path.dep=TRUE)
applyStrategy(strategy=strategy.st,portfolios=portfolio.st)
updatePortf(portfolio.st)
dateRange <- time(getPortfolio(portfolio.st)$summary)[-1]
updateAcct(portfolio.st,dateRange)
updateEndEq(account.st)
chart.Posn(portfolio.st, 'GOOG')
DATA:
GOOG <- structure(list(date = structure(c(17375, 17378, 17379, 17380,
17381, 17382, 17385, 17386, 17387, 17388, 17389, 17392, 17393,
17394, 17395, 17396, 17399, 17400, 17401, 17402, 17403, 17406,
17407, 17408, 17409, 17410, 17414, 17415, 17416, 17417, 17420,
17421, 17422, 17423, 17424, 17427, 17428, 17429, 17430, 17431,
17434, 17435, 17436, 17437, 17438, 17441, 17442, 17443, 17444,
17445, 17448, 17449, 17450, 17451, 17452, 17455, 17456, 17457,
17458, 17459, 17462, 17463, 17464, 17465, 17466, 17469, 17470,
17471, 17472, 17473, 17476, 17477, 17478, 17479, 17480, 17483,
17484, 17485, 17486, 17487, 17490, 17491, 17492, 17494, 17497,
17498, 17499, 17500, 17501, 17504, 17505, 17506, 17507, 17508,
17511, 17512, 17513, 17514, 17515, 17518), class = "Date"), open = c(929.400024,
941.890015, 932.380005, 928.609985, 930.340027, 926.75, 929.059998,
927.090027, 920.609985, 917.549988, 907.969971, 922.530029, 924.22998,
925.289978, 925.780029, 910.309998, 910, 912.719971, 921.929993,
928.659973, 923.48999, 916, 905.099976, 920.049988, 931.76001,
941.130005, 933.080017, 930.150024, 931.72998, 936.48999, 934.25,
932.590027, 930.659973, 931.25, 924.659973, 920.01001, 917.419983,
922.97998, 933, 927.75, 925.450012, 923.719971, 927.73999, 941.359985,
952, 959.97998, 954, 957, 955.48999, 966.700012, 980, 980, 973.719971,
987.450012, 992, 992.099976, 990.289978, 991.77002, 986, 989.440002,
989.52002, 970, 968.369995, 980, 1009.190002, 1014, 1015.219971,
1017.210022, 1021.76001, 1022.109985, 1028.98999, 1027.27002,
1030.52002, 1033.98999, 1026.459961, 1023.419983, 1022.590027,
1019.210022, 1022.52002, 1034.01001, 1020.26001, 1023.309998,
1035, 1035.869995, 1040, 1055.089966, 1042.680054, 1022.369995,
1015.799988, 1012.659973, 995.940002, 1001.5, 1020.429993, 1037.48999,
1035.5, 1039.630005, 1046.119995, 1045, 1054.609985, 1066.079956
), high = c(943.830017, 943.590027, 937.447021, 932.599976, 932.23999,
930.307007, 931.700012, 935.814026, 925.97998, 919.26001, 917.780029,
924.66803, 926.549988, 932.700012, 926.859985, 915.275024, 913,
925.859985, 929.929993, 930.840027, 925.554993, 919.244995, 923.330017,
930.81897, 941.97998, 942.47998, 937, 930.914978, 936.409973,
936.98999, 938.380005, 933.47998, 937.25, 932.77002, 926.48999,
922.080017, 922.419983, 933.880005, 936.530029, 934.72998, 926.400024,
930.820007, 949.900024, 950.690002, 959.786011, 962.539978, 958,
960.390015, 970.909973, 979.460022, 985.424988, 981.570007, 990.710022,
994.119995, 997.210022, 993.906982, 996.440002, 996.719971, 988.880005,
991, 989.52002, 972.22998, 976.090027, 987.599976, 1048.390015,
1024.969971, 1024, 1029.670044, 1028.089966, 1032.650024, 1034.869995,
1033.969971, 1043.521973, 1033.98999, 1030.76001, 1031.579956,
1026.810059, 1024.089966, 1035.920044, 1034.420044, 1022.609985,
1035.109985, 1039.706055, 1043.177979, 1055.459961, 1062.375,
1044.079956, 1028.48999, 1022.48999, 1016.099976, 1020.609985,
1024.969971, 1034.23999, 1042.050049, 1043.800049, 1050.310059,
1046.665039, 1058.5, 1067.619995, 1078.48999), low = c(927.5,
926.039978, 929.26001, 916.679993, 922.23999, 923.030029, 926.5,
925.609985, 917.25, 906.130005, 905.580017, 918.190002, 919.820007,
923.445007, 910.97998, 907.153992, 903.400024, 911.474976, 919.359985,
915.5, 915.5, 911.869995, 905, 919.650024, 931.76001, 935.150024,
921.960022, 919.27002, 923.619995, 924.880005, 926.919983, 923.861023,
929.859985, 924, 916.359985, 910.599976, 912.549988, 922, 923.830017,
926.47998, 909.700012, 921.140015, 927.73999, 940.549988, 951.51001,
947.840027, 949.140015, 950.690002, 955.179993, 963.359985, 976.109985,
966.080017, 972.25, 985, 989, 984, 988.590027, 986.974976, 978.390015,
984.580017, 966.119995, 961, 960.52002, 972.200012, 1008.200012,
1007.5, 1010.419983, 1016.950012, 1013.01001, 1020.309998, 1025,
1025.130005, 1028.449951, 1019.666016, 1025.280029, 1022.570007,
1014.150024, 1015.419983, 1022.52002, 1017.75, 1017.5, 1022.655029,
1031.430054, 1035, 1038.439941, 1040, 1015.650024, 1015, 1002.02002,
995.570007, 988.280029, 1001.140015, 1018.070984, 1032.521973,
1032.050049, 1033.689941, 1038.380005, 1043.109985, 1049.5, 1062
), close = c(941.530029, 930.5, 930.830017, 930.390015, 923.650024,
927.960022, 929.359985, 926.789978, 922.900024, 907.23999, 914.390015,
922.669983, 922.219971, 926.960022, 910.97998, 910.669983, 906.659973,
924.690002, 927, 921.280029, 915.890015, 913.809998, 921.289978,
929.570007, 939.330017, 937.340027, 928.450012, 927.809998, 935.950012,
926.5, 929.080017, 932.070007, 935.090027, 925.109985, 920.289978,
915, 921.809998, 931.580017, 932.450012, 928.530029, 920.969971,
924.859985, 944.48999, 949.5, 959.109985, 953.27002, 957.789978,
951.679993, 969.960022, 978.890015, 977, 972.599976, 989.25,
987.830017, 989.679993, 992, 992.179993, 992.809998, 984.450012,
988.200012, 968.450012, 970.539978, 973.330017, 972.559998, 1019.27002,
1017.109985, 1016.640015, 1025.5, 1025.579956, 1032.47998, 1025.900024,
1033.329956, 1039.849976, 1031.26001, 1028.069946, 1025.75, 1026,
1020.909973, 1032.5, 1019.090027, 1018.380005, 1034.48999, 1035.959961,
1040.609985, 1054.209961, 1047.410034, 1021.659973, 1021.409973,
1010.169983, 998.679993, 1005.150024, 1018.380005, 1030.930054,
1037.050049, 1041.099976, 1040.47998, 1040.609985, 1049.150024,
1064.189941, 1077.140015), volume = c(1846400, 1970100, 1277700,
1824400, 1202500, 1082300, 1032200, 1061600, 1192100, 1824000,
1206800, 1064500, 883400, 1006700, 1277200, 1342700, 943400,
1166700, 1090200, 1270300, 1053400, 1086500, 1185600, 1301200,
1582600, 947400, 1326400, 1527700, 1212700, 1011500, 1267000,
1134400, 1102600, 1397600, 2505400, 1306900, 936700, 1669800,
1290600, 1052700, 1856800, 1666900, 2239400, 1020300, 1581000,
1283400, 888300, 952400, 1213800, 1173900, 891400, 968400, 1693300,
1262400, 1169800, 910500, 1290200, 1057600, 1313600, 1183200,
1478400, 1212200, 1211300, 2042100, 5167700, 2085100, 1330700,
1373400, 1049000, 1076400, 1125200, 1112300, 1088700, 1245200,
720000, 885800, 959200, 854000, 1129700, 1397100, 953500, 1097000,
746300, 537000, 1307900, 1424400, 2459400, 1724000, 1909600,
1906400, 2067300, 1272000, 1458200, 1290800, 1192800, 1279500,
1282700, 1558700, 3275900, 1554600), adjusted = c(941.530029,
930.5, 930.830017, 930.390015, 923.650024, 927.960022, 929.359985,
926.789978, 922.900024, 907.23999, 914.390015, 922.669983, 922.219971,
926.960022, 910.97998, 910.669983, 906.659973, 924.690002, 927,
921.280029, 915.890015, 913.809998, 921.289978, 929.570007, 939.330017,
937.340027, 928.450012, 927.809998, 935.950012, 926.5, 929.080017,
932.070007, 935.090027, 925.109985, 920.289978, 915, 921.809998,
931.580017, 932.450012, 928.530029, 920.969971, 924.859985, 944.48999,
949.5, 959.109985, 953.27002, 957.789978, 951.679993, 969.960022,
978.890015, 977, 972.599976, 989.25, 987.830017, 989.679993,
992, 992.179993, 992.809998, 984.450012, 988.200012, 968.450012,
970.539978, 973.330017, 972.559998, 1019.27002, 1017.109985,
1016.640015, 1025.5, 1025.579956, 1032.47998, 1025.900024, 1033.329956,
1039.849976, 1031.26001, 1028.069946, 1025.75, 1026, 1020.909973,
1032.5, 1019.090027, 1018.380005, 1034.48999, 1035.959961, 1040.609985,
1054.209961, 1047.410034, 1021.659973, 1021.409973, 1010.169983,
998.679993, 1005.150024, 1018.380005, 1030.930054, 1037.050049,
1041.099976, 1040.47998, 1040.609985, 1049.150024, 1064.189941,
1077.140015), direction = c(1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
1, 0, 1, 1, 1), daily.returns = c(0.00796497316633915, -0.0117150050027772,
0.000354666308436391, -0.000472698550717299, -0.00724426411648449,
0.00466626740433007, 0.00150864581103693, -0.00276535146926948,
-0.00419723356136681, -0.0169682886474819, 0.00788107345224054,
0.0090551819947422, -0.000487728015749256, 0.00513982688410031,
-0.0172391922205249, -0.000340289585727183, -0.00440336244178141,
0.0198862082113775, 0.00249813234165353, -0.00617041100323623,
-0.00585057076060858, -0.00227103360221692, 0.00818548715419065,
0.00898742979704914, 0.0104994889319832, -0.00211852060935469,
-0.00948430104756415, -0.000689335981181594, 0.00877336309971533,
-0.010096705891169, 0.00278469185105235, 0.00321822657391202,
0.00324012142577179, -0.0106728140733341, -0.00521019887165097,
-0.00574816430305625, 0.00744262076502733, 0.0105987340354277,
0.000933891865565917, -0.00420396047997473, -0.00814196392564925,
0.004223822841668, 0.0212248397793964, 0.00530446066453272, 0.0101211005792523,
-0.00608894192671761, 0.00474152958256258, -0.00637925342751922,
0.0192081678026828, 0.00920655779357471, -0.00193077360177174,
-0.0045036069600819, 0.0171190874057765, -0.00143541369724542,
0.00187276754923715, 0.0023441991516544, 0.000181444556451638,
0.000634970473547991, -0.00842052962484363, 0.00380923353577045,
-0.0199858325846691, 0.00215805253147128, 0.00287472856682269,
-0.000791118106450051, 0.0480279078885169, -0.00211919801192617,
-0.000462064090345216, 0.00871496780500025, 7.7967820575342e-05,
0.00672792400010591, -0.00637296231157924, 0.00724235483593283,
0.00630971739679231, -0.00826077434077854, -0.0030933653676728,
-0.00225660326812049, 0.000243724104313836, -0.0049610399610136,
0.0113526435303026, -0.0129878673123487, -0.00069672156648426,
0.0158192275191027, 0.00142096203366848, 0.00448861362895858,
0.0130692345797547, -0.00645025872602245, -0.0245845086108847,
-0.000244699808749416, -0.0110043863846236, -0.0113743134258227,
0.00647858277461255, 0.0131621953779111, 0.0123235422321553,
0.00593638237264926, 0.00390523775000573, -0.000595520136675076,
0.000124947142183363, 0.0082067624980553, 0.01433533494348, 0.0121689498284778
), rsi = c(55.4941894407854, 50.1632187721403, 44.2057435629332,
42.3277780878597, 34.916071046742, 38.3146934968324, 31.6795222837921,
26.8964608058531, 26.5762418841725, 19.4454931404427, 19.2426086807444,
33.6749929164219, 34.664251589656, 45.2102647300278, 31.5897223352515,
36.2768164306463, 34.0840071047493, 46.9525192134939, 51.8801058990984,
46.3093986201844, 42.8722859382365, 43.0957642799655, 49.175098084696,
62.3766664464771, 63.434587161957, 58.4768395755699, 53.2799994541443,
50.4676873087737, 65.036737044046, 58.5873924866939, 62.3539876687387,
54.8745033541646, 55.2938209354096, 52.3738367508463, 52.7465383120286,
50.7142001647387, 50.3146292014814, 51.194583971026, 45.4279672713402,
44.2918322359654, 45.068547928843, 48.135031282721, 54.7140455891114,
63.3503388014154, 66.1156673174906, 61.0393644854218, 61.638611076124,
64.1858085150276, 73.1864682789322, 78.8442513456682, 76.0748344982738,
70.4160617369582, 74.4321939715647, 76.0682095017227, 81.8013716675612,
81.5329893185474, 77.4049144059356, 76.2072058576538, 65.5689633500972,
72.0267383603007, 55.6390349896244, 60.4198811003328, 52.2463634887349,
45.2655097258959, 68.9263022050927, 70.3371957548591, 64.6863349631492,
68.7059200902, 68.1459609632161, 69.5555458937198, 65.3412385589247,
67.360752439441, 74.1163158004527, 67.9866489434534, 78.9024435671165,
76.7050678391648, 76.1182375524639, 72.9909746088876, 59.4459498647349,
51.2180399554398, 51.0672179692865, 55.0636478144878, 55.756441335519,
54.6240547411303, 64.9109643846478, 57.4655825824172, 41.988898103196,
45.3179785535323, 42.0964437376148, 38.9428915240427, 41.8953683621125,
49.0751009808747, 49.4300638103628, 56.8843996184545, 58.4915483608461,
52.531910623186, 51.9880401490032, 53.5336152483392, 54.0807930604593,
61.574386151279), momentum = c(-2.29998799999998, -16.659973,
-25.159973, -23.0299680000001, -41.75, -42.929993, -38.790039,
-46.130005, -57.4400029999999, -43.460022, -33.409973, -11.420044,
-19.310058, -3.53997800000002, -19.850037, -19.7200319999999,
-16.990051, -3.27001999999993, -2.35998500000005, -5.50994900000001,
-7.01000900000008, 6.57000799999992, 6.89996300000007, 6.90002400000003,
17.110046, 10.380005, 17.4700320000001, 17.1400149999999, 29.290039,
1.80999799999995, 2.080017, 10.789978, 19.200012, 11.2999870000001,
-1, -14.570007, -17.520019, -5.76000999999997, 4, 0.720031000000063,
-14.980041, -1.64001499999995, 15.409973, 17.429993, 24.0199580000001,
28.160035, 37.5, 36.679993, 48.150024, 47.309998, 44.549988,
44.069947, 68.280029, 62.9700319999999, 45.1900029999999, 42.5,
33.0700079999999, 39.5399779999999, 26.660034, 36.520019, -1.51000999999997,
-8.35003699999993, -3.669983, -0.0399780000000192, 30.02002,
29.2799680000001, 26.960022, 33.5, 33.3999630000001, 39.6699820000001,
41.450012, 45.129944, 71.399964, 60.7200319999999, 54.7399290000001,
53.190002, 6.72997999999995, 3.79998799999998, 15.8599850000001,
-6.40997300000004, -7.19995100000006, 2.01000999999997, 10.059937,
7.28002900000001, 14.3599850000001, 16.150024, -6.40997300000004,
-4.34002699999996, -15.830017, -22.2299800000001, -27.349976,
-0.710021999999981, 12.5500489999999, 2.56005899999991, 5.14001499999995,
-0.130004999999983, -13.599976, 1.73999000000003, 42.5299680000001,
55.7300419999999), ROC = c(-0.032795598957029, -0.0521774017299466,
-0.0211218884757489, -0.0185396276689724, -0.011239584246205,
-0.0145175903203612, -0.00122591501722535, -0.00434970056376649,
-0.00808295692710193, -0.0179262252987202, -0.0147314594246657,
-0.00722453871018125, -0.00494320358659461, 0.00438952614578625,
0.00411390847797843, -0.00407661902179068, -0.0175041344808058,
0.00267477300869512, 4.31271435541447e-05, 0.011243117733982,
0.00571571283393091, 0.00785518232019733, -0.0036837104211509,
0.00276855559648226, 0.0194028343644312, 0.0231498189132653,
0.0158938717770809, 0.00705213034020158, 0.00683994790619824,
-0.0137528275407712, -0.00885123791251541, 0.00389138470011119,
0.00781584192612872, -0.0116494359217736, -0.0067252318670965,
-0.0152708022295656, -0.0110688002733577, -0.00376072262811622,
0.00790291050001635, 0.00891390782234858, 0.00650336567082288,
0.00330323208950656, 0.0137630010983862, 0.0181199866055621,
0.0324030338008408, 0.0344707694066191, 0.0349861658038035, 0.00758374779102411,
0.02131932596276, 0.0204135365091132, 0.0245884516898194, 0.0153443490021248,
0.0387182437892122, 0.018255779160036, 0.0109623594909625, 0.0152364552420901,
0.019931662088692, 0.00359222408181115, -0.00342751359774773,
-0.00149653292664187, -0.0240262396056901, -0.0220519404222213,
-0.0198161049507126, -0.0121513535862787, 0.0309568648450718,
0.0490236690313068, 0.046405770875606, 0.0522123781158035, 0.0530817741025684,
0.0128769514166818, 0.00860504177727073, 0.0162834675770531,
0.0138961503712416, 0.00552310197477013, -0.00428045026022694,
-0.000146247165464075, -0.00711880768197481, -0.0183820889249988,
0.00120168056673897, -0.00877310685546551, -0.00721091801104201,
0.00824079523600485, 0.0146341354305113, 0.00782401940069555,
0.0338815355466462, 0.0281073484004102, -0.0124798129988672,
-0.0141444961739703, -0.0296884485614344, -0.0541125132421794,
-0.0411836747473417, -0.0032155946565986, 0.00927736062930951,
0.0262615746498476, 0.0415987024226974, 0.0345453179346213, 0.0215939309246291,
0.0175189755318357, 0.0258336996226562, 0.0340315709208765),
predictions = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("0", "1"), class = "factor")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100"), class = "data.frame")
OLD question (I leave here for completness)
OLD Question (with data)
I have been playing around with the quantstrat package and I am running into an error.
The error comes about when I apply the line applySignals(strategy = strategy.st, mktdata=GOOG). I get this error:
Error in `colnames<-`(`*tmp*`, value = label) :
attempt to set 'colnames' on an object with less than two dimensions
I use traceback() and get this report:
5: stop("attempt to set 'colnames' on an object with less than two dimensions")
4: `colnames<-`(`*tmp*`, value = label)
3: (function (label, data = mktdata, column, threshold = 0, relationship = c("gt",
"lt", "eq", "gte", "lte"), cross = FALSE)
{
relationship = relationship[1]
ret_sig = NULL
colNum <- match.names(column, colnames(data))
switch(relationship, `>` = , gt = {
ret_sig = data[, colNum] > threshold
}, `<` = , lt = {
ret_sig = data[, colNum] < threshold
}, eq = {
ret_sig = data[, colNum] == threshold
}, gte = , gteq = , ge = {
ret_sig = data[, colNum] >= threshold
}, lte = , lteq = , le = {
ret_sig = data[, colNum] <= threshold
})
if (isTRUE(cross))
ret_sig <- diff(ret_sig) == 1
if (!missing(label))
colnames(ret_sig) <- label
return(ret_sig)
})(label = "longEntry", data = mktdata, column = "predictions",
threshold = 0.5, relationship = "gte", cross = TRUE)
2: do.call(sigFun, .formals)
1: applySignals(strategy = strategy.st, mktdata = GOOG)
From my research and looking at a solution here, its a problem with my signals.
The link to the solution suggests that head(mktdata) - which is created after you run the applySignals() line. A different strategy (I can add code if needed) creates the mktdata with column names as follows:
> colnames(mktdata)
[1] "SPY.Open" "SPY.High" "SPY.Low"
[4] "SPY.Close" "SPY.Volume" "SPY.Adjusted"
[7] "nonDerivedIndicator" "longEntry" "longExit"
However I imported the data using tidyquant and the column names are;
> colnames(GOOG)
[1] "open" "high" "low" "close"
[5] "volume" "adjusted" "daily.returns" "predictions"
Is this the issue I face? simply a column name issue? that is, quantmod imports the data as SPY.Open and quantstrat was built with this in mind and expects this format and not the tidy format? or am I facing a different error?
The quantstrat code:
currency('USD')
Sys.setenv(TZ="UTC")
stock("GOOG", currency="USD", multiplier=1)
symbols <- c("GOOG")
strategy.st <- portfolio.st <- account.st <- "SVMstrat"
rm.strat(strategy.st)
initPortf(portfolio.st, symbols=symbols, initDate=initDate, currency='USD')
initAcct(account.st, portfolios=portfolio.st, initDate=initDate, currency='USD')
initOrders(portfolio.st, initDate=initDate)
strategy(strategy.st, store=TRUE)
GOOG$predictions <- as.numeric(as.character(GOOG$predictions))
mean(GOOG$direction == GOOG$predictions)
add.signal(strategy.st,
name = sigThreshold,
arguments = list(column = "predictions",
threshold = 0.5,
relationship = "gte",
cross = TRUE),
label = "longEntry")
add.signal(strategy.st,
name = sigThreshold,
arguments = list(column = "predictions",
threshold = 0.5,
relationship = "lte",
cross = TRUE),
label = "longExit")
applySignals(strategy = strategy.st, mktdata=GOOG)
DATA:
(Had to remove old data for new question)
EDIT:
I have re-ran the model with the following and still run into the same error:
colnames(GOOG) <- c("GOOG.Open", "GOOG.High", "GOOG.Low",
"GOOG.Close", "GOOG.Volume", "GOOG.Adjusted",
"daily.returns", "predictions")
To answer my own question.
It looks like quantstrat takes xts objects and quantmod produces tibbles.
All I had to do was to convert my data to an xts object before running the strategy.
GOOG <- xts(GOOG, order.by = GOOG$date)
GOOG$date <- NULL
EDIT:
For a working strategy (SPY) I get the following xts structure output.
An ‘xts’ object on 2003-01-02/2012-12-28 containing:
Data: num [1:2516, 1:7] 72.9 74.6 74.8 76.2 75.6 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:7] "SPY.Open" "SPY.High" "SPY.Low" "SPY.Close" ...
Indexed by objects of class: [Date] TZ: UTC
xts Attributes:
List of 2
$ src : chr "yahoo"
$ updated: POSIXct[1:1], format: "2018-12-09 12:43:49"
For my code - import using tidyquant, add some indicators, run an SVM model and then convert it back into xts I have the following structure output.
An ‘xts’ object on 2017-07-28/2017-12-18 containing:
Data: chr [1:100, 1:11] " 929.40" " 941.89" " 932.38" " 928.61" ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:11] "open" "high" "low" "close" ...
Indexed by objects of class: [Date] TZ: UTC
xts Attributes:
NULL
I am missing the
List of 2
$ src : chr "yahoo"
$ updated: POSIXct[1:1], format: "2018-12-09 12:43:49"
part.

Replace value with NULL in column [duplicate]

This question already has an answer here:
Set NA and "" Cells in R Dataframe to NULL
(1 answer)
Closed 4 years ago.
I have a dataframe where I want to replace all values in a column that contain the value '2018' with NULL.
I have a dataset where every value in a column is a list. There are NULLs included as well. One of the values is not a list and I want to replace it with a NULL. If I replace it with NA then the datatypes in that column are mixed.
If I have a column like below, how do I replace the value containing 2018 with NULL instead of NA?
spend actions
176.2 2018-02-24
166.66 list(action_type = c("landing_page_view", "link_click", "offsit...
153.89 list(action_type = c("landing_page_view", "like", "link_click",...
156.54 list(action_type = c("landing_page_view", "like", "link_click",...
254.95 list(action_type = c("landing_page_view", "like", "link_click",...
374 list(action_type = c("landing_page_view", "like", "link_click",...
353.29 list(action_type = c("landing_page_view", "like", "link_click",...
0.41 NULL
Reproducible Example:
structure(list(spend = c("176.2", "166.66", "153.89", "156.54",
"254.95", "374", "353.29", "0.41"), actions = list("2018-02-24",
structure(list(action_type = c("landing_page_view", "link_click",
"offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content",
"post", "post_reaction", "page_engagement", "post_engagement",
"offsite_conversion"), value = c("179", "275", "212", "18",
"269", "1434", "1", "17", "293", "293", "1933")), .Names = c("action_type",
"value"), class = "data.frame", row.names = c(NA, 11L)),
structure(list(action_type = c("landing_page_view", "like",
"link_click", "offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content", "post_reaction",
"page_engagement", "post_engagement", "offsite_conversion"
), value = c("136", "3", "248", "101", "6", "237", "730",
"11", "262", "259", "1074")), .Names = c("action_type", "value"
), class = "data.frame", row.names = c(NA, 11L)), structure(list(
action_type = c("landing_page_view", "like", "link_click",
"offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content",
"post", "post_reaction", "page_engagement", "post_engagement",
"offsite_conversion"), value = c("95", "1", "156", "91",
"5", "83", "532", "1", "13", "171", "170", "711")), .Names =
c("action_type",
"value"), class = "data.frame", row.names = c(NA, 12L)),
structure(list(action_type = c("landing_page_view", "like",
"link_click", "offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content", "post_reaction",
"page_engagement", "post_engagement", "offsite_conversion"
), value = c("178", "4", "243", "56", "4", "138", "437",
"19", "266", "262", "635")), .Names = c("action_type", "value"
), class = "data.frame", row.names = c(NA, 11L)), structure(list(
action_type = c("landing_page_view", "like", "link_click",
"offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content",
"post_reaction", "page_engagement", "post_engagement",
"offsite_conversion"), value = c("203", "2", "306", "105",
"7", "186", "954", "23", "331", "329", "1252")), .Names =
c("action_type",
"value"), class = "data.frame", row.names = c(NA, 11L)),
structure(list(action_type = c("landing_page_view", "like",
"link_click", "offsite_conversion.fb_pixel_add_to_cart",
"offsite_conversion.fb_pixel_purchase",
"offsite_conversion.fb_pixel_search",
"offsite_conversion.fb_pixel_view_content", "post", "post_reaction",
"page_engagement", "post_engagement", "offsite_conversion"
), value = c("241", "4", "320", "106", "3", "240", "789",
"1", "17", "342", "338", "1138")), .Names = c("action_type",
"value"), class = "data.frame", row.names = c(NA, 12L)),
NULL)), .Names = c("spend", "actions"), row.names = c(NA,
-8L), class = "data.frame")
My ultimate goal is to use this function with this dataset to make the action_types their own column. This function works when either a list or NULL is in the actions column:
fb_insights_all<-df %>%
as.tibble() %>%
filter(!map_lgl(actions, is.null)) %>%
unnest() %>%
right_join(select(df, -actions)) %>%
spread(action_type, value)
Error: Each column must either be a list of vectors or a list of data frames [actions]
Without data to test this on, I'd try:
df$COL1<-ifelse(grepl("2018", df$COL1),"NULL",df$COL1)
As stated here NA functions more like what you seem to be trying to do, while NULL serves a different function. If you just want the value to just say "NULL" rather than function like NULL, treat it like a character value.

Convert column types to their read_csv() column type in R

One of my favorite things about library(readr) and the read_csv() function in R is that it almost always sets the column types of my data to the correct class. However, I am currently working with an API in R that returns data to me as a dataframe of all character classes, even if the data is clearly numbers. Take this dataframe for example, which has some sports data:
dput(mydf)
structure(list(isUnplayed = c("false", "false", "false"), isInProgress =
c("false", "false", "false"), isCompleted = c("true", "true", "true"), awayScore = c("106",
"95", "95"), homeScore = c("94", "97", "111"), game.ID = c("31176",
"31177", "31178"), game.date = c("2015-10-27", "2015-10-27",
"2015-10-27"), game.time = c("8:00PM", "8:00PM", "10:30PM"),
game.location = c("Philips Arena", "United Center", "Oracle Arena"
), game.awayTeam.ID = c("88", "86", "110"), game.awayTeam.City = c("Detroit",
"Cleveland", "New Orleans"), game.awayTeam.Name = c("Pistons",
"Cavaliers", "Pelicans"), game.awayTeam.Abbreviation = c("DET",
"CLE", "NOP"), game.homeTeam.ID = c("91", "89", "101"), game.homeTeam.City = c("Atlanta",
"Chicago", "Golden State"), game.homeTeam.Name = c("Hawks",
"Bulls", "Warriors"), game.homeTeam.Abbreviation = c("ATL",
"CHI", "GSW"), quarterSummary.quarter = list(structure(list(
`#number` = c("1", "2", "3", "4"), awayScore = c("25",
"23", "34", "24"), homeScore = c("25", "18", "23", "28"
)), .Names = c("#number", "awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)), structure(list(`#number` = c("1", "2", "3", "4"), awayScore = c("17",
"23", "28", "27"), homeScore = c("26", "20", "25", "26")), .Names = c("#number",
"awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)), structure(list(`#number` = c("1", "2", "3", "4"), awayScore = c("35",
"14", "26", "20"), homeScore = c("39", "20", "35", "17")), .Names = c("#number",
"awayScore", "homeScore"), class = "data.frame", row.names = c(NA,
4L)))), .Names = c("isUnplayed", "isInProgress", "isCompleted",
"awayScore", "homeScore", "game.ID", "game.date", "game.time",
"game.location", "game.awayTeam.ID", "game.awayTeam.City", "game.awayTeam.Name",
"game.awayTeam.Abbreviation", "game.homeTeam.ID", "game.homeTeam.City",
"game.homeTeam.Name", "game.homeTeam.Abbreviation", "quarterSummary.quarter"
), class = "data.frame", row.names = c(NA, 3L))
It is quite a hassle to deal with this dataframe once it is returned by the API, given the class types. I've come up with a sort of a hack to update the column classes, which is as follows:
write_csv(mydf, 'mydf.csv')
mydf <- read_csv('mydf.csv')
By writing to CSV and then re-reading the CSV using read_csv(), the dataframe columns update. Unfortunately I am left with a CSV file in my directory that I don't want. Is there a way to update the columns of an R dataframe to their 'read_csv()' column classes, without actually having to write the CSV?
Any help is appreciated!
You don't need to write and read the data if you just want readr to guess you column type. You could use readr::type_convert for that:
iris %>%
dplyr::mutate(Sepal.Width = as.character(Sepal.Width)) %>%
readr::type_convert() %>%
str()
For comparison:
iris %>%
dplyr::mutate(Sepal.Width = as.character(Sepal.Width)) %>%
str()
try this code, type.convert convert a character vector to logical, integer, numeric, complex or factor as appropriate.
indx <- which(sapply(df, is.character))
df[, indx] <- lapply(df[, indx], type.convert)
indx <- which(sapply(df, is.factor))
df[, indx] <- lapply(df[, indx], as.character)

Speed up optimization of lm() over dataframe

I'm working with spectral data and I'm trying to find which combination of wavelengths best predicts a particular concentration level.
I've got the following data:
spectrum <- data.frame(structure(list(reference = c("00383_130927131406", "00383_130927131636"
), concentration = c(785, 39), `200` = c(0.1818293, 0.1818728
), `202` = c(0.2090413, 0.2052553), `204` = c(0.2317517, 0.2450542
), `206` = c(0.2427286, 0.2486499), `208` = c(0.3005925, 0.2602714
), `210` = c(0.3774263, 0.3418267), `212` = c(0.4162179, 0.3934315
), `214` = c(0.483283, 0.4533348), `216` = c(0.5029044, 0.5114449
), `218` = c(0.4974553, 0.5390385), `220` = c(0.4940954, 0.5900267
), `222` = c(0.4953246, 0.6695098), `224` = c(0.481304, 0.7726094
), `226` = c(0.4513558, 0.8644904), `228` = c(0.4198686, 0.8791566
), `230` = c(0.3907493, 0.7864748), `232` = c(0.3591166, 0.6480582
), `234` = c(0.3277905, 0.49029), `236` = c(0.3033395, 0.3715453
), `238` = c(0.2875585, 0.2875996), `240` = c(0.274983, 0.2247074
), `242` = c(0.2685759, 0.1875459), `244` = c(0.2638485, 0.1637982
), `246` = c(0.2596794, 0.1469171), `248` = c(0.2566508, 0.1360679
), `250` = c(0.2534968, 0.1289802), `252` = c(0.2487593, 0.1223562
), `254` = c(0.2440191, 0.1170995), `256` = c(0.2390878, 0.1120935
), `258` = c(0.2350084, 0.107386), `260` = c(0.2326146, 0.105811
), `262` = c(0.231193, 0.1047314), `264` = c(0.2268821, 0.1026403
), `266` = c(0.2228926, 0.1005902), `268` = c(0.2188771, 0.0981951
), `270` = c(0.2141229, 0.0956424), `272` = c(0.2097481, 0.0937292
), `274` = c(0.2053046, 0.0918917), `276` = c(0.1986362, 0.0895234
), `278` = c(0.1925066, 0.0873742), `280` = c(0.1857309, 0.0845037
), `282` = c(0.1798247, 0.0821695), `284` = c(0.173684, 0.0794527
), `286` = c(0.1681736, 0.0774221), `288` = c(0.1636854, 0.075914
), `290` = c(0.1600333, 0.0746672), `292` = c(0.1567454, 0.0735234
), `294` = c(0.1537335, 0.0723853), `296` = c(0.1514025, 0.0713109
), `298` = c(0.1496398, 0.0703921), `300` = c(0.1482034, 0.0702467
), `302` = c(0.1455175, 0.0688483), `304` = c(0.1422886, 0.067099
), `306` = c(0.1396007, 0.065432), `308` = c(0.1368657, 0.0632867
), `310` = c(0.1347514, 0.0616299), `312` = c(0.1332553, 0.0602099
), `314` = c(0.1318352, 0.0587635), `316` = c(0.1304504, 0.0576302
), `318` = c(0.1291949, 0.056583), `320` = c(0.1272951, 0.0549143
), `322` = c(0.1265529, 0.0536381), `324` = c(0.1259775, 0.0525203
), `326` = c(0.1251506, 0.0515252), `328` = c(0.1242249, 0.0509617
), `330` = c(0.1237719, 0.0502567), `332` = c(0.1234315, 0.0497051
), `334` = c(0.1228537, 0.0491512), `336` = c(0.1219852, 0.0484357
), `338` = c(0.1209263, 0.0478882), `340` = c(0.1198741, 0.0474155
), `342` = c(0.1191702, 0.0461311), `344` = c(0.1190057, 0.046185
), `346` = c(0.1191455, 0.0463802), `348` = c(0.1181966, 0.0457693
), `350` = c(0.1172899, 0.045634), `352` = c(0.116555, 0.0453343
), `354` = c(0.1159115, 0.0445265), `356` = c(0.1154331, 0.0440425
), `358` = c(0.1148589, 0.043449), `360` = c(0.1143816, 0.0427716
), `362` = c(0.114209, 0.04271), `364` = c(0.1133982, 0.0423733
), `366` = c(0.1120933, 0.0421726), `368` = c(0.1119338, 0.042341
), `370` = c(0.1102654, 0.0409827), `372` = c(0.1094487, 0.0409078
), `374` = c(0.1090142, 0.0407193), `376` = c(0.1085882, 0.0399118
), `378` = c(0.1085156, 0.0396651), `380` = c(0.1082574, 0.0396597
), `382` = c(0.1076243, 0.0393841), `384` = c(0.1070805, 0.0390632
), `386` = c(0.1067173, 0.0387599), `388` = c(0.1062678, 0.0382344
), `390` = c(0.1057367, 0.037716), `392` = c(0.1056847, 0.0376194
), `394` = c(0.1055271, 0.0375567), `396` = c(0.1053451, 0.0375475
), `398` = c(0.1051099, 0.0374534), `400` = c(0.1054062, 0.037516
), `402` = c(0.1052253, 0.0373257), `404` = c(0.1043745, 0.0368734
), `406` = c(0.1034489, 0.0366216), `408` = c(0.1026329, 0.0364288
), `410` = c(0.1018779, 0.035892), `412` = c(0.1016637, 0.0360423
), `414` = c(0.1015407, 0.0360411), `416` = c(0.1007153, 0.0356497
), `418` = c(0.1007498, 0.0355663), `420` = c(0.1004693, 0.0345159
), `422` = c(0.0998567, 0.0338999), `424` = c(0.0998918, 0.0338306
), `426` = c(0.0999174, 0.0338034), `428` = c(0.0998015, 0.0338135
), `430` = c(0.0995906, 0.0338477), `432` = c(0.0989122, 0.0335684
), `434` = c(0.097852, 0.0329977), `436` = c(0.09709, 0.0327005
), `438` = c(0.0962624, 0.0324584), `440` = c(0.0957877, 0.0320588
), `442` = c(0.0952125, 0.0318741), `444` = c(0.0950441, 0.0320549
), `446` = c(0.0946589, 0.0318478), `448` = c(0.0944174, 0.0317015
), `450` = c(0.0946495, 0.0321019), `452` = c(0.0945732, 0.0321195
), `454` = c(0.0939588, 0.0314291), `456` = c(0.0932608, 0.0308688
), `458` = c(0.0926701, 0.0306815), `460` = c(0.0923376, 0.0303084
), `462` = c(0.0926626, 0.0307978), `464` = c(0.0939299, 0.0317353
), `466` = c(0.0930716, 0.0309835), `468` = c(0.0929553, 0.0311153
), `470` = c(0.0930083, 0.0313657), `472` = c(0.0926628, 0.0310591
), `474` = c(0.0922276, 0.0306481), `476` = c(0.092029, 0.0305241
), `478` = c(0.0915025, 0.0304446), `480` = c(0.0904626, 0.0299507
), `482` = c(0.0893053, 0.0291639), `484` = c(0.0885098, 0.0286272
), `486` = c(0.0888613, 0.0284567), `488` = c(0.0903306, 0.0288371
), `490` = c(0.0918492, 0.0302382), `492` = c(0.092112, 0.0302303
), `494` = c(0.0921006, 0.0303867), `496` = c(0.0918876, 0.0306633
), `498` = c(0.0918417, 0.0303878), `500` = c(0.0918264, 0.030263
), `502` = c(0.0915124, 0.0302606), `504` = c(0.0907789, 0.0301055
), `506` = c(0.0902592, 0.0299799), `508` = c(0.0902585, 0.0297897
), `510` = c(0.0903141, 0.0295952), `512` = c(0.0903744, 0.0300794
), `514` = c(0.0906949, 0.0303025), `516` = c(0.0907089, 0.0298949
), `518` = c(0.0905071, 0.0300078), `520` = c(0.0898525, 0.0290348
), `522` = c(0.0893119, 0.0290643), `524` = c(0.088986, 0.0289325
), `526` = c(0.0895112, 0.0285626), `528` = c(0.0897726, 0.0285274
), `530` = c(0.0895805, 0.0287086), `532` = c(0.0891773, 0.0287757
), `534` = c(0.0889579, 0.0287846), `536` = c(0.0884922, 0.028611
), `538` = c(0.0877636, 0.028343), `540` = c(0.0880152, 0.0284555
), `542` = c(0.0882578, 0.0282067), `544` = c(0.0883219, 0.028523
), `546` = c(0.0882104, 0.0291039), `548` = c(0.0882543, 0.0290097
), `550` = c(0.0880007, 0.0289382), `552` = c(0.0878873, 0.0289249
), `554` = c(0.0874991, 0.0282089), `556` = c(0.0876973, 0.0285702
), `558` = c(0.0875319, 0.0283379), `560` = c(0.087135, 0.0280649
), `562` = c(0.0875509, 0.0285492), `564` = c(0.0876262, 0.0285577
), `566` = c(0.0869524, 0.0280579), `568` = c(0.0869787, 0.0282936
), `570` = c(0.0870611, 0.0281805), `572` = c(0.0868479, 0.0274971
), `574` = c(0.0867729, 0.0280225), `576` = c(0.0872913, 0.0284508
), `578` = c(0.0871356, 0.0287336), `580` = c(0.0864677, 0.0284608
), `582` = c(0.0869201, 0.0286578), `584` = c(0.0867237, 0.0283562
), `586` = c(0.0866258, 0.0280203), `588` = c(0.0866252, 0.0279319
), `590` = c(0.0863437, 0.0276132), `592` = c(0.0861822, 0.0272436
), `594` = c(0.0870913, 0.0279816), `596` = c(0.0876164, 0.0284226
), `598` = c(0.0869303, 0.0277859), `600` = c(0.0864942, 0.027882
), `602` = c(0.0862525, 0.0279344), `604` = c(0.0862981, 0.0269194
), `606` = c(0.0865318, 0.0274518), `608` = c(0.086784, 0.0281402
), `610` = c(0.0868649, 0.0275235), `612` = c(0.0870241, 0.0285917
), `614` = c(0.0868249, 0.028492), `616` = c(0.0868225, 0.0277449
), `618` = c(0.086214, 0.02755), `620` = c(0.0864288, 0.0277508
), `622` = c(0.0863217, 0.0272391), `624` = c(0.0866155, 0.0272815
), `626` = c(0.0872267, 0.0278741), `628` = c(0.0873636, 0.0279329
), `630` = c(0.0868396, 0.0276163), `632` = c(0.0864614, 0.0279268
), `634` = c(0.0862915, 0.0279218), `636` = c(0.0857102, 0.0269628
), `638` = c(0.0858776, 0.0274795), `640` = c(0.0871622, 0.028363
), `642` = c(0.0873245, 0.0278735), `644` = c(0.0873235, 0.0288401
), `646` = c(0.0880486, 0.02906), `648` = c(0.0875043, 0.0273798
), `650` = c(0.0869163, 0.026752), `652` = c(0.0871902, 0.0270702
), `654` = c(0.0866511, 0.0266837), `656` = c(0.0868393, 0.0269093
), `658` = c(0.0883237, 0.0275446), `660` = c(0.0880274, 0.0282655
), `662` = c(0.0881567, 0.0283123), `664` = c(0.0878181, 0.0282416
), `666` = c(0.0870453, 0.0282417), `668` = c(0.0869403, 0.0280012
), `670` = c(0.0875342, 0.0284779), `672` = c(0.0881081, 0.0288431
), `674` = c(0.0883047, 0.0280668), `676` = c(0.089067, 0.0283966
), `678` = c(0.0899504, 0.0291833), `680` = c(0.0882103, 0.0272003
), `682` = c(0.0871596, 0.0260054), `684` = c(0.0879612, 0.0267891
), `686` = c(0.0873405, 0.0269836), `688` = c(0.0882181, 0.0278995
), `690` = c(0.0884919, 0.028267), `692` = c(0.0887171, 0.0296971
), `694` = c(0.0877552, 0.0283004), `696` = c(0.0879554, 0.0275502
), `698` = c(0.0890788, 0.0286819), `700` = c(0.0886739, 0.0274891
), `702` = c(0.0881579, 0.0268598), `704` = c(0.088684, 0.028465
), `706` = c(0.0889838, 0.0276326), `708` = c(0.0897651, 0.0278372
)), .Names = c("reference", "concentration", "200", "202", "204",
"206", "208", "210", "212", "214", "216", "218", "220", "222",
"224", "226", "228", "230", "232", "234", "236", "238", "240",
"242", "244", "246", "248", "250", "252", "254", "256", "258",
"260", "262", "264", "266", "268", "270", "272", "274", "276",
"278", "280", "282", "284", "286", "288", "290", "292", "294",
"296", "298", "300", "302", "304", "306", "308", "310", "312",
"314", "316", "318", "320", "322", "324", "326", "328", "330",
"332", "334", "336", "338", "340", "342", "344", "346", "348",
"350", "352", "354", "356", "358", "360", "362", "364", "366",
"368", "370", "372", "374", "376", "378", "380", "382", "384",
"386", "388", "390", "392", "394", "396", "398", "400", "402",
"404", "406", "408", "410", "412", "414", "416", "418", "420",
"422", "424", "426", "428", "430", "432", "434", "436", "438",
"440", "442", "444", "446", "448", "450", "452", "454", "456",
"458", "460", "462", "464", "466", "468", "470", "472", "474",
"476", "478", "480", "482", "484", "486", "488", "490", "492",
"494", "496", "498", "500", "502", "504", "506", "508", "510",
"512", "514", "516", "518", "520", "522", "524", "526", "528",
"530", "532", "534", "536", "538", "540", "542", "544", "546",
"548", "550", "552", "554", "556", "558", "560", "562", "564",
"566", "568", "570", "572", "574", "576", "578", "580", "582",
"584", "586", "588", "590", "592", "594", "596", "598", "600",
"602", "604", "606", "608", "610", "612", "614", "616", "618",
"620", "622", "624", "626", "628", "630", "632", "634", "636",
"638", "640", "642", "644", "646", "648", "650", "652", "654",
"656", "658", "660", "662", "664", "666", "668", "670", "672",
"674", "676", "678", "680", "682", "684", "686", "688", "690",
"692", "694", "696", "698", "700", "702", "704", "706", "708"
)))
column "concentration" are the concentrations and columns 200:708 are the corresponding wavelengths with absorption. Normally my sample dataframe is longer (n>50).
What i'm trying to do is find out if it is possible with spectral analysis (UV-vis) to monitor concentrations of Phosphate and Nitrogen in wastewater. Therefore I need to find a correlation between measured concentrations and observed wavelengths. I use only wavelengths 200 -708nm, so it is not comparable with mass spectrometry.
I've got the following working code:
grid2 <- expand.grid(w1=seq(200,708, 2), w2=seq(200,708, 2))
colnames_spec <- colnames(spectrum)
wave2 <- lapply(1:nrow(grid2), function(j){
w1 <- grid2$w1[j]
w2 <- grid2$w2[j]
cond2 <- colnames_spec==paste0("X",w2)
cond1 <- colnames_spec==paste0("X",w1)
fit <- lm(spectrum$concentration~spectrum[,cond1]+spectrum[,cond2])
assign(paste("r",j,sep=""), value=c(w1,w2,summary(fit)$r.squared))
}
)
tot_rsq_2 <- do.call(rbind, wave2)
tot_rsq_2 <- as.data.frame(tot_rsq_2)
colnames(tot_rsq_2) <- c("wavelength.1", "wavelength.2", "r_squared")
max_rsq_2 <- head(tot_rsq_2[with(tot_rsq_2, order(r_squared, decreasing=T)), ],n=5)
print(max_rsq_2)
The script takes about three minutes for me to run, but is there any way to speed up the code?
Furthermore, I would like to extent the linear model to search for the optimum with three wavelengths. Then grid2 would have 255*255*255 rows, which would take ages to run.
Any help is much appreciated.
Since the order of predictors in regression doesn't matter for R², you can use combn instead of expand.grid, which means less regressions have to be calculated. Furthermore, you should use faster functions for regression.
spec <- as.matrix(spectrum[,-1])
library(RcppEigen)
fun <- function(ind) {
fit <- fastLm(spec[,1]~spec[,ind[1]+1]+spec[,ind[2]+1])
1-sum(fit$residuals^2)/sum((fit$fitted.values-mean.default(fit$fitted.values))^2)
}
res <- combn(ncol(spec)-1, 2, fun)
res <- cbind.data.frame(t(matrix(names(spectrum)[combn(ncol(spec)-1, 2)+2],2)),
res)
head(res)
# 1 2 res
# 1 X200 X202 1
# 2 X200 X204 1
# 3 X200 X206 1
# 4 X200 X208 1
# 5 X200 X210 1
# 6 X200 X212 1

Resources