I'm trying to analyze anomalies in a time series using the anomalize package. Data is logged every second. When I try to call the time_decompose function, I get an error as per the following reprex.
library(tidyverse)
library(lubridate)
library(tibbletime)
library(anomalize)
structure(list(t = c(1001, 1002, 1003, 1004, 1005, 1006, 1007,
1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018,
1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029,
1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040,
1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051,
1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062,
1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073,
1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084,
1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095,
1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106,
1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117,
1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128,
1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139,
1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150,
1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161,
1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172,
1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183,
1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194,
1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205,
1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216,
1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227,
1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238,
1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249,
1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260,
1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271,
1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282,
1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293,
1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304,
1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315,
1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326,
1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337,
1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348,
1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359,
1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370,
1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381,
1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392,
1393, 1394, 1395, 1396, 1397, 1398, 1399), value = c(118.62,
121.57, 121.08, 118.5, 118.09, 115.28, 115.8, 111.83, 116.73,
120.34, 120.11, 124.51, 125.28, 127.2, 128.45, 124.24, 122.7,
121, 116.26, 114.12, 111.08, 109.83, 107.71, 109.31, 108.4, 106.59,
103.34, 104.02, 106.15, 105.9, 105.96, 104.79, 104.04, 103.45,
102.07, 99.71, 97.9, 99.12, 100.45, 99, 97.3, 96.11, 95.09, 95.98,
95.3, 92.88, 93.1, 91.2099999999999, 85.21, 85.6, 82.9500000000001,
80.85, 77.41, 78.66, 77.93, 73.88, 72.68, 71.09, 67.04, 68.25,
70.23, 67.86, 67.94, 69.44, 68.5, 67.11, 65.8899999999999, 64.7299999999999,
66.4900000000001, 67.2099999999999, 68.5400000000001, 69.56,
66.68, 67.24, 68.79, 69.74, 72.43, 73.17, 75.39, 79.2, 80.72,
83.04, 84.73, 87.82, 88.7, 92.38, 95.55, 97.0499999999999, 97.32,
97.59, 97.97, 97.96, 100.63, 100.77, 104.89, 105.38, 109.1, 107.84,
107.78, 105.08, 106.36, 103.95, 107.74, 107.58, 109.69, 112.46,
115.77, 117.11, 121.28, 123.4, 127.66, 127.01, 130.15, 131.31,
130.12, 129.88, 129.22, 128.48, 126.17, 127.26, 128.93, 127.57,
127.93, 128.92, 128.53, 128.72, 129.58, 129.12, 126.49, 127.31,
125.46, 125.09, 127.55, 129.01, 128.58, 128.7, 128.04, 127.13,
125.86, 124.27, 124.5, 125.45, 124.67, 126.13, 127.71, 126.17,
127.14, 125.11, 122.11, 120.4, 118.86, 117.7, 113.78, 109.02,
109.34, 108.79, 105.44, 108.11, 105.5, 106.75, 104.14, 100.34,
99.8099999999999, 95.9300000000001, 97.44, 97.63, 96.53, 97.25,
94.95, 93.06, 92.09, 86.22, 83.3400000000001, 77.04, 77.37, 74.88,
77.41, 78.21, 78.77, 81.79, 80.51, 84.71, 88.06, 88.25, 86.5300000000001,
86.6099999999999, 86.57, 89.06, 89.3499999999999, 90.92, 89.9100000000001,
90.26, 88.7699999999999, 88.08, 86.2699999999999, 84.93, 83.81,
83.18, 83.39, 84.43, 87.67, 87.23, 90.16, 89.88, 94.14, 97.64,
99.49, 100.61, 102.85, 103.01, 104.28, 106.94, 107.88, 107.5,
111.68, 110.92, 113.76, 113.8, 114.3, 115.95, 116.02, 113.34,
114.1, 118.21, 120.39, 122.09, 123.47, 122.46, 120.2, 117.88,
120.75, 119.2, 119.76, 120.43, 121.09, 120.36, 121.01, 120.87,
119.03, 120.17, 118.65, 118.13, 118.57, 118.84, 120.07, 119.93,
122.45, 125.04, 126.96, 124.86, 127.39, 129.6, 129.93, 127.71,
124.05, 120.7, 118.72, 116.6, 114.48, 110.81, 105.67, 97.48,
93.07, 95.25, 91.2100000000001, 89.47, 83.41, 84.85, 83.2500000000001,
80.89, 85.66, 86.3000000000001, 87.1499999999999, 87.1800000000001,
89.01, 91.27, 96.7400000000001, 100.41, 103.36, 108.56, 110.08,
109.31, 111, 109.77, 112.32, 114.05, 112.37, 114.59, 114.84,
114.31, 115.14, 115.38, 116.1, 112.19, 108.39, 106.65, 103.69,
100.23, 96.56, 92.07, 88.1000000000001, 80.79, 80.2800000000001,
76.8999999999999, 72.3900000000001, 70.37, 66.37, 66.8700000000001,
64.9199999999999, 63.23, 61.96, 53.95, 58.94, 59.9199999999999,
61, 60.3500000000001, 60.01, 57.33, 58.25, 59.37, 61.28, 62.61,
62.16, 63.38, 65.02, 68.9599999999999, 70.56, 71.54, 67.54, 69.34,
71.69, 73.87, 74.94, 78.94, 82.14, 84.88, 86.9299999999999, 90.26,
92.2500000000001, 95.15, 96.98, 99.2799999999999, 100.41, 105.08,
106.06, 106.95, 107.14, 106.84, 106.68, 104.91, 106.05, 102.63,
104.43, 103.92, 103.85, 103.71, 99.32, 103.02, 100.79, 101.5,
106.55, 108.73, 109.68, 112.88, 116.66, 116.59, 119.13, 122.81,
124.18, 127.61, 127.22, 129.81, 130.39, 131.11, 130.48, 126.35,
125.6, 118.47, 118.32, 116.57, 111.24, 109.76, 108.27, 105.05,
104.8, 103.92, 106.12, 108.32, 105.49, 104.84, 105.95, 103.16,
99.33, 98.52, 94.97, 92.6499999999999, 94.0800000000001, 92.09,
89.09)), row.names = c(NA, -399L), class = c("tbl_df", "tbl",
"data.frame")) %>%
mutate(t = make_datetime(sec = t)) %>%
as_tbl_time(index = t) %>%
time_decompose(value, merge = TRUE)
#> Error in `dplyr::filter()`:
#> ℹ In argument: `time_scale == key_value`.
#> Caused by error:
#> ! `..1` must be of size 8 or 1, not size 0.
#> Backtrace:
#> ▆
#> 1. ├─... %>% time_decompose(value, merge = TRUE)
#> 2. ├─anomalize::time_decompose(., value, merge = TRUE)
#> 3. ├─anomalize:::time_decompose.tbl_time(., value, merge = TRUE)
#> 4. │ └─data %>% ...
#> 5. ├─anomalize::decompose_stl(...)
#> 6. │ └─anomalize::time_frequency(data, period = frequency, message = message)
#> 7. │ └─template %>% ...
#> 8. ├─anomalize:::target_time_decomposition_scale(...)
#> 9. │ └─template %>% dplyr::filter(time_scale == key_value) %>% ...
#> 10. ├─dplyr::pull(., !!target_expr)
#> 11. ├─dplyr::filter(., time_scale == key_value)
#> 12. ├─dplyr:::filter.data.frame(., time_scale == key_value)
#> 13. │ └─dplyr:::filter_rows(.data, dots, by)
#> 14. │ └─dplyr:::filter_eval(dots, mask = mask, error_call = error_call)
#> 15. │ ├─base::withCallingHandlers(...)
#> 16. │ └─mask$eval_all_filter(dots, env_filter)
#> 17. │ └─dplyr (local) eval()
#> 18. ├─dplyr:::dplyr_internal_error(...)
#> 19. │ └─rlang::abort(class = c(class, "dplyr:::internal_error"), dplyr_error_data = data)
#> 20. │ └─rlang:::signal_abort(cnd, .file)
#> 21. │ └─base::signalCondition(cnd)
#> 22. └─dplyr (local) `<fn>`(`<dpl:::__>`)
#> 23. └─rlang::abort(message, class = error_class, parent = parent, call = error_call)
Created on 2023-02-14 with reprex v2.0.2
If I change the sample rate of the time series to a minute in make_datetime (not true), the time_decompose function works fine. However, I would like to carry out this analysis on the right units (seconds). The dataset is, of course, a small slice of the real dataset, but it allows you to reproduce the error.
library(tidyverse)
library(lubridate)
library(tibbletime)
library(anomalize)
structure(list(t = c(1001, 1002, 1003, 1004, 1005, 1006, 1007,
1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018,
1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029,
1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040,
1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051,
1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062,
1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073,
1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084,
1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095,
1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106,
1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117,
1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128,
1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139,
1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150,
1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161,
1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172,
1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183,
1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194,
1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205,
1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216,
1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227,
1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238,
1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249,
1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260,
1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271,
1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282,
1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293,
1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304,
1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315,
1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326,
1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337,
1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348,
1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359,
1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370,
1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381,
1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392,
1393, 1394, 1395, 1396, 1397, 1398, 1399), value = c(118.62,
121.57, 121.08, 118.5, 118.09, 115.28, 115.8, 111.83, 116.73,
120.34, 120.11, 124.51, 125.28, 127.2, 128.45, 124.24, 122.7,
121, 116.26, 114.12, 111.08, 109.83, 107.71, 109.31, 108.4, 106.59,
103.34, 104.02, 106.15, 105.9, 105.96, 104.79, 104.04, 103.45,
102.07, 99.71, 97.9, 99.12, 100.45, 99, 97.3, 96.11, 95.09, 95.98,
95.3, 92.88, 93.1, 91.2099999999999, 85.21, 85.6, 82.9500000000001,
80.85, 77.41, 78.66, 77.93, 73.88, 72.68, 71.09, 67.04, 68.25,
70.23, 67.86, 67.94, 69.44, 68.5, 67.11, 65.8899999999999, 64.7299999999999,
66.4900000000001, 67.2099999999999, 68.5400000000001, 69.56,
66.68, 67.24, 68.79, 69.74, 72.43, 73.17, 75.39, 79.2, 80.72,
83.04, 84.73, 87.82, 88.7, 92.38, 95.55, 97.0499999999999, 97.32,
97.59, 97.97, 97.96, 100.63, 100.77, 104.89, 105.38, 109.1, 107.84,
107.78, 105.08, 106.36, 103.95, 107.74, 107.58, 109.69, 112.46,
115.77, 117.11, 121.28, 123.4, 127.66, 127.01, 130.15, 131.31,
130.12, 129.88, 129.22, 128.48, 126.17, 127.26, 128.93, 127.57,
127.93, 128.92, 128.53, 128.72, 129.58, 129.12, 126.49, 127.31,
125.46, 125.09, 127.55, 129.01, 128.58, 128.7, 128.04, 127.13,
125.86, 124.27, 124.5, 125.45, 124.67, 126.13, 127.71, 126.17,
127.14, 125.11, 122.11, 120.4, 118.86, 117.7, 113.78, 109.02,
109.34, 108.79, 105.44, 108.11, 105.5, 106.75, 104.14, 100.34,
99.8099999999999, 95.9300000000001, 97.44, 97.63, 96.53, 97.25,
94.95, 93.06, 92.09, 86.22, 83.3400000000001, 77.04, 77.37, 74.88,
77.41, 78.21, 78.77, 81.79, 80.51, 84.71, 88.06, 88.25, 86.5300000000001,
86.6099999999999, 86.57, 89.06, 89.3499999999999, 90.92, 89.9100000000001,
90.26, 88.7699999999999, 88.08, 86.2699999999999, 84.93, 83.81,
83.18, 83.39, 84.43, 87.67, 87.23, 90.16, 89.88, 94.14, 97.64,
99.49, 100.61, 102.85, 103.01, 104.28, 106.94, 107.88, 107.5,
111.68, 110.92, 113.76, 113.8, 114.3, 115.95, 116.02, 113.34,
114.1, 118.21, 120.39, 122.09, 123.47, 122.46, 120.2, 117.88,
120.75, 119.2, 119.76, 120.43, 121.09, 120.36, 121.01, 120.87,
119.03, 120.17, 118.65, 118.13, 118.57, 118.84, 120.07, 119.93,
122.45, 125.04, 126.96, 124.86, 127.39, 129.6, 129.93, 127.71,
124.05, 120.7, 118.72, 116.6, 114.48, 110.81, 105.67, 97.48,
93.07, 95.25, 91.2100000000001, 89.47, 83.41, 84.85, 83.2500000000001,
80.89, 85.66, 86.3000000000001, 87.1499999999999, 87.1800000000001,
89.01, 91.27, 96.7400000000001, 100.41, 103.36, 108.56, 110.08,
109.31, 111, 109.77, 112.32, 114.05, 112.37, 114.59, 114.84,
114.31, 115.14, 115.38, 116.1, 112.19, 108.39, 106.65, 103.69,
100.23, 96.56, 92.07, 88.1000000000001, 80.79, 80.2800000000001,
76.8999999999999, 72.3900000000001, 70.37, 66.37, 66.8700000000001,
64.9199999999999, 63.23, 61.96, 53.95, 58.94, 59.9199999999999,
61, 60.3500000000001, 60.01, 57.33, 58.25, 59.37, 61.28, 62.61,
62.16, 63.38, 65.02, 68.9599999999999, 70.56, 71.54, 67.54, 69.34,
71.69, 73.87, 74.94, 78.94, 82.14, 84.88, 86.9299999999999, 90.26,
92.2500000000001, 95.15, 96.98, 99.2799999999999, 100.41, 105.08,
106.06, 106.95, 107.14, 106.84, 106.68, 104.91, 106.05, 102.63,
104.43, 103.92, 103.85, 103.71, 99.32, 103.02, 100.79, 101.5,
106.55, 108.73, 109.68, 112.88, 116.66, 116.59, 119.13, 122.81,
124.18, 127.61, 127.22, 129.81, 130.39, 131.11, 130.48, 126.35,
125.6, 118.47, 118.32, 116.57, 111.24, 109.76, 108.27, 105.05,
104.8, 103.92, 106.12, 108.32, 105.49, 104.84, 105.95, 103.16,
99.33, 98.52, 94.97, 92.6499999999999, 94.0800000000001, 92.09,
89.09)), row.names = c(NA, -399L), class = c("tbl_df", "tbl",
"data.frame")) %>%
mutate(t = make_datetime(min = t)) %>%
as_tbl_time(index = t) %>%
time_decompose(value, merge = TRUE)
#> frequency = 60 minutes
#> trend = 399 minutes
#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
#> # A time tibble: 399 × 6
#> # Index: t
#> t value observed season trend remainder
#> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1970-01-01 16:41:00 119. 119. -0.485 104. 15.5
#> 2 1970-01-01 16:42:00 122. 122. -0.462 104. 18.4
#> 3 1970-01-01 16:43:00 121. 121. 0.274 104. 17.2
#> 4 1970-01-01 16:44:00 118. 118. 0.284 104. 14.6
#> 5 1970-01-01 16:45:00 118. 118. 0.348 104. 14.1
#> 6 1970-01-01 16:46:00 115. 115. -0.421 104. 12.0
#> 7 1970-01-01 16:47:00 116. 116. 0.582 104. 11.6
#> 8 1970-01-01 16:48:00 112. 112. 1.26 104. 6.90
#> 9 1970-01-01 16:49:00 117. 117. 2.72 104. 10.3
#> 10 1970-01-01 16:50:00 120. 120. 4.42 104. 12.2
#> # … with 389 more rows
Created on 2023-02-14 with reprex v2.0.2
I have a dictionary which contains key pair values as key interprets "nodes " and value is a list of communities it belongs to. arranging them according to non-increasing order based on length of list value ,i need to create a list of keys starting from the top rank key and iterate over all keys to find keys with no intersection in their list values with previously added key
this is a dictionary
" {'2179': [15, 197, 363, 594, 766, 865, 1150, 1417, 1575, 1615, 1617, 1618, 1621, 1623, 1624, 1625, 1627], '2188': [15, 363, 766, 1150, 1417, 1616, 1617, 1618, 1619, 1620, 1622, 1624, 1625, 1626, 1629], '2180': [197, 594, 1150, 1575, 1616, 1617, 1618, 1619, 1620, 1622, 1624, 1625, 1626, 1629, 2201], '2195': [1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629], '2452': [1757, 1758, 1759, 1760, 1761, 1762, 1763, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772], '238': [57, 65, 76, 213, 251, 1080, 1126, 1448, 1896, 1897, 1898, 1899, 1900], '6974': [14, 122, 137, 491, 641, 660, 675, 1046, 1800, 2054, 2371], '124': [19, 66, 70, 113, 123, 159, 276, 297, 826, 2122], '3224': [18, 36, 44, 215, 230, 419, 1139, 1259, 2153], '100': [19, 66, 113, 297, 635, 826, 1356, 2122], '553': [40, 50, 133, 135, 192, 526, 1677,
1829]}"
.
I need to add keys iteratively to the list which have no intersection with list values of previously added key and next key to be added. this is a code i tried
"this is a code i tried.
k=len(new_dict)
seed=list(new_dict.keys())[0]
print(seed)
CummunitySet=[]
CommunitySet=set(new_dict.get(seed))
print(CommunitySet)
seedSet=set(seed)
Index=1
while ((seedCount < k) & (Index < count)):
seed=list(new_dict.keys())[Index]
if(set(new_dict.get(seed)).difference(CommunitySet)!=set()):
CommunitySet = CommunitySet.union(new_dict.get(seed))
print(CommunitySet)
seedSet = seedSet.union(set(seed))
Index=Index+1
seedCount=seedCount+1
else:
Index=Index+1
Index=Index+1
print(seedSet)
thankyou.
Maybe you could utilize set.intersection:
def main() -> None:
data = {
'2179': [15, 197, 363, 594, 766, 865, 1150, 1417, 1575, 1615, 1617, 1618, 1621, 1623, 1624, 1625, 1627],
'2188': [15, 363, 766, 1150, 1417, 1616, 1617, 1618, 1619, 1620, 1622, 1624, 1625, 1626, 1629],
'2180': [197, 594, 1150, 1575, 1616, 1617, 1618, 1619, 1620, 1622, 1624, 1625, 1626, 1629, 2201],
'2195': [1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629],
'2452': [1757, 1758, 1759, 1760, 1761, 1762, 1763, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772],
'238': [57, 65, 76, 213, 251, 1080, 1126, 1448, 1896, 1897, 1898, 1899, 1900],
'6974': [14, 122, 137, 491, 641, 660, 675, 1046, 1800, 2054, 2371],
'124': [19, 66, 70, 113, 123, 159, 276, 297, 826, 2122],
'3224': [18, 36, 44, 215, 230, 419, 1139, 1259, 2153],
'100': [19, 66, 113, 297, 635, 826, 1356, 2122],
'553': [40, 50, 133, 135, 192, 526, 1677, 1829]
}
new_data = {}
used_values = set()
for key, values in data.items():
values_set = set(values)
if values_set.intersection(used_values): # Equivalant to `values_set & used_values`.
continue
used_values |= values_set
new_data[key] = values
print(new_data)
if __name__ == '__main__':
main()
Output:
{
'2179': [15, 197, 363, 594, 766, 865, 1150, 1417, 1575, 1615, 1617, 1618, 1621, 1623, 1624, 1625, 1627],
'2452': [1757, 1758, 1759, 1760, 1761, 1762, 1763, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772],
'238': [57, 65, 76, 213, 251, 1080, 1126, 1448, 1896, 1897, 1898, 1899, 1900],
'6974': [14, 122, 137, 491, 641, 660, 675, 1046, 1800, 2054, 2371],
'124': [19, 66, 70, 113, 123, 159, 276, 297, 826, 2122],
'3224': [18, 36, 44, 215, 230, 419, 1139, 1259, 2153],
'553': [40, 50, 133, 135, 192, 526, 1677, 1829]
}
Note: Indentation in output has been added manually for readability.
I am working to perform a bootstrap using the statistic median for dataset "file", containing only one column "Total". This is it:
Total <-
c(2089, 1567, 1336, 1616, 1590, 1649, 1341, 1614, 1590, 1621,
1621, 1631, 1295, 107, 18, 195, 2059, 870, 2371, 787, 98, 2422,
655, 1277, 1336, 2109, 1811, 1337, 1290, 1308, 1359, 1600, 1296,
693, 107, 1359, 89, 89, 89, 89, 2411, 1639, 89, 89, 1283, 89,
89, 89, 2341, 1012, 1295, 1853, 1277, 1571, 1288, 1300, 1619,
107, 555, 1612, 1300, 1300, 2093, 133, 1674, 988, 132, 647, 606,
544, 873, 274, 120, 1620, 1601, 1601, 906, 1603, 1613, 1592,
1603, 1610, 1321, 2380, 1575, 1575, 1277, 2354, 1561, 1579, 2367,
2341, 876, 1612, 1588, 2087, 1612, 890, 1586, 1580, 611, 1797,
2079, 1937, 189, 171, 706, 1647, 1642, 1278, 1650, 1623, 1647,
1661, 1692, 1632, 1684, 2474, 403, 842, 593, 98, 2354, 1265,
866, 1483, 2379, 1650, 1875, 1655, 1632, 1691, 1329, 867, 1632,
1693, 1623, 829, 1659, 1685, 666, 1585, 1659, 2169, 1623, 1645,
1654, 1698, 2172, 789, 1698, 579, 2443, 335, 132, 1952, 1265,
978, 1624, 979, 1729, 607, 181, 752, 424, 386, 309, 998, 1435,
2476, 392, 1657, 348, 1652, 1646, 1345, 2445, 1655, 840, 1624,
1652, 1321, 1321, 2201, 957, 917, 2458, 4096, 2458, 1346, 2459,
1634, 2459, 2459, 2459, 2508, 714, 2457, 2457, 1703, 669, 976,
1634, 2459, 2491, 2393, 625, 1763, 879, 886, 1085, 731, 924,
1649, 1216, 1647, 2470, 668, 2326, 757, 215, 276, 186, 901, 1402,
429, 554, 2457, 1643, 986, 730, 1028, 971, 1952, 1584, 1023,
1352, 839, 2434, 430, 2462, 1327, 1004, 385, 1099, 1067, 758,
679, 1423, 2495, 1664, 2495, 2495, 1345, 2530, 1754, 1804, 2525,
1652, 2536, 1646, 2529, 1380, 1845, 963, 1339, 2482, 1417, 1729,
1384, 1648, 344, 1648, 955, 609, 485, 1822, 513, 223, 222, 193,
1410, 1159, 586, 585, 2671, 2702, 2529, 2212, 1658, 741, 2529,
861, 1758, 905, 2529, 597, 1049, 2529, 619, 2620, 2596, 1688,
2590, 2545, 2590, 883, 287, 723, 2565, 1835, 1738, 2243, 1693,
2565, 250, 2529, 1880, 1777, 701, 444, 927, 1127, 825, 2726,
1977, 235, 241, 269, 660, 1523, 420, 678, 213, 544, 940, 983,
605, 2716, 1848, 1848, 182, 1225, 365, 993, 224, 267, 309, 271,
324, 178, 2657, 1772, 546, 456, 2637, 1771, 677, 1409, 653, 2359,
690, 828, 2742, 1812, 2777, 552, 1572, 2742, 2792, 2819, 1753,
265, 1901, 1753, 2716, 2800, 2742, 453, 2742, 586, 1920, 929,
1897, 2742, 1859, 1899, 1106, 1135, 759, 730, 1838, 863, 1929,
2751, 2751, 2751, 2751, 713, 430, 2788, 1784, 966, 2483, 1784,
1786, 2727, 857, 1798, 1815, 730, 390, 593, 1489, 1448, 1784,
1510, 2788, 812, 856, 808, 941, 2797, 2757, 1852, 2757, 2412,
486, 1034, 615, 845, 974, 727, 969, 2916, 1841, 1926, 1926, 533,
446, 733, 696, 1214, 1857, 1907, 2824, 2631, 3556, 2496, 1617,
1000, 707, 936, 761, 960, 1936, 857, 423, 1130, 1165, 2453, 338,
988, 1869, 1951, 1932, 2820, 2742, 628, 447, 866, 637, 932, 2742,
1795, 2881, 695, 762, 2778, 427, 714, 2781, 1865, 1861, 678,
1465, 1770, 845, 356, 817, 385, 1820, 2692, 1787, 1510, 1814,
857, 2616, 204, 465, 1773, 2754, 1793, 1773, 1900, 185, 2706,
1162, 766, 2742, 1816, 2742, 1790, 1803, 1795, 1026, 334, 832,
478, 1849, 2679, 1773, 797, 2649, 1814, 1808, 99, 2037, 2616,
2719, 1813, 2637, 2648, 1813, 865, 1717, 2588, 2711, 2818, 1828,
2553, 2720, 1791, 1780, 2706, 2565, 1717, 1881, 1037, 329, 893,
723, 1821, 2692, 2586, 2729, 1755, 1793, 2670, 2602, 2638, 2684,
1813, 1755, 1755, 2626, 832, 739, 724, 1968, 2598, 2627, 851,
749, 684, 625, 2673, 2778, 1764, 2644, 1800, 1792, 511, 2776,
1890, 1764, 2776, 1040, 1049, 2699, 2061, 897, 1764, 274, 2755,
1912, 2581, 1780, 820, 1803, 2692, 2783, 572, 2751, 2699, 1830,
1875, 633, 1083)
Then I tried to use the bootstrap function:
> boot (Total, median, 1000)
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = Total, statistic = median, R = 1000)
Bootstrap Statistics :
original bias std. error
t1* 1603 0 0
There were 50 or more warnings (use warnings() to see the first 50)
The warning message was:
the condition has length > 1 and only the first element will be used
Can you please advise me how do I perform bootstrap to generate 95% confidence intervals for the median? I am a beginner in this and your help would be much appreciated.
Thank you so much in advance.
Admittedly the boot function from the boot package has a slightly non-intuitive aspect to it. But if you read the documentation (or look at the examples in the documentation) you'll see specific instructions about the statistic argument:
In all other cases statistic must take at least two arguments. The
first argument passed will always be the original data. The second
will be a vector of indices, frequencies or weights which define the
bootstrap sample.
So instead of:
x <- rnorm(10)
boot(data = x,statistic = median,R = 1000)
You want this:
boot(data = x,statistic = function(x,i) median(x[i]),R = 1000)
Once you're that far, the function boot.ci() can be used to compute the confidence intervals (only some of them are available in this particular example I believe).
b <- boot(data = x,statistic = function(x,i) median(x[i]),R = 1000)
boot.ci(b)
Though the answer by #joran is right, since I already had code tested, with the CI computation, here it goes.
library(boot)
bootMedian <- function(data, indices) median(data[indices])
b <- boot(Total, bootMedian, R = 1000)
boot.ci(b)
This is how you would "roll your own" bootrap:
# number of bootstrap replicates
B <- 10000
# create empty storage container
result_vec <- vector(length=B)
for(b in 1:B) {
# draw a bootstrap sample
this_sample <- sample(Total, size=length(Total), replace=TRUE)
# calculate your statistic
m <- median(this_sample)
# save your calucated statistic
result_vec[b] <- m
}
# then probably draw a histogram of your bootstrapped replicates
hist(result_vec)
# get 95% confidence interval
result_vec <- result_vec[order(result_vec)]
lower_bound <- result_vec[round(0.025*B)]
upper_bound <- result_vec[round(0.0975*B)]
I use the standard normal random generator in this code:
B <- i
bs.result <- matrix(NA, nrow=i, ncol=...)
for (b in 1:i) {
sample.n <- rnorm(n, mean-..., sd=...)
optim.b <- optim(c(mu=0, sd=1), loglik, control=list(fnscale=-1), z=sample.n)
bs.result <- c(optim.b$par, optim.b$converge)
}
With the last column of the table you can check whether your optimize function had converged.