Related
I am currently working with a .txt file and have used the read_table2() function to read it, resulting in the following structure.
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
<chr> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 FVP110~ 2.08e6 1101~ 1.10e 3 6 0 0 0 6 01101 6 0 0 0 6 01101 6 0 0 0
2 FVP110~ 2.06e4 8 9.3 e 1 2 93 0 0 0 0 0 093 0 0 0 0 0 093 0 0
3 FVP110~ 2.10e6 6 9.3 e 1 2 93 0 0 0 0 0 093 0 0 0 0 0 093 0 0
4 FVP110~ 2.10e6 6 3.11e18 3111 8 0 0 0 8 03111 8 0 0 0 8 03111 8 0 0
5 FVP110~ 2.08e6 94 2 e 0 94 0 0 0 0 0 094 0 0 0 0 0 094 0 0 0
6 FVP110~ 2.06e4 6 9.2 e 1 2 92 0 0 0 0 0 092 0 0 0 0 0 092 0 0
# ... with 31 more variables: X21 <chr>, X22 <chr>, X23 <chr>, X24 <chr>, X25 <chr>, X26 <chr>, X27 <chr>, X28 <chr>,
# X29 <chr>, X30 <chr>, X31 <chr>, X32 <chr>, X33 <chr>, X34 <chr>, X35 <chr>, X36 <chr>, X37 <chr>, X38 <chr>, X39 <chr>,
# X40 <chr>, X41 <chr>, X42 <chr>, X43 <chr>, X44 <chr>, X45 <chr>, X46 <chr>, X47 <chr>, X48 <chr>, X49 <chr>, X50 <chr>,
# X51 <dbl>
I know that my first column, instead of being FVP1104Q1V110121011010110110527421101011165 is always a 4 chr 3 dbl 2chr 2chr 1dbl 2dbl etc. In total, there are 51 columns but if parsed correctly they will become a total of 129.
These are the first 10 rows and 10 columns of my data set.
structure(list(X1 = c("FVP1104Q1V110121011010110110527421101011165",
"FVP1104Q1V110121011010110110527421101022262", "FVP1104Q1V110121011010110110527421101033231",
"FVP1104Q1V110121011010110110527421101044134", "FVP1104Q1V110121011010110110527421102011165",
"FVP1104Q1V110121011010110110527421102022260", "FVP1104Q1V110121011010110110527421102033138",
"FVP1104Q1V110121011010110110527421102044232", "FVP1104Q1V11012101101011011052742110205616",
"FVP1104Q1V110121011010110110527421102063142"), X2 = c(2080110,
20601, 2100112, 2100112, 2080110, 20601, 2120115, 2100112, 10501,
40701), X3 = c("11011116112", "8", "6", "6", "94", "6", "6",
"6", "124", "8"), X4 = c(1101, 93, 93, 3111045932226084352, 2,
92, 3185102331226052608, 93, 91, 6), X5 = c(6, 2, 2, 3111, 94,
2, 3185, 2, 2, 11011216112), X6 = c(0, 93, 93, 8, 0, 92, 8, 93,
91, 1101), X7 = c("0", "0", "0", "0", "0", "0", "0", "0", "0",
"6"), X8 = c("0", "0", "0", "0", "0", "0", "0", "0", "0", "0"
), X9 = c("6", "0", "0", "0", "0", "0", "0", "0", "0", "0"),
X10 = c("01101", "0", "0", "8", "0", "0", "8", "0", "0",
"0"), X11 = c("6", "0", "0", "03111", "094", "0", "03185",
"0", "0", "6"), X12 = c("0", "093", "093", "8", "0", "092",
"8", "093", "091", "01101"), X13 = c("0", "0", "0", "0",
"0", "0", "0", "0", "0", "6"), X14 = c("0", "0", "0", "0",
"0", "0", "0", "0", "0", "0"), X15 = c("6", "0", "0", "0",
"0", "0", "0", "0", "0", "0")), row.names = c(NA, 10L), class = "data.frame")
And I want to get
structure(list(fileid = structure(c("FVP1", "FVP1", "FVP1", "FVP1",
"FVP1", "FVP1", "FVP1", "FVP1", "FVP1", "FVP1"), label = "File Identification", format.stata = "%9s"),
schedule = structure(c(104, 104, 104, 104, 104, 104, 104,
104, 104, 104), label = "Schedule", format.stata = "%8.0g"),
quarter = structure(c("Q3", "Q3", "Q3", "Q3", "Q3", "Q3",
"Q3", "Q3", "Q3", "Q3"), label = "Quarter", format.stata = "%9s"),
visit = structure(c("V1", "V1", "V1", "V1", "V1", "V1", "V1",
"V1", "V1", "V1"), label = "Visit", format.stata = "%9s"),
sector = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Sector", format.stata = "%8.0g"),
state = structure(c(36, 36, 36, 36, 36, 36, 36, 36, 36, 36
), label = "State/Ut Code", format.stata = "%8.0g"), district = structure(c(10,
10, 10, 10, 10, 10, 10, 10, 10, 10), label = "District Code", format.stata = "%8.0g"),
region = structure(c(362, 362, 362, 362, 362, 362, 362, 362,
362, 362), label = "NSS-Region", format.stata = "%8.0g"),
stratum = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Stratum", format.stata = "%8.0g"),
substratum = structure(c(8, 8, 8, 8, 8, 8, 8, 8, 8, 8), label = "Sub-Stratum", format.stata = "%8.0g"),
subsample = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Sub-Sample", format.stata = "%8.0g"),
subregion = structure(c(3613, 3613, 3613, 3613, 3613, 3613,
3613, 3613, 3613, 3613), label = "Fod Sub-Region", format.stata = "%8.0g"),
fsu = structure(c(50030, 50030, 50030, 50030, 50030, 50030,
50030, 50030, 50030, 50030), label = "FSU", format.stata = "%10.0g"),
sbno = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Sample Sg/Sb No.", format.stata = "%8.0g"),
sss = structure(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2), label = "Second Stage Stratum No.", format.stata = "%8.0g")), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I'm trying to replicate reshaping of a .txt data using a dictionary .dct in Stata, but I don't find a clear way to do that in R.
My data also includes NA's
As per MrFlick's suggestion, we can use tidyr::separate to break apart your first column into multiple columns by position:
library(tidyr)
data.frame(X1 = "FVP1104Q1V110121011010110110527421101011165") %>%
separate(
X1,
sep = c(4, 7, 9, 11, 12),
into = paste0("X1_", 1:6)
)
# X1_1 X1_2 X1_3 X1_4 X1_5 X1_6
# 1 FVP1 104 Q1 V1 1 0121011010110110527421101011165
My dataframe loopsubset_created contains 30 observations of 45 variables. (Below you will find str(loopsubset_created) and a dput(loopsubset_created) sample).
Now I would like to create scatter plots of the PdKeyT-Variable (y) versus five of the band-value-variables (BLUE, GREEN, RED, SWIR1, SWIR2) (x) with
each variable in ONE panel
all panels aligned in ONE row
using the PdKeyTvariable as common y-axis.
In the end it should basically look like this: (I did this with ggscatter, but for flexibility reasons I would prefer basically using ggplot)
Here now my issue:When trying with ggplot, I do not find the right way for the above showed arrangement, as I cannot figure out the right code for separating/grouping by variables. I found hundreds of tutorials for facetting by multiple categorial values within a variable, but not by multiple variables.
With the following code
ggplot(loopsubset_created, aes(y = PdKeyT)) +
geom_point(aes(x = BLUE, col = "BLUE")) +
geom_point(aes(x = GREEN, col = "GREEN")) +
geom_point(aes(x = RED, col = "RED")) +
geom_point(aes(x = SWIR1, col = "SWIR1")) +
geom_point(aes(x = SWIR2, col = "SWIR2"))
I came to this basic result
Here the basic question:
Now, I would like to arrange the 5 layers seperately in one row according to the above depicted way
Anybody an idea for me?
Plus some information around the question:
Though the following aspects are not directly part of my question, I'd like to describe my final idea of the plot (in order to avoid that your advices may clash with further requirements):
Each panel should include
Spearman corr value and according p-value (as shown above) and
additionaly Pearson corr value and according p-value
Linear regression with conf. interval (as shown above) or other type of regression line (not shown)
Points should be couloured by variable (BLUE=bLue, RED= red; GREEN=green, SWIR1+2 by some other coulours, e.g. magenta and violet)
later on points and regressionlines should be subdived by ranges of PdKeyT (e.g. below -10, -10-to 30, and above 30) with using differnt brightness values of variable basic colours (blue, green, ...), analogouos to this:
All panels should use ONE common y-axis at the left as explained
And I would like to adpat the x-axes by the range of the respecitve variable (e.g. range for BLUE, GREEN and RED from 500 to 3000 and the SWIRs from 0 to 1500
edit 31.10.2021 referring to your answers:
Would it furtheron be possible with your respective approaches to limit the x-axes individually as depicted in the 'further requirements' of my question (B-G-R ranging from 500 to 3000, SWIRs from 0 to 1500) with using coord_cartesian(xlim = c(min,max))? I am asking because I read some discussions with issues on limiting axes depending on the 'faceting approach'. But I'd like to control the x-axes, because I'll have many of these plots stacked on top of each other (My sample mirrored the data of just one sampling point out of 300). And i would be glad if getting them aligned.
I'd meanwhile prefer to discrete points and reglines just by gray scale colors (for all bands the same) and rather discretely coloring the panels by theme(panel.background = element_rect(fill = "#xxxxxx"). Do you see an issue with that?
Finally some information and sample of my data
> str(loopsubset_created)
'data.frame': 30 obs. of 45 variables:
$ Site_ID : chr "A" "A" "A" "A" ...
$ Spot_Nr : chr "1" "1" "1" "1" ...
$ Transkt_Nr : chr "2" "2" "2" "2" ...
$ Point_Nr : chr "4" "4" "4" "4" ...
$ n : int 30 30 30 30 30 30 30 30 30 30 ...
$ rank : int 3 3 3 3 3 3 3 3 3 3 ...
$ Tile : chr "1008" "1008" "1008" "1008" ...
$ Date : int 20190208 20190213 20190215 20190218 20190223 20190228 20190302 20190305 20190315 20190320 ...
$ id : chr "22" "22" "22" "22" ...
$ Point_ID : chr "1022" "1022" "1022" "1022" ...
$ Site_Nr : chr "1" "1" "1" "1" ...
$ Point_x : num 356251 356251 356251 356251 356251 ...
$ Point_y : num 5132881 5132881 5132881 5132881 5132881 ...
$ Classification : num 7 7 7 7 7 7 7 7 7 7 ...
$ Class_Derived : chr "WW" "WW" "WW" "WW" ...
$ BLUE : num 1112 1095 944 1144 1141 ...
$ GREEN : num 1158 1178 1009 1288 1265 ...
$ RED : num 599 708 613 788 835 ...
$ REDEDGE1 : num 359 520 433 576 665 761 618 598 881 619 ...
$ REDEDGE2 : num 83 82 65 169 247 404 116 118 532 162 ...
$ REDEDGE3 : num 73 116 81 142 233 391 56 171 538 131 ...
$ BROADNIR : num 44 93 60 123 262 349 74 113 560 125 ...
$ NIR : num 37 70 66 135 215 313 110 135 504 78 ...
$ SWIR1 : num 187 282 184 225 356 251 240 216 507 197 ...
$ SWIR2 : num 142 187 155 197 281 209 192 146 341 143 ...
$ Quality.assurance.information: num 26664 10272 10272 10272 8224 ...
$ Q00_VAL : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q01_CS1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q02_CSS : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q03_CSH : num 1 0 0 0 0 0 0 0 1 0 ...
$ Q04_SNO : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q05_WAT : num 1 1 1 1 1 1 1 1 1 1 ...
$ Q06_AR1 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q07_AR2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q08_SBZ : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q09_SAT : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q10_ZEN : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q11_IL1 : num 1 1 1 1 0 0 0 0 0 0 ...
$ Q12_IL2 : num 0 0 0 0 0 0 0 0 0 0 ...
$ Q13_SLO : num 1 1 1 1 1 1 1 1 1 1 ...
$ Q14_VAP : num 1 0 0 0 0 0 0 0 1 0 ...
$ Q15_WDC : num 0 0 0 0 0 0 0 0 0 0 ...
$ PdMax : int -7 -19 -20 -22 -24 -25 -26 -25 -21 -15 ...
$ PdMin : int -13 -23 -24 -26 -28 -29 -29 -28 -24 -20 ...
$ PdKeyT : int -10 -20 -22 -22 -27 -26 -26 -27 -22 -17 ...
loopsubset_created <- structure(list(Site_ID = c("A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A"), Spot_Nr = c("1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1"), Transkt_Nr = c("2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"), Point_Nr = c("4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4"), n = c(30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L), rank = c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), Tile = c("1008",
"1008", "1008", "1008", "1008", "1008", "1008", "1008", "1008",
"1008", "1008", "1008", "1008", "1008", "1008", "1008", "1008",
"1008", "1008", "1008", "1008", "1008", "1008", "1008", "1008",
"1008", "1008", "1008", "1008", "1008"), Date = c(20190208L,
20190213L, 20190215L, 20190218L, 20190223L, 20190228L, 20190302L,
20190305L, 20190315L, 20190320L, 20190322L, 20190325L, 20190330L,
20190401L, 20190416L, 20190419L, 20190421L, 20190501L, 20190506L,
20190524L, 20190531L, 20190603L, 20190620L, 20190625L, 20190630L,
20190705L, 20190710L, 20190809L, 20190814L, 20190903L), id = c("22",
"22", "22", "22", "22", "22", "22", "22", "22", "22", "22", "22",
"22", "22", "22", "22", "22", "22", "22", "22", "22", "22", "22",
"22", "22", "22", "22", "22", "22", "22"), Point_ID = c("1022",
"1022", "1022", "1022", "1022", "1022", "1022", "1022", "1022",
"1022", "1022", "1022", "1022", "1022", "1022", "1022", "1022",
"1022", "1022", "1022", "1022", "1022", "1022", "1022", "1022",
"1022", "1022", "1022", "1022", "1022"), Site_Nr = c("1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1"), Point_x = c(356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781, 356250.781, 356250.781, 356250.781, 356250.781,
356250.781, 356250.781), Point_y = c(5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701, 5132880.701, 5132880.701,
5132880.701, 5132880.701, 5132880.701), Classification = c(7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7), Class_Derived = c("WW", "WW", "WW",
"WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW",
"WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW", "WW",
"WW", "WW", "WW", "WW", "WW"), BLUE = c(1112, 1095, 944, 1144,
1141, 1010, 968, 1023, 1281, 1124, 1215, 1154, 1188, 1177, 1622,
1305, 1215, 2282, 2322, 2337, 2680, 2473, 1143, 1187, 1165, 1040,
1290, 1112, 1474, 1131), GREEN = c(1158, 1178, 1009, 1288, 1265,
1208, 1122, 1146, 1416, 1298, 1379, 1345, 1379, 1366, 1714, 1446,
1354, 2417, 2417, 2500, 2967, 2587, 1469, 1522, 1544, 1253, 1514,
1371, 1875, 1416), RED = c(599, 708, 613, 788, 835, 852, 726,
729, 1044, 816, 905, 908, 948, 970, 1206, 944, 935, 1648, 1741,
2004, 2109, 2032, 1241, 1290, 1419, 1206, 1424, 1339, 1969, 1321
), REDEDGE1 = c(359, 520, 433, 576, 665, 761, 618, 598, 881,
619, 722, 771, 829, 823, 937, 725, 759, 1327, 1395, 1756, 1718,
1753, 1533, 1528, 1683, 1335, 1605, 1499, 2016, 1592), REDEDGE2 = c(83,
82, 65, 169, 247, 404, 116, 118, 532, 162, 183, 218, 285, 200,
514, 182, 230, 568, 531, 1170, 780, 1101, 1192, 1174, 1250, 949,
1121, 1127, 1382, 1159), REDEDGE3 = c(73, 116, 81, 142, 233,
391, 56, 171, 538, 131, 205, 137, 321, 253, 503, 193, 214, 564,
527, 1192, 698, 1177, 1203, 1259, 1341, 1049, 1146, 1216, 1416,
1188), BROADNIR = c(44, 93, 60, 123, 262, 349, 74, 113, 560,
125, 121, 211, 325, 221, 480, 184, 178, 461, 435, 1067, 570,
1023, 961, 966, 964, 844, 764, 993, 1197, 834), NIR = c(37, 70,
66, 135, 215, 313, 110, 135, 504, 78, 115, 216, 197, 163, 462,
113, 165, 392, 349, 1006, 574, 1092, 1153, 1143, 1128, 961, 1033,
1027, 1164, 1086), SWIR1 = c(187, 282, 184, 225, 356, 251, 240,
216, 507, 197, 306, 260, 298, 290, 400, 190, 300, 275, 204, 678,
528, 1087, 1091, 1049, 1310, 935, 1199, 1169, 984, 1139), SWIR2 = c(142,
187, 155, 197, 281, 209, 192, 146, 341, 143, 271, 220, 246, 232,
387, 168, 217, 193, 173, 540, 374, 764, 766, 799, 869, 724, 827,
794, 745, 848), Quality.assurance.information = c(26664, 10272,
10272, 10272, 8224, 8224, 8224, 8224, 24616, 8224, 8224, 8224,
32, 8224, 8288, 24616, 8224, 8240, 48, 8208, 8240, 8192, 8192,
24648, 8192, 8192, 8192, 8192, 0, 8224), Q00_VAL = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q01_CS1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Q02_CSS = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q03_CSH = c(1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Q04_SNO = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q05_WAT = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1), Q06_AR1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Q07_AR2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q08_SBZ = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q09_SAT = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), Q10_ZEN = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q11_IL1 = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q12_IL2 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q13_SLO = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1), Q14_VAP = c(1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Q15_WDC = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), PdMax = c(-7L, -19L, -20L,
-22L, -24L, -25L, -26L, -25L, -21L, -15L, -19L, -17L, -23L,
-22L, -4L, -7L, -8L, 55L, 57L, 47L, 67L, 44L, 21L, 18L, 13L,
16L, 16L, 9L, 12L, 11L), PdMin = c(-13L, -23L, -24L, -26L,
-28L, -29L, -29L, -28L, -24L, -20L, -22L, -22L, -26L, -26L,
-7L, -11L, -11L, 46L, 47L, 36L, 52L, 37L, 17L, 14L, 9L, 11L,
9L, 5L, 5L, 2L), PdKeyT = c(-10L, -20L, -22L, -22L, -27L,
-26L, -26L, -27L, -22L, -17L, -19L, -19L, -23L, -23L, -5L,
-9L, -9L, 54L, 53L, 40L, 60L, 43L, 20L, 15L, 13L, 15L, 13L,
7L, 9L, 6L)), row.names = 198:227, class = "data.frame")
Update:
To fulfill your last task me could make use of the code that is from Allan Cameron: adding another column to set the cuts mutate(range = cut(PdKeyT, c(-Inf, -10, 30, Inf), c("Low", "Mid", "High"))) %>% (this code was provided by Allan Cameron)
library(tidyverse)
library(ggpubr)
df_long_list <- loopsubset_created %>%
select(PdKeyT, BLUE, GREEN, RED, SWIR1, SWIR2) %>%
pivot_longer(
cols = -PdKeyT
) %>%
mutate(color = case_when(name=="BLUE" ~ "blue",
name=="GREEN" ~ "green",
name=="RED" ~ "red",
name=="SWIR1" ~ "magenta",
name=="SWIR2" ~ "violet"))%>%
mutate(range = cut(PdKeyT, c(-Inf, -10, 30, Inf), c("Low", "Mid", "High"))) %>%
group_split(name)
p <- ggplot()
for (i in 1:5) p <- p + geom_point(data=df_long_list[[i]], aes(value, PdKeyT, color=color, alpha=range))+
geom_smooth(data=df_long_list[[i]], aes(value, PdKeyT, group=range), method = lm, se=TRUE)+
theme(legend.position="none") +
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Spearman",..r.label.., ..p.label.., sep = "~`,`~")), method="spearman",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 70)+
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Pearson",..r.label.., ..p.label.., sep = "~`,`~")), method="pearson",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 65)+
facet_grid(.~name, scales = "free") +
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(120, 10, 120, 10),
panel.border = element_rect(fill = NA, color = "black"))
p
Here is how you could do it:
select all relevant columns
bring in long format
add color column to dataframe
make a list of dataframes with group_split
use a for loop to iterate over each of the 5 dataframes in the list
within the loop add stat_cor for pearson and spearman from ggpubr package
facet and do some formatting
library(tidyverse)
library(ggpubr)
df_long_list <- loopsubset_created %>%
select(PdKeyT, BLUE, GREEN, RED, SWIR1, SWIR2) %>%
pivot_longer(
cols = -PdKeyT
) %>%
mutate(color = case_when(name=="BLUE" ~ "blue",
name=="GREEN" ~ "green",
name=="RED" ~ "red",
name=="SWIR1" ~ "magenta",
name=="SWIR2" ~ "violet"))%>%
group_split(name)
p <- ggplot()
for (i in 1:5) p <- p + geom_point(data=df_long_list[[i]], aes(value, PdKeyT, color=color))+
geom_smooth(data=df_long_list[[i]], aes(value, PdKeyT), method = lm, se=TRUE)+
theme(legend.position="none") +
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Spearman",..r.label.., ..p.label.., sep = "~`,`~")), method="spearman",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 70)+
stat_cor(data=df_long_list[[i]], aes(value, PdKeyT,
label=paste("Pearson",..r.label.., ..p.label.., sep = "~`,`~")), method="pearson",
# label.x.npc="left", label.y.npc="top", hjust=0) +
label.x = 3, label.y = 65)+
facet_grid(.~name, scales = "free_y") +
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(120, 10, 120, 10),
panel.border = element_rect(fill = NA, color = "black"))
p
To panel plots use facet_wrap or facet_grid. Also, generally ggplot2 works better when your data are in a long format. This allows you to assign a variable to an aesthetic rather than do it manually as you have.
library(ggplot2)
library(tidyr)
library(purrr)
library(dplyr)
library(tibble)
# lengthen your data so variable names are in a column
df <- loopsubset_created %>%
pivot_longer(cols = c(BLUE:RED, starts_with("SWIR")))
# get correlation coef and pvalue
r <- map(split(df, ~ name), ~ with(.x, c(cor(PdKeyT, value, method = "spearman"),
cor.test(PdKeyT, value, method = "spearman")$p.value))) %>%
bind_rows() %>%
rownames_to_column("i") %>% # first row is coef, second row is p value
pivot_longer(-i) %>%
mutate(lab = ifelse(i == 1,
# formatted so will be parsed by geom_text
sprintf("italic(R) == %0.5f", value),
sprintf("italic(p) == %0.5f", value)),
x = -Inf, # left of panel
y = Inf, # top of panel,
vjust = ifelse(i == 1, 0.75, 2)) # put p-value below
df %>%
ggplot(aes(x = value, y = PdKeyT, color = name)) +
geom_point() +
geom_text(data = r,
aes(x = x, y = y,
label = lab,
vjust = vjust),
size = 3,
parse = T,
inherit.aes = F) +
geom_smooth(method = "lm",
se = T,
formula = y ~ x,
show.legend = F) +
facet_grid(~ name,
scales = "free_x") +
labs(color = element_blank(),
x = "XLAB")
I think this fulfills most of your requirements, other than the correlation annotations. If, as you mention in your question, you wish to have 3 regressions per panel (one for each of the three ranges of PdkeyT) you would also need 3 correlation coefficients and p values per panel, which will be messy.
The reason why you have not seen tutorials for having different facets per variable is that this is not what facets are. Facets are a way of displaying data that have the same x and y axis but differ by some other categorical variable. They are not intended as a way of plotting different x variables against the same y variable. What you are describing is 5 distinct plots side-by-side, not facets.
Having said that, it is still possible to create the plot you are looking for with creative use of facets. You first need to shape the data into long format so that the values of the different x axis columns get stacked into a single column called value, and a new column called name is created to label each value according to which column it originally came from.
We can then use the new value column as our x axis variable, and facet according to the name column.
To make this look more authentic, we make some theme adjustments to ensure the facet strips resemble axis labels:
library(dplyr)
library(tidyr)
library(ggplot2)
loopsubset_created %>%
select(PdKeyT, BLUE, GREEN, RED, SWIR1, SWIR2) %>%
pivot_longer(-1) %>%
mutate(range = cut(PdKeyT, c(-Inf, -10, 30, Inf), c("Low", "Mid", "High"))) %>%
ggplot(aes(value, PdKeyT, color = name)) +
geom_point(aes(alpha = range)) +
geom_smooth(aes(group = range), size = 0.1,
method = "lm", formula = y ~ x, color = "black") +
labs(x = "") +
facet_grid(.~name, switch = "x", scales = "free_x") +
scale_color_manual(values = c("blue", "green", "red", "magenta", "violet")) +
theme_bw() +
theme(strip.placement = "outside",
strip.background = element_blank(),
plot.margin = margin(120, 10, 120, 10),
legend.position = "none")
My question is somehow related to an already answered question Need to extract individual characters from a string column using R.
I try to solve this question with my knowledge and need to know how to remove non numeric characters in parenthesis within a string: `
This is the dataframe with column x:
team linescore ondate x
1 NYM 010000000 2020-08-01 0, 1, 0, 0, 0, 0, 0, 0, 0
2 NYM (10)1140006x) 2020-08-02 (, 1, 0, ), 1, 1, 4, 0, 0, 0, 6, x, )
3 BOS 002200010 2020-08-13 0, 0, 2, 2, 0, 0, 0, 1, 0
4 NYM 00000(11)01x 2020-08-15 0, 0, 0, 0, 0, (, 1, 1, ), 0, 1, x
5 BOS 311200 2020-08-20 3, 1, 1, 2, 0, 0
structure(list(team = c("NYM", "NYM", "BOS", "NYM", "BOS"), linescore = c("010000000",
"(10)1140006x)", "002200010", "00000(11)01x", "311200"), ondate = structure(c(18475,
18476, 18487, 18489, 18494), class = "Date"), x = list(c("0",
"1", "0", "0", "0", "0", "0", "0", "0"), c("(", "1", "0", ")",
"1", "1", "4", "0", "0", "0", "6", "x", ")"), c("0", "0", "2",
"2", "0", "0", "0", "1", "0"), c("0", "0", "0", "0", "0", "(",
"1", "1", ")", "0", "1", "x"), c("3", "1", "1", "2", "0", "0"
))), class = "data.frame", row.names = c(NA, -5L))
Desired Output:
team linescore ondate x
1 NYM 010000000 2020-08-01 0, 1, 0, 0, 0, 0, 0, 0, 0
2 NYM (10)1140006x) 2020-08-02 10, 1, 1, 4, 0, 0, 0, 6, x, )
3 BOS 002200010 2020-08-13 0, 0, 2, 2, 0, 0, 0, 1, 0
4 NYM 00000(11)01x 2020-08-15 0, 0, 0, 0, 0, 11, 0, 1, x
5 BOS 311200 2020-08-20 3, 1, 1, 2, 0, 0
How can I change (, 1, 0, ) to 10 and (, 1, 1, ) to 11 and leave the rest as is.
Some help I already got so far:
regex for replacement of specific character outside parenthesis only thanks AnilGoyal
gsub("\\D+", "", str1) thanks to akrun
gsub("[(,) ]", "", "(, 1, 0, )") thanks to Anoushirvan
Thanks!
We could do this in base R. An option is to insert a delimiter between the characters that are outside the (...) with *SKIP/*FAIL, then remove the paired () while keeping the characters by capturing as a group, finally return the list by splitting at the , with strsplit
df1$x <- strsplit(gsub("\\((\\d+)\\)", "\\1,",
gsub("\\([^)]+\\)(*SKIP)(*FAIL)|(.)", "\\1,",
df1$linescore, perl = TRUE)),",")
-ouptut
df1$x
[[1]]
[1] "0" "1" "0" "0" "0" "0" "0" "0" "0"
[[2]]
[1] "10" "1" "1" "4" "0" "0" "0" "6" "x" ")"
[[3]]
[1] "0" "0" "2" "2" "0" "0" "0" "1" "0"
[[4]]
[1] "0" "0" "0" "0" "0" "11" "0" "1" "x"
[[5]]
[1] "3" "1" "1" "2" "0" "0"
Here is another way that we could get to your desired output, I just figured out which is not relying on regex. However, the use of regex makes your solution much more elegant and compact:
library(purrr)
map(df %>% select(linescore), ~ strsplit(.x, "\\(|\\)")) %>%
flatten() %>%
map_dfr(~ map(.x, ~ if(nchar(.x) > 2) strsplit(.x, "")[[1]] else .x) %>%
reduce(~ c(.x, .y)) %>%
keep(~ nchar(.x) != 0) %>% t() %>%
as_tibble() %>%
set_names(~ paste0("inng", 1:length(.x))))
# A tibble: 5 x 9
inng1 inng2 inng3 inng4 inng5 inng6 inng7 inng8 inng9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 0 1 0 0 0 0 0 0 0
2 10 1 1 4 0 0 0 6 x
3 0 0 2 2 0 0 0 1 0
4 0 0 0 0 0 11 0 1 x
5 3 1 1 2 0 0 NA NA NA
I'm fairly new to R and machine learning and I'm trying to build a logistic regression model that can predict which of the clinical variables can better predict the outcome of death (0 being death and 1 is living in the dataset below).
Here is the dput of the data
structure(list(ID = c(13154920, 13201107, 13207948, 13234892,
13082943, 13193903, 13283776, 13154288, 13269178, 13055690, 13207670,
13220627, 13055009, 13044947, 13060589, 13201616, 13054278, 13160156,
13160971, 13239318, 13321288, 13154966, 13165362, 12999835, 13223721,
13064865, 13104602, 13036280, 13040507, 12964437, 13029805, 13029001,
12993036, 13072516, 13060586, 13119819, 13040632, 13055908, 13059026,
13207119, 13261022, 13259391, 13262499, 13207315, 13135316, 13233898,
13181075, 13261607, 13186960, 13240091, 13260671, 13302375, 13021555,
13062360, 13035346, 13077712, 13128769, 13267480, 13040172, 12977871,
13090190, 13040530, 13100979, 13192142, 13289317, 13315577, 13044653,
13079694, 13128639, 13207352, 13049409, 13210994, 13283675),
outcome = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1), day = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), pa02 = c("121", "NA", "78.7", "69.099999999999994", "131",
"26.3", "68.900000000000006", "74.099999999999994", "118",
"404", "399", "NA", "NA", "73.7", "265", "75", "222", "112",
"133", "77.900000000000006", "64.7", "84", "92.2", "107",
"64.599999999999994", "56.6", "69.900000000000006", "130",
"91.4", "NA", "111", "79.2", "86.9", "65.7", "94.5", "129",
"NA", "157,0", "32,8", "160,0", "38,1", "61,7", "NA", "86,8",
"86,5", "87,4", "143,0", "57,8", "NA", "88,6", "94,3", "NA",
"70,0", "107,0", "57,2", "75,9", "148,0", "NA", "60,4", "NA",
"NA", "148,0", "75,3", "75,4", "58", "46,4", "82,3", "189,0",
"132,0", "96,5", "112,0", "67,0", "65"), iot = c("0", "1",
"1", "0", "1", "0", "0", "0", "1", "1", "0", "0", "0", "1",
"0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "1", "1",
"1", "0", "1", "0", "1", "0", "1", "0", "1", "1", "1", "1",
"0", "0", "1", "0", "0", "0", "0", "0", "1", "0", "0", "0",
"0", "0", "0", "1", "0", "1", "0", "1", "0", "1", "0", "1",
"0", "0", "0", "0", "1", "0", "1", "0", "1", "0", "0"), fio2 = c("0.23",
"1", "0.6", "0.23", "1", "0.23", "0.23", "0.23", "0.4", "1",
"0.23", "0.23", "0.23", "1", "0.23", "0.23", "0.23", "0.23",
"0.5", "0.5", "0.23", "0.23", "0.23", "0.23", "0.75", "NA",
"NA", "0.23", "0.1", "0.23", "NA", "0.23", "NA", "0.23",
"1", "0.5", "0.4", "0.45", "0.23", "0.23", "0.7", "0.23",
"0.23", "0.23", "0.23", "0.23", "1", "0.23", "0.23", "0.23",
"0.23", "0.23", "0.23", "1", "0.23", "1", "0.23", "0.8",
"0.23", "0.4", "0.23", "0.4", "0.23", "0.23", "0.23", "0.23",
"0.3", "0.23", "0.8", "0.23", "0.3", "0.23", "0.23"), resp_syst = c("526.08695652173913",
"NA", "131.16666666666669", "300.43478260869563", "131",
"114.34782608695652", "299.56521739130437", "322.17391304347819",
"295", "404", "1734.782608695652", "NA", "NA", "73.7", "1152.1739130434783",
"326.08695652173913", "965.21739130434776", "486.95652173913044",
"266", "155.80000000000001", "281.30434782608694", "365.21739130434781",
"400.86956521739131", "465.21739130434781", "86.133333333333326",
"NA", "NA", "565.21739130434776", "914", "NA", "NA", "344.3478260869565",
"NA", "285.6521739130435", "94.5", "258", "NA", "348.88888888888886",
"142.60869565217391", "695.6521739130435", "54.428571428571431",
"268.26086956521738", "NA", "377.39130434782606", "376.08695652173913",
"380", "143", "251.30434782608694", "NA", "385.21739130434776",
"409.99999999999994", "NA", "304.3478260869565", "107", "248.69565217391303",
"75.900000000000006", "643.47826086956513", "NA", "262.60869565217388",
"NA", "NA", "370", "327.39130434782606", "327.82608695652175",
"252.17391304347825", "201.7391304347826", "274.33333333333331",
"821.73913043478262", "165", "419.56521739130432", "373.33333333333337",
"291.30434782608694", "282.60869565217388"), resp_score = c(0,
NA, 3, 1, 3, 2, 2, 1, 2, 0, 0, NA, NA, 4, 0, 1, 0, 0, 2,
3, 2, 1, 0, 0, 4, NA, NA, 0, 0, NA, NA, 1, NA, 2, 4, 2, NA,
1, 2, 0, 4, 2, NA, 1, 1, 1, 3, 2, NA, 1, 0, NA, 1, 3, 2,
4, 0, NA, 2, NA, NA, 1, 1, 1, 2, 2, 2, 0, 2, 0, 1, 2, 2),
platelets = c("NA", "363000", "NA", "NA", "620000", "NA",
"419000", "277000", "NA", "NA", "277000", "255000", "NA",
"185000", "318000", "296000", "182000", "48000", "129000",
"260000", "254000", "213000", "132000", "293000", "NA", "99000",
"297000", "227000", "174000", "172000", "76000", "NA", "242000",
"181000", "90000", "NA", "NA", "264000", "331000", "237000",
"279000", "NA", "214000", "NA", "NA", "283000", "416000",
"349000", "NA", "243000", "635000", "100000", "165000", "343000",
"NA", "423000", "206000", "NA", "439000", "NA", "358000",
"253000", "130000", "NA", "498000", "415000", "236000", "440000",
"218000", "300000", "190000", "244000", "275000"), coag_score = c(NA,
0, NA, NA, 0, NA, 0, 0, NA, NA, 0, 0, NA, 0, 0, 0, 0, 3,
1, 0, 0, 0, 1, 0, NA, 2, 0, 0, 0, 0, 2, NA, 0, 0, 2, NA,
NA, 0, 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, NA, 0, 0, 2, 0, 0,
NA, 0, 0, NA, 0, NA, 0, 0, 1, NA, 0, 0, 0, 0, 0, 0, 0, 0,
0), bilirrubin = c("NA", "0.4", "NA", "0.5", "0.3", "NA",
"0.9", "NA", "0.4", "NA", "0.5", "NA", "NA", "1.1000000000000001",
"NA", "0.3", "NA", "0.5", "NA", "0.3", "0.3", "NA", "NA",
"0.4", "NA", "0.5", "0.4", "0.4", "1", "0.5", "NA", "NA",
"NA", "NA", "NA", "0.3", "0.3", "1", "0.7", "0.3", "1.4",
"NA", "NA", "0.3", "0.3", "0.3", "0.7", "0.4", "NA", "NA",
"NA", "0.7", "0.6", "0.5", "NA", "NA", "0.3", "0.3", "NA",
"NA", "NA", "0.4", "0.3", "NA", "NA", "0.4", "1.1000000000000001",
"NA", "0.5", "1.4", "0.4", "0.6", "0.7"), liver_score = c(NA,
0, NA, 0, 0, NA, 0, NA, 0, NA, 0, NA, NA, 0, NA, 0, NA, 0,
NA, 0, 0, NA, NA, 0, NA, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA,
0, 0, 0, 0, 0, 1, NA, NA, 0, 0, 0, 0, 0, NA, NA, NA, 0, 0,
0, NA, NA, 0, 0, NA, NA, NA, 0, 0, NA, NA, 0, 0, NA, 0, 0,
0, 0, 0), pam = c("NA", "80.666666666666671", "73.333333333333329",
"83.333333333333329", "84.666666666666671", "100.6666666666667",
"101", "100", "95", "69.333333333333329", "93.666666666666671",
"70", "91.666666666666671", "70.666666666666671", "81.666666666666671",
"71.333333333333329", "95.333333333333329", "92", "77.666666666666671",
"84.333333333333329", "90", "80", "96.333333333333329", "93.333333333333329",
"69.333333333333329", "57.333333333333343", "80", "63", "81.666666666666671",
"80", "76.666666666666671", "90", "62.666666666666657", "89.333333333333329",
"87.333333333333329", "72.333333333333329", "93.333333333333329",
"87.666666666666671", "105.3333333333333", "107.6666666666667",
"69.333333333333329", "111.3333333333333", "75.666666666666671",
"103", "85.333333333333329", "86.666666666666671", "64.333333333333329",
"93.333333333333329", "87.333333333333329", "100", "106.6666666666667",
"96.666666666666671", "102", "98.666666666666671", "83.333333333333329",
"73.666666666666671", "95.333333333333329", "81", "93.333333333333329",
"62.666666666666657", "83.333333333333329", "93", "92.333333333333329",
"73.333333333333329", "87.333333333333329", "74.333333333333329",
"95.333333333333329", "80.333333333333329", "83.333333333333329",
"96.333333333333329", "97.666666666666671", "91.666666666666671",
"85.333333333333329"), catecolamine = c("10 ml/h", "0,1 mcg/kg/min",
"0", "0", "0,1mcg/kg/min", "0", "0", "0", "0", "0,8 mcg/kg/min",
"0,1 mcg/kg/min", "0", "0", "0,3mcg/kg/min", "0,15 mcg/kg/min",
"0,13mcg/kg/min", "6ml/h", "0", "0", "0,03mcg/kg/min", "0",
"0", "0", "0", "9mL/h", "0", "0", "0.45", "5.5555555555555558E-3",
"0", "NA", "0", "0.2857142857142857", "0", "0.32258064516129031",
"0", "0.16", "0,05mcg/kg/min", "0", "0", "0", "0", "0", "0",
"0", "0", "0,1mcg/Kg/min", "0", "0", "0", "0", "0", "0",
"0", "0", "10ml/hr", "0", "0", "0", "30ml/h", "0", "0", "0",
"0", "0", "0", "0", "10 ml/h", "0", "0", "0", "0", "0"),
cardiovas_score = c(NA, 3, 0, 0, 3, 0, 0, 0, 0, 4, 3, 0,
0, 4, 4, 4, 3, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 4, 4, 0, NA,
0, 4, 0, 4, 0, 4, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0,
0, 0, 0, 0, 0, NA, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, NA,
0, 0, 0, 0, 0), creatinine = c("0.62", "2.21", "NA", "1.25",
"1.84", "0.94", "0.77", "0.41", "1.58", "NA", "2.4500000000000002",
"0.21", "NA", "1.65", "2.77", "2.1800000000000002", "1.1299999999999999",
"0.49", "0.53", "1.34", "1.23", "0.8", "0.65", "2.9", "0.45",
"1.05", "0.72", "5.14", "0.65", "0.47", "1.46", "NA", "2.77",
"2.21", "0.97", "0.35", "NA", "1.83", "0.78", "0.45", "1.23",
"NA", "0.59", "1.21", "0.61", "0.75", "1.23", "0.74", "NA",
"0.88", "0.65", "3.48", "0.7", "0.88", "NA", "1.1499999999999999",
"0.59", "0.6", "0.41", "NA", "0.66", "0.72", "0.69", "NA",
"NA", "1.29", "0.8", "0.94", "0.48", "1.1399999999999999",
"0.85", "0.74", "0.86"), renal_score = c(0, 2, NA, 1, 1,
0, 0, 0, 0, NA, 2, 0, NA, 1, 2, 2, 1, 0, 0, 1, 1, 0, 0, 2,
0, 0, 0, 4, 0, 0, 1, NA, 2, 2, 0, 0, NA, 1, 0, 0, 1, NA,
0, 1, 0, 0, 1, 0, NA, 0, 0, 3, 0, 0, NA, 0, 0, 0, 0, NA,
0, 0, 0, NA, NA, 1, 0, 0, 0, 0, 0, 0, 0), `SOFA SCORE` = c(0,
5, 3, 2, 7, 2, 2, 1, 2, 4, 5, 0, 0, 9, 6, 6, 4, 3, 3, 7,
3, 1, 0, 2, 7, 2, 0, 8, 4, 0, 3, 1, 6, 4, 10, 2, 4, 5, 2,
0, 7, 2, 0, 2, 1, 1, 7, 2, 0, 1, 0, 5, 1, 3, 2, 4, 0, 0,
2, 3, 0, 1, 1, 1, 2, 3, 2, 3, 2, 0, 1, 2, 2)), row.names = c(NA,
-73L), class = c("tbl_df", "tbl", "data.frame"))
I have tried
test <- glm(sofa_t0_deadoralive$outcome ~ resp_score + coag_score + liver_score + cardiovas_score + renal_score, family = binomial)
Which outputs
Warning message:
glm.fit: fitted probabilities numerically 0 or 1 occurred
And when I use
glm(formula = sofa_t0_deadoralive$outcome ~ resp_score + coag_score +
liver_score + cardiovas_score + renal_score, family = binomial)
I get
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9432 -0.5247 0.0000 0.5729 1.5111
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.543e+00 1.429e+00 1.779 0.0752 .
resp_score1 -5.534e-02 1.381e+00 -0.040 0.9680
resp_score2 -8.189e-01 1.415e+00 -0.579 0.5627
resp_score3 -1.803e+00 1.664e+00 -1.084 0.2784
resp_score4 -1.639e+01 4.315e+03 -0.004 0.9970
resp_scoreNA -1.282e+00 1.460e+00 -0.878 0.3799
coag_score1 -1.889e+00 1.538e+00 -1.228 0.2193
coag_score2 -1.917e+01 4.706e+03 -0.004 0.9967
coag_score3 -2.211e+01 1.075e+04 -0.002 0.9984
coag_scoreNA -1.777e+00 9.772e-01 -1.819 0.0690 .
liver_score1 3.463e+01 1.159e+04 0.003 0.9976
liver_scoreNA -1.833e+00 9.257e-01 -1.980 0.0478 *
cardiovas_score1 NA NA NA NA
cardiovas_score3 -2.795e-01 1.722e+00 -0.162 0.8710
cardiovas_score4 -1.997e+01 2.782e+03 -0.007 0.9943
cardiovas_scoreNA -1.747e+01 6.423e+03 -0.003 0.9978
.
.
.
Why is R recognizing each score of resp_score, liver_score and cardiovas_score, in this example, as different categorical data instead of continuous? What can I do to fix it?
Any direction as to where to focus is greatly appreciated.
I tried your dataset as follows:
myData <- structure(list(ID = c(13154920, 13201107, 13207948, 13234892,
13082943, 13193903, 13283776, 13154288, 13269178, 13055690, 13207670,
13220627, 13055009, 13044947, 13060589, 13201616, 13054278, 13160156,
13160971, 13239318, 13321288, 13154966, 13165362, 12999835, 13223721,
13064865, 13104602, 13036280, 13040507, 12964437, 13029805, 13029001,
12993036, 13072516, 13060586, 13119819, 13040632, 13055908, 13059026,
13207119, 13261022, 13259391, 13262499, 13207315, 13135316, 13233898,
13181075, 13261607, 13186960, 13240091, 13260671, 13302375, 13021555,
13062360, 13035346, 13077712, 13128769, 13267480, 13040172, 12977871,
13090190, 13040530, 13100979, 13192142, 13289317, 13315577, 13044653,
13079694, 13128639, 13207352, 13049409, 13210994, 13283675),
outcome = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1),
day = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0),
pa02 = c("121", "NA", "78.7", "69.099999999999994", "131",
"26.3", "68.900000000000006", "74.099999999999994", "118",
"404", "399", "NA", "NA", "73.7", "265", "75", "222", "112",
"133", "77.900000000000006", "64.7", "84", "92.2", "107",
"64.599999999999994", "56.6", "69.900000000000006", "130",
"91.4", "NA", "111", "79.2", "86.9", "65.7", "94.5", "129",
"NA", "157,0", "32,8", "160,0", "38,1", "61,7", "NA", "86,8",
"86,5", "87,4", "143,0", "57,8", "NA", "88,6", "94,3", "NA",
"70,0", "107,0", "57,2", "75,9", "148,0", "NA", "60,4", "NA",
"NA", "148,0", "75,3", "75,4", "58", "46,4", "82,3", "189,0",
"132,0", "96,5", "112,0", "67,0", "65"),
iot = c("0", "1", "1", "0", "1", "0", "0", "0", "1", "1", "0", "0", "0", "1",
"0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "1", "1",
"1", "0", "1", "0", "1", "0", "1", "0", "1", "1", "1", "1",
"0", "0", "1", "0", "0", "0", "0", "0", "1", "0", "0", "0",
"0", "0", "0", "1", "0", "1", "0", "1", "0", "1", "0", "1",
"0", "0", "0", "0", "1", "0", "1", "0", "1", "0", "0"),
fio2 = c("0.23", "1", "0.6", "0.23", "1", "0.23", "0.23", "0.23", "0.4", "1",
"0.23", "0.23", "0.23", "1", "0.23", "0.23", "0.23", "0.23",
"0.5", "0.5", "0.23", "0.23", "0.23", "0.23", "0.75", "NA",
"NA", "0.23", "0.1", "0.23", "NA", "0.23", "NA", "0.23",
"1", "0.5", "0.4", "0.45", "0.23", "0.23", "0.7", "0.23",
"0.23", "0.23", "0.23", "0.23", "1", "0.23", "0.23", "0.23",
"0.23", "0.23", "0.23", "1", "0.23", "1", "0.23", "0.8",
"0.23", "0.4", "0.23", "0.4", "0.23", "0.23", "0.23", "0.23",
"0.3", "0.23", "0.8", "0.23", "0.3", "0.23", "0.23"),
resp_syst = c("526.08695652173913", "NA", "131.16666666666669", "300.43478260869563", "131",
"114.34782608695652", "299.56521739130437", "322.17391304347819",
"295", "404", "1734.782608695652", "NA", "NA", "73.7", "1152.1739130434783",
"326.08695652173913", "965.21739130434776", "486.95652173913044",
"266", "155.80000000000001", "281.30434782608694", "365.21739130434781",
"400.86956521739131", "465.21739130434781", "86.133333333333326",
"NA", "NA", "565.21739130434776", "914", "NA", "NA", "344.3478260869565",
"NA", "285.6521739130435", "94.5", "258", "NA", "348.88888888888886",
"142.60869565217391", "695.6521739130435", "54.428571428571431",
"268.26086956521738", "NA", "377.39130434782606", "376.08695652173913",
"380", "143", "251.30434782608694", "NA", "385.21739130434776",
"409.99999999999994", "NA", "304.3478260869565", "107", "248.69565217391303",
"75.900000000000006", "643.47826086956513", "NA", "262.60869565217388",
"NA", "NA", "370", "327.39130434782606", "327.82608695652175",
"252.17391304347825", "201.7391304347826", "274.33333333333331",
"821.73913043478262", "165", "419.56521739130432", "373.33333333333337",
"291.30434782608694", "282.60869565217388"),
resp_score = c(0, NA, 3, 1, 3, 2, 2, 1, 2, 0, 0, NA, NA, 4, 0, 1, 0, 0, 2,
3, 2, 1, 0, 0, 4, NA, NA, 0, 0, NA, NA, 1, NA, 2, 4, 2, NA,
1, 2, 0, 4, 2, NA, 1, 1, 1, 3, 2, NA, 1, 0, NA, 1, 3, 2,
4, 0, NA, 2, NA, NA, 1, 1, 1, 2, 2, 2, 0, 2, 0, 1, 2, 2),
platelets = c("NA", "363000", "NA", "NA", "620000", "NA",
"419000", "277000", "NA", "NA", "277000", "255000", "NA",
"185000", "318000", "296000", "182000", "48000", "129000",
"260000", "254000", "213000", "132000", "293000", "NA", "99000",
"297000", "227000", "174000", "172000", "76000", "NA", "242000",
"181000", "90000", "NA", "NA", "264000", "331000", "237000",
"279000", "NA", "214000", "NA", "NA", "283000", "416000",
"349000", "NA", "243000", "635000", "100000", "165000", "343000",
"NA", "423000", "206000", "NA", "439000", "NA", "358000",
"253000", "130000", "NA", "498000", "415000", "236000", "440000",
"218000", "300000", "190000", "244000", "275000"),
coag_score = c(NA, 0, NA, NA, 0, NA, 0, 0, NA, NA, 0, 0, NA, 0, 0, 0, 0, 3,
1, 0, 0, 0, 1, 0, NA, 2, 0, 0, 0, 0, 2, NA, 0, 0, 2, NA,
NA, 0, 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, NA, 0, 0, 2, 0, 0,
NA, 0, 0, NA, 0, NA, 0, 0, 1, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0),
bilirrubin = c("NA", "0.4", "NA", "0.5", "0.3", "NA",
"0.9", "NA", "0.4", "NA", "0.5", "NA", "NA", "1.1000000000000001",
"NA", "0.3", "NA", "0.5", "NA", "0.3", "0.3", "NA", "NA",
"0.4", "NA", "0.5", "0.4", "0.4", "1", "0.5", "NA", "NA",
"NA", "NA", "NA", "0.3", "0.3", "1", "0.7", "0.3", "1.4",
"NA", "NA", "0.3", "0.3", "0.3", "0.7", "0.4", "NA", "NA",
"NA", "0.7", "0.6", "0.5", "NA", "NA", "0.3", "0.3", "NA",
"NA", "NA", "0.4", "0.3", "NA", "NA", "0.4", "1.1000000000000001",
"NA", "0.5", "1.4", "0.4", "0.6", "0.7"),
liver_score = c(NA, 0, NA, 0, 0, NA, 0, NA, 0, NA, 0, NA, NA, 0, NA, 0, NA, 0,
NA, 0, 0, NA, NA, 0, NA, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA,
0, 0, 0, 0, 0, 1, NA, NA, 0, 0, 0, 0, 0, NA, NA, NA, 0, 0,
0, NA, NA, 0, 0, NA, NA, NA, 0, 0, NA, NA, 0, 0, NA, 0, 0,
0, 0, 0),
pam = c("NA", "80.666666666666671", "73.333333333333329", "83.333333333333329",
"84.666666666666671", "100.6666666666667",
"101", "100", "95", "69.333333333333329", "93.666666666666671",
"70", "91.666666666666671", "70.666666666666671", "81.666666666666671",
"71.333333333333329", "95.333333333333329", "92", "77.666666666666671",
"84.333333333333329", "90", "80", "96.333333333333329", "93.333333333333329",
"69.333333333333329", "57.333333333333343", "80", "63", "81.666666666666671",
"80", "76.666666666666671", "90", "62.666666666666657", "89.333333333333329",
"87.333333333333329", "72.333333333333329", "93.333333333333329",
"87.666666666666671", "105.3333333333333", "107.6666666666667",
"69.333333333333329", "111.3333333333333", "75.666666666666671",
"103", "85.333333333333329", "86.666666666666671", "64.333333333333329",
"93.333333333333329", "87.333333333333329", "100", "106.6666666666667",
"96.666666666666671", "102", "98.666666666666671", "83.333333333333329",
"73.666666666666671", "95.333333333333329", "81", "93.333333333333329",
"62.666666666666657", "83.333333333333329", "93", "92.333333333333329",
"73.333333333333329", "87.333333333333329", "74.333333333333329",
"95.333333333333329", "80.333333333333329", "83.333333333333329",
"96.333333333333329", "97.666666666666671", "91.666666666666671",
"85.333333333333329"),
catecolamine = c("10 ml/h", "0,1 mcg/kg/min", "0", "0",
"0,1mcg/kg/min", "0", "0", "0", "0", "0,8 mcg/kg/min",
"0,1 mcg/kg/min", "0", "0", "0,3mcg/kg/min", "0,15 mcg/kg/min",
"0,13mcg/kg/min", "6ml/h", "0", "0", "0,03mcg/kg/min", "0",
"0", "0", "0", "9mL/h", "0", "0", "0.45", "5.5555555555555558E-3",
"0", "NA", "0", "0.2857142857142857", "0", "0.32258064516129031",
"0", "0.16", "0,05mcg/kg/min", "0", "0", "0", "0", "0", "0",
"0", "0", "0,1mcg/Kg/min", "0", "0", "0", "0", "0", "0",
"0", "0", "10ml/hr", "0", "0", "0", "30ml/h", "0", "0", "0",
"0", "0", "0", "0", "10 ml/h", "0", "0", "0", "0", "0"),
cardiovas_score = c(NA, 3, 0, 0, 3, 0, 0, 0, 0, 4, 3, 0,
0, 4, 4, 4, 3, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 4, 4, 0, NA,
0, 4, 0, 4, 0, 4, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0,
0, 0, 0, 0, 0, NA, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, NA,
0, 0, 0, 0, 0),
creatinine = c("0.62", "2.21", "NA", "1.25", "1.84", "0.94",
"0.77", "0.41", "1.58", "NA", "2.4500000000000002",
"0.21", "NA", "1.65", "2.77", "2.1800000000000002",
"1.1299999999999999", "0.49", "0.53", "1.34",
"1.23", "0.8", "0.65", "2.9", "0.45", "1.05", "0.72",
"5.14", "0.65", "0.47", "1.46", "NA", "2.77", "2.21",
"0.97", "0.35", "NA", "1.83", "0.78", "0.45", "1.23",
"NA", "0.59", "1.21", "0.61", "0.75", "1.23", "0.74",
"NA", "0.88", "0.65", "3.48", "0.7", "0.88", "NA",
"1.1499999999999999", "0.59", "0.6", "0.41", "NA",
"0.66", "0.72", "0.69", "NA", "NA", "1.29", "0.8",
"0.94", "0.48", "1.1399999999999999", "0.85", "0.74",
"0.86"),
renal_score = c(0, 2, NA, 1, 1, 0, 0, 0, 0, NA, 2, 0, NA, 1, 2, 2,
1, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 4, 0, 0, 1, NA, 2,
2, 0, 0, NA, 1, 0, 0, 1, NA, 0, 1, 0, 0, 1, 0, NA,
0, 0, 3, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA,
1, 0, 0, 0, 0, 0, 0, 0),
`SOFA SCORE` = c(0, 5, 3, 2, 7, 2, 2, 1, 2, 4, 5, 0, 0, 9, 6, 6, 4,
3, 3, 7, 3, 1, 0, 2, 7, 2, 0, 8, 4, 0, 3, 1, 6, 4,
10, 2, 4, 5, 2, 0, 7, 2, 0, 2, 1, 1, 7, 2, 0, 1, 0,
5, 1, 3, 2, 4, 0, 0, 2, 3, 0, 1, 1, 1, 2, 3, 2, 3,
2, 0, 1, 2, 2)
),
row.names = c(NA, -73L),
class = c("tbl_df", "tbl", "data.frame")
)
test <- glm(outcome ~ resp_score + coag_score + liver_score +
cardiovas_score + renal_score, data = myData)
summary(test)
The only thing that I removed from your command is Family = binomial and got the following results:
Call:
glm(formula = outcome ~ resp_score + coag_score + liver_score +
cardiovas_score + renal_score, data = myData)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.88541 -0.16677 0.09191 0.11459 0.68038
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.93077 0.13547 6.871 4.17e-07 ***
resp_score -0.02268 0.06703 -0.338 0.7381
coag_score -0.27004 0.12882 -2.096 0.0468 *
liver_score 0.47799 0.42628 1.121 0.2733
cardiovas_score -0.11254 0.05819 -1.934 0.0650 .
renal_score -0.20551 0.10250 -2.005 0.0564 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.1415673)
Null deviance: 6.9667 on 29 degrees of freedom
Residual deviance: 3.3976 on 24 degrees of freedom
(43 observations deleted due to missingness)
AIC: 33.793
Number of Fisher Scoring iterations: 2
So I didn't get any factors in variables. I think somehow your dataset storing variables as factors. I don't know if it's going to solve your problem but just wanted to share my experience with this very dataset. You can try it my way if you like.
Having a hard time figuring out an efficient solution to the following problem. The question is very verbose because I'm not sure if I'm making this problem harder than it can be.
Given a named vector
t <- c(2, 0, 0, 30, 0, 0, 10, 2000, 0, 20, 0, 40, 60, 10)
names(t) <- c(1, 0, 0, 2, 0, 0, 3, 4, 0, 5, 0, 6, 7, 8)
I want to split t into a list of 4 elements that's balanced based on the sum of the resulting list elements while keeping the order of elements, and only splitting on non-zero elements. Expected outcome
L[1] <- c(2, 0, 0, 30, 0, 0, 10) # sum = 42
L[2] <- c(2000, 0) # sum = 2000
L[3] <- c(20, 0, 40) # sum = 60
L[4] <- c(60, 10) # sum = 70
The error function I use is minimizing sd(rowSums(L)) or sd(sapply(L, sum))
Trying to split the vector using something like the following doesn't quite work
split(t, cut(cumsum(t), 4))
# $`(-0.17,544]`
# 1 0 0 2 0 0 3
# 2 0 0 30 0 0 10
# $`(544,1.09e+03]`
# named numeric(0)
# $`(1.09e+03,1.63e+03]`
# named numeric(0)
# $`(1.63e+03,2.17e+03]`
# 4 0 5 0 6 7 8
# 2000 0 20 0 40 60 10
I wrote a function to split the list the way that I wanted (see error function above)
break_at <- function(val, nchunks) {
nchunks <- nchunks - 1
nonzero <- val[val != 0]
all_groupings <- as.matrix(gtools::permutations(n = 2, r = length(nonzero), v = c(1, 0), repeats.allowed = TRUE))
all_groupings <- all_groupings[rowSums(all_groupings) == nchunks, ]
which_grouping <- which.min(
sapply(
1:nrow(all_groupings),
function(i) {
sd(
sapply(
split(
nonzero,
cumsum(all_groupings[i,])
),
sum
)
)
}
)
)
mark_breaks <- rep(0, length(val))
mark_breaks[names(val) %in% which(all_groupings[which_grouping,]==1)] <- 1
return(mark_breaks)
}
You can see the result is much better
break_at(t, 4)
# 0 0 0 0 0 0 0 1 0 1 0 0 1 0
split(t, cumsum(break_at(t, 4)))
# $`0`
# 1 0 0 2 0 0 3
# 2 0 0 30 0 0 10
# $`1`
# 4 0
# 2000 0
# $`2`
# 5 0 6
# 20 0 40
# $`3`
# 7 8
# 60 10
It works by using gtools::permutations(n = 2, r = length(nonzero), v = c(1, 0), repeats.allowed = TRUE) to look at all potential splits. See how the above works for r = 3
# [,1] [,2] [,3]
# [1,] 0 0 0
# [2,] 0 0 1
# [3,] 0 1 0
# [4,] 0 1 1
# [5,] 1 0 0
# [6,] 1 0 1
# [7,] 1 1 0
# [8,] 1 1 1
which I then filter, all_groupings[rowSums(all_groupings) == nchunks, ]. This only looks at potential splits that produce nchunks.
My issue is that this works horribly with my real data because of the number of permutations involved.
hard <- structure(c(2, 0, 1, 2, 0, 1, 1, 1, 5, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 0, 0, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1,
1, 1, 2, 0, 2, 0, 1, 4, 0, 0, 0, 1, 3, 0, 0, 4, 0, 0, 0, 2, 0,
1, 1, 1, 3, 0, 0, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 6,
0, 0, 0, 0, 0, 1, 1, 1, 3, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0,
1, 1, 2, 0, 1, 2, 0, 1, 1, 4, 0, 0, 0, 1, 1, 3, 0, 0, 1, 2, 0,
1, 1, 2, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 1, 1, 2, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 2, 0, 3,
0, 0, 1, 1, 2, 0, 1, 2, 0, 1, 1, 1, 2, 0, 2, 0, 1, 3, 0, 0, 1,
1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 1, 2, 0, 1, 1, 1, 1, 1, 1, 2,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
0, 1, 1, 1, 1, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 8, 0, 0, 0, 0, 0, 0, 0,
1, 2, 0, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1,
3, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1,
1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 3, 0,
0, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1,
1, 1, 1, 2, 0, 1, 1, 1, 1, 5, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 0, 1, 1, 1, 1, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 0, 1, 1, 2, 0, 1, 2, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 2,
0, 1, 9, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 1, 1, 1,
1, 6, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 1, 3, 0, 0, 1, 1, 1, 3,
0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1), .Names = c("1", "0",
"2", "3", "0", "4", "5", "6", "7", "0", "0", "0", "0", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "24", "0", "0", "25", "26", "27", "28", "29",
"30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "0",
"40", "41", "42", "43", "0", "44", "45", "46", "47", "48", "49",
"50", "51", "52", "0", "53", "0", "54", "55", "0", "0", "0",
"56", "57", "0", "0", "58", "0", "0", "0", "59", "0", "60", "61",
"62", "63", "0", "0", "64", "65", "66", "67", "68", "0", "69",
"70", "0", "71", "72", "73", "0", "74", "75", "0", "0", "0",
"0", "0", "76", "77", "78", "79", "0", "0", "80", "81", "82",
"83", "84", "85", "86", "87", "88", "0", "89", "90", "91", "0",
"92", "93", "0", "94", "95", "96", "0", "0", "0", "97", "98",
"99", "0", "0", "100", "101", "0", "102", "103", "104", "0",
"105", "106", "0", "0", "107", "108", "0", "0", "109", "110",
"111", "112", "0", "113", "114", "115", "116", "117", "118",
"119", "120", "121", "122", "123", "124", "125", "126", "127",
"128", "129", "130", "131", "0", "132", "133", "134", "0", "135",
"0", "0", "136", "137", "138", "0", "139", "140", "0", "141",
"142", "143", "144", "0", "145", "0", "146", "147", "0", "0",
"148", "149", "150", "151", "152", "153", "0", "154", "155",
"156", "157", "0", "158", "159", "0", "160", "161", "162", "163",
"164", "165", "166", "0", "167", "168", "169", "170", "171",
"172", "173", "174", "175", "176", "177", "178", "179", "180",
"181", "182", "183", "184", "185", "186", "0", "187", "188",
"189", "190", "191", "192", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "193", "194", "195", "196", "197", "0", "198",
"199", "200", "201", "0", "202", "203", "204", "205", "0", "206",
"0", "0", "0", "0", "0", "0", "0", "207", "208", "0", "209",
"210", "211", "212", "213", "214", "215", "0", "216", "217",
"218", "219", "220", "221", "0", "222", "223", "224", "225",
"0", "0", "226", "227", "228", "229", "230", "231", "232", "233",
"234", "235", "236", "237", "238", "239", "240", "0", "241",
"242", "243", "244", "245", "246", "247", "248", "0", "249",
"250", "251", "252", "253", "254", "0", "255", "256", "257",
"258", "259", "260", "0", "0", "261", "262", "263", "264", "0",
"265", "266", "267", "268", "269", "270", "271", "272", "273",
"274", "0", "275", "276", "277", "278", "279", "280", "281",
"282", "0", "283", "284", "285", "286", "287", "0", "0", "0",
"0", "288", "0", "0", "0", "0", "0", "289", "290", "291", "292",
"293", "294", "295", "296", "297", "298", "299", "300", "301",
"302", "303", "304", "305", "306", "307", "308", "309", "310",
"311", "312", "313", "314", "315", "316", "317", "318", "319",
"320", "321", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "322", "323", "324", "325", "326", "327", "328", "329",
"330", "331", "332", "333", "334", "335", "336", "337", "338",
"339", "340", "341", "0", "342", "343", "344", "345", "346",
"0", "347", "0", "348", "349", "350", "351", "352", "353", "354",
"355", "356", "357", "358", "359", "360", "0", "361", "362",
"363", "0", "364", "365", "0", "366", "367", "0", "0", "0", "0",
"0", "0", "0", "368", "0", "369", "370", "0", "0", "0", "0",
"0", "0", "0", "0", "371", "0", "0", "372", "0", "0", "0", "373",
"374", "375", "376", "377", "0", "0", "0", "0", "0", "378", "0",
"0", "0", "0", "0", "379", "380", "0", "0", "381", "382", "383",
"384", "0", "0", "385", "0", "0", "0", "0", "0", "0", "386",
"387", "388", "0", "389", "390", "391", "392", "393", "394",
"395", "396", "397", "398", "399", "400", "401", "402", "0",
"403", "404", "405", "406", "407", "408", "409"))
I don't know if there are some analytical solutions. But if you treat it as a integer programming problem you could use the "SANN" heuristics implemented in optim. For example, consider some (sub-optimal) random split points to cut the vector t
> startpar <- sort(sample(length(t)-1, 3))
> startpar
[1] 5 6 9
> # result in a sub-optimal split
> split(t, cut(1:length(t), c(0, startpar, length(t)), labels = 1:4))
$`1`
1 0 0 2 0
2 0 0 30 0
$`2`
0
0
$`3`
3 4 0
10 2000 0
$`4`
5 0 6 7 8
20 0 40 60 10
The error function could be written as
> # from manual: A function to be minimized (or maximized)
> fn <- function(par, vec){
+ ind_vec <- cut(1:length(vec), c(0, par, length(vec)), labels = 1:4)
+ sd(unlist(lapply(split(vec, ind_vec), sum)))
+ }
> # evaluated at the starting parameters
> fn(startpar, t)
[1] 979.5625
The "SANN" heuristics (Simulated annealing) needs a method to generate a new candidate solution. There can be more sophisticated ways to select either the functions or the starting values, but the present choices still lead to the/an [edit:] near optimal solution (and maybe in acceptable time?).
> # from manual: For the "SANN" method it specifies a function to generate a new candidate point
> gr <- function(par, vec){
+ ind <- sample(length(par), 1)
+ par[ind] <- par[ind] + sample(-1:1, 1)
+ par[ind] <- max(c(par[ind], ifelse(ind == 1, 1, par[ind - 1] + 1)))
+ par[ind] <- min(c(par[ind], ifelse(ind == 3, length(vec) - 1, par[ind + 1] - 1)))
+ par
+ }
Applied to the toy data
> optimpar <- optim(startpar, fn, gr, method = "SANN", vec = t)$par
> split(t, cut(1:length(t), c(0, optimpar, length(t)), labels = 1:4))
$`1`
1 0 0 2
2 0 0 30
$`2`
0 0 3
0 0 10
$`3`
4
2000
$`4`
0 5 0 6 7 8
0 20 0 40 60 10
> fn(optimpar, t)
[1] 972.7329
>
Applied to the real data
> # use for "hard"
> startpar <- sort(sample(length(hard)-1, 3))
> optimpar <- optim(startpar, fn, gr, method = "SANN", vec = hard)
> optimpar
$par
[1] 146 293 426
$value
[1] 4.573474
...[output shortened]
[Edit] since my initial results were sub-optimal.
I'm sure you found a sufficient alternative yourself already, but for the sake of completeness: Regarding the present toy and real data examples a better choice for gr (I'll call it gr2 for later reference) would have a different sampling length (e.g. dependent on the length of the data) in order to generate the new candidate which will be less dependent from the incumbent (the current solution). For example
> gr2 <- function(par, vec){
+ ind <- sample(length(par), 1)
+ l <- round(log(length(vec), 2))
+ par[ind] <- par[ind] + sample(-l:l, 1)
+ par[ind] <- max(c(par[ind], ifelse(ind == 1, 1, par[ind - 1] + 1)))
+ par[ind] <- min(c(par[ind], ifelse(ind == 3, length(vec) - 1, par[ind + 1] - 1)))
+ par
+ }
For the real data resulting in
> set.seed(1337)
>
> startpar <- sort(sample(length(hard)-1, 3))
> opt <- optim(startpar, fn, gr2, method = "SANN", vec = hard)
> opt$value
[1] 4.5
> lapply(split(hard, cut(1:length(hard), c(0, opt$par, length(hard)), labels = 1:4)), sum)
$`1`
[1] 140
$`2`
[1] 141
$`3`
[1] 144
$`4`
[1] 150
And for the toy data resulting in
> startpar <- sort(sample(length(t)-1, 3))
> opt <- optim(startpar, fn, gr2, method = "SANN", vec = t)
> opt$value
[1] 971.4024
> split(t, cut(1:length(t), c(0, opt$par, length(t)), labels = 1:4))
$`1`
1 0 0 2 0 0 3
2 0 0 30 0 0 10
$`2`
4
2000
$`3`
0 5 0 6
0 20 0 40
$`4`
7 8
60 10
Regarding the optimality for the real data (using gr2), I ran a short simulation of 100 optimization runs from different starting parameters: Each of those runs terminated at a value of 4.5.
By using dynamic programming you can get the true optimum in O(N^2) time. The trick is to see that minimizing the standard deviation is the same as minimizing the sum of squares of rowSums. Since the error contributions of each subvector are independent, we can reduce the search-space of
possible splits by ignoring extensions of suboptimal splits of subvectors.
If for instance (3, 5) is a better split for V[1:7] than (2, 4), then
every split of V starting with (3, 5, 8,...) is better than every split starting with
(2, 4, 8, ...).
So if we for each 1 < k < len(V) find the best 2-group split of 'V[1:k]',
we can find the best into 3-group split of each V[1:k] by only considering extensions of the optimal 2-group splits of the subvectors V[1:k]. In general we find the best (n+1)-group spilt by extending the optimal n-group splits.
The balanced.split function below takes in a vector of values and the number of splits and returns a list of subvectors. This yields the a solution with row sums 140,141,144,150 on the hard set.
balanced.split <- function(all.values, n.splits) {
nonzero.idxs <- which(all.values!=0)
values <- all.values[nonzero.idxs]
cumsums = c(0, cumsum(values))
error.table <- outer(cumsums, cumsums, FUN='-')**2
# error.table[i, j] = error contribution of segment
# values[i:(j-1)]
# Iteratively find best i splits
index.matrix <- array(dim=c(n.splits-1, ncol(error.table)))
cur.best.splits <- error.table[1, ]
for (i in 1:(n.splits-1)){
error.sums <- cur.best.splits + error.table
index.matrix[i, ] <- apply(error.sums, 2, which.min)
# index.matrix[i, k] = last split of optimal (i+1)-group
# split of values[1:k]
cur.best.splits <- apply(error.sums, 2, min)
# cur.best.splits[k] = minimal error function
# of (i+1)-group split of values[1:k]
}
# Trace best splits
cur.idx <- ncol(index.matrix)
splits <- vector("numeric", n.splits-1)
for (i in (n.splits-1):1) {
cur.idx = index.matrix[i, cur.idx]
splits[i] <- cur.idx
}
# Split values vector
splits <- c(1, nonzero.idxs[splits], length(all.values)+1)
chunks <- list()
for (i in 1:n.splits)
chunks[[i]] <- all.values[splits[i]:(splits[i+1]-1)]
return(chunks)
}
Below is more detailed code for the same algorithm
# Matrix containing the error contribution of
# subsegments [i:j]
.makeErrorTable <- function(values) {
cumsums = c(0, cumsum(values))
return(outer(cumsums, cumsums, FUN='-')**2)
}
# Backtrace the optimal split points from an index matrix
.findPath <- function(index.matrix){
nrows <- nrow(index.matrix)
cur.idx <- ncol(index.matrix)
path <- vector("numeric", nrows)
for (i in nrows:1) {
cur.idx = index.matrix[i, cur.idx]
path[i] <- cur.idx
}
return(path)
}
.findSplits <- function(error.table, n.splits) {
n.diffs <- nrow(error.table)
max.val <- error.table[1, n.diffs]
# Table used to backtrace the optimal path
idx.table <- array(dim=c(n.splits-1, n.diffs))
cur.best.splits <- error.table[1, ]
for (i in 1:(n.splits-1)){
error.sums <- cur.best.splits + error.table
idx.table[i, ] <- apply(error.sums, 2, which.min)
cur.best.splits <- apply(error.sums, 2, min)
}
return(.findPath(idx.table))
}
# Split values at given split points
.splitChunks <- function(values, splits) {
splits <- c(1, splits, length(values)+1)
chunks <- list()
for (i in 1:(length(splits)-1))
chunks[[i]] <- values[splits[i]:(splits[i+1]-1)]
return(chunks)
}
#' Main function that splits all.values into n.splits
#' chunks, minimizing sd(sum(chunk))
balanced.split <- function(all.values, n.splits) {
nonzero.idxs <- which(all.values!=0)
values <- all.values[nonzero.idxs]
error.table <- .makeErrorTable(values)
splits <- .findSplits(error.table, n.splits)
full.splits <- nonzero.idxs[splits]
return(.splitChunks(all.values, full.splits))
}
The following solution is "split[ting] t into a list of 4 elements that's balanced based on the sum of the resulting list elements while keeping the order of elements, and only splitting on non-zero elements.".
It's not producing your exact expected output though, but to my understanding your optimization rules were not requirements but just things you've tried to get those balanced lists. And it should be efficient :).
t <- c(2, 0, 0, 30, 0, 0, 10, 2000, 0, 20, 0, 40, 60, 10)
groups <- cut(cumsum(t),
breaks=quantile(cumsum(t),
probs=seq(0, 1, 0.25)),
include.lowest =TRUE)
lapply(unique(groups),function(x) t[groups==x])
# [[1]]
# [1] 2 0 0 30 0 0
#
# [[2]]
# [1] 10
#
# [[3]]
# [1] 2000 0 20 0
#
# [[4]]
# [1] 40 60 10
On your hard data, the results are quite well "balanced" :
t2 <- as.numeric(hard)
groups <- cut(cumsum(t2),
breaks=quantile(cumsum(t2),
probs=seq(0, 1, 0.25)),
include.lowest =TRUE)
L2 <- lapply(unique(groups),function(x) t2[groups==x])
sapply(L2,sum)
# [1] 144 145 149 137
To compare with 138 143 144 150 using currently chosen solution.