Error message with missForest package (imputation using Random Forest) - r
My dataframe is below. All variables are numeric, one of them (Total) has about 20 NAs. I would like the missForest package to create imputed values for the NAs in Total. I am running
R version 4.2.1 (2022-06-23 ucrt) on Windows.
imp <- structure(list(Years = c(21, 5, 5, 25, 4, 4, 4, 1, 12, 17, 5.5,
4, 13, 1, 1, 5, 1, 12, 8, 1, 14, 0.8, 6, 5, 4, 7, 4, 21, 3, 2,
20, 1, 2, 2, 20, 2, 1, 9, 12, 22, 1, 27, 5, 3, 1, 8, 5, 25, 1,
0.4, 4, 1, 1.5, 1, 1, 21, 5, 0.5, 3, 12, 3, 28, 7, 5, 22, 3.25,
4, 4, 12, 1, 3, 25, 17, 12, 40, 12, 6, 3, 8, 7, 17, 1, 3, 3,
6, 4, 7, 1, 7, 6, 4, 11, 1, 5, 2, 15, 1, 3, 7.5, 21, 4, 1.5,
7, 13, 5, 6, 9, 12.5, 2.5, 1, 17, 8, 5, 22, 25, 13, 5.5, 19,
9, 3.3, 14, 3, 22, 5, 6, 2.8, 9, 1, 8, 11, 8, 4, 2, 10, 1, 19,
13, 5, 1, 1.5, 7, 12, 2, 2.5, 1.5, 1, 2, 8, 5, 4, 3, 2, 2.5,
7, 11, 3, 8, 22, 5, 5, 8, 3.5, 1, 8, 11, 1, 5, 7, 9, 7, 4, 1,
14, 4, 20, 4, 5, 15.5, 9, 2, 7.5, 1, 13.5, 14, 1, 7, 4, 20, 9.5,
0, 10, 3, 8, 1, 3, 1, 19, 1, 20, 8, 25, 16, 14, 10, 24, 1, 2,
4, 0, 11, 2, 1.5, 2, 1, 21, 1, 20, 1.75, 5, 22, 5, 3), Staff = c(7,
8, 6, 10, 15, 6, 7, 17, 9, 5, 7, 12, 15, 8, 7, 5, 8, 8, 2, 8,
7, 8, 7, 7, 12, 8, 8, 7, 12, 10, 5, 7, 3, 6, 11, 4, 8, 8, 9,
6, 9, 9, 18, 10, 9, 5, 7, 20, 9, 4, 9, 6, 5, 4, 3, 5, 11, 8,
4, 7, 6, 16, 5, 5, 8, 8, 7, 4, 9, 9, 9, 14, 8, 5, 6, 6, 4, 3,
6, 7, 10, 7, 7, 3, 7, 13, 12, 4, 10, 8, 9, 5, 15, 7, 9, 9, 6,
5, 15, 7, 6, 5, 7, 8, 7, 7, 5, 9, 15, 12, 15, 5, 8, 7, 7, 5,
8, 12, 6, 6, 12, 9, 5, 4, 6, 7, 15, 5, 20, 6, 6, 11, 6, 8, 6,
2, 7, 4, 4, 2, 6, 15, 5, 15, 6, 3, 8, 15, 12, 7, 6, 9, 7, 1,
10, 5, 7, 4, 5, 1, 6, 5, 20, 8, 10, 1, 11, 9, 9, 5, 3, 8, 6,
5, 5, 5, 6, 8, 4, 7, 5, 4, 10, 8, 13, 5, 13, 3, 0, 15, 20, 5,
15, 14, 19, 20, 5, 7, 5, 9, 6, 6, 7, 20, 10, 25, 7, 5, 6, 10,
45, 10, 6, 5, 6, 8, 13, 12, 15, 7, 4, 1), JDs = c(64, 64, 120,
200, 30, 70, 370, 75, 300, 20, 68, 170, 77, 275, 132, 81, 875,
135, 75, 84, 74, 110, 120, 60, 1800, 94, 54, 125, 140, 150, 52,
190, 53, 170, 325, 18, 300, 86, 130, 375, 140, 200, 104, 50,
100, 95, 360, 40, 45, 52, 165, 20, 150, 58, 230, 95, 150, 95,
85, 120, 100, 265, 18, 90, 130, 77, 80, 75, 133, 73, 302, 500,
70, 50, 55, 72, 35, 60, 100, 90, 130, 41, 200, 29, 90, 35, 68,
30, 115, 51, 40, 125, 460, 400, 125, 400, 250, 51, 190, 200,
235, 150, 250, 137, 760, 90, 70, 100, 325, 200, 350, 150, 325,
23, 17, 50, 415, 650, 120, 96, 200, 4, 71, 700, 60, 224, 203,
16, 40, 62, 105, 41, 340, 22, 60, 11, 60, 30, 95, 27, 300, 120,
70, 96, 100, 6, 750, 14, 80, 60, 51, 90, 350, 250, 31, 78, 95,
32, 185, 65, 65, 30, 24, 65, 550, 100, 200, 80, 47, 45, 37, 250,
55, 25, 27, 90, 190, 65, 27, 80, 68, 110, 220, 325, 25, 43, 14,
5, 7, 17, 15, 135, 20, 26, 26, 29, 75, 93, 50, 127, 14, 75, 90,
50, 105, 190, 8, 45, 150, 300, 15, 25, 150, 60, 32, 85, 15, 144,
190, 155, 10, 20), Total = c(325000, 250000, 275000, 340000,
165000, 3e+05, 420000, 8e+05, 5e+05, 100776, 440000, 440000,
191500, NA, 4e+05, 145000, 6e+05, 4e+05, 125000, 155000, 230000,
250000, 240000, 2e+05, NA, 250000, 188000, 375000, 190000, 450000,
290558, 725000, 355000, 350000, 8e+05, 125000, 450000, 255000,
212500, 6e+05, 342000, 450000, 250000, 228000, 325000, 325000,
425000, 175000, NA, 240000, NA, 250000, 237000, 330000, 345000,
195000, 295000, 208000, 225000, NA, 445000, 253000, 75000, 285000,
4e+05, 2e+05, 308000, 236000, 470000, 190000, 1250000, 480000,
2e+05, 285000, 232000, 240000, 2e+05, 209000, 250000, 309000,
NA, 170000, 1e+06, 115200, 565000, 182500, 175000, 250000, 250000,
265000, 120000, 345000, 425000, 630000, 165000, 650000, 3e+05,
265000, 345000, 425000, 4e+05, 230000, 425000, 161500, 6e+05,
251000, 265000, 190000, 420000, 6e+05, 510000, 340000, 650000,
275000, 120000, 185000, 480000, 550000, 185000, 240000, 560000,
114000, 150000, 1050000, 230000, NA, 335000, 225000, 260000,
410000, 315000, 206000, 650000, 160000, 210000, 180000, 275000,
2e+05, 2e+05, 201094, 395000, 297000, 265000, 3e+05, 275000,
80000, 134000, 180000, 195000, 850000, 4e+05, 385000, 420000,
NA, 187000, 180000, 182700, 96597.28, 380000, 2e+05, 260000,
257500, 185000, 220000, 550000, 315000, 360000, 380000, 185000,
280000, 225000, 375000, 310000, 170000, 165000, 260000, 350000,
208000, 110000, 192500, 187500, 216000, 495000, 550000, 114500,
215000, 185000, NA, 114500, 110000, 250000, 350000, 180000, 118000,
191500, 1e+05, 230000, 350000, 240000, NA, 180000, 215000, 203000,
99800, 389900, NA, NA, NA, 4e+05, 6e+05, NA, NA, NA, 220000,
217500, NA, NA, 210000, 337000, 275000, NA, NA)), row.names = c(NA,
-222L), class = c("tbl_df", "tbl", "data.frame"))
library(missForest) # installed with dependencies = TRUE
impFor <- missForest(imp)
The statement above returns the following warnings and error.
Warning: argument is not numeric or logical: returning NAWarning: argument is not numeric or logical: returning NAWarning: argument is not numeric or logical: returning NAWarning: argument is not numeric or logical: returning NA
Warning: The response has five or fewer unique values. Are you sure you want to do regression?
Error in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry, :
length of response must be the same as predictors
The first four warnings appear to say that my four variables are neither numeric nor logical, but they are all numeric. The warning regarding regression and "five or fewer unique values" puzzles me because the package's manual makes no reference to a minimum number of unique values. Finally, the error confounds me completely.
I have searched StackOverflow, but the two questions that came up are not relevant.
Thank you for setting me right.
Your data should be in a data.frame format instead of tibble. You could use as.data.frame like this:
library(missForest)
class(imp)
#> [1] "tbl_df" "tbl" "data.frame"
imp <- as.data.frame(imp)
class(imp)
#> [1] "data.frame"
imp <- missForest(imp)
imp
#> $ximp
#> Years Staff JDs Total
#> 1 21.00 7 64 325000.00
#> 2 5.00 8 64 250000.00
#> 3 5.00 6 120 275000.00
#> 4 25.00 10 200 340000.00
#> 5 4.00 15 30 165000.00
#> 6 4.00 6 70 300000.00
#> 7 4.00 7 370 420000.00
#> 8 1.00 17 75 800000.00
#> 9 12.00 9 300 500000.00
#> 10 17.00 5 20 100776.00
#> 11 5.50 7 68 440000.00
#> 12 4.00 12 170 440000.00
#> 13 13.00 15 77 191500.00
#> 14 1.00 8 275 422030.00
#> 15 1.00 7 132 400000.00
#> 16 5.00 5 81 145000.00
#> 17 1.00 8 875 600000.00
#> 18 12.00 8 135 400000.00
#> 19 8.00 2 75 125000.00
#> 20 1.00 8 84 155000.00
#> 21 14.00 7 74 230000.00
#> 22 0.80 8 110 250000.00
#> 23 6.00 7 120 240000.00
#> 24 5.00 7 60 200000.00
#> 25 4.00 12 1800 564720.00
#> 26 7.00 8 94 250000.00
#> 27 4.00 8 54 188000.00
#> 28 21.00 7 125 375000.00
#> 29 3.00 12 140 190000.00
#> 30 2.00 10 150 450000.00
#> 31 20.00 5 52 290558.00
#> 32 1.00 7 190 725000.00
#> 33 2.00 3 53 355000.00
#> 34 2.00 6 170 350000.00
#> 35 20.00 11 325 800000.00
#> 36 2.00 4 18 125000.00
#> 37 1.00 8 300 450000.00
#> 38 9.00 8 86 255000.00
#> 39 12.00 9 130 212500.00
#> 40 22.00 6 375 600000.00
#> 41 1.00 9 140 342000.00
#> 42 27.00 9 200 450000.00
#> 43 5.00 18 104 250000.00
#> 44 3.00 10 50 228000.00
#> 45 1.00 9 100 325000.00
#> 46 8.00 5 95 325000.00
#> 47 5.00 7 360 425000.00
#> 48 25.00 20 40 175000.00
#> 49 1.00 9 45 185352.00
#> 50 0.40 4 52 240000.00
#> 51 4.00 9 165 403167.00
#> 52 1.00 6 20 250000.00
#> 53 1.50 5 150 237000.00
#> 54 1.00 4 58 330000.00
#> 55 1.00 3 230 345000.00
#> 56 21.00 5 95 195000.00
#> 57 5.00 11 150 295000.00
#> 58 0.50 8 95 208000.00
#> 59 3.00 4 85 225000.00
#> 60 12.00 7 120 261252.00
#> 61 3.00 6 100 445000.00
#> 62 28.00 16 265 253000.00
#> 63 7.00 5 18 75000.00
#> 64 5.00 5 90 285000.00
#> 65 22.00 8 130 400000.00
#> 66 3.25 8 77 200000.00
#> 67 4.00 7 80 308000.00
#> 68 4.00 4 75 236000.00
#> 69 12.00 9 133 470000.00
#> 70 1.00 9 73 190000.00
#> 71 3.00 9 302 1250000.00
#> 72 25.00 14 500 480000.00
#> 73 17.00 8 70 200000.00
#> 74 12.00 5 50 285000.00
#> 75 40.00 6 55 232000.00
#> 76 12.00 6 72 240000.00
#> 77 6.00 4 35 200000.00
#> 78 3.00 3 60 209000.00
#> 79 8.00 6 100 250000.00
#> 80 7.00 7 90 309000.00
#> 81 17.00 10 130 279905.83
#> 82 1.00 7 41 170000.00
#> 83 3.00 7 200 1000000.00
#> 84 3.00 3 29 115200.00
#> 85 6.00 7 90 565000.00
#> 86 4.00 13 35 182500.00
#> 87 7.00 12 68 175000.00
#> 88 1.00 4 30 250000.00
#> 89 7.00 10 115 250000.00
#> 90 6.00 8 51 265000.00
#> 91 4.00 9 40 120000.00
#> 92 11.00 5 125 345000.00
#> 93 1.00 15 460 425000.00
#> 94 5.00 7 400 630000.00
#> 95 2.00 9 125 165000.00
#> 96 15.00 9 400 650000.00
#> 97 1.00 6 250 300000.00
#> 98 3.00 5 51 265000.00
#> 99 7.50 15 190 345000.00
#> 100 21.00 7 200 425000.00
#> 101 4.00 6 235 400000.00
#> 102 1.50 5 150 230000.00
#> 103 7.00 7 250 425000.00
#> 104 13.00 8 137 161500.00
#> 105 5.00 7 760 600000.00
#> 106 6.00 7 90 251000.00
#> 107 9.00 5 70 265000.00
#> 108 12.50 9 100 190000.00
#> 109 2.50 15 325 420000.00
#> 110 1.00 12 200 600000.00
#> 111 17.00 15 350 510000.00
#> 112 8.00 5 150 340000.00
#> 113 5.00 8 325 650000.00
#> 114 22.00 7 23 275000.00
#> 115 25.00 7 17 120000.00
#> 116 13.00 5 50 185000.00
#> 117 5.50 8 415 480000.00
#> 118 19.00 12 650 550000.00
#> 119 9.00 6 120 185000.00
#> 120 3.30 6 96 240000.00
#> 121 14.00 12 200 560000.00
#> 122 3.00 9 4 114000.00
#> 123 22.00 5 71 150000.00
#> 124 5.00 4 700 1050000.00
#> 125 6.00 6 60 230000.00
#> 126 2.80 7 224 756680.00
#> 127 9.00 15 203 335000.00
#> 128 1.00 5 16 225000.00
#> 129 8.00 20 40 260000.00
#> 130 11.00 6 62 410000.00
#> 131 8.00 6 105 315000.00
#> 132 4.00 11 41 206000.00
#> 133 2.00 6 340 650000.00
#> 134 10.00 8 22 160000.00
#> 135 1.00 6 60 210000.00
#> 136 19.00 2 11 180000.00
#> 137 13.00 7 60 275000.00
#> 138 5.00 4 30 200000.00
#> 139 1.00 4 95 200000.00
#> 140 1.50 2 27 201094.00
#> 141 7.00 6 300 395000.00
#> 142 12.00 15 120 297000.00
#> 143 2.00 5 70 265000.00
#> 144 2.50 15 96 300000.00
#> 145 1.50 6 100 275000.00
#> 146 1.00 3 6 80000.00
#> 147 2.00 8 750 134000.00
#> 148 8.00 15 14 180000.00
#> 149 5.00 12 80 195000.00
#> 150 4.00 7 60 850000.00
#> 151 3.00 6 51 400000.00
#> 152 2.00 9 90 385000.00
#> 153 2.50 7 350 420000.00
#> 154 7.00 1 250 434900.00
#> 155 11.00 10 31 187000.00
#> 156 3.00 5 78 180000.00
#> 157 8.00 7 95 182700.00
#> 158 22.00 4 32 96597.28
#> 159 5.00 5 185 380000.00
#> 160 5.00 1 65 200000.00
#> 161 8.00 6 65 260000.00
#> 162 3.50 5 30 257500.00
#> 163 1.00 20 24 185000.00
#> 164 8.00 8 65 220000.00
#> 165 11.00 10 550 550000.00
#> 166 1.00 1 100 315000.00
#> 167 5.00 11 200 360000.00
#> 168 7.00 9 80 380000.00
#> 169 9.00 9 47 185000.00
#> 170 7.00 5 45 280000.00
#> 171 4.00 3 37 225000.00
#> 172 1.00 8 250 375000.00
#> 173 14.00 6 55 310000.00
#> 174 4.00 5 25 170000.00
#> 175 20.00 5 27 165000.00
#> 176 4.00 5 90 260000.00
#> 177 5.00 6 190 350000.00
#> 178 15.50 8 65 208000.00
#> 179 9.00 4 27 110000.00
#> 180 2.00 7 80 192500.00
#> 181 7.50 5 68 187500.00
#> 182 1.00 4 110 216000.00
#> 183 13.50 10 220 495000.00
#> 184 14.00 8 325 550000.00
#> 185 1.00 13 25 114500.00
#> 186 7.00 5 43 215000.00
#> 187 4.00 13 14 185000.00
#> 188 20.00 3 5 132532.25
#> 189 9.50 0 7 114500.00
#> 190 0.00 15 17 110000.00
#> 191 10.00 20 15 250000.00
#> 192 3.00 5 135 350000.00
#> 193 8.00 15 20 180000.00
#> 194 1.00 14 26 118000.00
#> 195 3.00 19 26 191500.00
#> 196 1.00 20 29 100000.00
#> 197 19.00 5 75 230000.00
#> 198 1.00 7 93 350000.00
#> 199 20.00 5 50 240000.00
#> 200 8.00 9 127 259289.83
#> 201 25.00 6 14 180000.00
#> 202 16.00 6 75 215000.00
#> 203 14.00 7 90 203000.00
#> 204 10.00 20 50 99800.00
#> 205 24.00 10 105 389900.00
#> 206 1.00 25 190 466223.67
#> 207 2.00 7 8 153912.76
#> 208 4.00 5 45 249760.00
#> 209 0.00 6 150 400000.00
#> 210 11.00 10 300 600000.00
#> 211 2.00 45 15 190321.17
#> 212 1.50 10 25 143960.88
#> 213 2.00 6 150 350892.50
#> 214 1.00 5 60 220000.00
#> 215 21.00 6 32 217500.00
#> 216 1.00 8 85 193365.00
#> 217 20.00 13 15 193093.52
#> 218 1.75 12 144 210000.00
#> 219 5.00 15 190 337000.00
#> 220 22.00 7 155 275000.00
#> 221 5.00 4 10 143128.61
#> 222 3.00 1 20 149726.72
#>
#> $OOBerror
#> NRMSE
#> 0.4584988
#>
#> attr(,"class")
#> [1] "missForest"
Created on 2023-02-11 with reprex v2.0.2
Related
Descriptive statistics for R-data in a long format
I have data on repeated measurements (5 or less) which include blood pressure measurements. I formatted the data in a long format, but seeing as I've done this for the first time, I now no longer see how to get descriptive statistics of my variables. My example data: questiondata <- structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 8, 8), time = c("time1", "time2", "time3", "time5", "time1", "time2", "time3", "time5", "time1", "time2", "time3", "time5", "time4", "time5", "time4", "time5", "time4", "time5", "time4", "time5", "time4", "time5", "time4", "time5"), cohort = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), systolicBP = c(102, 137, 132, 150, 152, 146, 160.5, 159.5, 144, 138, 137.5, 163, 137, 147, 125, 141, 147, 150, 148, 167.5, 153.5, 164.5, 159, 123), diastolicBP = c(56, 99, 78, 90, 77, 78, 80.5, 82, 72, 70, 67.5, 61, 86, 90, 80.5, 84, 75, 81, 91, 96, 80, 87.5, 87, 79), egfr = c(78.2, NA, 55.8, NA, NA, NA, 87.6, NA, NA, NA, 75.6, 70.9, 71.9, 71.8, 47.9, 36.6, 93.7, 81.5, 93.2, 82.1, 92.9, 79.1, 66.6, 55.2)), row.names = c(NA, -24L), class = c("tbl_df", "tbl", "data.frame")) Which corresponds to the following tibble: # A tibble: 24 x 6 id time cohort systolicBP diastolicBP egfr <dbl> <chr> <dbl> <dbl> <dbl> <dbl> 1 1 time1 1 102 56 78.2 2 1 time2 1 137 99 NA 3 1 time3 1 132 78 55.8 4 1 time5 1 150 90 NA 5 2 time1 1 152 77 NA 6 2 time2 1 146 78 NA 7 2 time3 1 160. 80.5 87.6 8 2 time5 1 160. 82 NA 9 3 time1 1 144 72 NA 10 3 time2 1 138 70 NA 11 3 time3 1 138. 67.5 75.6 12 3 time5 1 163 61 70.9 13 4 time4 2 137 86 71.9 14 4 time5 2 147 90 71.8 15 5 time4 2 125 80.5 47.9 16 5 time5 2 141 84 36.6 17 6 time4 2 147 75 93.7 18 6 time5 2 150 81 81.5 19 7 time4 2 148 91 93.2 20 7 time5 2 168. 96 82.1 21 8 time4 2 154. 80 92.9 22 8 time5 2 164. 87.5 79.1 23 9 time4 2 159 87 66.6 24 9 time5 2 123 79 55.2 So for example now I want to Get how many persons are in each cohort What is the mean of the systolicBP, in total, for each cohort, and at each timepoint. I'm sure this is very easy, but I can't seem to get any viable results.
library(tidyverse) questiondata %>% group_by(cohort) %>% summarise(n = n_distinct(id), mean_systolic = mean(systolicBP, na.rm = TRUE), .groups = "drop") #> # A tibble: 2 x 3 #> cohort n mean_systolic #> <dbl> <int> <dbl> #> 1 1 3 143. #> 2 2 6 147. questiondata %>% group_by(cohort, time) %>% summarise(mean_systolic_time = mean(systolicBP, na.rm = TRUE), .groups = "drop") #> # A tibble: 6 x 3 #> cohort time mean_systolic_time #> <dbl> <chr> <dbl> #> 1 1 time1 133. #> 2 1 time2 140. #> 3 1 time3 143. #> 4 1 time5 158. #> 5 2 time4 145. #> 6 2 time5 149. Created on 2021-06-25 by the reprex package (v2.0.0) questiondata %>% group_by(cohort, time) %>% summarise(mean_systolic_time = str_c(round(mean(systolicBP, na.rm = TRUE), digits = 0), " mmHg"), .groups = "drop") # A tibble: 6 x 3 cohort time mean_systolic_time <dbl> <chr> <chr> 1 1 time1 133 mmHg 2 1 time2 140 mmHg 3 1 time3 143 mmHg 4 1 time5 158 mmHg 5 2 time4 145 mmHg 6 2 time5 149 mmHg
Calculate a complex difference score with tidyverse?
I have a large dataset of 70 000 rows that I want to perform some operations on, but I can't find an appropriate solution. bib sta run course finish comment day 1 42 9 1 SG 19.88 99 1 2 42 17 2 A 19.96 11 1 3 42 27 3 B 20.92 22 1 4 42 39 4 A 19.60 11 1 5 42 48 5 SG 20.24 99 1 6 42 61 6 C 22.90 33 1 7 42 76 7 B 20.70 22 1 8 42 86 8 C 22.74 33 1 9 42 93 9 C 22.75 33 1 10 42 103 10 A 19.79 11 1 11 42 114 11 B 20.67 22 1 12 42 120 12 SG 20.10 99 1 I want to end up with a tibble that: calculates the mean finish time in SG course for each bib number on one particular day. For example, 19.88 + 20.24 + 20.10 / 3 calculate a difference score for each observation in the dataset by subtracting finish from this mean SG score. For example, 19.88 - mean(SG), 19.96 - mean(SG). I have tried the following approach: First group by day, bib and course. Then filter by SG and calculate the mean: avg.sgtime <- df %>% group_by(day, bib, course) %>% filter(course == 'SG') %>% mutate(avg.sg = mean(finish)) Resulting in the following tibble bib sta run course finish comment day avg.sg <int> <int> <int> <chr> <dbl> <int> <chr> <dbl> 1 42 9 1 SG 19.9 99 1 20.1 2 42 48 5 SG 20.2 99 1 20.1 3 42 120 12 SG 20.1 99 1 20.1 4 42 6 1 SG 20.0 99 2 19.9 5 42 42 5 SG 19.8 77 2 19.9 6 42 130 15 SG 19.9 99 2 19.9 7 42 6 1 SG 20.6 99 3 20.5 8 42 68 12 SG 20.6 77 3 20.5 9 42 90 15 SG 20.4 77 3 20.5 Finally I join the two tibbles together using the following syntax: df %>% full_join(avg.sgtime) %>% mutate(diff = finish - avg.sg) However, this doesn't work. It only works for the SG course but not for course A, B and C. Is there a way to fix this or is there a better solution to the problem? bib sta run course finish comment day avg.sg diff 1 42 9 1 SG 19.88 99 1 20.07333 -0.193333333 2 42 17 2 A 19.96 11 1 NA NA 3 42 27 3 B 20.92 22 1 NA NA 4 42 39 4 A 19.60 11 1 NA NA 5 42 48 5 SG 20.24 99 1 20.07333 0.166666667
You can filter your values for finish within the mutate() and calculate the mean based on those: df %>% group_by(day,bib) %>% mutate( avg.sg = mean(finish[course=="SG"]), diff = finish - avg.sg)
Is the following what you are aiming for? (note that I added a few random values for a second bib just to make sure the join is done properly) The difference to your attempt is using summarise() instead of mutate() to consolidate the avg.sgtime data frame, and also dropping a few columns so that the join is not populated with NAs. Instead of dropping you can also set the relevant columns to join by passing the by argument to the left_join() function. library(dplyr) library(tidyr) # for join avg.sgtime <- df %>% group_by(day, bib, course) %>% filter(course == 'SG') %>% summarise(avg.sg = mean(finish), .groups = "drop") %>% select(c(bib, day, avg.sg)) avg.sgtime #> # A tibble: 3 x 3 #> bib day avg.sg #> <dbl> <dbl> <dbl> #> 1 42 1 20.1 #> 2 43 1 19.1 #> 3 44 2 19.3 df %>% left_join(avg.sgtime) %>% mutate(diff = finish - avg.sg) #> Joining, by = c("bib", "day") #> # A tibble: 36 x 9 #> bib sta run course finish comment day avg.sg diff #> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 42 9 1 SG 19.9 99 1 20.1 -0.193 #> 2 42 17 2 A 20.0 11 1 20.1 -0.113 #> 3 42 27 3 B 20.9 22 1 20.1 0.847 #> 4 42 39 4 A 19.6 11 1 20.1 -0.473 #> 5 42 48 5 SG 20.2 99 1 20.1 0.167 #> 6 42 61 6 C 22.9 33 1 20.1 2.83 #> 7 42 76 7 B 20.7 22 1 20.1 0.627 #> 8 42 86 8 C 22.7 33 1 20.1 2.67 #> 9 42 93 9 C 22.8 33 1 20.1 2.68 #> 10 42 103 10 A 19.8 11 1 20.1 -0.283 #> # … with 26 more rows Created on 2021-07-04 by the reprex package (v2.0.0) data df <- tribble(~bib, ~sta, ~run, ~course, ~finish, ~comment, ~day, 42, 9, 1, "SG", 19.88, 99, 1, 42, 17, 2, "A", 19.96, 11, 1, 42, 27, 3, "B", 20.92, 22, 1, 42, 39, 4, "A", 19.60, 11, 1, 42, 48, 5, "SG", 20.24, 99, 1, 42, 61, 6, "C", 22.90, 33, 1, 42, 76, 7, "B", 20.70, 22, 1, 42, 86, 8, "C", 22.74, 33, 1, 42, 93, 9, "C", 22.75, 33, 1, 42, 103, 10, "A", 19.79, 11, 1, 42, 114, 11, "B", 20.67, 22, 1, 42, 120, 12, "SG", 20.10, 99, 1, 43, 9, 1, "SG", 19.12, 99, 1, 43, 17, 2, "A", 19.64, 11, 1, 43, 27, 3, "B", 20.62, 22, 1, 43, 39, 4, "A", 19.23, 11, 1, 43, 48, 5, "SG", 20.11, 99, 1, 43, 61, 6, "C", 22.22, 33, 1, 43, 76, 7, "B", 20.33, 22, 1, 43, 86, 8, "C", 22.51, 33, 1, 43, 93, 9, "C", 22.78, 33, 1, 43, 103, 10, "A", 19.98, 11, 1, 43, 114, 11, "B", 20.11, 22, 1, 43, 120, 12, "SG", 18.21, 99, 1, 44, 9, 1, "SG", 19.18, 99, 2, 44, 17, 2, "A", 19.56, 11, 2, 44, 27, 3, "B", 20.62, 22, 2, 44, 39, 4, "A", 19.20, 11, 2, 44, 48, 5, "SG", 20.74, 99, 2, 44, 61, 6, "C", 22.50, 33, 2, 44, 76, 7, "B", 20.60, 22, 2, 44, 86, 8, "C", 22.74, 33, 2, 44, 93, 9, "C", 22.85, 33, 2, 44, 103, 10, "A", 19.59, 11, 2, 44, 114, 11, "B", 20.27, 22, 2, 44, 120, 12, "SG", 18.10, 99, 2, )
Thanks #Marcelo Avila for providing me with a very good hint: I hope this is what you are looking for: library(dplyr) df %>% group_by(bib, day) %>% mutate(across(finish, ~ mean(.x[course == "SG"]), .names = "avg_{.col}"), diff = finish - avg_finish, avg_finish = ifelse(course == "SG", avg_finish, NA)) # A tibble: 12 x 9 # Groups: bib, day [1] bib sta run course finish comment day avg_finish diff <int> <int> <int> <chr> <dbl> <int> <int> <dbl> <dbl> 1 42 9 1 SG 19.9 99 1 20.1 -0.193 2 42 17 2 A 20.0 11 1 NA -0.113 3 42 27 3 B 20.9 22 1 NA 0.847 4 42 39 4 A 19.6 11 1 NA -0.473 5 42 48 5 SG 20.2 99 1 20.1 0.167 6 42 61 6 C 22.9 33 1 NA 2.83 7 42 76 7 B 20.7 22 1 NA 0.627 8 42 86 8 C 22.7 33 1 NA 2.67 9 42 93 9 C 22.8 33 1 NA 2.68 10 42 103 10 A 19.8 11 1 NA -0.283 11 42 114 11 B 20.7 22 1 NA 0.597 12 42 120 12 SG 20.1 99 1 20.1 0.0267 I also added another alternative solution with a minor change, using dear #Marcelo Avila's data set: df %>% group_by(bib, day) %>% mutate(across(finish, ~ mean(.x[select(cur_data(), course) == "SG"]), .names = "avg_{.col}"), diff = finish - avg_finish, avg_finish = ifelse(course == "SG", avg_finish, NA)) # A tibble: 36 x 9 # Groups: bib, day [3] bib sta run course finish comment day avg_finish diff <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> 1 42 9 1 SG 19.9 99 1 20.1 -0.193 2 42 17 2 A 20.0 11 1 NA -0.113 3 42 27 3 B 20.9 22 1 NA 0.847 4 42 39 4 A 19.6 11 1 NA -0.473 5 42 48 5 SG 20.2 99 1 20.1 0.167 6 42 61 6 C 22.9 33 1 NA 2.83 7 42 76 7 B 20.7 22 1 NA 0.627 8 42 86 8 C 22.7 33 1 NA 2.67 9 42 93 9 C 22.8 33 1 NA 2.68 10 42 103 10 A 19.8 11 1 NA -0.283 # ... with 26 more rows
output of pmatch changes with vector length in R
I am trying to use pmatch in base R. The following example appears to work as expected: treat1 <- c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,10, 10,10,10,10,10,10,11,11,11,11, 11,11,12,12,12,12,12,12,12,13, 13,13,13,13,13,14,14,14,14,14, 14,14,15,15,15,15,15,15,16,16, 16,16,16,16,16,17,17,17,17,17, 17,18,18,18,18,18,18,18) control1 <- c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9,10, 10,10,10,10,10,11,11,11,11,11, 12,12,12,12,12,12,13,13,13,13, 13,14,14,14,14,14,14,15,15,15, 15,15,16,16,16,16,16,16,17,17, 17,17,17,18,18,18,18,18,18) pmatch(control1, treat1) #[1] 1 2 3 4 5 8 9 10 11 12 # 13 14 15 16 17 18 21 22 23 24 # 25 26 27 28 29 30 31 34 35 36 # 37 38 39 40 41 42 43 44 47 48 # 49 50 51 52 53 54 55 56 57 60 # 61 62 63 64 65 67 68 69 70 71 # 73 74 75 76 77 78 80 81 82 83 # 84 86 87 88 89 90 91 93 94 95 # 96 97 99 100 101 102 103 104 106 107 # 108 109 110 112 113 114 115 116 117 However, the following example does not work as I expected. The only difference between the example above and the one below is the presence of a few additional elements of value 19 at the end of the vectors below. The output below contains numerous NA's and only seems to include the position in treat2 of the first element of a given value in control2. I have tried including some of the options for pmatch in the documentation but cannot get output similar to that shown above. There are several similar questions on Stack Overflow, such as the following, but I have not found a solution to my issue: Properties of pmatch function treat2 <- c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,10, 10,10,10,10,10,10,11,11,11,11, 11,11,12,12,12,12,12,12,12,13, 13,13,13,13,13,14,14,14,14,14, 14,14,15,15,15,15,15,15,16,16, 16,16,16,16,16,17,17,17,17,17, 17,18,18,18,18,18,18,18,19,19, 19,19,19,19,19) control2 <- c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9,10, 10,10,10,10,10,11,11,11,11,11, 12,12,12,12,12,12,13,13,13,13, 13,14,14,14,14,14,14,15,15,15, 15,15,16,16,16,16,16,16,17,17, 17,17,17,18,18,18,18,18,18,19, 19,19,19,19) pmatch(control2, treat2) #[1] 1 NA NA NA NA 8 NA NA NA NA # NA 14 NA NA NA NA 21 NA NA NA # NA NA 27 NA NA NA NA 34 NA NA # NA NA NA 40 NA NA NA NA 47 NA # NA NA NA NA 53 NA NA NA NA 60 # NA NA NA NA NA 67 NA NA NA NA # 73 NA NA NA NA NA 80 NA NA NA # NA 86 NA NA NA NA NA 93 NA NA # NA NA 99 NA NA NA NA NA 106 NA # NA NA NA 112 NA NA NA NA NA 119 # NA NA NA NA
Given that your treat and control are always numbers, I think it might be easier (and faster) to just rewrite that function using Rcpp. Consider something like this Rcpp::cppFunction('NumericVector cpmatch(NumericVector x, NumericVector table) { int n = x.size(), m = table.size(); NumericVector out(n, NA_REAL), y = clone(table); for (int i = 0; i < n; i++) { if (ISNAN(x[i])) { continue; } for (int j = 0; j < m; j++) { if (!ISNAN(y[j]) & x[i] == y[j]) { y[j] = NA_REAL; out[i] = j + 1; break; } } } return out; }') Test > cpmatch(control2, treat2) [1] 1 2 3 4 5 8 9 10 11 12 13 14 15 16 17 18 21 22 23 24 25 26 27 28 29 30 31 34 35 36 37 38 39 40 41 42 43 [38] 44 47 48 49 50 51 52 53 54 55 56 57 60 61 62 63 64 65 67 68 69 70 71 73 74 75 76 77 78 80 81 82 83 84 86 87 88 [75] 89 90 91 93 94 95 96 97 99 100 101 102 103 104 106 107 108 109 110 112 113 114 115 116 117 119 120 121 122 123 > cpmatch(control1, treat1) [1] 1 2 3 4 5 8 9 10 11 12 13 14 15 16 17 18 21 22 23 24 25 26 27 28 29 30 31 34 35 36 37 38 39 40 41 42 43 [38] 44 47 48 49 50 51 52 53 54 55 56 57 60 61 62 63 64 65 67 68 69 70 71 73 74 75 76 77 78 80 81 82 83 84 86 87 88 [75] 89 90 91 93 94 95 96 97 99 100 101 102 103 104 106 107 108 109 110 112 113 114 115 116 117 Benchmark > microbenchmark::microbenchmark(cpmatch(control1, treat1), pmatch(control1, treat1)) Unit: microseconds expr min lq mean median uq max neval cld cpmatch(control1, treat1) 16.9 17.3 19.795 17.55 18.1 55.7 100 a pmatch(control1, treat1) 174.5 174.8 187.174 175.20 188.5 421.9 100 b
Perhaps there is a way to get the desired output from pmatch, but I have not been able to figure out how. I tried looking at the source code for the pmatch function here: R-4.0.3\src\library\base\R\match.R But was not able to make progress that way. So, I wrote the following for-loop to apply to the output of pmatch and replace the NA's with the elements I wanted. It seems to work, at least for the example below. my.vector <- c(1, NA, NA, NA, NA, 8, NA, NA, NA, NA, NA, 14, NA, NA, NA, NA, 21, NA, NA, NA, NA, NA, 27, NA, NA, NA, NA, 34, NA, NA, NA, NA, NA) desired.result <- c(1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 35, 36, 37, 38, 39) pos.not.na <- which(!is.na(my.vector)) if(any(is.na(my.vector)) == TRUE) { my.output <- my.vector for(i in 2:length(pos.not.na)) { my.output[pos.not.na[(i-1)]:(pos.not.na[i]-1)] <- seq(my.vector[pos.not.na[(i-1)]], (my.vector[pos.not.na[(i-1)]] + (length(pos.not.na[(i-1)]:(pos.not.na[i]-1)) - 1))) } my.output[pos.not.na[length(pos.not.na)]:length(my.vector)] <- seq(my.vector[pos.not.na[length(pos.not.na)]], (my.vector[pos.not.na[length(pos.not.na)]] + length(pos.not.na[length(pos.not.na)]:length(my.vector)) - 1)) } if(any(is.na(my.vector)) == FALSE) {my.output = my.vector} my.output all.equal(my.output, desired.result) #[1] TRUE
How to merge the names while converting nested list to data frame
I have a nested lists with names: lst <- list(var1 = list(`0.1` = c(100, 10, 1, 0.1, 0.01), `0.2` = c(100, 20, 4, 0.8, 0.16), `0.3` = c(100, 30, 9, 2.7, 0.81), `0.4` = c(100, 40, 16, 6.4, 2.56), `0.5` = c(100, 50, 25, 12.5, 6.25), `0.6` = c(100, 60, 36, 21.6, 12.96), `0.7` = c(100, 70, 49, 34.3, 24.01), `0.8` = c(100, 80, 64, 51.2, 40.96), `0.9` = c(100, 90, 81, 72.9, 65.61)), var2 = list( `0.1` = c(10, 11, 11.1, 11.11, 11.111), `0.2` = c(10, 12, 12.4, 12.48, 12.496), `0.3` = c(10, 13, 13.9, 14.17, 14.251 ), `0.4` = c(10, 14, 15.6, 16.24, 16.496), `0.5` = c(10, 15, 17.5, 18.75, 19.375), `0.6` = c(10, 16, 19.6, 21.76, 23.056), `0.7` = c(10, 17, 21.9, 25.33, 27.731), `0.8` = c(10, 18, 24.4, 29.52, 33.616), `0.9` = c(10, 19, 27.1, 34.39, 40.951))) I'd like to convert it to data frame. I could do it with dplyr::bind_cols, but then my names are partly lost: # A tibble: 5 x 18 `0.1` `0.2` `0.3` `0.4` `0.5` `0.6` `0.7` `0.8` `0.9` `0.11` `0.21` `0.31` `0.41` `0.51` `0.61` `0.71` `0.81` <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 100 100 100 100 100 100 100 100 100 10 10 10 10 10 10 10 10 2 10 20 30. 40 50 60 70 80 90 11 12 13 14 15 16 17 18 3 1 4 9. 16 25 36 49. 64 81 11.1 12.4 13.9 15.6 17.5 19.6 21.9 24.4 4 0.1 0.8 2.7 6.4 12.5 21.6 34.3 51.2 72.9 11.1 12.5 14.2 16.2 18.8 21.8 25.3 29.5 5 0.01 0.16 0.81 2.56 6.25 13.0 24.0 41.0 65.6 11.1 12.5 14.3 16.5 19.4 23.1 27.7 33.6 # ... with 1 more variable: `0.91` <dbl> while I'd like to create a informative names joining names from two levels of list together. So the resulting output would be: # A tibble: 5 x 18 var1_0.1 var1_0.2 var1_0.3 var1_0.4 var1_0.5 var1_0.6 var1_0.7 var1_0.8 var1_0.9 var2_0.1 var2_0.2 var2_0.3 var2_0.4 <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 100 100 100 100 100 100 100 100 100 10 10 10 10 2 10 20 30. 40 50 60 70 80 90 11 12 13 14 3 1 4 9. 16 25 36 49. 64 81 11.1 12.4 13.9 15.6 4 0.1 0.8 2.7 6.4 12.5 21.6 34.3 51.2 72.9 11.1 12.5 14.2 16.2 5 0.01 0.16 0.81 2.56 6.25 13.0 24.0 41.0 65.6 11.1 12.5 14.3 16.5 # ... with 5 more variables: var2_0.5 <dbl>, var2_0.6 <dbl>, var2_0.7 <dbl>, var2_0.8 <dbl>, var2_0.9 <dbl> How can I achieve that in the most efficient way?
I did not use dplyr, but data.table and rlist. Is this what you wanted? library(data.table) library(rlist) lst <- list(var1 = list(`0.1` = c(100, 10, 1, 0.1, 0.01), `0.2` = c(100, 20, 4, 0.8, 0.16), `0.3` = c(100, 30, 9, 2.7, 0.81), `0.4` = c(100, 40, 16, 6.4, 2.56), `0.5` = c(100, 50, 25, 12.5, 6.25), `0.6` = c(100, 60, 36, 21.6, 12.96), `0.7` = c(100, 70, 49, 34.3, 24.01), `0.8` = c(100, 80, 64, 51.2, 40.96), `0.9` = c(100, 90, 81, 72.9, 65.61)), var2 = list(`0.1` = c(10, 11, 11.1, 11.11, 11.111), `0.2` = c(10, 12, 12.4, 12.48, 12.496), `0.3` = c(10, 13, 13.9, 14.17, 14.251), `0.4` = c(10, 14, 15.6, 16.24, 16.496), `0.5` = c(10, 15, 17.5, 18.75, 19.375), `0.6` = c(10, 16, 19.6, 21.76, 23.056), `0.7` = c(10, 17, 21.9, 25.33, 27.731), `0.8` = c(10, 18, 24.4, 29.52, 33.616), `0.9` = c(10, 19, 27.1, 34.39, 40.951))) temp = lapply(lst, as.data.table) final = rlist::list.cbind( temp ) final #> var1.0.1 var1.0.2 var1.0.3 var1.0.4 var1.0.5 var1.0.6 var1.0.7 var1.0.8 #> 1: 1e+02 100.00 100.00 100.00 100.00 100.00 100.00 100.00 #> 2: 1e+01 20.00 30.00 40.00 50.00 60.00 70.00 80.00 #> 3: 1e+00 4.00 9.00 16.00 25.00 36.00 49.00 64.00 #> 4: 1e-01 0.80 2.70 6.40 12.50 21.60 34.30 51.20 #> 5: 1e-02 0.16 0.81 2.56 6.25 12.96 24.01 40.96 #> var1.0.9 var2.0.1 var2.0.2 var2.0.3 var2.0.4 var2.0.5 var2.0.6 var2.0.7 #> 1: 100.00 10.000 10.000 10.000 10.000 10.000 10.000 10.000 #> 2: 90.00 11.000 12.000 13.000 14.000 15.000 16.000 17.000 #> 3: 81.00 11.100 12.400 13.900 15.600 17.500 19.600 21.900 #> 4: 72.90 11.110 12.480 14.170 16.240 18.750 21.760 25.330 #> 5: 65.61 11.111 12.496 14.251 16.496 19.375 23.056 27.731 #> var2.0.8 var2.0.9 #> 1: 10.000 10.000 #> 2: 18.000 19.000 #> 3: 24.400 27.100 #> 4: 29.520 34.390 #> 5: 33.616 40.951 Created on 2020-04-30 by the reprex package (v0.3.0)
You can use dplyr::bind_cols to convert to data frame (as you have mentioned) and then change names using base R by replicating first level names appropriate number of times: df <- dplyr::bind_cols(lst) names(df) <- paste(rep(names(lst), times = sapply(lst, length)), unlist(lapply(lst, names)), sep = '_') If you know your inner level names before hand, it gets even simpler: paste(rep(names(lst), each = 9), seq(0.1, 0.9, by = 0.1), sep = '_')
a data.table solution library(data.table) l <- lapply(seq_along(lst),function(x){ tmp <- as.data.table(lst[[x]]) names(tmp) <- paste0(names(lst)[x],"_",names(lst[[x]])) tmp }) as.data.table(unlist(l,recursive = FALSE)) #> var1_0.1 var1_0.2 var1_0.3 var1_0.4 var1_0.5 var1_0.6 var1_0.7 var1_0.8 #> 1: 1e+02 100.00 100.00 100.00 100.00 100.00 100.00 100.00 #> 2: 1e+01 20.00 30.00 40.00 50.00 60.00 70.00 80.00 #> 3: 1e+00 4.00 9.00 16.00 25.00 36.00 49.00 64.00 #> 4: 1e-01 0.80 2.70 6.40 12.50 21.60 34.30 51.20 #> 5: 1e-02 0.16 0.81 2.56 6.25 12.96 24.01 40.96 #> var1_0.9 var2_0.1 var2_0.2 var2_0.3 var2_0.4 var2_0.5 var2_0.6 var2_0.7 #> 1: 100.00 10.000 10.000 10.000 10.000 10.000 10.000 10.000 #> 2: 90.00 11.000 12.000 13.000 14.000 15.000 16.000 17.000 #> 3: 81.00 11.100 12.400 13.900 15.600 17.500 19.600 21.900 #> 4: 72.90 11.110 12.480 14.170 16.240 18.750 21.760 25.330 #> 5: 65.61 11.111 12.496 14.251 16.496 19.375 23.056 27.731 #> var2_0.8 var2_0.9 #> 1: 10.000 10.000 #> 2: 18.000 19.000 #> 3: 24.400 27.100 #> 4: 29.520 34.390 #> 5: 33.616 40.951 Created on 2020-04-30 by the reprex package (v0.3.0)
in Base-R new_lst <- do.call(cbind,lapply(lst, function(x) do.call(cbind,x))) colnames(new_lst) <- with(stack(lapply(lst, names)), paste(ind,values,sep="_")) Another method in Base-R (This one is much easier to understand) lst <- do.call(cbind,unlist(lst,recursive=F)) colnames(lst) <- lapply(colnames(lst), function(x) sub("\\.","_",x)) output > new_lst var1_0.1 var1_0.2 var1_0.3 var1_0.4 var1_0.5 var1_0.6 var1_0.7 var1_0.8 var1_0.9 var2_0.1 var2_0.2 var2_0.3 var2_0.4 var2_0.5 var2_0.6 var2_0.7 var2_0.8 var2_0.9 [1,] 1e+02 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 10.000 10.000 10.000 10.000 10.000 10.000 10.000 10.000 10.000 [2,] 1e+01 20.00 30.00 40.00 50.00 60.00 70.00 80.00 90.00 11.000 12.000 13.000 14.000 15.000 16.000 17.000 18.000 19.000 [3,] 1e+00 4.00 9.00 16.00 25.00 36.00 49.00 64.00 81.00 11.100 12.400 13.900 15.600 17.500 19.600 21.900 24.400 27.100 [4,] 1e-01 0.80 2.70 6.40 12.50 21.60 34.30 51.20 72.90 11.110 12.480 14.170 16.240 18.750 21.760 25.330 29.520 34.390 [5,] 1e-02 0.16 0.81 2.56 6.25 12.96 24.01 40.96 65.61 11.111 12.496 14.251 16.496 19.375 23.056 27.731 33.616 40.951
How to nest ifelse statements to accommodate three conditions
I have a simple dataframe, my data, with two variables, A and B. Here's a sample of the first 100 rows: structure(list(A = c(0, 6, 35, 0, 99, 20, 3, 6, 80, 12, 23, 77, 28, 80, 18, 90, 12, 60, 99, 90, 1, 3, 99, 100, 24, 99, 0, 40, 0, 0, 99, 10, 23, 7, 99, 0, 76, 57, 99, 0, 21, 6, 0, 0, 0, 0, 0, 0, 25, 50, 0, 100, 35, 40, 25, 90, 10, 20, 25, 100, 0, 15, 98, 35, 85, 90, 0, 0, 90, 90, 90, 50, 45, 90, 20, 15, 85, 100, 90, 15, 90, 85, 15, 25, 35, 90, 10, 35, 35, 100, 20, 0, 60, 100, 19, 60, 0, 50, 50, 6), B = c(10, 14, 5, 25, 87, 12, 12, 5, 80, 87, 60, 78, 23, 60, 18, 45, 12, 34, 99, 70, 2, 21, 50, 57, 50, 70, 12, 18, 34, 34, 23, 45, 34, 12, 99, 29, 76, 34, 50, 12, 20, 12, 50, 45, 2, 5, 12, 34, 25, 25, 25, 90, 45, 25, 35, 80, 15, 15, 20, 80, 4, 45, 27, 15, 85, 20, 58, 25, 20, 58, 45, 45, 48, 80, 25, 10, 80, 45, 25, 10, 45, 65, 45, 25, 35, 87, 10, 13, 25, 45, 25, 15, 25, 85, 19, 40, 12, 45, 65, 10)), row.names = 52:151, class = "data.frame") I want to add a new column for variable P, but the calculation for P differs for three conditions. Such that... If A < B, then P is equal to (B - A)/(B - 1) If A > B, then P is equal to (A - B)/(100 - B) If A = B, then P is equal to 0 How do I apply this logic? I have attempted to use a nested ifelse function as follows: mydata$P <- ifelse(mydata$A < mydata$B, ((mydata$B-mydata$A)/(mydata$B - 1)), ifelse(mydata$A == mydata$B), 0, ((mydata$A-mydata$B)/(100 - mydata$B))) But it returns this error: Error in ifelse(mydata$A < mydata$B, ((mydata$B - mydata$A)/(mydata$B - : unused arguments (0, ((mydata$A - mydata$B)/(100 - mydata$B))) Where am I going wrong?
Here's an alternative: mydata$ P <- with(mydata, ifelse(A < B, (B - A)/(B - 1), ifelse(A > B, (A - B)/(100 - B), 0)))
Here's a solution that uses case_when from dplyr, as I find it quite neat and tidy for structuring these sorts of statements. First, I define the data: # Define data frame df <- structure(list(A = c(0, 6, 35, 0, 99, 20, 3, 6, 80, 12, 23, 77, 28, 80, 18, 90, 12, 60, 99, 90, 1, 3, 99, 100, 24, 99, 0, 40, 0, 0, 99, 10, 23, 7, 99, 0, 76, 57, 99, 0, 21, 6, 0, 0, 0, 0, 0, 0, 25, 50, 0, 100, 35, 40, 25, 90, 10, 20, 25, 100, 0, 15, 98, 35, 85, 90, 0, 0, 90, 90, 90, 50, 45, 90, 20, 15, 85, 100, 90, 15, 90, 85, 15, 25, 35, 90, 10, 35, 35, 100, 20, 0, 60, 100, 19, 60, 0, 50, 50, 6), B = c(10, 14, 5, 25, 87, 12, 12, 5, 80, 87, 60, 78, 23, 60, 18, 45, 12, 34, 99, 70, 2, 21, 50, 57, 50, 70, 12, 18, 34, 34, 23, 45, 34, 12, 99, 29, 76, 34, 50, 12, 20, 12, 50, 45, 2, 5, 12, 34, 25, 25, 25, 90, 45, 25, 35, 80, 15, 15, 20, 80, 4, 45, 27, 15, 85, 20, 58, 25, 20, 58, 45, 45, 48, 80, 25, 10, 80, 45, 25, 10, 45, 65, 45, 25, 35, 87, 10, 13, 25, 45, 25, 15, 25, 85, 19, 40, 12, 45, 65, 10)), row.names = 52:151, class = "data.frame") Then, I apply case_when, like so: # Perform calculation df$P <- with(df, dplyr::case_when( A < B ~ (B - A)/(B - 1), A > B ~ (A - B)/(100 - B), A == B ~ 0 )) which gives df #> A B P #> 52 0 10 1.11111111 #> 53 6 14 0.61538462 #> 54 35 5 0.31578947 #> 55 0 25 1.04166667 #> 56 99 87 0.92307692 #> 57 20 12 0.09090909 #> 58 3 12 0.81818182 #> 59 6 5 0.01052632 #> 60 80 80 0.00000000 #> 61 12 87 0.87209302 #> 62 23 60 0.62711864 #> 63 77 78 0.01298701 #> 64 28 23 0.06493506 #> 65 80 60 0.50000000 #> 66 18 18 0.00000000 #> 67 90 45 0.81818182 #> 68 12 12 0.00000000 #> 69 60 34 0.39393939 #> 70 99 99 0.00000000 #> 71 90 70 0.66666667 #> 72 1 2 1.00000000 #> 73 3 21 0.90000000 #> 74 99 50 0.98000000 #> 75 100 57 1.00000000 #> 76 24 50 0.53061224 #> 77 99 70 0.96666667 #> 78 0 12 1.09090909 #> 79 40 18 0.26829268 #> 80 0 34 1.03030303 #> 81 0 34 1.03030303 #> 82 99 23 0.98701299 #> 83 10 45 0.79545455 #> 84 23 34 0.33333333 #> 85 7 12 0.45454545 #> 86 99 99 0.00000000 #> 87 0 29 1.03571429 #> 88 76 76 0.00000000 #> 89 57 34 0.34848485 #> 90 99 50 0.98000000 #> 91 0 12 1.09090909 #> 92 21 20 0.01250000 #> 93 6 12 0.54545455 #> 94 0 50 1.02040816 #> 95 0 45 1.02272727 #> 96 0 2 2.00000000 #> 97 0 5 1.25000000 #> 98 0 12 1.09090909 #> 99 0 34 1.03030303 #> 100 25 25 0.00000000 #> 101 50 25 0.33333333 #> 102 0 25 1.04166667 #> 103 100 90 1.00000000 #> 104 35 45 0.22727273 #> 105 40 25 0.20000000 #> 106 25 35 0.29411765 #> 107 90 80 0.50000000 #> 108 10 15 0.35714286 #> 109 20 15 0.05882353 #> 110 25 20 0.06250000 #> 111 100 80 1.00000000 #> 112 0 4 1.33333333 #> 113 15 45 0.68181818 #> 114 98 27 0.97260274 #> 115 35 15 0.23529412 #> 116 85 85 0.00000000 #> 117 90 20 0.87500000 #> 118 0 58 1.01754386 #> 119 0 25 1.04166667 #> 120 90 20 0.87500000 #> 121 90 58 0.76190476 #> 122 90 45 0.81818182 #> 123 50 45 0.09090909 #> 124 45 48 0.06382979 #> 125 90 80 0.50000000 #> 126 20 25 0.20833333 #> 127 15 10 0.05555556 #> 128 85 80 0.25000000 #> 129 100 45 1.00000000 #> 130 90 25 0.86666667 #> 131 15 10 0.05555556 #> 132 90 45 0.81818182 #> 133 85 65 0.57142857 #> 134 15 45 0.68181818 #> 135 25 25 0.00000000 #> 136 35 35 0.00000000 #> 137 90 87 0.23076923 #> 138 10 10 0.00000000 #> 139 35 13 0.25287356 #> 140 35 25 0.13333333 #> 141 100 45 1.00000000 #> 142 20 25 0.20833333 #> 143 0 15 1.07142857 #> 144 60 25 0.46666667 #> 145 100 85 1.00000000 #> 146 19 19 0.00000000 #> 147 60 40 0.33333333 #> 148 0 12 1.09090909 #> 149 50 45 0.09090909 #> 150 50 65 0.23437500 #> 151 6 10 0.44444444 Created on 2019-08-08 by the reprex package (v0.3.0)
Alternatively, you could avoid using ifelse in the first place: mydata$P <- with(mydata, abs(B - A) / ((A <= B) * (B - 1) + (A >= B) * (100 - B))) NB: if A equals B, the numerator is zero and the denominator is 99 independent of the value of B, so there will be no issues trying to divide by zero.