Related
I get the following message of error when using janitor::adorn_totals("row"):
"Error in adorn_totals(., "row") :
trying to re-add a totals dimension that is already been added"
Here is the head of my dataset :
structure(list(code_1 = c("M01", "C03", "M99", "C05", "O01",
"C07"), regroupement_elsan = c("Gastro", "Ophtalmo", "Divers médecine",
"Gynéco", "Accouchements", "bouche et dents"), actes_2019 = c(9179,
5589, 6024, 4150, 4028, 3458), actes_2020 = c(7933, 4167, 3740,
2994, 3348, 2206), actes_2021 = c(6504, 5505, 4682, 3376, 3226,
3035), sejours_2019 = c(1631, 2502, 1028, 852, 1455, 1288), sejours_2020 = c(1335,
1819, 726, 574, 1371, 801), sejours_2021 = c(1109, 2416, 825,
657, 1259, 1106), tx_0_nuit_2019 = c("3.92397302268547", "90.7673860911271",
"32.9766536964981", "57.5117370892019", "0.206185567010309",
"98.9130434782609"), tx_0_nuit_2020 = c("3.29588014981273", "92.9081913139087",
"47.1074380165289", "59.581881533101", "0.291757840991977", "99.250936329588"
), tx_0_nuit_2021 = c("3.6068530207394", "95.4470198675497",
"18.3030303030303", "60.2739726027397", "0.158856235107228",
"98.7341772151899"), pourcentage = c(5.37796226165473, 4.55191916519208,
3.87140518282095, 2.79151300666457, 2.66748251170021, 2.50955034811226
), pourcentage_cumule = c(78.4062908267046, 82.9582099918967,
86.8296151747176, 89.6211281813822, 92.2886106930824, 94.7981610411947
)), row.names = c(NA, -6L), class = c("tabyl", "tbl_df", "tbl",
"data.frame"), core = structure(list(code_1 = c("M01b", "C01",
"C02", "C04", "M01", "C03", "M99", "C05", "O01", "C07", "C08",
"C99", "C98", "C10", "C06", "M03", "O02", "M02", "M04", "C01b",
"O03", "S99", "***", "C10b", "M05", "M98", "O04"), regroupement_elsan = c("Endoscopies
digestives",
"Ortho (+ rhumato et rachis)", "Chirurgie digestive", "Uro-néphro",
"Gastro", "Ophtalmo", "Divers médecine", "Gynéco", "Accouchements",
"bouche et dents", "Tissus mou et chir plastique", "Divers chir",
"Chir esth et hors sécu", "Chir thoracique et vasculaire", "ORL Stomato sf bouche et
dent",
"Pneumologie", "Obstétrique autre (hors IVG)", "Cardio Vasc (médecine)",
"Neurologie", "Rachis", "IVG", "Séances autres", "Autres", "Chir thoracique",
"Soins palliatifs", "Vasculaire interventionnel", "Néo nat"),
actes_2019 = c(36079, 29520, 14618, 6515, 9179, 5589, 6024,
4150, 4028, 3458, 2137, 2180, 575, 449, 866, 388, 294, 311,
714, 395, 292, 1842, 10, 0, 4, 0, 1), actes_2020 = c(30192,
25451, 12845, 7376, 7933, 4167, 3740, 2994, 3348, 2206, 2107,
1477, 575, 437, 337, 897, 193, 218, 267, 308, 118, 737, 8,
4, 0, 11, 5), actes_2021 = c(42333, 24055, 13735, 8196, 6504,
5505, 4682, 3376, 3226, 3035, 2571, 1134, 689, 511, 352,
272, 181, 161, 138, 106, 82, 61, 18, 8, 7, 0, 0), sejours_2019 = c(6992,
5493, 2577, 1221, 1631, 2502, 1028, 852, 1455, 1288, 540,
397, 236, 158, 260, 63, 148, 101, 90, 44, 246, 1820, 4, 0,
1, 0, 1), sejours_2020 = c(5811, 4946, 2220, 1220, 1335,
1819, 726, 574, 1371, 801, 554, 269, 221, 140, 94, 42, 109,
79, 58, 34, 98, 720, 2, 1, 0, 1, 5), sejours_2021 = c(7922,
5144, 2523, 1451, 1109, 2416, 825, 657, 1259, 1106, 649,
264, 278, 162, 111, 51, 108, 69, 30, 21, 77, 54, 7, 1, 2,
0, 0), tx_0_nuit_2019 = c("96.0955377574371", "63.5718186783179",
"41.4435389988359", "36.2817362817363", "3.92397302268547",
"90.7673860911271", "32.9766536964981", "57.5117370892019",
"0.206185567010309", "98.9130434782609", "72.5925925925926",
"53.904282115869", "13.9830508474576", "96.2025316455696",
"50.7692307692308", "42.8571428571429", "85.1351351351351",
"72.2772277227723", "11.1111111111111", "4.54545454545455",
"100,0", "100,0", "100,0", "0,0", "0,0", "0,0", "0,0"), tx_0_nuit_2020 =
c("96.0936155567028",
"67.3069146785281", "40.5855855855856", "34.344262295082",
"3.29588014981273", "92.9081913139087", "47.1074380165289",
"59.581881533101", "0.291757840991977", "99.250936329588",
"76.3537906137184", "49.814126394052", "11.7647058823529",
"99.2857142857143", "53.1914893617021", "16.6666666666667",
"74.3119266055046", "81.0126582278481", "25.8620689655172",
"8.82352941176471", "98.9795918367347", "100,0", "100,0",
"100,0", "0,0", "0,0", "20,0"), tx_0_nuit_2021 = c("96.7053774299419",
"73.2892690513219", "51.0503369005153", "41.9021364576154",
"3.6068530207394", "95.4470198675497", "18.3030303030303",
"60.2739726027397", "0.158856235107228", "98.7341772151899",
"83.9753466872111", "60.2272727272727", "50,0", "94.4444444444444",
"72.972972972973", "1.96078431372549", "81.4814814814815",
"85.5072463768116", "43.3333333333333", "52.3809523809524",
"100,0", "100,0", "100,0", "100,0", "0,0", "0,0", "0,0")), row.names = c(NA,
-27L), class = "data.frame"), tabyl_type = "two_way", totals = "row")
And the code I tried :
library(janitor)
autres %>%
adorn_totals("row")
Could anyone help ? I had indeed used the adorn_totals function on the dataframe used to generate the dataframe "autres", but I made sure the row "total" isn't in the dataframe "autres" anymore.
With the object you have shared as x:
x %>%
untabyl() %>%
adorn_totals()
Why it works:
You can see at the end of the object you shared, tabyl_type = "two_way", totals = "row". Those attributes are stored with the data.frame you're working with. When you try to adorn_totals() a second time, janitor checks this and errors.
When you call untabyl() it strips those attributes. Then adorn_totals() succeeds.
I notice you have a cumulative percentage column. If desired, you can control exactly which columns get a totals value in adorn_totals() - see ?adorn_totals and the ... argument for how, and here's an example: https://stackoverflow.com/a/69759313.
I have this dataset listed below that will be used for my question below.
Data<-read.table(file=file.choose(),header=T)
Data;
VARIABLE TYPE NGENES BETA BETA_STD SE P
black SET 43 -0.049246 -0.0078434 0.14654 0.63156
blue SET 152 -0.080217 -0.023193 0.08137 0.83781
brown SET 163 -0.057881 -0.017266 0.079054 0.76791
cyan SET 42 0.1498 0.023586 0.14128 0.1446
darkgreen SET 2 -0.65338 -0.022727 0.67635 0.83292
green SET 172 -0.13458 -0.041115 0.073527 0.96631
greenyellow SET 40 0.026733 0.0041104 0.14624 0.42749
grey SET 4 0.16388 0.0080567 0.53064 0.37874
grey60 SET 23 -0.1455 -0.017054 0.20066 0.76576
lightcyan SET 41 0.083008 0.012918 0.15225 0.29284
magenta SET 32 -0.10777 -0.014858 0.16601 0.74184
midnightblue SET 23 0.00024188 2.84E-05 0.19544 0.49951
pink SET 64 -0.017662 -0.0034093 0.12521 0.55608
purple SET 60 0.12025 0.022504 0.12624 0.17048
red SET 73 0.40737 0.083745 0.11427 0.00018742
royalblue SET 7 -0.27895 -0.018125 0.36009 0.78067
salmon SET 170 0.040831 0.01241 0.076001 0.29559
turquoise SET 450 0.027806 0.012383 0.050585 0.29131
With this dataset I am wanting to create several rectangles on a plot what are each color coded and have a pvalue labeled on top of the rectangle. I am wanting to loop through the VARIABLE column and for each rectangle assign a color. Furthermore, I want to loop through the P column and write the P value on top of each rectangle. Thus for each row in the dataset, the color and p value should be the same. This is the script I am trying right now. I am not seeing how to loop the associated columns with this script. Any help would be nice.
coords <- matrix(
c(100, 300, 110, 310,
120, 300, 130, 310,
140, 300, 150, 310,
160, 300, 170, 310,
180, 300, 190, 310,
100, 320, 110, 330,
120, 320, 130, 330,
140, 320, 150, 330,
160, 320, 170, 330,
180, 320, 190, 330,
100, 340, 110, 350,
120, 340, 130, 350,
140, 340, 150, 350,
160, 340, 170, 350,
180, 340, 190, 350,
100, 360, 110, 370,
120, 360, 130, 370,
140, 360, 150, 370),
ncol=4,byrow=TRUE)
plot(c(100, 200), c(300, 450), type = "n",
main = "Test")
rfun <- function(x,i) {
do.call(rect,as.list(x))
}
apply(coords,1,rfun)
text((coords[,1]+coords[,3])/2,
(coords[,2]+coords[,4])/2,
seq(nrow(coords)))
I am not sure, but maybe you want something like this?
DF <- structure(list(VARIABLE = c("black", "blue", "brown", "cyan",
"darkgreen", "green", "greenyellow", "grey", "grey60", "lightcyan",
"magenta", "midnightblue", "pink", "purple", "red", "royalblue",
"salmon", "turquoise"),
TYPE = c("SET", "SET", "SET", "SET",
"SET", "SET", "SET", "SET", "SET", "SET", "SET", "SET", "SET",
"SET", "SET", "SET", "SET", "SET"),
NGENES = c(43L, 152L, 163L, 42L, 2L, 172L, 40L, 4L, 23L, 41L, 32L, 23L,
64L, 60L, 73L, 7L, 170L, 450L),
BETA = c(-0.049246, -0.080217, -0.057881, 0.1498, -0.65338, -0.13458,
0.026733, 0.16388, -0.1455, 0.083008, -0.10777, 0.00024188,
-0.017662, 0.12025, 0.40737, -0.27895, 0.040831, 0.027806),
BETA_STD = c(-0.0078434, -0.023193, -0.017266, 0.023586, -0.022727,
-0.041115, 0.0041104, 0.0080567, -0.017054, 0.012918,
-0.014858, 2.84e-05, -0.0034093, 0.022504, 0.083745,
-0.018125, 0.01241, 0.012383),
SE = c(0.14654, 0.08137, 0.079054, 0.14128, 0.67635, 0.073527, 0.14624,
0.53064, 0.20066, 0.15225, 0.16601, 0.19544, 0.12521, 0.12624,
0.11427, 0.36009, 0.076001, 0.050585),
P = c(0.63156, 0.83781, 0.76791, 0.1446, 0.83292, 0.96631, 0.42749,
0.37874, 0.76576, 0.29284, 0.74184, 0.49951, 0.55608, 0.17048,
0.00018742, 0.78067, 0.29559, 0.29131)),
class = "data.frame",
row.names = c(NA, -18L))
coords <- matrix(
c(100, 300, 110, 310,
120, 300, 130, 310,
140, 300, 150, 310,
160, 300, 170, 310,
180, 300, 190, 310,
100, 320, 110, 330,
120, 320, 130, 330,
140, 320, 150, 330,
160, 320, 170, 330,
180, 320, 190, 330,
100, 340, 110, 350,
120, 340, 130, 350,
140, 340, 150, 350,
160, 340, 170, 350,
180, 340, 190, 350,
100, 360, 110, 370,
120, 360, 130, 370,
140, 360, 150, 370),
ncol=4,byrow=TRUE)
rfun <- function(x, i) do.call(rect, c(as.list(x), border = i))
plot(c(100, 200), c(300, 450), type = "n",
main = "Test")
invisible(sapply(seq_len(nrow(DF)),
function(y) do.call(rect, c(as.list(coords[y,]), border = DF$VARIABLE[y]))))
text((coords[,1]+coords[,3])/2,
(coords[,2]+coords[,4])/2,
round(DF$P, 2))
Created on 2020-08-04 by the reprex package (v0.3.0)
I have a data frame R_m which looks like this:
data frame R_m
I have used pivot_longer to modify the data for ggplot and then print it:
R_m2 <- R_m %>%
pivot_longer(names_to = "per", values_to="ind", cols=-sim, names_ptypes=list(per=integer()))
ggplot(R_m2, aes(x=per, y=ind,color=sim, group=sim))+geom_line() +
theme(legend.position = "none")
Now, I would like to add a line for mean, median and some quantiles in the graph. Before, I got this data from elsewhere and imported it as a data frame 'stat' into R. It looks like this:
median, mean and quantile
which plotted nicely by adding
geom_line(data=subset(stat,sim=="Median"),colour="black", size = 1)
Outcome looks like this:
Now I would like to achieve the same in R. Creating an array of medians (where median is taken across each column in R_m) was straightforward to do. But then my first stumbling block was that using rbind, cbind etc. I could not create a data frame that looks like 'stat'. I could not rename the zero column to "sim", so the geom_line command above no longer works. Anyway, this requires a lot of data manipulation. Is there a more efficient way of adding a median, mean and percentile lines to a graph?
structure(list(sim = c(166, 37, 163, 65, 95, 92, 98, 168, 19,
157, 177, 200, 115, 177, 149, 130, 66, 114, 96, 12, 138, 39,
80, 33, 157, 107, 180, 159, 166, 14, 126, 67, 190, 86, 147, 182,
43, 5, 109, 141, 53, 186, 49, 68, 168, 107, 67, 28, 158, 178),
per = c(407L, 1763L, 2158L, 1608L, 836L, 1638L, 285L, 1978L,
45L, 1927L, 192L, 1517L, 163L, 789L, 1989L, 2478L, 2410L,
2445L, 1532L, 181L, 1489L, 1434L, 2515L, 676L, 1503L, 2458L,
732L, 1266L, 1705L, 1852L, 1543L, 1568L, 41L, 1992L, 600L,
1314L, 33L, 199L, 370L, 46L, 1171L, 1173L, 2048L, 994L, 836L,
372L, 2374L, 1414L, 1628L, 1188L), ind = c(97.7428137181456,
100.462039003802, 95.2793483563514, 98.3721036305918, 99.0584691732282,
103.301132288618, 102.428408453689, 100.387198613893, 99.7888039221544,
101.017059784079, 106.12288506898, 102.636429823681, 93.8144062244855,
104.280572544198, 97.3182467653953, 96.5603916025096, 96.3529141792467,
98.3149638711415, 98.3629878947972, 94.6106342501915, 99.6835722307572,
98.1716050345778, 103.055895201755, 100.054976695486, 96.1369802984859,
98.5257212288309, 98.8568719059079, 102.900859147552, 99.37215427561,
102.623437273663, 104.128600607447, 102.673062489082, 100.368131206055,
98.3487549118012, 96.4401682804699, 96.4407823981984, 97.9413312935541,
102.122624393907, 98.2979203190445, 101.018531709501, 100.444354410774,
101.118257199515, 100.867412455804, 98.9923953588876, 100.417977446024,
102.21423103019, 102.296794518966, 99.9367239162818, 102.314273028354,
100.80711113148)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
This question aims to receive feedback to make a function more efficient. Apologies for a long, case specific post.
I created a function that calculates percentages in estimates of the American Community Survey (ACS). Because estimates in the ACS have margins of error, calculating percentages (e.g. % of the total population being below 17 years old) requires the recalculation of the error that results from dividing the estimate of both variables (population below 17 / total population).
So to calculate the new margin of error for a proportion calculated as p = estimate_a/estimate_b, the formula to use is MOE(p) = (1/estimate_b)*sqrt((MOE_b^2)-(p^2*MOE_a^2)). If the value inside of the square root was negative, then the substraction should be changed to a sum, with the formula becoming MOE(p) = (1/estimate_b)*sqrt((MOE_b^2)+(p^2*MOE_a^2)). If the result of p = estimate_a/estimate_b is 1, the documentation suggest calculating MOE using another formula: MOE(p) = MOE_a/estimate_b
To make these calculations, I created a function that takes a data frame with estimates and their MOEs, calculates the proportion between two specified variables, and writes two new columns in the original dataframe - one with the proportion, and another one with its margin of error. The function loops through the rows of the data frame carrying out if-else checks to determine what formula to apply, including skipping rows that might have NA values. The original data on which I apply this function is considerably long - ~250000 rows, and the structure of this function makes it go very slowly. Hence, the question is whether there are ways to improve the quality of this code to improve its speed. The function and dummy data are provided below:
percent_calculator <- function(DF, A_e, B_e, A_se, B_se, New_fn){
# arguments legend >> DF = data frame; A_e = estimate_a (string of the fieldname); B_e = estimate_b (string of the fieldname);
# A_se = MOE_a (string of the fieldname); B_se = MOE_b (string of the fieldname); New_fn = root for new fieldname in the data frame (string)
pb<- txtProgressBar(min = 0, max = nrow(DF), initial = 0) # progress bar initialization
for (i in 1:nrow(DF)){ # for loop that iterates through the rows of the DF
setTxtProgressBar(pb,i)
if(is.na(DF[[A_e]][i])==FALSE & is.na(DF[[B_e]][i])==FALSE){ # check if any of the estimates used to calculate the proportion is NA (if so, skip)
if (DF[[B_e]][i]!= 0){ # check if estimate_b is not 0, to avoid creating inf values from A_e/B-e
DF[[paste0(New_fn, "_e")]][i] <- (DF[[A_e]][i]/DF[[B_e]][i])
if(DF[[paste0(New_fn, "_e")]][i] == 1){ # check if P==1 to then use the appropiate formula for MOE
DF[[paste0(New_fn, "_se")]][i] <- (DF[[A_se]][i]/DF[[B_e]][i])
} else {
if((DF[[A_se]][i]^2)-(DF[[paste0(New_fn, "_e")]][i]^2)*(DF[[B_se]][i]^2)>= 0){ # check for the sign of the value inside of the square root
DF[[paste0(New_fn, "_se")]][i] <- (1/DF[[B_e]][i])*sqrt((DF[[A_se]][i]^2)-(DF[[paste0(New_fn, "_e")]][i]^2)*(DF[[B_se]][i]^2))
} else {
DF[[paste0(New_fn, "_se")]][i] <- (1/DF[[B_e]][i])*sqrt((DF[[A_se]][i]^2)+(DF[[paste0(New_fn, "_e")]][i]^2)*(DF[[B_se]][i]^2))
}
}
} else { # assign 0 value if B_e was 0
DF[[paste0(New_fn, "_e")]][i] <- 0
DF[[paste0(New_fn, "_se")]][i] <- 0
}
} else { # assign NA if any of the estimates was NA
DF[[paste0(New_fn, "_e")]][i] <- NA
DF[[paste0(New_fn, "_se")]][i] <- NA
}
DF[[paste0(New_fn, "_e")]][i] <- DF[[paste0(New_fn, "_e")]][i]*100 # switch from proportion to percentage in the estimate value
DF[[paste0(New_fn, "_se")]][i] <- DF[[paste0(New_fn, "_se")]][i]*100 # switch from proportion to percentage in the MOE value
}
return(DF)
}
Dummy <- structure(list(TotPop_e = c(636L, 1287L, 810L, 1218L, 2641L,
835L, 653L, 1903L, 705L, 570L, 2150L, 6013L, 1720L, 2555L, 1150L,
2224L, 1805L, 728L, 2098L, 3099L, 4194L, 1909L, 2401L, 1446L,
1345L, 1573L, 2037L, 634L, 1916L, 1522L, 592L, 831L, 577L, 2196L,
1482L, 1436L, 1668L, 3095L, 3677L, 2641L, 1285L, 932L, 2461L,
1609L, 1143L, 1617L, 1075L, 1280L, 838L, 1447L, 3941L, 2402L,
1130L, 851L, 10316L, 9576L, 2396L, 3484L, 5688L, 2200L, 1856L,
1441L, 2539L, 3056L, 1325L, 2454L, 2010L, 2340L, 1448L, 2435L,
2782L, 3633L, 1766L, 2564L, 1473L, 1214L, 1951L, 2561L, 4262L,
2576L, 4257L, 2314L, 2071L, 3182L, 1839L, 2214L, 1101L, 1898L,
790L, 867L, 1764L, 970L, 1320L, 2850L, 1019L, 1483L, 3720L, 2215L,
3581L, 3391L), TotPop_se = c(132.522796352584, 149.544072948328,
127.051671732523, 130.091185410334, 232.826747720365, 135.562310030395,
100.303951367781, 176.29179331307, 114.285714285714, 96.6565349544073,
339.817629179331, 438.297872340425, 245.592705167173, 324.012158054711,
333.130699088146, 224.924012158055, 321.580547112462, 169.604863221885,
175.075987841945, 469.908814589666, 375.075987841945, 411.550151975684,
378.115501519757, 235.258358662614, 241.337386018237, 291.793313069909,
337.386018237082, 138.601823708207, 145.896656534954, 193.920972644377,
135.562310030395, 117.325227963526, 244.984802431611, 318.54103343465,
207.90273556231, 200, 279.635258358663, 657.750759878419, 401.215805471125,
401.823708206687, 229.787234042553, 139.817629179331, 303.951367781155,
201.215805471125, 200, 252.887537993921, 356.838905775076, 241.945288753799,
238.297872340426, 267.477203647416, 320.9726443769, 255.31914893617,
178.115501519757, 116.109422492401, 891.793313069909, 766.565349544073,
255.31914893617, 463.22188449848, 448.632218844985, 367.781155015198,
269.300911854103, 261.398176291793, 286.93009118541, 446.808510638298,
224.316109422492, 212.158054711246, 233.434650455927, 304.559270516717,
356.231003039514, 275.379939209726, 330.699088145897, 368.996960486322,
248.024316109423, 310.030395136778, 153.799392097264, 243.768996960486,
265.65349544073, 337.386018237082, 436.474164133739, 359.270516717325,
344.072948328268, 196.960486322188, 231.003039513678, 356.231003039514,
212.158054711246, 348.328267477204, 206.079027355623, 240.729483282675,
196.352583586626, 141.033434650456, 215.80547112462, 127.659574468085,
248.024316109423, 589.057750759878, 231.61094224924, 486.93009118541,
605.471124620061, 713.06990881459, 488.753799392097, 382.370820668693
), Under17_se = c(35.8095476596307, 50.9877853224243, 50.0994474845873,
44.7376765786604, 113.994325548832, 59.7386237841673, 22.7862186188344,
95.1285234870203, 42.3093316505904, 35.4621507988699, 143.021311606928,
205.334390935311, 102.292167403598, 115.712493289527, 88.9617416652971,
98.0345650964952, 149.50823698925, 40.0016629212452, 86.7428425216985,
158.047696828218, 173.225615182675, 144.710221534209, 121.094774232467,
76.9999466678128, 88.9160360898593, 97.7665610480423, 133.02517642826,
30.4983051540691, 83.3625069421341, 75.7125713164268, 50.3826325227805,
37.5622898620679, 7.29483282674772, 122.185425418875, 83.4644035953588,
63.8384709681463, 99.5458131127046, 208.446825330589, 150.282359742524,
206.017151858922, 87.7761872483956, 56.194023821941, 120.701992909334,
50.6423479626955, 55.4225960853081, 93.2888100499867, 126.879946773287,
143.069104861932, 86.7747884744339, 79.4517480028886, 140.260959630942,
125.115775875384, 52.187662082273, 38.1819057688564, 365.828168907497,
380.635956883794, 135.735302000757, 213.321896356121, 198.507936644685,
126.535797699776, 141.516048792542, 114.238818548927, 117.737122860635,
165.644292987747, 71.238834852709, 93.0825940979755, 41.8438489710712,
97.0666682368976, 86.5060758100772, 92.8659724484427, 76.6536183156139,
192.822109819002, 101.83958502542, 139.341067042001, 55.3992539361667,
92.106793773051, 78.2330906844691, 115.177918141833, 207.546042154974,
139.609995160777, 153.568552211039, 73.5738128652025, 112.249861520572,
171.38868664475, 66.0687084216098, 181.939713349267, 28.4417934718288,
90.1132509720827, 57.4202669424023, 46.8440239496863, 80.4799857926917,
42.6875862955885, 81.3500156027725, 142.669475129055, 23.4653605661019,
191.159072511375, 159.615857998832, 191.592580855392, 184.123292172321,
125.375425911215), Under17_e = c(123, 284, 189, 228, 661, 180,
49, 500, 121, 115, 686, 1456, 385, 578, 302, 476, 738, 124, 527,
803, 1219, 459, 614, 218, 229, 422, 543, 69, 536, 306, 149, 80,
0, 520, 281, 270, 454, 669, 905, 978, 282, 178, 630, 187, 145,
367, 327, 577, 225, 246, 966, 629, 211, 65, 2857, 3051, 592,
1162, 1322, 464, 490, 264, 576, 617, 326, 695, 169, 381, 309,
476, 355, 915, 431, 869, 269, 358, 335, 650, 1443, 561, 900,
411, 759, 1265, 171, 833, 45, 255, 134, 144, 339, 203, 388, 413,
66, 416, 654, 565, 700, 362)), row.names = c(NA, 100L), class = "data.frame")
# example run to calculate pct of people below 17
Dummy <- percent_calculator(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
You don't need that loop at all. All your operations are simple arithmetic that can take vectors, instead of single values. This is called vectorization. You then implement your logic tree with a nested ifelse. ifelse does compute all three possible outcomes (which is a bit unnecessary), but that is very much worth it in this case. If you want to optimize further have a look here: Is `if` faster than ifelse?
Timings at the bottom.
percent_calculator_vectorized <- function(DF, A_e, B_e, A_se, B_se, New_fn){
# arguments legend >> DF = data frame; A_e = estimate_a (string of the fieldname); B_e = estimate_b (string of the fieldname);
# A_se = MOE_a (string of the fieldname); B_se = MOE_b (string of the fieldname); New_fn = root for new fieldname in the data frame (string)
e_name <- paste0(New_fn, "_e")
se_name <- paste0(New_fn, "_se")
DF[[e_name]] <- DF[[A_e]] / DF[[B_e]]
DF[[se_name]] <- ifelse(
DF[[e_name]] == 1, # check if P==1 to then use the appropriate formula for MOE
DF[[A_se]] / DF[[B_e]],
ifelse(
(DF[[A_se]]^2)-(DF[[e_name]]^2)*(DF[[B_se]]^2)>= 0, # check for the sign of the value inside of the square root
(1/DF[[B_e]])*sqrt((DF[[A_se]]^2)-(DF[[e_name]]^2)*(DF[[B_se]]^2)),
(1/DF[[B_e]])*sqrt((DF[[A_se]]^2)+(DF[[e_name]]^2)*(DF[[B_se]]^2))
)
)
# assign 0 value if B_e was 0
DF[DF[[B_e]] == 0 & !is.na(DF[[B_e]]), c(e_name, se_name)] <- 0
# switch from proportion to percentage in the estimate value
DF[, c(e_name, se_name)] <- DF[, c(e_name, se_name)] * 100
return(DF)
}
Dummy2 <- percent_calculator(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
Dummy3 <- percent_calculator_vectorized(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
all.equal(Dummy2, Dummy3) #TRUE
Timings:
bench::mark(
orig = percent_calculator(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17"),
vect = percent_calculator_vectorized(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17"),
)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <list>
1 orig 17.2ms 18.5ms 53.1 331.2KB 14.8 18 5 339ms <df[,6] … <df[,3] … <bch… <tibble …
2 vect 157.4µs 168µs 5700. 19.4KB 11.6 2450 5 430ms <df[,6] … <df[,3] … <bch… <tibble …
Speed up for this small dataset is ~100x, also with a ~10x smaller memory footprint.
I see you have a working solution but for my own reasons I wanted to try a tidyverse solution. This one is just about as fast as the base R solution and for me would be easier to maintain. I also added some more oddities to your toy data to make sure I caught the edge cases.
library(dplyr)
ACS_recalculator <- function(DF, A_e, B_e, A_se, B_se, New_fn){
e_name <- paste0(New_fn, "_e")
se_name <- paste0(New_fn, "_se")
A_ex <- ensym(A_e)
B_ex <- ensym(B_e)
A_sex <- ensym(A_se)
B_sex <- ensym(B_se)
DF <-
DF %>%
mutate(e_value = ifelse(!!B_ex != 0, !!A_ex / !!B_ex, 0),
se_value = case_when(
!!B_ex == 0 ~ 0,
e_value == 1 ~ !!A_sex / !!B_ex,
((!!A_sex)^2) - (e_value^2 * ((!!B_sex)^2)) >= 0 ~ (1/!!B_ex) * sqrt(((!!A_sex)^2) - (e_value^2) * ((!!B_sex)^2)),
((!!A_sex)^2) - (e_value^2 * ((!!B_sex)^2)) < 0 ~ (1/!!B_ex) * sqrt(((!!A_sex)^2) + (e_value^2) * ((!!B_sex)^2)),
TRUE ~ NA_real_),
e_value = e_value * 100,
se_value = se_value * 100) %>%
rename(!!e_name := e_value,
!!se_name := se_value)
return(DF)
}
Dummy2 <- ACS_recalculator(Dummy2, "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
head(Dummy2)
#> TotPop_e TotPop_se Under17_se Under17_e P_Bel17_e P_Bel17_se
#> 1 636 132.5228 35.80955 123 19.33962 3.932255
#> 2 1287 149.5441 50.98779 284 22.06682 3.020104
#> 3 810 127.0517 50.09945 189 23.33333 4.986043
#> 4 1218 130.0912 44.73768 228 18.71921 3.081212
#> 5 2641 232.8267 113.99433 661 25.02840 3.709747
#> 6 835 135.5623 59.73862 180 21.55689 6.239876
Your original example data with more missings and zeros
Dummy2 <- structure(list(TotPop_e = c(636L, 1287L, 810L, 1218L, 2641L,
835L, 653L, 1903L, 0L, 570L, 2150L, 6013L, 1720L, 2555L, 1150L,
2224L, 1805L, 728L, 2098L, 3099L, 4194L, 1909L, 2401L, 1446L,
1345L, 1573L, 2037L, 634L, 1916L, 1522L, 592L, 831L, 577L, 2196L,
1482L, 1436L, 1668L, 3095L, 3677L, 2641L, 1285L, 932L, 2461L,
1609L, 1143L, 1617L, 1075L, 1280L, 838L, 1447L, 3941L, 2402L,
1130L, 851L, 10316L, 9576L, 2396L, 3484L, 5688L, 2200L, 1856L,
1441L, 2539L, 3056L, 1325L, 2454L, 2010L, 2340L, 1448L, 2435L,
2782L, 3633L, 1766L, 2564L, 1473L, 1214L, 1951L, 2561L, 4262L,
2576L, 4257L, 2314L, 2071L, 3182L, 1839L, 2214L, NA, 1898L,
790L, 867L, 1764L, 970L, 1320L, 2850L, 1019L, 1483L, 3720L, 2215L,
3581L, 3391L), TotPop_se = c(132.522796352584, 149.544072948328,
127.051671732523, 130.091185410334, 232.826747720365, 135.562310030395,
100.303951367781, 176.29179331307, 114.285714285714, 0,
339.817629179331, 438.297872340425, 245.592705167173, 324.012158054711,
333.130699088146, 224.924012158055, 321.580547112462, 169.604863221885,
175.075987841945, 469.908814589666, 375.075987841945, 411.550151975684,
378.115501519757, 235.258358662614, 241.337386018237, 291.793313069909,
337.386018237082, 138.601823708207, 145.896656534954, 193.920972644377,
135.562310030395, 117.325227963526, 244.984802431611, 318.54103343465,
207.90273556231, 200, 279.635258358663, 657.750759878419, 401.215805471125,
401.823708206687, 229.787234042553, 139.817629179331, 303.951367781155,
201.215805471125, 200, 252.887537993921, 356.838905775076, 241.945288753799,
238.297872340426, 267.477203647416, 320.9726443769, 255.31914893617,
178.115501519757, 116.109422492401, NA, 766.565349544073,
255.31914893617, 463.22188449848, 448.632218844985, 367.781155015198,
269.300911854103, 261.398176291793, 286.93009118541, 446.808510638298,
224.316109422492, 212.158054711246, 233.434650455927, 304.559270516717,
356.231003039514, 275.379939209726, 330.699088145897, 368.996960486322,
248.024316109423, 310.030395136778, 153.799392097264, 243.768996960486,
265.65349544073, 337.386018237082, 436.474164133739, 359.270516717325,
344.072948328268, 196.960486322188, 231.003039513678, 356.231003039514,
212.158054711246, 348.328267477204, 206.079027355623, 240.729483282675,
196.352583586626, 141.033434650456, 215.80547112462, 127.659574468085,
248.024316109423, 589.057750759878, 231.61094224924, 486.93009118541,
605.471124620061, 713.06990881459, 488.753799392097, 382.370820668693
), Under17_se = c(35.8095476596307, 50.9877853224243, 50.0994474845873,
44.7376765786604, 113.994325548832, 59.7386237841673, 22.7862186188344,
95.1285234870203, 42.3093316505904, 35.4621507988699, 143.021311606928,
205.334390935311, 102.292167403598, 115.712493289527, 88.9617416652971,
98.0345650964952, 149.50823698925, 40.0016629212452, 86.7428425216985,
158.047696828218, 173.225615182675, 144.710221534209, 121.094774232467,
76.9999466678128, 88.9160360898593, 97.7665610480423, 133.02517642826,
30.4983051540691, 83.3625069421341, 75.7125713164268, 50.3826325227805,
37.5622898620679, 7.29483282674772, 122.185425418875, 83.4644035953588,
63.8384709681463, 99.5458131127046, 208.446825330589, 150.282359742524,
206.017151858922, 87.7761872483956, 56.194023821941, 120.701992909334,
50.6423479626955, 55.4225960853081, 93.2888100499867, 126.879946773287,
143.069104861932, 86.7747884744339, 79.4517480028886, 140.260959630942,
125.115775875384, 52.187662082273, 38.1819057688564, 365.828168907497,
380.635956883794, 135.735302000757, 213.321896356121, 198.507936644685,
126.535797699776, 141.516048792542, 114.238818548927, 117.737122860635,
165.644292987747, 71.238834852709, 93.0825940979755, 41.8438489710712,
97.0666682368976, 86.5060758100772, 92.8659724484427, 76.6536183156139,
192.822109819002, 101.83958502542, 139.341067042001, 55.3992539361667,
92.106793773051, 78.2330906844691, 115.177918141833, 207.546042154974,
139.609995160777, 153.568552211039, 73.5738128652025, 112.249861520572,
171.38868664475, 66.0687084216098, 181.939713349267, 28.4417934718288,
90.1132509720827, 57.4202669424023, 46.8440239496863, 80.4799857926917,
42.6875862955885, 81.3500156027725, 142.669475129055, 23.4653605661019,
191.159072511375, 159.615857998832, 191.592580855392, 184.123292172321,
125.375425911215), Under17_e = c(123, 284, 189, 228, 661, 180,
49, 500, 121, 115, 686, 1456, 385, 578, 302, 476, 738, 124, 527,
803, 1219, 459, 614, 218, 229, 422, 543, 69, 536, 306, 149, 80,
0, 520, 281, 270, 454, 669, 905, 978, 282, 178, 630, 187, 145,
367, 327, 577, 225, 246, 966, 629, 211, 65, 2857, 3051, 592,
1162, 1322, 464, 490, 264, 576, 617, 326, 695, 169, 381, 309,
476, 355, 915, 431, 869, 269, 358, 335, 650, 1443, 561, 900,
411, 759, 1265, 171, 833, 45, 255, 134, 144, 339, 203, 388, 413,
66, 416, 654, 565, 700, 362)), row.names = c(NA, 100L), class = "data.frame")
I have a have time-series at 10 minutes duration. I want sub-series of duration between 23:10:00 - 00:00:00 hours. Here is the dput of data,
df<-structure(c(994, 1019, 1381, 843, 1105, 1120, 869, 2216, 1741,
1737, 1727, 1462, 1564, 418, 281, 280, 277, 311, 242, 221, 328,
359, 410, 436, 359, 1738, 2075, 1766, 1812, 1810, 1246, 323,
250, 272, 283, 286, 252, 1671, 1695, 1687, 1646, 1257, 1632,
277, 305, 292, 261, 309, 304, 209, 210, 225, 201, 197, 247, 264,
238, 260, 254, 263, 226, 624, 1955, 1561, 1231, 976, 1213, 167,
1037, 1269, 1619, 1749, 1674, 1123, 1695, 2164, 1780, 1732, 1715,
283, 230, 291, 281, 137, 1358, 1630, 1626, 1889, 1635, 1591,
1606, 2024, 1783, 1752, 613, 301, 933, 1823, 1831, 1810, 1895,
1876, 1222, 1952, 1288, 282, 261, 296, 839, 1831, 1799, 1950,
2085, 1921, 1862, 1885, 1869, 1909, 1896, 1843), .Dim = c(120L,
1L), .Dimnames = list(NULL, "value"), index = structure(c(1430764200,
1430847600, 1430848200, 1430848800, 1430849400, 1430850000, 1430850600,
1430934000, 1430934600, 1430935200, 1430935800, 1430936400, 1430937000,
1431020400, 1431021000, 1431021600, 1431022200, 1431022800, 1431023400,
1431106800, 1431107400, 1431108000, 1431108600, 1431109200, 1431109800,
1431193200, 1431193800, 1431194400, 1431195000, 1431195600, 1431196200,
1431279600, 1431280200, 1431280800, 1431281400, 1431282000, 1431282600,
1431366000, 1431366600, 1431367200, 1431367800, 1431368400, 1431369000,
1431452400, 1431453000, 1431453600, 1431454200, 1431454800, 1431455400,
1431538800, 1431539400, 1431540000, 1431540600, 1431541200, 1431541800,
1431625200, 1431625800, 1431626400, 1431627000, 1431627600, 1431628200,
1431711600, 1431712200, 1431712800, 1431713400, 1431714000, 1431714600,
1431798000, 1431798600, 1431799200, 1431799800, 1431800400, 1431801000,
1431884400, 1431885000, 1431885600, 1431886200, 1431886800, 1431887400,
1431970800, 1431971400, 1431972000, 1431972600, 1431973200, 1431973800,
1432057200, 1432057800, 1432058400, 1432059000, 1432059600, 1432060200,
1432143600, 1432144200, 1432144800, 1432145400, 1432146000, 1432146600,
1432230000, 1432230600, 1432231200, 1432231800, 1432232400, 1432233000,
1432316400, 1432317000, 1432317600, 1432318200, 1432318800, 1432319400,
1432402800, 1432403400, 1432404000, 1432404600, 1432405200, 1432405800,
1432489200, 1432489800, 1432490400, 1432491000, 1432491600), tclass = c("POSIXct",
"POSIXt"), tzone = "Asia/Kolkata"), .indexCLASS = c("POSIXct",
"POSIXt"), .indexTZ = "Asia/Kolkata", tclass = c("POSIXct", "POSIXt"
), tzone = "Asia/Kolkata", class = c("xts", "zoo"))
Required output is:
Is there any existing function which can do this? I tried split.xts, but was not able to get required form.
You could use xts with only base R or use chained expressions with dplyr and tidyr. Base R's unstack and tidyr's spread both take two columns of data containing key-value pairs and arrange them as separate columns of values for each unique key value. Code would look like:
# base R version
library(xts)
df2 <- unstack(data.frame(value=coredata(df), time = format(index(df), "%H:%M")),
value ~ time)[,c(2:6,1)]
# version using chained expressions with dplyr and tidyr
library(xts)
library(dplyr)
library(tidyr)
df3 <- df %>% fortify.zoo() %>%
mutate(time=format(Index, "%H:%M"), Index=format(Index, "%Y-%m-%d") ) %>%
spread(key=time, value=value) %>%
select(c(3:6,2))