I'd like to output a chart similar to the one represented on this page (on the right) using R and any package that would make it look good:
http://processtrends.com/pg_charts_monthly_cycle_chart.htm
Anyone up to the challenge? :)
Thanks!
The stats package in R base already has a function to do this. Here is my one-liner and the output that it produces
monthplot(redata, labels = month.abb, ylab = 'Listings')
Building on this an example of using monthplot for a weekly cycle plot is here (gives full R code and source data): http://figshare.com/figures/index.php/OpenURL_Router_Data:_Total_Requests_by_Weekday
monthplot(ts(sdf$values, frequency = 7, start=c(12,5)), labels = dow, ylab = "No. requests / day", xlab = "Weekday")
which gives this weekly cycle plot:
Example of monthplot to create a graph showing a weekly cycle http://figshare.com/figures/images/a/a7/Total_requests_by_weekday_01_Apr_to_31_Jul_2011.jpeg
Of course no graphical challenge will be complete without a ggplot solution. The tricky bit is to use ddply to summarise the monthly averages, and to pass this as data to a separate layer to ggplot.
library(lubridate)
library(plyr)
library(ggplot2)
df$month <- factor(month(df$dates), levels=1:12, labels=month.abb, ordered=TRUE)
df$year <- year(df$dates)
hline.data <- ddply(df, .(month), summarize, avgvalue=mean(values))
ggplot() +
geom_line(aes(x=year, y=values, group=month), data=df, colour="blue") +
geom_hline(aes(yintercept=avgvalue), data=hline.data, colour="blue", size=2) +
facet_grid(~month) +
opts(axis.text.x = theme_blank()) +
xlab("")
The data:
df <- structure(list(dates = structure(c(8415, 8446, 8474, 8505, 8535,
8566, 8596, 8627, 8658, 8688, 8719, 8749, 8780, 8811, 8839, 8870,
8900, 8931, 8961, 8992, 9023, 9053, 9084, 9114, 9145, 9176, 9204,
9235, 9265, 9296, 9326, 9357, 9388, 9418, 9449, 9479, 9510, 9541,
9570, 9601, 9631, 9662, 9692, 9723, 9754, 9784, 9815, 9845, 9876,
9907, 9935, 9966, 9996, 10027, 10057, 10088, 10119, 10149, 10180,
10210, 10241, 10272, 10300, 10331, 10361, 10392, 10422, 10453,
10484, 10514, 10545, 10575, 10606, 10637, 10665, 10696, 10726,
10757, 10787, 10818, 10849, 10879, 10910, 10940, 10971, 11002,
11031, 11062, 11092, 11123, 11153, 11184, 11215, 11245, 11276,
11306, 11337, 11368, 11396, 11427, 11457, 11488, 11518, 11549,
11580, 11610, 11641, 11671, 11702, 11733, 11761, 11792, 11822,
11853, 11883, 11914, 11945, 11975, 12006, 12036, 12067, 12098,
12126, 12157, 12187, 12218, 12248, 12279, 12310, 12340, 12371,
12401, 12432, 12463, 12492, 12523, 12553, 12584, 12614, 12645,
12676, 12706, 12737, 12767, 12798, 12829, 12857, 12888, 12918,
12949, 12979, 13010, 13041, 13071, 13102, 13132), class = "Date"),
values = c(1093, 1182, 1299, 1372, 1319, 1362, 1239, 1162,
1059, 921, 815, 720, 835, 853, 1034, 1030, 1240, 1388, 1429,
1319, 1231, 1184, 1076, 825, 991, 1093, 854, 808, 1079, 1092,
1220, 1251, 1130, 1131, 1052, 951, 950, 1006, 1112, 1119,
1250, 1322, 1347, 1310, 1215, 1128, 1035, 992, 1079, 1018,
1112, 1224, 1323, 1344, 1326, 1267, 1171, 1075, 916, 932,
888, 904, 939, 1018, 1140, 1174, 1285, 1311, 1298, 1231,
1091, 1088, 991, 1028, 1177, 1322, 1322, 1398, 1389, 1174,
1196, 1115, 756, 496, 693, 673, 748, 777, 820, 948, 966,
1027, 960, 865, 767, 675, 765, 732, 613, 632, 659, 705, 684,
734, 715, 626, 551, 487, 500, 536, 575, 595, 736, 798, 832,
797, 792, 726, 650, 584, 567, 524, 574, 571, 591, 657, 699,
756, 867, 795, 760, 685, 609, 588, 521, 581, 614, 623, 668,
702, 777, 697, 647, 562, 523, 508, 493, 504, 534, 586, 621,
620, 636, 600, 549, 557)), .Names = c("dates", "values"), row.names = c(NA,
-156L), class = "data.frame")
An awful piece of R coding here by me, but it might give you some ideas on how to do it:
This was the data I used taken from the excel file on the linked site:
> dput(redata)
structure(c(1093L, 1182L, 1299L, 1372L, 1319L, 1362L, 1239L,
1162L, 1059L, 921L, 815L, 720L, 835L, 853L, 1034L, 1030L, 1240L,
1388L, 1429L, 1319L, 1231L, 1184L, 1076L, 825L, 991L, 1093L,
854L, 808L, 1079L, 1092L, 1220L, 1251L, 1130L, 1131L, 1052L,
951L, 950L, 1006L, 1112L, 1119L, 1250L, 1322L, 1347L, 1310L,
1215L, 1128L, 1035L, 992L, 1079L, 1018L, 1112L, 1224L, 1323L,
1344L, 1326L, 1267L, 1171L, 1075L, 916L, 932L, 888L, 904L, 939L,
1018L, 1140L, 1174L, 1285L, 1311L, 1298L, 1231L, 1091L, 1088L,
991L, 1028L, 1177L, 1322L, 1322L, 1398L, 1389L, 1174L, 1196L,
1115L, 756L, 496L, 693L, 673L, 748L, 777L, 820L, 948L, 966L,
1027L, 960L, 865L, 767L, 675L, 765L, 732L, 613L, 632L, 659L,
705L, 684L, 734L, 715L, 626L, 551L, 487L, 500L, 536L, 575L, 595L,
736L, 798L, 832L, 797L, 792L, 726L, 650L, 584L, 567L, 524L, 574L,
571L, 591L, 657L, 699L, 756L, 867L, 795L, 760L, 685L, 609L, 588L,
521L, 581L, 614L, 623L, 668L, 702L, 777L, 697L, 647L, 562L, 523L,
508L, 493L, 504L, 534L, 586L, 621L, 620L, 636L, 600L, 549L, 557L
), .Dim = 12:13, .Dimnames = list(c("Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), c("X1993",
"X1994", "X1995", "X1996", "X1997", "X1998", "X1999", "X2000",
"X2001", "X2002", "X2003", "X2004", "X2005")))
And here's my woeful coding... so much cleanup is possible here, but it is a quick test of the possibilities.
monthnames <- c(
"Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec"
)
# size of window
windows(w=6,h=3)
# margins
par(
mar=c(5.1,5.1,2.1,2.1),
cex.axis=0.7
)
# set up plot with the number of categories and the y limits
# yaxs="i" sets the yaxis as having no separation from the corner point
ylimlp <- c(0,max(redata))*1.06
plot(1:156, type="n", xaxt="n", ylim=ylimlp, ann=FALSE, yaxs="i", xaxs="i", bty="l", las="1")
abline(v=seq(13,156,13),lty=1,col="grey")
title(xlab="Month", col.lab=rgb(0,0,0), font.lab=2, cex.lab=0.75)
title(ylab="Listings", col.lab=rgb(0,0,0), font.lab=2, cex.lab=0.75)
lines(redata[1,],type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*1),redata[2,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*2),redata[3,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*3),redata[4,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*4),redata[5,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*5),redata[6,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*6),redata[7,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*7),redata[8,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*8),redata[9,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*9),redata[10,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*10),redata[11,]),type="l",pch=NA,lwd=1,col="grey")
lines(c(rep(NA,13*11),redata[12,]),type="l",pch=NA,lwd=1,col="grey")
redatamonthmean <- apply(redata,1,mean)
lines(rep(redatamonthmean[1],13),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*1),rep(redatamonthmean[2],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*2),rep(redatamonthmean[3],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*3),rep(redatamonthmean[4],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*4),rep(redatamonthmean[5],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*5),rep(redatamonthmean[6],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*6),rep(redatamonthmean[7],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*7),rep(redatamonthmean[8],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*8),rep(redatamonthmean[9],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*9),rep(redatamonthmean[10],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*10),rep(redatamonthmean[11],13)),type="l",pch=NA,lwd=1,col="black")
lines(c(rep(NA,13*11),rep(redatamonthmean[12],13)),type="l",pch=NA,lwd=1,col="black")
mtext(monthnames[1], side=1, cex=0.7, at=6.5)
mtext(monthnames[2], side=1, cex=0.7, at=6.5*3)
mtext(monthnames[3], side=1, cex=0.7, at=6.5*5)
mtext(monthnames[4], side=1, cex=0.7, at=6.5*7)
mtext(monthnames[5], side=1, cex=0.7, at=6.5*9)
mtext(monthnames[6], side=1, cex=0.7, at=6.5*11)
mtext(monthnames[7], side=1, cex=0.7, at=6.5*13)
mtext(monthnames[8], side=1, cex=0.7, at=6.5*15)
mtext(monthnames[9], side=1, cex=0.7, at=6.5*17)
mtext(monthnames[10], side=1, cex=0.7, at=6.5*19)
mtext(monthnames[11], side=1, cex=0.7, at=6.5*21)
mtext(monthnames[12], side=1, cex=0.7, at=6.5*23)
And the example image
Related
This question aims to receive feedback to make a function more efficient. Apologies for a long, case specific post.
I created a function that calculates percentages in estimates of the American Community Survey (ACS). Because estimates in the ACS have margins of error, calculating percentages (e.g. % of the total population being below 17 years old) requires the recalculation of the error that results from dividing the estimate of both variables (population below 17 / total population).
So to calculate the new margin of error for a proportion calculated as p = estimate_a/estimate_b, the formula to use is MOE(p) = (1/estimate_b)*sqrt((MOE_b^2)-(p^2*MOE_a^2)). If the value inside of the square root was negative, then the substraction should be changed to a sum, with the formula becoming MOE(p) = (1/estimate_b)*sqrt((MOE_b^2)+(p^2*MOE_a^2)). If the result of p = estimate_a/estimate_b is 1, the documentation suggest calculating MOE using another formula: MOE(p) = MOE_a/estimate_b
To make these calculations, I created a function that takes a data frame with estimates and their MOEs, calculates the proportion between two specified variables, and writes two new columns in the original dataframe - one with the proportion, and another one with its margin of error. The function loops through the rows of the data frame carrying out if-else checks to determine what formula to apply, including skipping rows that might have NA values. The original data on which I apply this function is considerably long - ~250000 rows, and the structure of this function makes it go very slowly. Hence, the question is whether there are ways to improve the quality of this code to improve its speed. The function and dummy data are provided below:
percent_calculator <- function(DF, A_e, B_e, A_se, B_se, New_fn){
# arguments legend >> DF = data frame; A_e = estimate_a (string of the fieldname); B_e = estimate_b (string of the fieldname);
# A_se = MOE_a (string of the fieldname); B_se = MOE_b (string of the fieldname); New_fn = root for new fieldname in the data frame (string)
pb<- txtProgressBar(min = 0, max = nrow(DF), initial = 0) # progress bar initialization
for (i in 1:nrow(DF)){ # for loop that iterates through the rows of the DF
setTxtProgressBar(pb,i)
if(is.na(DF[[A_e]][i])==FALSE & is.na(DF[[B_e]][i])==FALSE){ # check if any of the estimates used to calculate the proportion is NA (if so, skip)
if (DF[[B_e]][i]!= 0){ # check if estimate_b is not 0, to avoid creating inf values from A_e/B-e
DF[[paste0(New_fn, "_e")]][i] <- (DF[[A_e]][i]/DF[[B_e]][i])
if(DF[[paste0(New_fn, "_e")]][i] == 1){ # check if P==1 to then use the appropiate formula for MOE
DF[[paste0(New_fn, "_se")]][i] <- (DF[[A_se]][i]/DF[[B_e]][i])
} else {
if((DF[[A_se]][i]^2)-(DF[[paste0(New_fn, "_e")]][i]^2)*(DF[[B_se]][i]^2)>= 0){ # check for the sign of the value inside of the square root
DF[[paste0(New_fn, "_se")]][i] <- (1/DF[[B_e]][i])*sqrt((DF[[A_se]][i]^2)-(DF[[paste0(New_fn, "_e")]][i]^2)*(DF[[B_se]][i]^2))
} else {
DF[[paste0(New_fn, "_se")]][i] <- (1/DF[[B_e]][i])*sqrt((DF[[A_se]][i]^2)+(DF[[paste0(New_fn, "_e")]][i]^2)*(DF[[B_se]][i]^2))
}
}
} else { # assign 0 value if B_e was 0
DF[[paste0(New_fn, "_e")]][i] <- 0
DF[[paste0(New_fn, "_se")]][i] <- 0
}
} else { # assign NA if any of the estimates was NA
DF[[paste0(New_fn, "_e")]][i] <- NA
DF[[paste0(New_fn, "_se")]][i] <- NA
}
DF[[paste0(New_fn, "_e")]][i] <- DF[[paste0(New_fn, "_e")]][i]*100 # switch from proportion to percentage in the estimate value
DF[[paste0(New_fn, "_se")]][i] <- DF[[paste0(New_fn, "_se")]][i]*100 # switch from proportion to percentage in the MOE value
}
return(DF)
}
Dummy <- structure(list(TotPop_e = c(636L, 1287L, 810L, 1218L, 2641L,
835L, 653L, 1903L, 705L, 570L, 2150L, 6013L, 1720L, 2555L, 1150L,
2224L, 1805L, 728L, 2098L, 3099L, 4194L, 1909L, 2401L, 1446L,
1345L, 1573L, 2037L, 634L, 1916L, 1522L, 592L, 831L, 577L, 2196L,
1482L, 1436L, 1668L, 3095L, 3677L, 2641L, 1285L, 932L, 2461L,
1609L, 1143L, 1617L, 1075L, 1280L, 838L, 1447L, 3941L, 2402L,
1130L, 851L, 10316L, 9576L, 2396L, 3484L, 5688L, 2200L, 1856L,
1441L, 2539L, 3056L, 1325L, 2454L, 2010L, 2340L, 1448L, 2435L,
2782L, 3633L, 1766L, 2564L, 1473L, 1214L, 1951L, 2561L, 4262L,
2576L, 4257L, 2314L, 2071L, 3182L, 1839L, 2214L, 1101L, 1898L,
790L, 867L, 1764L, 970L, 1320L, 2850L, 1019L, 1483L, 3720L, 2215L,
3581L, 3391L), TotPop_se = c(132.522796352584, 149.544072948328,
127.051671732523, 130.091185410334, 232.826747720365, 135.562310030395,
100.303951367781, 176.29179331307, 114.285714285714, 96.6565349544073,
339.817629179331, 438.297872340425, 245.592705167173, 324.012158054711,
333.130699088146, 224.924012158055, 321.580547112462, 169.604863221885,
175.075987841945, 469.908814589666, 375.075987841945, 411.550151975684,
378.115501519757, 235.258358662614, 241.337386018237, 291.793313069909,
337.386018237082, 138.601823708207, 145.896656534954, 193.920972644377,
135.562310030395, 117.325227963526, 244.984802431611, 318.54103343465,
207.90273556231, 200, 279.635258358663, 657.750759878419, 401.215805471125,
401.823708206687, 229.787234042553, 139.817629179331, 303.951367781155,
201.215805471125, 200, 252.887537993921, 356.838905775076, 241.945288753799,
238.297872340426, 267.477203647416, 320.9726443769, 255.31914893617,
178.115501519757, 116.109422492401, 891.793313069909, 766.565349544073,
255.31914893617, 463.22188449848, 448.632218844985, 367.781155015198,
269.300911854103, 261.398176291793, 286.93009118541, 446.808510638298,
224.316109422492, 212.158054711246, 233.434650455927, 304.559270516717,
356.231003039514, 275.379939209726, 330.699088145897, 368.996960486322,
248.024316109423, 310.030395136778, 153.799392097264, 243.768996960486,
265.65349544073, 337.386018237082, 436.474164133739, 359.270516717325,
344.072948328268, 196.960486322188, 231.003039513678, 356.231003039514,
212.158054711246, 348.328267477204, 206.079027355623, 240.729483282675,
196.352583586626, 141.033434650456, 215.80547112462, 127.659574468085,
248.024316109423, 589.057750759878, 231.61094224924, 486.93009118541,
605.471124620061, 713.06990881459, 488.753799392097, 382.370820668693
), Under17_se = c(35.8095476596307, 50.9877853224243, 50.0994474845873,
44.7376765786604, 113.994325548832, 59.7386237841673, 22.7862186188344,
95.1285234870203, 42.3093316505904, 35.4621507988699, 143.021311606928,
205.334390935311, 102.292167403598, 115.712493289527, 88.9617416652971,
98.0345650964952, 149.50823698925, 40.0016629212452, 86.7428425216985,
158.047696828218, 173.225615182675, 144.710221534209, 121.094774232467,
76.9999466678128, 88.9160360898593, 97.7665610480423, 133.02517642826,
30.4983051540691, 83.3625069421341, 75.7125713164268, 50.3826325227805,
37.5622898620679, 7.29483282674772, 122.185425418875, 83.4644035953588,
63.8384709681463, 99.5458131127046, 208.446825330589, 150.282359742524,
206.017151858922, 87.7761872483956, 56.194023821941, 120.701992909334,
50.6423479626955, 55.4225960853081, 93.2888100499867, 126.879946773287,
143.069104861932, 86.7747884744339, 79.4517480028886, 140.260959630942,
125.115775875384, 52.187662082273, 38.1819057688564, 365.828168907497,
380.635956883794, 135.735302000757, 213.321896356121, 198.507936644685,
126.535797699776, 141.516048792542, 114.238818548927, 117.737122860635,
165.644292987747, 71.238834852709, 93.0825940979755, 41.8438489710712,
97.0666682368976, 86.5060758100772, 92.8659724484427, 76.6536183156139,
192.822109819002, 101.83958502542, 139.341067042001, 55.3992539361667,
92.106793773051, 78.2330906844691, 115.177918141833, 207.546042154974,
139.609995160777, 153.568552211039, 73.5738128652025, 112.249861520572,
171.38868664475, 66.0687084216098, 181.939713349267, 28.4417934718288,
90.1132509720827, 57.4202669424023, 46.8440239496863, 80.4799857926917,
42.6875862955885, 81.3500156027725, 142.669475129055, 23.4653605661019,
191.159072511375, 159.615857998832, 191.592580855392, 184.123292172321,
125.375425911215), Under17_e = c(123, 284, 189, 228, 661, 180,
49, 500, 121, 115, 686, 1456, 385, 578, 302, 476, 738, 124, 527,
803, 1219, 459, 614, 218, 229, 422, 543, 69, 536, 306, 149, 80,
0, 520, 281, 270, 454, 669, 905, 978, 282, 178, 630, 187, 145,
367, 327, 577, 225, 246, 966, 629, 211, 65, 2857, 3051, 592,
1162, 1322, 464, 490, 264, 576, 617, 326, 695, 169, 381, 309,
476, 355, 915, 431, 869, 269, 358, 335, 650, 1443, 561, 900,
411, 759, 1265, 171, 833, 45, 255, 134, 144, 339, 203, 388, 413,
66, 416, 654, 565, 700, 362)), row.names = c(NA, 100L), class = "data.frame")
# example run to calculate pct of people below 17
Dummy <- percent_calculator(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
You don't need that loop at all. All your operations are simple arithmetic that can take vectors, instead of single values. This is called vectorization. You then implement your logic tree with a nested ifelse. ifelse does compute all three possible outcomes (which is a bit unnecessary), but that is very much worth it in this case. If you want to optimize further have a look here: Is `if` faster than ifelse?
Timings at the bottom.
percent_calculator_vectorized <- function(DF, A_e, B_e, A_se, B_se, New_fn){
# arguments legend >> DF = data frame; A_e = estimate_a (string of the fieldname); B_e = estimate_b (string of the fieldname);
# A_se = MOE_a (string of the fieldname); B_se = MOE_b (string of the fieldname); New_fn = root for new fieldname in the data frame (string)
e_name <- paste0(New_fn, "_e")
se_name <- paste0(New_fn, "_se")
DF[[e_name]] <- DF[[A_e]] / DF[[B_e]]
DF[[se_name]] <- ifelse(
DF[[e_name]] == 1, # check if P==1 to then use the appropriate formula for MOE
DF[[A_se]] / DF[[B_e]],
ifelse(
(DF[[A_se]]^2)-(DF[[e_name]]^2)*(DF[[B_se]]^2)>= 0, # check for the sign of the value inside of the square root
(1/DF[[B_e]])*sqrt((DF[[A_se]]^2)-(DF[[e_name]]^2)*(DF[[B_se]]^2)),
(1/DF[[B_e]])*sqrt((DF[[A_se]]^2)+(DF[[e_name]]^2)*(DF[[B_se]]^2))
)
)
# assign 0 value if B_e was 0
DF[DF[[B_e]] == 0 & !is.na(DF[[B_e]]), c(e_name, se_name)] <- 0
# switch from proportion to percentage in the estimate value
DF[, c(e_name, se_name)] <- DF[, c(e_name, se_name)] * 100
return(DF)
}
Dummy2 <- percent_calculator(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
Dummy3 <- percent_calculator_vectorized(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
all.equal(Dummy2, Dummy3) #TRUE
Timings:
bench::mark(
orig = percent_calculator(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17"),
vect = percent_calculator_vectorized(Dummy , "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17"),
)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <list>
1 orig 17.2ms 18.5ms 53.1 331.2KB 14.8 18 5 339ms <df[,6] … <df[,3] … <bch… <tibble …
2 vect 157.4µs 168µs 5700. 19.4KB 11.6 2450 5 430ms <df[,6] … <df[,3] … <bch… <tibble …
Speed up for this small dataset is ~100x, also with a ~10x smaller memory footprint.
I see you have a working solution but for my own reasons I wanted to try a tidyverse solution. This one is just about as fast as the base R solution and for me would be easier to maintain. I also added some more oddities to your toy data to make sure I caught the edge cases.
library(dplyr)
ACS_recalculator <- function(DF, A_e, B_e, A_se, B_se, New_fn){
e_name <- paste0(New_fn, "_e")
se_name <- paste0(New_fn, "_se")
A_ex <- ensym(A_e)
B_ex <- ensym(B_e)
A_sex <- ensym(A_se)
B_sex <- ensym(B_se)
DF <-
DF %>%
mutate(e_value = ifelse(!!B_ex != 0, !!A_ex / !!B_ex, 0),
se_value = case_when(
!!B_ex == 0 ~ 0,
e_value == 1 ~ !!A_sex / !!B_ex,
((!!A_sex)^2) - (e_value^2 * ((!!B_sex)^2)) >= 0 ~ (1/!!B_ex) * sqrt(((!!A_sex)^2) - (e_value^2) * ((!!B_sex)^2)),
((!!A_sex)^2) - (e_value^2 * ((!!B_sex)^2)) < 0 ~ (1/!!B_ex) * sqrt(((!!A_sex)^2) + (e_value^2) * ((!!B_sex)^2)),
TRUE ~ NA_real_),
e_value = e_value * 100,
se_value = se_value * 100) %>%
rename(!!e_name := e_value,
!!se_name := se_value)
return(DF)
}
Dummy2 <- ACS_recalculator(Dummy2, "Under17_e", "TotPop_e", "Under17_se", "TotPop_se", "P_Bel17")
head(Dummy2)
#> TotPop_e TotPop_se Under17_se Under17_e P_Bel17_e P_Bel17_se
#> 1 636 132.5228 35.80955 123 19.33962 3.932255
#> 2 1287 149.5441 50.98779 284 22.06682 3.020104
#> 3 810 127.0517 50.09945 189 23.33333 4.986043
#> 4 1218 130.0912 44.73768 228 18.71921 3.081212
#> 5 2641 232.8267 113.99433 661 25.02840 3.709747
#> 6 835 135.5623 59.73862 180 21.55689 6.239876
Your original example data with more missings and zeros
Dummy2 <- structure(list(TotPop_e = c(636L, 1287L, 810L, 1218L, 2641L,
835L, 653L, 1903L, 0L, 570L, 2150L, 6013L, 1720L, 2555L, 1150L,
2224L, 1805L, 728L, 2098L, 3099L, 4194L, 1909L, 2401L, 1446L,
1345L, 1573L, 2037L, 634L, 1916L, 1522L, 592L, 831L, 577L, 2196L,
1482L, 1436L, 1668L, 3095L, 3677L, 2641L, 1285L, 932L, 2461L,
1609L, 1143L, 1617L, 1075L, 1280L, 838L, 1447L, 3941L, 2402L,
1130L, 851L, 10316L, 9576L, 2396L, 3484L, 5688L, 2200L, 1856L,
1441L, 2539L, 3056L, 1325L, 2454L, 2010L, 2340L, 1448L, 2435L,
2782L, 3633L, 1766L, 2564L, 1473L, 1214L, 1951L, 2561L, 4262L,
2576L, 4257L, 2314L, 2071L, 3182L, 1839L, 2214L, NA, 1898L,
790L, 867L, 1764L, 970L, 1320L, 2850L, 1019L, 1483L, 3720L, 2215L,
3581L, 3391L), TotPop_se = c(132.522796352584, 149.544072948328,
127.051671732523, 130.091185410334, 232.826747720365, 135.562310030395,
100.303951367781, 176.29179331307, 114.285714285714, 0,
339.817629179331, 438.297872340425, 245.592705167173, 324.012158054711,
333.130699088146, 224.924012158055, 321.580547112462, 169.604863221885,
175.075987841945, 469.908814589666, 375.075987841945, 411.550151975684,
378.115501519757, 235.258358662614, 241.337386018237, 291.793313069909,
337.386018237082, 138.601823708207, 145.896656534954, 193.920972644377,
135.562310030395, 117.325227963526, 244.984802431611, 318.54103343465,
207.90273556231, 200, 279.635258358663, 657.750759878419, 401.215805471125,
401.823708206687, 229.787234042553, 139.817629179331, 303.951367781155,
201.215805471125, 200, 252.887537993921, 356.838905775076, 241.945288753799,
238.297872340426, 267.477203647416, 320.9726443769, 255.31914893617,
178.115501519757, 116.109422492401, NA, 766.565349544073,
255.31914893617, 463.22188449848, 448.632218844985, 367.781155015198,
269.300911854103, 261.398176291793, 286.93009118541, 446.808510638298,
224.316109422492, 212.158054711246, 233.434650455927, 304.559270516717,
356.231003039514, 275.379939209726, 330.699088145897, 368.996960486322,
248.024316109423, 310.030395136778, 153.799392097264, 243.768996960486,
265.65349544073, 337.386018237082, 436.474164133739, 359.270516717325,
344.072948328268, 196.960486322188, 231.003039513678, 356.231003039514,
212.158054711246, 348.328267477204, 206.079027355623, 240.729483282675,
196.352583586626, 141.033434650456, 215.80547112462, 127.659574468085,
248.024316109423, 589.057750759878, 231.61094224924, 486.93009118541,
605.471124620061, 713.06990881459, 488.753799392097, 382.370820668693
), Under17_se = c(35.8095476596307, 50.9877853224243, 50.0994474845873,
44.7376765786604, 113.994325548832, 59.7386237841673, 22.7862186188344,
95.1285234870203, 42.3093316505904, 35.4621507988699, 143.021311606928,
205.334390935311, 102.292167403598, 115.712493289527, 88.9617416652971,
98.0345650964952, 149.50823698925, 40.0016629212452, 86.7428425216985,
158.047696828218, 173.225615182675, 144.710221534209, 121.094774232467,
76.9999466678128, 88.9160360898593, 97.7665610480423, 133.02517642826,
30.4983051540691, 83.3625069421341, 75.7125713164268, 50.3826325227805,
37.5622898620679, 7.29483282674772, 122.185425418875, 83.4644035953588,
63.8384709681463, 99.5458131127046, 208.446825330589, 150.282359742524,
206.017151858922, 87.7761872483956, 56.194023821941, 120.701992909334,
50.6423479626955, 55.4225960853081, 93.2888100499867, 126.879946773287,
143.069104861932, 86.7747884744339, 79.4517480028886, 140.260959630942,
125.115775875384, 52.187662082273, 38.1819057688564, 365.828168907497,
380.635956883794, 135.735302000757, 213.321896356121, 198.507936644685,
126.535797699776, 141.516048792542, 114.238818548927, 117.737122860635,
165.644292987747, 71.238834852709, 93.0825940979755, 41.8438489710712,
97.0666682368976, 86.5060758100772, 92.8659724484427, 76.6536183156139,
192.822109819002, 101.83958502542, 139.341067042001, 55.3992539361667,
92.106793773051, 78.2330906844691, 115.177918141833, 207.546042154974,
139.609995160777, 153.568552211039, 73.5738128652025, 112.249861520572,
171.38868664475, 66.0687084216098, 181.939713349267, 28.4417934718288,
90.1132509720827, 57.4202669424023, 46.8440239496863, 80.4799857926917,
42.6875862955885, 81.3500156027725, 142.669475129055, 23.4653605661019,
191.159072511375, 159.615857998832, 191.592580855392, 184.123292172321,
125.375425911215), Under17_e = c(123, 284, 189, 228, 661, 180,
49, 500, 121, 115, 686, 1456, 385, 578, 302, 476, 738, 124, 527,
803, 1219, 459, 614, 218, 229, 422, 543, 69, 536, 306, 149, 80,
0, 520, 281, 270, 454, 669, 905, 978, 282, 178, 630, 187, 145,
367, 327, 577, 225, 246, 966, 629, 211, 65, 2857, 3051, 592,
1162, 1322, 464, 490, 264, 576, 617, 326, 695, 169, 381, 309,
476, 355, 915, 431, 869, 269, 358, 335, 650, 1443, 561, 900,
411, 759, 1265, 171, 833, 45, 255, 134, 144, 339, 203, 388, 413,
66, 416, 654, 565, 700, 362)), row.names = c(NA, 100L), class = "data.frame")
I am working with a CSV dataset called combinedDataset, which I found a model for using k-fold validation procedure. My x value for the model is hour meter reading while my y value is cumulative cost. Here's a dput of my combinedDataset:
structure(list(Unit.ID = c(925L, 967L, 1054L, 967L, 1054L, 967L,
1160L, 1054L, 1160L, 967L, 967L, 1054L, 1160L, 967L, 1054L, 1160L,
967L, 1160L, 1054L, 1054L, 967L, 1160L, 1054L, 967L, 1160L, 1054L,
967L, 1160L, 1054L, 164L, 967L, 967L, 1160L, 1054L, 164L, 967L,
164L, 1160L, 164L, 1054L, 967L, 164L, 1054L, 967L, 1054L, 164L,
967L, 164L, 164L, 1054L, 967L, 164L, 967L, 164L, 1054L, 164L,
925L, 164L, 967L, 1054L, 1054L, 925L, 925L, 164L, 165L, 164L,
1054L, 967L, 164L, 165L, 967L, 164L, 164L, 165L, 1054L, 967L,
967L, 165L, 164L, 1054L, 967L, 165L, 967L, 165L, 164L, 967L,
164L, 967L, 164L, 967L, 164L, 967L, 164L, 1054L, 164L, 164L,
164L, 164L, 164L, 164L, 164L), Hour.Meter.Reading = c(34L, 381L,
532L, 600L, 732L, 783L, 796L, 947L, 1016L, 1038L, 1200L, 1282L,
1290L, 1388L, 1481L, 1528L, 1579L, 1671L, 1704L, 1728L, 1755L,
1906L, 1926L, 1936L, 2031L, 2063L, 2136L, 2205L, 2293L, 2321L,
2342L, 2382L, 2425L, 2505L, 2524L, 2576L, 2704L, 2731L, 2777L,
2811L, 2873L, 2960L, 2997L, 3080L, 3170L, 3175L, 3264L, 3371L,
3386L, 3425L, 3485L, 3570L, 3690L, 3740L, 3746L, 3854L, 3863L,
3976L, 3990L, 3991L, 4078L, 4103L, 4106L, 4138L, 4138L, 4216L,
4249L, 4253L, 4305L, 4326L, 4353L, 4483L, 4489L, 4489L, 4500L,
4580L, 4581L, 4652L, 4721L, 4742L, 4784L, 4805L, 4828L, 4943L,
4947L, 4954L, 4968L, 5298L, 5316L, 5407L, 5533L, 5628L, 5712L,
5747L, 5951L, 6165L, 6194L, 6439L, 6636L, 6702L, 6918L), Labour.Cost = c(1102.5,
4270, 542.5, 2730, 682.5, 3097.5, 336, 871.5, 525, 2695, 1837.5,
1092, 1995, 2572.5, 1092, 924, 840, 1575, 693, 693, 560, 2100,
7959, 2747.5, 1092, 1764, 2030, 5355, 7434, 315, 1890, 2688,
504, 3024, 805, 1701, 577.5, 777, 6440, 1281, 588, 4910, 1470,
1911, 3738, 4140, 9219, 525, 1995, 1239, 1491, 2292.5, 4389,
2012.5, 1134, 945, 490, 3307.5, 714, 756, 1302, 297.5, 875, 1872.5,
1435, 1767.5, 2037, 3108, 1645, 1067.5, 3087, 1452.5, 11777.5,
5670, 4872, 2916, 4158, 5350, 2817.5, 84, 1596, 3865, 714, 910,
4112.5, 1197, 3622.5, 714, 3675, 4767, 3150, 2142, 2436, 210,
1974, 3843, 14532, 2373, 2919, 7098, 2205), Parts.Cost = c(657.6733,
6451.9113, 2235.8885, 6729.7326, 8357.0427, 9224.9012, 1957.0181,
6890.5315, 3156.4815, 2009.3578, 4555.0977, 3458.6842, 1546.2183,
6249.232, 4430.8058, 3835.5721, 3415.2062, 4868.2379, 2151.4558,
2233.2055, 2554.7489, 7433.8141, 2563.289, 3348.7162, 2173.6179,
1940.2806, 4404.6421, 5626.8595, 10553.4599, 12.62, 11405.5704,
2554.2787, 1907.3543, 12625.7525, 243.5735, 6104.7416, 405.959,
3609.1684, 4647.767, 12842.3638, 489.477, 9961.5883, 1706.0572,
2381.7686, 15177.0692, 5416.7948, 16538.1428, 253.3975, 1390.5058,
8699.7549, 7759.8042, 5128.0276, 8556.2625, 5760.523, 1923.699,
628.643, 158.4313, 14481.7111, 3796.3243, 11671.4333, 7140.2504,
1326.837, 441.0999, 2866.2141, 4229.31, 2935.825, 7452.8686,
11683.7093, 2644.1532, 418.679, 11665.8066, 523.9236, 18247.2776,
8115.265, 25011.6846, 13727.0801, 31786.6422, 6064.3123, 10599.0455,
119.4423, 1228.3541, 3587.7566, 3666.517, 472.1537, 1968.7669,
1417.8506, 8023.1254, 5831.6884, 14873.8008, 10193.2736, 6442.1719,
7525.4562, 4378.1336, 1691.4286, 12144.6891, 13094.8609, 20582.1682,
2544.103, 16934.6748, 17344.5551, 8912.7088), Total.Cost = c(1760.1733,
10721.9113, 2778.3885, 9459.7326, 9039.5427, 12322.4012, 2293.0181,
7762.0315, 3681.4815, 4704.3578, 6392.5977, 4550.6842, 3541.2183,
8821.732, 5522.8058, 4759.5721, 4255.2062, 6443.2379, 2844.4558,
2926.2055, 3114.7489, 9533.8141, 10522.289, 6096.2162, 3265.6179,
3704.2806, 6434.6421, 10981.8595, 17987.4599, 327.62, 13295.5704,
5242.2787, 2411.3543, 15649.7525, 1048.5735, 7805.7416, 983.459,
4386.1684, 11087.767, 14123.3638, 1077.477, 14871.5883, 3176.0572,
4292.7686, 18915.0692, 9556.7948, 25757.1428, 778.3975, 3385.5058,
9938.7549, 9250.8042, 7420.5276, 12945.2625, 7773.023, 3057.699,
1573.643, 648.4313, 17789.2111, 4510.3243, 12427.4333, 8442.2504,
1624.337, 1316.0999, 4738.7141, 5664.31, 4703.325, 9489.8686,
14791.7093, 4289.1532, 1486.179, 14752.8066, 1976.4236, 30024.7776,
13785.265, 29883.6846, 16643.0801, 35944.6422, 11414.3123, 13416.5455,
203.4423, 2824.3541, 7452.7566, 4380.517, 1382.1537, 6081.2669,
2614.8506, 11645.6254, 6545.6884, 18548.8008, 14960.2736, 9592.1719,
9667.4562, 6814.1336, 1901.4286, 14118.6891, 16937.8609, 35114.1682,
4917.103, 19853.6748, 24442.5551, 11117.7088), Cumulative.Cost = c(1760.1733,
12482.0846, 15260.4731, 24720.2057, 33759.7484, 46082.1496, 48375.1677,
56137.1992, 59818.6807, 64523.0385, 70915.6362, 75466.3204, 79007.5387,
87829.2707, 93352.0765, 98111.6486, 102366.8548, 108810.0927,
111654.5485, 114580.754, 117695.5029, 127229.317, 137751.606,
143847.8222, 147113.4401, 150817.7207, 157252.3628, 168234.2223,
186221.6822, 186549.3022, 199844.8726, 205087.1513, 207498.5056,
223148.2581, 224196.8316, 232002.5732, 232986.0322, 237372.2006,
248459.9676, 262583.3314, 263660.8084, 278532.3967, 281708.4539,
286001.2225, 304916.2917, 314473.0865, 340230.2293, 341008.6268,
344394.1326, 354332.8875, 363583.6917, 371004.2193, 383949.4818,
391722.5048, 394780.2038, 396353.8468, 397002.2781, 414791.4892,
419301.8135, 431729.2468, 440171.4972, 441795.8342, 443111.9341,
447850.6482, 453514.9582, 458218.2832, 467708.1518, 482499.8611,
486789.0143, 488275.1933, 503027.9999, 505004.4235, 535029.2011,
548814.4661, 578698.1507, 595341.2308, 631285.873, 642700.1853,
656116.7308, 656320.1731, 659144.5272, 666597.2838, 670977.8008,
672359.9545, 678441.2214, 681056.072, 692701.6974, 699247.3858,
717796.1866, 732756.4602, 742348.6321, 752016.0883, 758830.2219,
760731.6505, 774850.3396, 791788.2005, 826902.3687, 831819.4717,
851673.1465, 876115.7016, 887233.4104)), class = "data.frame", row.names = c(NA,
-101L))
Here's the code I used to find the model:
set.seed(123)
idx <- sample(1:nrow(combinedDataset), nrow(combinedDataset))
view(idx)
test_size2 <- floor(nrow(combinedDataset)*0) #multiplied by 0 implies nothing is being tested because the whole model is getting trained
train <- combinedDataset[-idx[1:test_size2],]
view(test)
view(train)
train_X <- train$Hour.Meter.Reading
train_y <- train$Cumulative.Cost
X <- train_X
y <- train_y
poly_order <- 2
Model <- lm(y~poly(X, poly_order))
print(Model)
and here's the code and a picture of the plot for the model:
X_new = seq(min(X), max(X), 1)
y_new <- predict(Model, data.frame(X = X_new))
plot.new()
plot(combinedDataset$Hour.Meter.Reading, combinedDataset$Cumulative.Cost, col = "blue")
lines(X_new, y_new, col="red", type = "l", lwd = 2)
legend(5, 95, legend=c("samples", "fitted model"),
col = c("blue", "red"), lty = c(-1, 1), pch = c(1, -1))
Using this model, I am trying to predict the “Cumulative Cost” when “Hour Meter Reading” are 4000 and 8000, accordingly. Is there a function that lets me find those specific values out of my model?
I've tried plugging in 4000 and 8000 into the X value using the equation that came out of the printing my model, but I got insanely high numbers which don't seem right and don't fit correctly into the plot.
I am currently doing a K-fold cross validation procedure to determine the best model (linear or quadratic) for this data is. My data comes from a CSV dataset called combinedData which I've pasted a dput for below:
structure(list(Unit.ID = c(925L, 967L, 1054L, 967L, 1054L, 967L,
1160L, 1054L, 1160L, 967L, 967L, 1054L, 1160L, 967L, 1054L, 1160L,
967L, 1160L, 1054L, 1054L, 967L, 1160L, 1054L, 967L, 1160L, 1054L,
967L, 1160L, 1054L, 164L, 967L, 967L, 1160L, 1054L, 164L, 967L,
164L, 1160L, 164L, 1054L, 967L, 164L, 1054L, 967L, 1054L, 164L,
967L, 164L, 164L, 1054L, 967L, 164L, 967L, 164L, 1054L, 164L,
925L, 164L, 967L, 1054L, 1054L, 925L, 925L, 164L, 165L, 164L,
1054L, 967L, 164L, 165L, 967L, 164L, 164L, 165L, 1054L, 967L,
967L, 165L, 164L, 1054L, 967L, 165L, 967L, 165L, 164L, 967L,
164L, 967L, 164L, 967L, 164L, 967L, 164L, 1054L, 164L, 164L,
164L, 164L, 164L, 164L, 164L), Hour.Meter.Reading = c(34L, 381L,
532L, 600L, 732L, 783L, 796L, 947L, 1016L, 1038L, 1200L, 1282L,
1290L, 1388L, 1481L, 1528L, 1579L, 1671L, 1704L, 1728L, 1755L,
1906L, 1926L, 1936L, 2031L, 2063L, 2136L, 2205L, 2293L, 2321L,
2342L, 2382L, 2425L, 2505L, 2524L, 2576L, 2704L, 2731L, 2777L,
2811L, 2873L, 2960L, 2997L, 3080L, 3170L, 3175L, 3264L, 3371L,
3386L, 3425L, 3485L, 3570L, 3690L, 3740L, 3746L, 3854L, 3863L,
3976L, 3990L, 3991L, 4078L, 4103L, 4106L, 4138L, 4138L, 4216L,
4249L, 4253L, 4305L, 4326L, 4353L, 4483L, 4489L, 4489L, 4500L,
4580L, 4581L, 4652L, 4721L, 4742L, 4784L, 4805L, 4828L, 4943L,
4947L, 4954L, 4968L, 5298L, 5316L, 5407L, 5533L, 5628L, 5712L,
5747L, 5951L, 6165L, 6194L, 6439L, 6636L, 6702L, 6918L), Labour.Cost = c(1102.5,
4270, 542.5, 2730, 682.5, 3097.5, 336, 871.5, 525, 2695, 1837.5,
1092, 1995, 2572.5, 1092, 924, 840, 1575, 693, 693, 560, 2100,
7959, 2747.5, 1092, 1764, 2030, 5355, 7434, 315, 1890, 2688,
504, 3024, 805, 1701, 577.5, 777, 6440, 1281, 588, 4910, 1470,
1911, 3738, 4140, 9219, 525, 1995, 1239, 1491, 2292.5, 4389,
2012.5, 1134, 945, 490, 3307.5, 714, 756, 1302, 297.5, 875, 1872.5,
1435, 1767.5, 2037, 3108, 1645, 1067.5, 3087, 1452.5, 11777.5,
5670, 4872, 2916, 4158, 5350, 2817.5, 84, 1596, 3865, 714, 910,
4112.5, 1197, 3622.5, 714, 3675, 4767, 3150, 2142, 2436, 210,
1974, 3843, 14532, 2373, 2919, 7098, 2205), Parts.Cost = c(657.6733,
6451.9113, 2235.8885, 6729.7326, 8357.0427, 9224.9012, 1957.0181,
6890.5315, 3156.4815, 2009.3578, 4555.0977, 3458.6842, 1546.2183,
6249.232, 4430.8058, 3835.5721, 3415.2062, 4868.2379, 2151.4558,
2233.2055, 2554.7489, 7433.8141, 2563.289, 3348.7162, 2173.6179,
1940.2806, 4404.6421, 5626.8595, 10553.4599, 12.62, 11405.5704,
2554.2787, 1907.3543, 12625.7525, 243.5735, 6104.7416, 405.959,
3609.1684, 4647.767, 12842.3638, 489.477, 9961.5883, 1706.0572,
2381.7686, 15177.0692, 5416.7948, 16538.1428, 253.3975, 1390.5058,
8699.7549, 7759.8042, 5128.0276, 8556.2625, 5760.523, 1923.699,
628.643, 158.4313, 14481.7111, 3796.3243, 11671.4333, 7140.2504,
1326.837, 441.0999, 2866.2141, 4229.31, 2935.825, 7452.8686,
11683.7093, 2644.1532, 418.679, 11665.8066, 523.9236, 18247.2776,
8115.265, 25011.6846, 13727.0801, 31786.6422, 6064.3123, 10599.0455,
119.4423, 1228.3541, 3587.7566, 3666.517, 472.1537, 1968.7669,
1417.8506, 8023.1254, 5831.6884, 14873.8008, 10193.2736, 6442.1719,
7525.4562, 4378.1336, 1691.4286, 12144.6891, 13094.8609, 20582.1682,
2544.103, 16934.6748, 17344.5551, 8912.7088), Total.Cost = c(1760.1733,
10721.9113, 2778.3885, 9459.7326, 9039.5427, 12322.4012, 2293.0181,
7762.0315, 3681.4815, 4704.3578, 6392.5977, 4550.6842, 3541.2183,
8821.732, 5522.8058, 4759.5721, 4255.2062, 6443.2379, 2844.4558,
2926.2055, 3114.7489, 9533.8141, 10522.289, 6096.2162, 3265.6179,
3704.2806, 6434.6421, 10981.8595, 17987.4599, 327.62, 13295.5704,
5242.2787, 2411.3543, 15649.7525, 1048.5735, 7805.7416, 983.459,
4386.1684, 11087.767, 14123.3638, 1077.477, 14871.5883, 3176.0572,
4292.7686, 18915.0692, 9556.7948, 25757.1428, 778.3975, 3385.5058,
9938.7549, 9250.8042, 7420.5276, 12945.2625, 7773.023, 3057.699,
1573.643, 648.4313, 17789.2111, 4510.3243, 12427.4333, 8442.2504,
1624.337, 1316.0999, 4738.7141, 5664.31, 4703.325, 9489.8686,
14791.7093, 4289.1532, 1486.179, 14752.8066, 1976.4236, 30024.7776,
13785.265, 29883.6846, 16643.0801, 35944.6422, 11414.3123, 13416.5455,
203.4423, 2824.3541, 7452.7566, 4380.517, 1382.1537, 6081.2669,
2614.8506, 11645.6254, 6545.6884, 18548.8008, 14960.2736, 9592.1719,
9667.4562, 6814.1336, 1901.4286, 14118.6891, 16937.8609, 35114.1682,
4917.103, 19853.6748, 24442.5551, 11117.7088), Cumulative.Cost = c(1760.1733,
12482.0846, 15260.4731, 24720.2057, 33759.7484, 46082.1496, 48375.1677,
56137.1992, 59818.6807, 64523.0385, 70915.6362, 75466.3204, 79007.5387,
87829.2707, 93352.0765, 98111.6486, 102366.8548, 108810.0927,
111654.5485, 114580.754, 117695.5029, 127229.317, 137751.606,
143847.8222, 147113.4401, 150817.7207, 157252.3628, 168234.2223,
186221.6822, 186549.3022, 199844.8726, 205087.1513, 207498.5056,
223148.2581, 224196.8316, 232002.5732, 232986.0322, 237372.2006,
248459.9676, 262583.3314, 263660.8084, 278532.3967, 281708.4539,
286001.2225, 304916.2917, 314473.0865, 340230.2293, 341008.6268,
344394.1326, 354332.8875, 363583.6917, 371004.2193, 383949.4818,
391722.5048, 394780.2038, 396353.8468, 397002.2781, 414791.4892,
419301.8135, 431729.2468, 440171.4972, 441795.8342, 443111.9341,
447850.6482, 453514.9582, 458218.2832, 467708.1518, 482499.8611,
486789.0143, 488275.1933, 503027.9999, 505004.4235, 535029.2011,
548814.4661, 578698.1507, 595341.2308, 631285.873, 642700.1853,
656116.7308, 656320.1731, 659144.5272, 666597.2838, 670977.8008,
672359.9545, 678441.2214, 681056.072, 692701.6974, 699247.3858,
717796.1866, 732756.4602, 742348.6321, 752016.0883, 758830.2219,
760731.6505, 774850.3396, 791788.2005, 826902.3687, 831819.4717,
851673.1465, 876115.7016, 887233.4104)), class = "data.frame", row.names = c(NA,
-101L))
So far, I've created all the models I need (K=5) for both linear and quadratic models and I am at the stage where I am trying to calculate the MSE and R squarred values. Here's the code for the process below:
#linear model (Model 1) k-validation
#splitting the testing data into 5 k folds
set.seed(123)
idx <- sample(1:nrow(combinedDataset), nrow(combinedDataset))
view(idx)
test_size <- floor(nrow(combinedDataset)*0.2)
test1 <- combinedDataset[idx[1:test_size],]
train1 <- combinedDataset[-idx[1:test_size],]
view(test1)
view(train1)
train_X1 <- train1$Hour.Meter.Reading
train_y1 <- train1$Cumulative.Cost
test_X1 <- test1$Hour.Meter.Reading
test_y1 <- test1$Cumulative.Cost
X1 <- train_X1
y1 <- train_y1
#Create the 5 linear model equations
poly_order <- 1
Model1 <- lm(y1~poly(X1, poly_order))
print(Model1)
#Calculate MSE
test_yhat1 <- predict(Model1, data.frame(X1 = test_X1))
MSE1 <- mean((test_y1-test_yhat1)^2)
print(MSE1)
But for the last part of the code where I am calculating the MSE value for the first model, I keep getting this error:
Error: variable 'poly(X1, poly_order)' was fitted with type "nmatrix.1" but type "nmatrix.2" was supplied
In addition: Warning message:
In Z/rep(sqrt(norm2[-1L]), each = length(x)) :
longer object length is not a multiple of shorter object length
I have no idea what that code means or how to fix it. I've checked over my code multiple times but I haven't noticed anything wrong with my Model1.
Edit: Made the code shorter
Right now I have a dataset with temperature (independent variable) on the x-axis and the rate of sapflow measured from the xylem in trees during transpiration along the y-axis.
I created a scatterplot from the data below (scatterplot 1). I want to plot the best fit line using the function abline() or perhaps another function that does the same job. However, my r-code (below) appears to have plotted the line of best fit in the bottom left-hand corner of my plot, which does not seem right.
Rcode
plot(Sapflow$Sapflow, Sapflow$Temperature,
main="Scatterplot of Temperature (°C)",
xlab="Temperature (°C)",
ylab=expression(paste("Sapflow Litres day"^{-1})),
pch=19,
col="red")
##Line of best fit
abline(lm(Sapflow$Sapflow~Sapflow$Temperature), col="blue")
Goal
My scatterplot has an overall negative correlation and I can imagine the line of best fit, in reality, would be slightly curved. Even though I cannnot say for sure without a true visual representation of the line of best fit, I can envision that it would be similar in appearance to scatterplot 2. Can anyone lend a hand?
Scatterplot 1
Scatterplot 2
Reference: 'Quick R'
https://www.statmethods.net/graphs/images/scatterplot2.jpg
Data
structure(list(Date = structure(c(31L, 42L, 53L, 55L, 56L, 57L,
58L, 59L, 60L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 54L, 61L, 72L,
83L, 86L, 87L, 88L, 89L, 90L, 91L, 62L, 63L, 64L, 65L, 66L, 67L,
68L, 69L, 70L, 71L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L,
82L, 84L, 85L, 92L, 103L, 114L, 117L, 118L, 119L, 120L, 121L,
122L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L, 104L,
105L, 106L, 107L, 108L, 109L, 110L, 111L, 112L, 113L, 115L, 116L,
123L, 134L, 145L, 147L, 148L, 149L, 150L, 151L, 152L, 124L, 125L,
126L, 127L, 128L, 129L, 130L, 131L, 132L, 133L, 135L, 136L, 137L,
138L, 139L, 140L, 141L, 142L, 143L, 144L, 146L, 1L, 12L, 23L,
25L, 26L, 27L, 28L, 29L, 30L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 24L
), .Label = c("10/1/18", "10/10/18", "10/11/18", "10/12/18",
"10/13/18", "10/14/18", "10/15/18", "10/16/18", "10/17/18", "10/18/18",
"10/19/18", "10/2/18", "10/20/18", "10/21/18", "10/22/18", "10/23/18",
"10/24/18", "10/25/18", "10/26/18", "10/27/18", "10/28/18", "10/29/18",
"10/3/18", "10/30/18", "10/4/18", "10/5/18", "10/6/18", "10/7/18",
"10/8/18", "10/9/18", "6/1/18", "6/10/18", "6/11/18", "6/12/18",
"6/13/18", "6/14/18", "6/15/18", "6/16/18", "6/17/18", "6/18/18",
"6/19/18", "6/2/18", "6/20/18", "6/21/18", "6/22/18", "6/23/18",
"6/24/18", "6/25/18", "6/26/18", "6/27/18", "6/28/18", "6/29/18",
"6/3/18", "6/30/18", "6/4/18", "6/5/18", "6/6/18", "6/7/18",
"6/8/18", "6/9/18", "7/1/18", "7/10/18", "7/11/18", "7/12/18",
"7/13/18", "7/14/18", "7/15/18", "7/16/18", "7/17/18", "7/18/18",
"7/19/18", "7/2/18", "7/20/18", "7/21/18", "7/22/18", "7/23/18",
"7/24/18", "7/25/18", "7/26/18", "7/27/18", "7/28/18", "7/29/18",
"7/3/18", "7/30/18", "7/31/18", "7/4/18", "7/5/18", "7/6/18",
"7/7/18", "7/8/18", "7/9/18", "8/1/18", "8/10/18", "8/11/18",
"8/12/18", "8/13/18", "8/14/18", "8/15/18", "8/16/18", "8/17/18",
"8/18/18", "8/19/18", "8/2/18", "8/20/18", "8/21/18", "8/22/18",
"8/23/18", "8/24/18", "8/25/18", "8/26/18", "8/27/18", "8/28/18",
"8/29/18", "8/3/18", "8/30/18", "8/31/18", "8/4/18", "8/5/18",
"8/6/18", "8/7/18", "8/8/18", "8/9/18", "9/1/18", "9/10/18",
"9/11/18", "9/12/18", "9/13/18", "9/14/18", "9/15/18", "9/16/18",
"9/17/18", "9/18/18", "9/19/18", "9/2/18", "9/20/18", "9/21/18",
"9/22/18", "9/23/18", "9/24/18", "9/25/18", "9/26/18", "9/27/18",
"9/28/18", "9/29/18", "9/3/18", "9/30/18", "9/4/18", "9/5/18",
"9/6/18", "9/7/18", "9/8/18", "9/9/18"), class = "factor"),
Temperature = c(85.07,
79.72, 72.83, 90.1, 83.02, 73.34, 77.11, 74.79, 81.66, 77.71,
66.14, 78.15, 69.33, 68.13, 60.31, 69.47, 81.86, 78.63, 77.69,
77.56, 52.88, 53.32, 53.74, 55.85, 49.56, 55.3, 69.25, 74.96,
69.29, 60.07, 54.31, 48.6, 55.73, 56.74, 47.66, 60.51, 55.64,
58.39, 63.8, 63.16, 73.65, 71.08, 64.34, 60.1, 51.61, 54.87,
58.23, 52.49, 52.56, 59.64, 67.85, 64.42, 60.08, 59.71, 57.12,
58.7, 68.85, 72.44, 89.13, 77.67, 62.17, 61.3, 63.58, 66.26,
60.09, 56.63, 53.11, 59.84, 60.06, 80.76, 79.51, 73.96, 84.58,
78.77, 71.65, 72.59, 77.52, 69.04, 78.26, 77.22, 73.75, 81.95,
82.04, 78.14, 73.41, 72.76, 90.68, 74.24, 71.3, 74.4, 60.26,
66.08, 65.18, 57.17, 66.88, 75.53, 71.52, 74.97, 66.02, 78.06,
73.58, 68.18, 83.55, 80.4, 66.28, 72.32, 72.39, 77.74, 69.81,
74.21, 77.37, 88.28, 65.33, 87.54, 80.49, 69.58, 68.18, 69.25,
60.06, 66.38, 68.51, 71.65, 63.29, 76.63, 80.46, 85.56, 81.25,
94.48, 73.87, 76.8, 72.83, 77.55, 81.5, 77.7, 75.79, 94.38, 99.55,
94.14, 87.29, 84.81, 82.63, 85.27, 84.52, 71.13, 76.28, 78.06,
82.83, 75.18, 83.8, 85.38, 84, 85.33),
Humidity = c(19.67, 18.82,
20.38, 14.94, 12.92, 15.28, 15.12, 16.05, 15.19, 16.67, 18.69,
14.61, 16.71, 17.35, 16.98, 15.44, 15.21, 18.62, 20.11, 18.64,
15.66, 17.2, 18.21, 19.32, 23.02, 21.69, 18.03, 18.46, 18.45,
20.78, 23.04, 22.05, 19.71, 20.59, 24.89, 23.34, 24.7, 24.2,
22.43, 18.21, 17.66, 18.23, 20.36, 22.83, 23.52, 22.88, 19.59,
21.51, 22.25, 21.47, 22.03, 22.51, 25.54, 24.01, 24.28, 26.21,
23.72, 17.63, 17.27, 19.19, 19.97, 19.84, 22.78, 24.46, 23.05,
23.31, 24.75, 23.23, 18.91, 15.56, 13.51, 15.8, 17.67, 19.18,
18.93, 20.05, 17.1, 16.87, 18.77, 20.49, 21.5, 18.04, 18.82,
17.38, 13.05, 13.13, 13.48, 16.32, 16.74, 16.11, 15.77, 15.48,
18.17, 18.16, 18.44, 16.63, 16.64, 14.47, 13.07, 14.14, 17.27,
16.71, 18.22, 12.9, 13.95, 14.7, 15.78, 17.52, 19.66, 18.87,
18.07, 16.4, 12.92, 10.57, 10.04, 9.78, 10.24, 14.25, 15.92,
11.59, 9.25, 10.33, 11.22, 15.03, 13.67, 14.26, 15.42, 8.34,
8.56, 12.37, 14.38, 15.47, 16.4, 17.15, 20.05, 11.08, 10.63,
14.34, 13.27, 9.33, 8.1, 10.95, 12.79, 8.64, 11.42, 12.12, 9.91,
7.86, 3.51, 4.97, 3.63, 5.59),
Radiation = c(197.8, 195.5, 288,
72, 160.5, 337.1, 176.9, 242.3, 189.4, 295.7, 363.2, 158, 290,
251.2, 297.3, 192.6, 163.5, 274.5, 210.7, 243.4, 287.4, 375.7,
290.5, 336.4, 361.6, 369.2, 302.6, 295.2, 348.5, 343.5, 327.6,
358.9, 358.6, 288.9, 325.6, 307.8, 321.3, 321.5, 280.6, 264.9,
253, 279.5, 318.1, 285.1, 330.8, 252, 201, 229.9, 259.3, 230.4,
265.5, 214.1, 307, 311.1, 282.5, 256.9, 227.2, 263.4, 68.2, 130.8,
276.6, 299.2, 276.5, 243.9, 291, 289.3, 290.6, 259.6, 220.5,
72.7, 158.9, 233.8, 105.9, 164.2, 168.1, 188.7, 120.1, 217.7,
111.2, 114.7, 143.6, 55.2, 108.5, 162.2, 185, 197.7, 54.1, 126.3,
111.2, 135.4, 228.3, 214.3, 240.1, 247.6, 173, 172.4, 131.9,
149.4, 203.1, 92.3, 168.5, 146.6, 65.9, 103.6, 200.2, 131.3,
183.5, 128.3, 140.6, 124.1, 125.9, 75.8, 173.2, 47.9, 111.7,
205.8, 188.3, 175.6, 193.7, 170.4, 188.3, 108, 171.1, 59.5, 87.7,
142.2, 111.8, 26.3, 129.9, 103.1, 158.7, 147.9, 109.8, 67.8,
106.6, 12.3, 15.8, 53, 63.4, 86.2, 123.3, 112.9, 128.2, 141.9,
81.6, 102, 86.8, 83.9, 50, 96.8, 100.5, 47),
Sapflow = c(14.97,
16.31, 17.52, 7.45, 12.18, 15.82, 11.79, 14.45, 10.95, 13.62,
16.28, 11.42, 16.13, 15.09, 17.28, 14.43, 11.7, 16.06, 17.66,
16.33, 17.79, 18.58, 19.41, 19.8, 21.63, 21.35, 17.81, 17.56,
19.37, 21.27, 23.26, 23.67, 22.64, 21.85, 24.81, 22.36, 24.72,
23.87, 23.67, 22.01, 19.23, 19.92, 21.99, 23.6, 24.9, 24.46,
22.22, 23.95, 24.81, 23.88, 22.98, 24.47, 26.09, 25.97, 25.82,
26.24, 25.09, 22, 16.91, 21.35, 25.32, 25.76, 26.38, 25.78, 25.77,
25.15, 26.29, 26.22, 24.59, 18.26, 18.91, 21.57, 21.37, 21.29,
23.96, 24.85, 21.02, 23.05, 22.69, 23.9, 25.24, 25.4, 23.19,
22.8, 22.08, 21.86, 13.82, 22.05, 23.21, 20.12, 22.73, 21.88,
23.33, 24.76, 23.5, 22.06, 22.01, 20.65, 21.54, 19.9, 21.67,
21.84, 18.82, 17.99, 21.41, 23.53, 23.39, 25.75, 22.62, 22.25,
21.81, 16.81, 20.42, 12.08, 12.36, 15.31, 14.14, 15.48, 15.18,
14.19, 12.09, 12.39, 12.34, 12.61, 10.79, 10.53, 11.29, 9.92,
9.79, 10.86, 10.98, 10.58, 12.54, 12.52, 12.25, 6.38, 0.91, 5.24,
6.56, 5.72, 4.55, 4.99, 2.88, 0.99, 1.03, 1.57, 2.07, 2.3, 2.22,
2.11, 2.21, 2.29)),
class = "data.frame", row.names = c(NA, -152L))
Your original (linear) fit seems off because you misspecified the columns in your call to plot. By default, the first argument is for the x-axis, not for the y-axis but you can change this by expressly assigning y = Sapflow$Sapflow and x = Sapflow$Temperature. Accordingly, plotting your data (with sapflow as the dependent variable and temperature as the predictor) with a linear fit using abline looks like this:
plot(y = Sapflow$Sapflow, x = Sapflow$Temperature,
main="Scatterplot of Temperature (°C)",
xlab="Temperature (°C)",
ylab=expression(paste("Sapflow Litres day"^{-1})),
pch=19,
col="red")
abline(lm(Sapflow$Sapflow ~ Sapflow$Temperature))
If you want fit your data smooth curve, you can use scatter.smooth:
scatter.smooth(y = Sapflow$Sapflow, x = Sapflow$Temperature,
main="Scatterplot of Temperature (°C)",
xlab="Temperature (°C)",
ylab=expression(paste("Sapflow Litres day"^{-1})),
pch=19,
col="red")
This is the my data frame:
dput(head(df,10))
structure(list(Users = c(200L, 305L, 358L, 419L, 492L, 576L,
675L, 791L, 928L, 1087L), Transactions = c(460, 702.4, 823.2,
964.8, 1130.8, 1325.3, 1553.2, 1820.4, 2133.5, 2500.4), Workload = c(100,
109.9, 109.9, 110, 110, 110.1, 110.1, 110.2, 110.2, 110.3), HeapUsage = c(25,
35.83, 36.47, 37.13, 37.79, 38.47, 39.17, 39.87, 40.59, 41.32
)), .Names = c("Users", "Transactions", "Workload", "HeapUsage"
), row.names = c(NA, 10L), class = "data.frame")
I need to use the variable names. For example,
yv=c("HeapUsage") and xv=c("Users", "Transactions", "Workload")
I need to be able to create however many entries in xv vector against yv variable.
I tried this:
ggplot(df)+geom_point()+facet_wrap(yv~xv)
does not seem to be working. In this case, I need 3 different charts yaxis being yv and xaxis being 1. Users, 2. Transactions 3. Workload
Is there an easy way to do it?