Measure Accuracy errors print NA - r

I'm building ARIMA model on my data and when I try to check the Measure Accuracy errors , it print NA!
I don't know where I missed up.
Does any one have suggestions please ?
accuracy(forecast_data, test_data)
$Models
Call $Fit.criteria
"Min.max.accuracy MAE MAPE MSE RMSE NRMSE.mean NRMSE.median
"Not supported" NA NA NA NA NA NA NA
"Not supported" NA NA NA NA NA NA NA
"Not supported" NA NA NA NA NA NA NA
"Not supported" NA NA NA NA NA NA NA
Here's my code:
auto_ARIMA <- auto.arima(training_data, trace=TRUE, ic ="aicc", approximation=FALSE, stepwise=FALSE)
forecast_data <- forecast(object=test_data, model= auto_ARIMA)
accuracy(forecast_data, test_data)
my data is in Time series format and has no NA..
Any help will be appreciated.
updates:
Here's part of what dput(training_data) & dput(test_data) print:
dput(training_data)
c(601L, 215L, 147L, 275L, 707L, 1509L, 2118L, 1506L, 1439L, 1745L,
1882L, 1773L, 1752L, 1773L, 1727L, 1823L, 1860L, 2020L, 1744L,
1670L, 1498L, 1372L, 1262L, 723L, 313L, 166L, 129L, 252L, 695L,
1510L, 2051L, 1484L, 1417L, 1838L, 1756L, 1740L, 1756L, 1675L)
dput(training_data)
c(601L, 215L, 147L, 275L, 707L, 1509L, 2118L, 1506L, 1439L, 1745L,
1882L, 1773L, 1752L, 1773L, 1727L, 1823L, 1860L, 2020L, 1744L,
1670L, 1498L, 1372L, 1262L, 723L, 313L, 166L, 129L, 252L, 695L,
1510L, 2051L, 1484L, 1417L, 1838L, 1756L, 1740L, 1756L, 1675L)

Related

Counting observations per month in a data frame

I currently have a dataframe that has two columns: arrest date and number of arrests. The date column has almost every single day from 2006-2020; instead of having the number of arrests per day, I'd like to have the number of arrests per month, per year.
The dataframe is going to be converted into an xts object for a time series analysis so I need a resulting date column that has the year and month.
Below is the first 6 months of data from the dataset:
structure(list(ARREST_DATE = structure(c(13149, 13150, 13151,
13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159, 13160,
13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13169,
13170, 13171, 13172, 13173, 13174, 13175, 13176, 13177, 13178,
13179, 13180, 13181, 13182, 13183, 13184, 13185, 13186, 13187,
13188, 13189, 13190, 13191, 13192, 13193, 13194, 13195, 13196,
13197, 13198, 13199, 13200, 13201, 13202, 13203, 13204, 13205,
13206, 13207, 13208, 13209, 13210, 13211, 13212, 13213, 13214,
13215, 13216, 13217, 13218, 13219, 13220, 13221, 13222, 13223,
13224, 13225, 13226, 13227, 13228, 13229, 13230, 13231, 13232,
13233, 13234, 13235, 13236, 13237, 13238, 13239, 13240, 13241,
13242, 13243, 13244, 13245, 13246, 13247, 13248, 13249, 13250,
13251, 13252, 13253, 13254, 13255, 13256, 13257, 13258, 13259,
13260, 13261, 13262, 13263, 13264, 13265, 13266, 13267, 13268,
13269, 13270, 13271, 13272, 13273, 13274, 13275, 13276, 13277,
13278, 13279, 13280, 13281, 13282, 13283, 13284, 13285, 13286,
13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295,
13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304,
13305, 13306, 13307, 13308, 13309, 13310, 13311, 13312, 13313,
13314, 13315, 13316, 13317, 13318, 13319, 13320, 13321, 13322,
13323, 13324, 13325, 13326, 13327, 13328, 13329), class = "Date"),
num_of_arrests = c(550L, 617L, 895L, 1224L, 1379L, 1246L,
893L, 635L, 889L, 1316L, 1223L, 1264L, 1258L, 852L, 478L,
710L, 1131L, 1190L, 1309L, 1085L, 910L, 704L, 852L, 1278L,
1322L, 1250L, 1128L, 967L, 686L, 812L, 998L, 1350L, 1356L,
1292L, 1006L, 568L, 867L, 1296L, 1428L, 1327L, 1182L, 821L,
233L, 618L, 915L, 1370L, 1391L, 1237L, 992L, 649L, 888L,
1167L, 1369L, 1126L, 1071L, 888L, 615L, 831L, 1019L, 1364L,
1109L, 1239L, 962L, 720L, 930L, 1233L, 1413L, 1350L, 1258L,
1034L, 629L, 954L, 1181L, 1421L, 1332L, 974L, 924L, 680L,
958L, 1232L, 1389L, 1289L, 1189L, 931L, 672L, 824L, 1188L,
1332L, 1194L, 1005L, 1011L, 653L, 822L, 1252L, 1421L, 1316L,
1231L, 902L, 740L, 811L, 1184L, 1362L, 1401L, 1144L, 860L,
383L, 775L, 1143L, 1296L, 1271L, 1056L, 729L, 593L, 836L,
1264L, 1341L, 1298L, 1127L, 771L, 548L, 908L, 1290L, 1398L,
1297L, 1127L, 878L, 663L, 928L, 1258L, 1389L, 1300L, 1135L,
937L, 600L, 851L, 1173L, 1366L, 1211L, 958L, 912L, 602L,
843L, 1274L, 1368L, 1332L, 1068L, 823L, 589L, 482L, 1076L,
1217L, 1194L, 1020L, 822L, 628L, 895L, 1225L, 1116L, 1264L,
1254L, 829L, 747L, 911L, 1241L, 1291L, 1267L, 1182L, 924L,
438L, 826L, 1228L, 1361L, 1255L, 1095L, 763L, 594L, 860L,
1056L, 1157L, 1073L, 898L)), row.names = c(NA, 181L), class = "data.frame")
To get the number of arrests per month, you could do as follows: extract the month and year by using the lubridate functions month() and year(), group by both of them (year could be omitted in your example, since there is only year 2006) and summarize() the sum().
As requested, to get a column with year and month, paste() them together, ungroup(), deselect the helper columns and relocate() yearmonth to the front.
Code
library(dplyr)
library(lubridate)
result <- data %>% mutate(year = year(ARREST_DATE), month = month(ARREST_DATE)) %>%
group_by(year, month) %>% summarise(arrests_per_month = sum(num_of_arrests)) %>%
mutate(yearmonth = paste(year, month, sep = "-")) %>% ungroup() %>%
select(-c(year, month)) %>% relocate(yearmonth)
Output
> result
# A tibble: 6 x 2
yearmonth arrests_per_month
<chr> <int>
1 2006-1 31051
2 2006-2 28872
3 2006-3 33910
4 2006-4 30541
5 2006-5 32253
6 2006-6 30414

Selecting such vector elements so that the sum of elements is exactly equal to the specified value

I have a vector of random positive integers. I would like to select only those elements of the vector whose sum will be exactly equal to a certain predetermined value.
Let's take an example like this.
x=1:5, I am looking for elements whose sum is equal to 14. The solution is of course the vector c(2, 3, 4, 5).
Of course, there may be several solutions.
Example 2.
x=1:5, I'm looking for elements whose sum is equal to 7.
Here, of course, should be the following three solutions:
1.c(2, 5),
2.c(3, 4),
3.c(1, 2, 4).
There may also be a situation where there will be no solutions at all.
Example 3.
x=c(1, 2, 7), I'm looking for elements whose sum equals 5.
Of course, there are no correct solutions here.
Everything seems trivially simple if we have vectors of several elements. Here, I even came up with a few alternative solutions. However, the problem becomes when the size of the vector increases.
My vector looks like this:
x= c(236L, 407L, 51L, 308L, 72L, 9787L, 458L, 5486L, 42L, 4290L,
31L, 3533L, 1102L, 24L, 100L, 669L, 9352L, 4091L, 2751L, 3324L,
3193L, 245L, 86L, 98932L, 77L, 13L, 9789L, 91L, 999L, 25L, 25379L,
9626L, 9092L, 622L, 97L, 57L, 2911L, 6L, 405L, 894L, 1760L, 9634L,
96L, 9765L, 223L, 765L, 743L, 5960L, 14L, 50L, 89L, 348L, 5875L,
5L, 58602L, 397L, 1181L, 94L, 954L, 7901L, 836L, 8810L, 52L,
15L, 48L, 26L, 4L, 66L, 5265L, 80L, 282L, 231L, 76L, 661L, 7604L,
7406L, 58L, 10L, 903L, 49446L, 80921L, 1L, 876L, 334L, 63L, 796L,
88L, 413L, 1214L, 2983L, 9518L, 595L, 708L, 53L, 321L, 12L, 634L,
4910L, 8984L, 465L)
I have to find at least one subset of elements whose sum will be exactly 23745.
Unfortunately, I had a complete failure here. Whatever I write is calculated in hours and I don't get any correct result anyway. Does anyone have any idea how this can be solved in R? I will be grateful for even a small hint.
I have to admit that your problem inspired me and made me wonder. I decided to face it by creating my own optimization function. And even though you got the answer that uses the gbp package (I didn't know it before), let them share my own function. Here it is findSumm.
findSumm = function(x, sfind, nmax=1, tmax=1){
if(sum(x)<sfind) stop("Impossible solution! sum(x)<sfind!")
fTimeSec = function() as.numeric(Sys.time()-l$tstart, units="secs")
#The current selection of vector element
sel = c(TRUE, rep(FALSE, length(x)-1))
#List of intermediate states of the vector sel
lsel = list()
#List with a collection of parameters and results
l = list(
x = sort(x, TRUE),
tstart = Sys.time(),
chosen = list(),
xfind = list(),
time = c(),
stop = FALSE,
reason = "")
while(TRUE) {
#Maximum Runtime Test
if(fTimeSec()>tmax) {
l$reason = "Calculation time is greater than tmax.\n"
l$stop = TRUE
break
}
#Record the solution and test the number of solutions
if(sum(l$x[sel])==sfind){
#Save solution
l$chosen[[length(l$chosen)+1]] = sel
l$xfind[[length(l$xfind)+1]] = l$x[sel]
l$time = c(l$time, fTimeSec())
#Test the number of solutions
if(length(l$chosen)==nmax){
l$reason = "Already found nmax solutions.\n"
l$stop = TRUE
break
}
}
idx = which(sel)
if(idx[length(idx)]==length(sel)) {
if(length(lsel)==0) break
sel=lsel[[length(lsel)]]
idx = which(sel)
lsel[length(lsel)]=NULL
sel[idx[length(idx)]]=FALSE
sel[idx[length(idx)]+1]=TRUE
next
}
if(sum(l$x[sel])>=sfind){
sel[idx[length(idx)]]=FALSE
sel[idx[length(idx)]+1]=TRUE
next
} else {
lsel[[length(lsel)+1]] = sel #Save the current state of sel vector
sel[idx[length(idx)]+1]=TRUE
next
}
}
if(length(l$chosen)==0 & !l$stop) stop("No solutions!")
l$reason = paste(l$reason, "Found", length(l$chosen),
"solutions in time", signif(fTimeSec(), 3), "seconds.\n")
cat(l$reason)
return(l)
}
Let's check how it works
findSumm(1:5, 20)$xfind
#Error in findSumm(1:5, 20) : Impossible solution! sum(x)<sfind!
findSumm(c(1,2,7), 5)$xfind
#Error in findSumm(c(1, 2, 7), 5) : No solutions!
findSumm(1:5, 14, 10, 10)$xfind
# Found 1 solutions in time 0.007 seconds.
# [[1]]
# [1] 5 4 3 2
findSumm(1:5, 5, 10, 10)$xfind
# Found 3 solutions in time 0.001 seconds.
# [[1]]
# [1] 5
#
# [[2]]
# [1] 4 1
#
# [[3]]
# [1] 3 2
findSumm(1:5, 7, 10, 10)$xfind
# Found 3 solutions in time 0.004 seconds.
# [[1]]
# [1] 5 2
#
# [[2]]
# [1] 4 3
#
# [[3]]
# [1] 4 2 1
As you can see it was doing great. Now it's time to check that on your vector x.
x= c(236L, 407L, 51L, 308L, 72L, 9787L, 458L, 5486L, 42L, 4290L,
31L, 3533L, 1102L, 24L, 100L, 669L, 9352L, 4091L, 2751L, 3324L,
3193L, 245L, 86L, 98932L, 77L, 13L, 9789L, 91L, 999L, 25L, 25379L,
9626L, 9092L, 622L, 97L, 57L, 2911L, 6L, 405L, 894L, 1760L, 9634L,
96L, 9765L, 223L, 765L, 743L, 5960L, 14L, 50L, 89L, 348L, 5875L,
5L, 58602L, 397L, 1181L, 94L, 954L, 7901L, 836L, 8810L, 52L,
15L, 48L, 26L, 4L, 66L, 5265L, 80L, 282L, 231L, 76L, 661L, 7604L,
7406L, 58L, 10L, 903L, 49446L, 80921L, 1L, 876L, 334L, 63L, 796L,
88L, 413L, 1214L, 2983L, 9518L, 595L, 708L, 53L, 321L, 12L, 634L,
4910L, 8984L, 465L)
findSumm(x, 23745, 1, 10)$xfind[[1]]
# Already found nmax solutions.
# Found 1 solutions in time 0.008 seconds.
# [1] 9789 9787 4091 77 1
A few comments about my function. My function searches for all possible and valid combinations unless it reaches the number of valid results specified by nmax or the computation takes tmax seconds. In the case of your vector x and the sum you are looking for 23745, the number of correct solutions is enormous. I turned it on for 1 min and got the 37827 results! And the function would still find valid results at a rate of 626 solutions per second perhaps for the next 100 years!
Below is a visualization of this process.
l = findSumm(x, 23745, +Inf, 60)
library(tidyverse)
library(ggpmisc)
df = tibble(
n = 1:length(l$chosen),
t = l$time
)
df %>% ggplot(aes(t,n))+
geom_line(size=0.1)+
geom_smooth(method = lm, formula = y~x)+
stat_poly_eq(formula = y~x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE)
Finally, I decided to check the performance of my function. I did not expect a revelation because it was not written in pure C. However, I must admit that the graph below surprised me very pleasantly!
library(microbenchmark)
ggplot2::autoplot(microbenchmark(findSumm(x, 23745),
gbp1d_solver_dpp(p = x, w = x, c = 23745L),
times=100))
As it turned out, my function is almost 4 times faster than gbp1d_solver_dpp. I'm proud!
Since I am very stubborn and curiously inquisitive, I decided to perform one more test on a vector with a length of 1000.
x=c(234L, 1891L, 3187L, 38417L, 2155L, 6857L, 71692L, 463575L,
800L, 2195L, 820L, 9735L, 913L, 62685L, 920597L, 864L, 903L,
478L, 2828L, 99371L, 3109L, 379L, 8544L, 444L, 772L, 571L, 226L,
94L, 378L, 60253L, 10920L, 47626L, 671L, 45163L, 27767L, 62498L,
87706L, 4966L, 4615L, 14897L, 261L, 684L, 3780L, 97L, 705L, 7313L,
3629L, 436L, 5076L, 3198L, 731L, 56634L, 67411L, 249L, 403L,
82728L, 9986L, 643662L, 11045L, 934L, 8154L, 289L, 4452L, 624L,
4876L, 86859L, 933L, 2372L, 6493L, 773566L, 6599L, 459L, 2024L,
80425L, 591L, 6262L, 35033L, 89607L, 6435L, 14917L, 9559L, 67983L,
82365L, 88127L, 466L, 758L, 11605L, 828L, 410L, 557L, 2991L,
808L, 8512L, 273605L, 294L, 4666L, 27L, 26337L, 7340L, 682L,
46480L, 19903L, 699L, 700L, 58L, 136L, 852L, 909L, 64316L, 9109L,
876L, 6382L, 803L, 295L, 9539L, 26271L, 1906L, 23639L, 9022L,
9513L, 169L, 65427L, 861864L, 743L, 91L, 9039L, 247L, 58749L,
5674L, 65959L, 99126L, 7765L, 5934L, 13881L, 77696L, 66894L,
977L, 6279L, 46273L, 919L, 6307L, 316L, 420113L, 61336L, 70L,
6148L, 257L, 17804L, 14L, 989L, 16907L, 36L, 25L, 333L, 224L,
119L, 4000L, 9438L, 5439L, 748L, 16532L, 4847L, 939L, 9504L,
2782L, 424L, 64034L, 5306L, 30247L, 6636L, 3976L, 60588L, 180L,
78118L, 1L, 61866L, 9501L, 15834L, 66712L, 77219L, 448L, 612L,
5339L, 58413L, 4785L, 2191L, 35711L, 84383L, 6261L, 896L, 24353L,
54868L, 288L, 8059L, 867L, 687L, 94667L, 1713L, 1507L, 71048L,
882L, 4155L, 97230L, 49492L, 47839L, 793L, 263L, 63160L, 9062L,
3518L, 55956L, 6626L, 14619L, 636L, 1127L, 970L, 5512L, 118117L,
2370L, 802L, 98333L, 6089L, 1076L, 80L, 305L, 3995L, 437L, 49L,
9207L, 2021L, 7554L, 9486L, 33501L, 55745L, 967L, 24857L, 692L,
4148L, 464957L, 2381L, 3876L, 3246L, 1478L, 308L, 98068L, 532L,
4670L, 7965L, 940L, 467L, 777L, 68749L, 2739L, 23951L, 831L,
60763L, 12047L, 75620L, 650L, 69584L, 294122L, 41149L, 9657L,
780L, 153054L, 37990L, 16L, 894L, 15500L, 31873L, 3800L, 472L,
50989L, 8767L, 8209L, 2929L, 4751L, 38L, 47403L, 64941L, 28042L,
49020L, 81785L, 299L, 936L, 63136L, 3L, 42033L, 1750L, 1147L,
273L, 62668L, 41L, 5829L, 686L, 511L, 65019L, 842L, 88716L, 96217L,
9442L, 6324L, 197L, 55422L, 630L, 665L, 3921L, 726L, 766916L,
43944L, 9035L, 573L, 77942L, 29689L, 749L, 95240L, 281L, 1933L,
78265L, 812L, 854L, 17445L, 8855L, 2940L, 6057L, 46689L, 999L,
381L, 347L, 50199L, 161L, 534L, 804L, 99043L, 13183L, 679L, 432L,
38887L, 575L, 781L, 2023L, 187077L, 89498L, 85L, 16780L, 3731L,
45904L, 13861L, 3971L, 301L, 4175L, 9427L, 126913L, 845L, 175L,
1684L, 9064L, 56647L, 116L, 479672L, 6754L, 441L, 412L, 97091L,
4062L, 598L, 146L, 423L, 2715L, 198939L, 80577L, 76385L, 2088L,
139L, 647L, 246L, 85002L, 898L, 50939L, 135L, 46388L, 623L, 17928L,
63072L, 346L, 78582L, 16691L, 838L, 44L, 5181L, 7918L, 3650L,
35L, 8825L, 9758L, 22677L, 9838L, 2239L, 9001L, 96689L, 570L,
47373L, 507L, 6378L, 40839L, 11677L, 937874L, 2485L, 22188L,
20413L, 13L, 877L, 5578L, 428L, 61L, 3200L, 5444L, 85540L, 640L,
94460L, 310L, 6043L, 3771L, 6167L, 476L, 9365L, 1956L, 143L,
7841L, 4957L, 3309L, 9317L, 41434L, 97881L, 51853L, 474L, 3098L,
7109L, 93976L, 545L, 28475L, 2066L, 4959L, 7410L, 293L, 8246L,
43L, 721L, 2260L, 72854L, 100L, 61382L, 107L, 5637L, 891L, 256L,
442L, 84440L, 55792L, 195L, 24074L, 19L, 57376L, 59159L, 805253L,
193329L, 3636L, 98954L, 968L, 380L, 5203L, 90157L, 71907L, 35497L,
41769L, 1683L, 1984L, 5765L, 832L, 411L, 4888L, 9801L, 710L,
2325L, 40L, 32927L, 435L, 66L, 66301L, 94776L, 48234L, 28977L,
122312L, 48L, 359L, 572L, 753L, 945L, 32241L, 328L, 55976L, 128L,
815794L, 57894L, 576L, 60131L, 342448L, 8913L, 33506L, 20448L,
58750L, 637L, 82086L, 635710L, 96772L, 272L, 938L, 4863L, 737L,
949L, 4804L, 3446L, 92319L, 28883L, 6032L, 53970L, 9394L, 5630L,
71583L, 136862L, 23161L, 8545L, 54249L, 213666L, 668L, 893L,
881126L, 8252L, 584L, 83L, 13754L, 244156L, 530L, 64574L, 22009L,
89204L, 34992L, 85992L, 82697L, 50L, 95845L, 3096L, 42L, 554949L,
325L, 2092L, 28L, 3830L, 893583L, 625L, 3740L, 4513L, 9938L,
910L, 8868L, 9614L, 41281L, 27915L, 25839L, 4417L, 5730L, 2825L,
683L, 550L, 88838L, 9248L, 961L, 2748L, 7259L, 53220L, 2179L,
4036L, 46014L, 83725L, 8211L, 6957L, 6886L, 4653L, 6300L, 80437L,
135885L, 23745L, 9536L, 78L, 652590L, 1037L, 5293L, 492L, 7467L,
71685L, 890L, 5023L, 96524L, 17465L, 53665L, 21508L, 463L, 159L,
311L, 764L, 27534L, 71L, 2504L, 270L, 6449L, 13449L, 302L, 88L,
3893L, 22007L, 9208L, 680618L, 878L, 14721L, 20L, 322374L, 644L,
944669L, 57334L, 233L, 982L, 870L, 950L, 121L, 254L, 4226L, 45L,
61823L, 9626L, 58590L, 6552L, 3920L, 68L, 3644L, 35775L, 4591L,
636207L, 78314L, 408L, 371L, 984L, 7089L, 4679L, 2233L, 756L,
20527L, 178L, 80573L, 589923L, 120L, 7938L, 894842L, 6563L, 569L,
91110L, 620L, 786288L, 46022L, 396L, 762533L, 145964L, 7732L,
60L, 274L, 87869L, 227L, 6706L, 707L, 955L, 48246L, 771L, 29001L,
14224L, 5173L, 20215L, 7566L, 1564L, 733L, 3568L, 3570L, 39256L,
925L, 41577L, 348L, 68267L, 151L, 98572L, 1389L, 5421L, 69043L,
42434L, 27597L, 53320L, 46051L, 1686L, 59L, 361L, 747579L, 5044L,
73873L, 28894L, 8146L, 353L, 2622L, 664L, 349L, 90764L, 8920L,
716L, 14903L, 96055L, 89L, 94239L, 416L, 7896L, 232L, 5543L,
61664L, 6709L, 2L, 14275L, 2954L, 917416L, 3567L, 42086L, 99956L,
86112L, 206L, 64L, 25956L, 57112L, 425L, 6507L, 28034L, 991L,
8444L, 140L, 1461L, 68783L, 347633L, 87696L, 593L, 164L, 837L,
8793L, 965L, 8811L, 97412L, 351L, 23L, 66808L, 8308L, 14245L,
12519L, 3019L, 1920L, 813L, 485L, 979L, 929L, 2970L, 32447L,
8962L, 867973L, 40534L, 551L, 20941L, 49413L, 188L, 948L, 9018L,
187252L, 3919L, 45963L, 358L, 7211L, 959L, 47L, 4220L, 36086L,
1645L, 33056L, 300L, 29682L, 9152L, 431L, 364L, 2211L, 3779L,
4633L, 22500L, 33980L, 794L, 84558L, 488L, 732L, 6686L, 15042L,
906L, 13553L, 6115L, 153L, 866L, 3624L, 329L, 6875L, 86L, 6298L,
57424L, 17582L, 955879L, 40945L, 4858L, 694L, 755L, 499L, 406L,
564L, 874L, 1695L, 43961L, 578L, 9063L, 505L, 5856L, 4484L, 76708L,
712L, 23348L, 986L, 275L, 996L, 8966L, 220L, 7008L, 849L, 953460L,
3062L, 278L, 26L, 8547L, 16895L, 98289L, 815L, 25135L, 956L,
370L, 8221L, 72674L, 31711L, 73L, 41667L, 2915L, 797L, 41309L,
4257L, 8148L, 5723L, 2124L, 8306L, 53388L, 33520L, 680L, 893759L,
40133L, 94791L, 988L, 162L, 79366L, 37625L, 7125L, 50947L, 171L,
99558L, 166L, 90717L, 5807L, 606L, 98592L, 59207L, 966L, 61299L,
7553L, 9678L, 62322L, 156L, 267L, 8478L, 59554L, 2264L, 28338L,
899L, 9719L, 98L, 51403L, 6302L, 265L, 79929L, 101L, 5227L, 972L,
145L, 48018L, 90140L, 698L, 8L, 5751L, 26083L, 1295L, 78124L,
383L, 2776L, 80204L, 210L, 3422L, 36064L, 46L, 4953L, 20271L,
3916L, 767L, 601372L, 56575L, 5237L, 5621L, 6705L, 1191L, 63768L,
1016L, 313L, 2285L, 12489L, 2755L, 338L, 7518L, 2630L, 421L,
6554L, 306L, 113L, 57197L, 885L, 9445L, 37364L, 86630L, 2460L,
715L, 10829L, 9914L, 6635L, 229L, 525L, 839L, 3278L, 969L, 182L,
187L, 7022L, 554L, 6489L, 15791L, 4157L, 47048L, 9447L, 152L,
1419L, 22618L, 5194L, 609L, 923L, 768L, 6248L, 714L, 1159L, 825893L,
53492L, 19731L, 65167L, 96325L, 336L, 4443L, 843L, 62960L, 9788L,
35032L, 284L, 4647L, 360L, 11297L, 1515L)
findSumm(x, 9568447L)$xfind[[1]]
# Already found nmax solutions.
# Found 1 solutions in time 0.065 seconds.
# [1] 955879 953460 944669 937874 920597 917416 894842 893759 893583 881126 347633 27597 8 3 1
As you can see my findSumm function works great. It took only 0.065 seconds to extract a subset with a sum equal to 9568447L!
Unfortunately, trying to run gbp1d_solver_dpp on such a long vector resulted in the error "size is too large". So I was not able to compare the performance of these two solutions with such a large vector.
This task sounds like a 1 dimensional bin packing problem or knapsack problem, in which case there are many resources available online to help guide you.
One potential solution is to use the gbp package, e.g.
#install.packages("gbp")
library(gbp)
#> Loading required package: magrittr
#> Loading required package: data.table
x <- c(236L, 407L, 51L, 308L, 72L, 9787L, 458L, 5486L, 42L, 4290L,
31L, 3533L, 1102L, 24L, 100L, 669L, 9352L, 4091L, 2751L, 3324L,
3193L, 245L, 86L, 98932L, 77L, 13L, 9789L, 91L, 999L, 25L, 25379L,
9626L, 9092L, 622L, 97L, 57L, 2911L, 6L, 405L, 894L, 1760L, 9634L,
96L, 9765L, 223L, 765L, 743L, 5960L, 14L, 50L, 89L, 348L, 5875L,
5L, 58602L, 397L, 1181L, 94L, 954L, 7901L, 836L, 8810L, 52L,
15L, 48L, 26L, 4L, 66L, 5265L, 80L, 282L, 231L, 76L, 661L, 7604L,
7406L, 58L, 10L, 903L, 49446L, 80921L, 1L, 876L, 334L, 63L, 796L,
88L, 413L, 1214L, 2983L, 9518L, 595L, 708L, 53L, 321L, 12L, 634L,
4910L, 8984L, 465L)
test <- gbp1d_solver_dpp(p = x, w = x, c = 23745L)
list_of_selected_items <- x[as.logical(test$k)]
list_of_selected_items
#> [1] 236 51 308 458 5486 4290 31 3533 9352
sum(list_of_selected_items)
#> [1] 23745
Created on 2021-10-18 by the reprex package (v2.0.1)
I think this is a subset sum problem. Below are two options for two different use cases, hope it can help you a bit.
Find All Subsets:
If you you want to find all possible subsets, you can the the following base R code, which is written in a recursion manner:
findAllSubsets <- function(v, s, r = c()) {
if (s == 0) {
return(list(r))
} else {
res <- list()
v <- v[v<=s]
for (k in seq_along(v)) {
if (length(r) == 0 || (tail(r, 1) >= v[k] & sum(v[k:length(v)]) >= s)) {
res[[k]] <- Recall(v[-k], s - v[k], c(r, v[k]))
}
}
return(unlist(res, recursive = FALSE))
}
}
and you will see for example
> x <- 1:10
> S <- 23
> findAllSubsets(x, S)
[[1]]
[1] 7 6 4 3 2 1
[[2]]
[1] 7 6 5 3 2
[[3]]
[1] 7 6 5 4 1
[[4]]
[1] 8 5 4 3 2 1
[[5]]
[1] 8 6 4 3 2
[[6]]
[1] 8 6 5 3 1
[[7]]
[1] 8 6 5 4
[[8]]
[1] 8 7 4 3 1
[[9]]
[1] 8 7 5 2 1
[[10]]
[1] 8 7 5 3
[[11]]
[1] 8 7 6 2
[[12]]
[1] 9 5 4 3 2
[[13]]
[1] 9 6 4 3 1
[[14]]
[1] 9 6 5 2 1
[[15]]
[1] 9 6 5 3
[[16]]
[1] 9 7 4 2 1
[[17]]
[1] 9 7 4 3
[[18]]
[1] 9 7 5 2
[[19]]
[1] 9 7 6 1
[[20]]
[1] 9 8 3 2 1
[[21]]
[1] 9 8 4 2
[[22]]
[1] 9 8 5 1
[[23]]
[1] 9 8 6
[[24]]
[1] 10 5 4 3 1
[[25]]
[1] 10 6 4 2 1
[[26]]
[1] 10 6 4 3
[[27]]
[1] 10 6 5 2
[[28]]
[1] 10 7 3 2 1
[[29]]
[1] 10 7 4 2
[[30]]
[1] 10 7 5 1
[[31]]
[1] 10 7 6
[[32]]
[1] 10 8 3 2
[[33]]
[1] 10 8 4 1
[[34]]
[1] 10 8 5
[[35]]
[1] 10 9 3 1
[[36]]
[1] 10 9 4
However, this method does NOT scale well. Thus, it is only recommended if one want to find all possible subsets and have a small size x.
Find One Subset:
If you just want one possible subset, you can use the package adagio with its function subsetsum directly, e.g,
subsetS <- function(x, target) {
v <- x[x <= target]
with(adagio::subsetsum(v, target), v[inds])
}
such that
> subsetS(x,23745)
[1] 236 51 308 458 5486 4290 31 3533 9352
Benchmarking
set.seed(0)
x <- sample(100000L,200)
target <- 237450
ggplot2::autoplot(microbenchmark(
findSumm(x, target),
gbp1d_solver_dpp(p = x, w = x, c = target),
subsetS(x, target),
times = 100
))
I have to find at least one subset of elements
Reflecting ThomasIsCoding's observation that this is a subset sum problem, specifically the coin changing problem that has a staggering amount of well-documented approaches in many languages, and is widely used to teach recursion. One of these implementations, of Mark, has been adapted to fit to this particular use case. Naturally, an approach using C++ would be much faster, but the fun is to keep it in R.
Note The \(x) and \(e) anonymous function are for R 4.1 and later. You can find your version by running version[c("major", "minor")]. For lower versions, replace \(x) with function(x) and \(e) with function(e).
Edit: big oopsie - forgot to change target parameter. New benchmarks & rewriting of function
cc <- \(x, target, warn = TRUE){
cc_env <- environment()
tryCatch(aux(x, target, cc_env), error = \(e){ # R4.1 new anonymous function
if (.Internal(exists("out", cc_env, "integer", TRUE))){
return(cc_env[["out"]])
}
if(warn) warning("No subset of x can sum up to target!", call. = F)
NA
})
}
aux = \(x, target, env, out = 0L){
s <- sum(out)
if(s == target) {
env[["out"]] <- out[-1L]
stop(call. = F)
}
if(s > target) return() # avoid eternal loop
for(i in seq_along(x)){
n = x[i]
left = x[(i+1L):length(x)]
aux(left, target, env, c(out, n)) # recursion here
}
}
The output is a vector of elements of x of which the sum equals y.
> cc(x, y)
[1] 236 407 51 308 72 9787 458 5486 42 4290 31 1102 24 100 669 245 86 77 13 91 25 97 6
[24] 14 5 10 1 12
Benchmarks
library(adagio)
library(gbp)
x = c(236L, 407L, 51L, 308L, 72L, 9787L, 458L, 5486L, 42L, 4290L,
31L, 3533L, 1102L, 24L, 100L, 669L, 9352L, 4091L, 2751L, 3324L,
3193L, 245L, 86L, 98932L, 77L, 13L, 9789L, 91L, 999L, 25L, 25379L,
9626L, 9092L, 622L, 97L, 57L, 2911L, 6L, 405L, 894L, 1760L, 9634L,
96L, 9765L, 223L, 765L, 743L, 5960L, 14L, 50L, 89L, 348L, 5875L,
5L, 58602L, 397L, 1181L, 94L, 954L, 7901L, 836L, 8810L, 52L,
15L, 48L, 26L, 4L, 66L, 5265L, 80L, 282L, 231L, 76L, 661L, 7604L,
7406L, 58L, 10L, 903L, 49446L, 80921L, 1L, 876L, 334L, 63L, 796L,
88L, 413L, 1214L, 2983L, 9518L, 595L, 708L, 53L, 321L, 12L, 634L,
4910L, 8984L, 465L)
target <- 23745L
ggplot2::autoplot(microbenchmark::microbenchmark(
findSumm(x, target),
gbp::gbp1d_solver_dpp(p = x, w = x, c = target),
subsetS(x, target),
cc(x, target, warn = F), # slightly faster than warn = T
times = 100
))
The customary benchmark plot. For OP's case, on average the recursive approach wins!
The primary reason for the dramatic speed-up is the reduction of writing to memory, and avoiding unnecessary function calls. Further, this problem is excellent use case for the use of recursion, ensuring a O(2^n).
> bench::bench_memory(cc(x, target, warn = F))
# A tibble: 1 x 2
mem_alloc memory
<bch:byt> <list>
1 42.2KB <Rprofmem [140 x 3]>
> bench::bench_memory(subsetS(x, target))
# A tibble: 1 x 2
mem_alloc memory
<bch:byt> <list>
1 5.4MB <Rprofmem [346 x 3]>
However, the performance of the recursive function depends on the inputs. Case in point, here is a situation where the while approach outperforms recursion.
set.seed(1000)
x <- sample(100000L, 200L)
target <- 237450L
In these situations, the current version of cc uses up a lot of memory, leading to a call to gc() which is rather expensive.
Error handling
But what if we feed some input that is not valid? We can compare some of the implementations here.
x <- c(1L, 2L)
target <- 5L # impossible to reach with 1 and 2.
> findSumm(x, target)
Error in findSumm(x, target) : Impossible solution! sum(x)<sfind!
> gbp::gbp1d_solver_dpp(p = x, w = x, c = target)
C++ object <0000023130596180> of class 'gbp1d' <000002313b255230>
> subsetS(x, target)
[1] NA NA
Warning message:
In adagio::subsetsum(v, target) :
Total sum of 'S' is smaller than 't'; no solution possible.
> cc(x, target)
[1] NA
Warning message:
No subset of x can sum up to target!
The manner of error handling differs. Should the evaluation return a hard show-stopping error, or should the code execution process not be interrupted? In this case, I think a NA paired with a warning that users can silence is reasonable.
Digging into the adagio package's error NA NA, this seems to come from c(1,2)[NA] which returns a vector of length 2, which is odd since c(1,2,3,4,5)[c(1,2,3, NA)] gives [1] 1 2 3 NA.
How cc works
Try to find out yourself first, and then inspect the following call.
cc_explained <- \(x, target, warn = TRUE){
cc_env <- environment()
print(environment())
print("----")
tryCatch(aux(x, target, cc_env), error = \(e){ # R4.1 new anonymous function
if (.Internal(exists("out", cc_env, "integer", TRUE))){
print("----")
print(environment())
return(cc_env[["out"]])
}
if(warn) warning("No subset of x can sum up to target!", call. = F)
NA
})
}
aux = \(x, target, env, out = 0L){
print(environment())
s <- sum(out)
if(s == target) {
lobstr::cst()
env[["out"]] <- out[-1L]
stop(call. = F)
}
if(s > target) return() # avoid eternal loop
for(i in seq_along(x)){
n = x[i]
left = x[(i+1L):length(x)]
aux(left, target, env, c(out, n)) # recursion here
}
}
cc_explained(x,y)
Environments
Environments power much of R's functionality. For this task, environments are the prefered data structure, as they
Are not copied on modification
Allow lexical scoping from within a function environment
In a nutshell, it does not carry the memory overhead like maintaining a list of possible outcomes.
Every time a function is called, a new environment is created that looks like <environment: 0x0000020fe68f06e8>. Because of this, we need to pass on our output value to the environment of the first function in the call stack, gotten by running environment() where the empty space is the NULL argument. This is passed down on each iteration.
Recursion
The call stack lobstr::cst() reveals the recursive nature of cc.
x
1. \-global::cc_explained(x, y)
2. +-base::tryCatch(...)
3. | \-base:::tryCatchList(expr, classes, parentenv, handlers)
4. | \-base:::tryCatchOne(expr, names, parentenv, handlers[[1L]])
5. | \-base:::doTryCatch(return(expr), name, parentenv, handler)
6. \-global::aux(x, target, cc_env)
7. \-global::aux(left, target, env, c(out, n))
...
33. \-global::aux(left, target, env, c(out, n))
34. \-global::aux(left, target, env, c(out, n))
35.
Every time the sum is not equal to the target, aux recalls itself with an updated value for out.
Stopping mechanism
2 different stopping mechanisms are used:
return() returning a NULL if the target is not met.
stop() when a target is met.
Just before stop() is called, the cc_env environment is updated with the value for the iterator, out at that point.
the tryCatch() statement in the cc function checks if the recursive call was able to find a solution by checking whether a new variable out exists in cc_env. Finally, the value is retrieved. This action looks like list subsetting, but isn't.
You can use comboGeneral from the package RcppAlgos (I am the author). E.g.:
library(RcppAlgos)
comboGeneral(x, 5,
constraintFun = "sum",
comparisonFun = "==",
limitConstraints = 23745L,
upper = 1L)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 1 31 4290 9634 9789
Here is a more general approach:
GetPartition <- function(x, target, n_res = 1L) {
len_x <- length(x)
n_soln <- 0L
res <- list()
m <- 1L
i <- 1L
while (n_soln < n_res || m > len_x) {
combs <- comboGeneral(
x, m, constraintFun = "sum",
comparisonFun = "==", limitConstraints = target,
upper = min(choose(len_x, m), n_res - n_soln)
)
if (nrow(combs)) {
n_soln <- n_soln + nrow(combs)
res[[i]] <- combs
i <- i + 1L
}
m <- m + 1L
}
return(res)
}
The results are in lexicographical order. For example:
> GetPartition(x, target = 23745L, n_res = 13L)
#> [[1]]
#> [,1] [,2] [,3] [,4]
#> [1,] 42 4290 9626 9787
#> [2,] 76 5875 8810 8984
#> [3,] 86 4910 8984 9765
#> [4,] 97 5486 8810 9352
#> [5,] 97 5960 7901 9787
#> [6,] 100 4091 9765 9789
#> [7,] 231 4091 9634 9789
#> [8,] 236 4910 8810 9789
#> [9,] 465 5486 8810 8984
#>
#> [[2]]
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 1 31 4290 9634 9789
#> [2,] 1 63 4290 9626 9765
#> [3,] 1 63 4910 8984 9787
#> [4,] 1 77 4091 9787 9789
And it's really fast:
system.time(print(GetPartition(x, 23745L)))
#> [[1]]
#> [,1] [,2] [,3] [,4]
#> [1,] 42 4290 9626 9787
#>
#> user system elapsed
#> 0.015 0.000 0.016
Testing on the very large vector in #Marek's answer (I renamed it x_big for clarity):
system.time(print(GetPartition(x_big, 9568447L)))
#> [[1]]
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
#> [1,] 762533 773566 805253 825893 861864 881126 893583 917416 937874 953460 955879
#>
#> user system elapsed
#> 0.006 0.000 0.006

Can R successfully deliver a count when two x and y points are within a certain distance of each other?

I am struggling a bit. I am trying to move X and Y data into a plot. I then would like R to give me the count when two X,Y coords are within 12 units of each other. Additionally, I would like to not have diagonal distance calculated, but rather X1-X2, Y1-Y2 distance calculated. I am unsure of how to do that. This is what I have so far.
View(D1.B1.Green)
Green <- D1.B1.Green
Green
Green <- Green[, 6:7]
Green
as.data.frame(Green)
Red <- D1.B1.Red
Red
Red <- Red[, 6:7]
Red
as.data.frame(Red)
combine <- rbind(Red, Green)
combine
#combine=cbind(x,y) #takes and combines dataframe values
plot(combine) #plots aggregated data
Yellow = as.matrix(dist(combine)) #calculates distance data of df into matrix
Yellow
neighbors=which(Yellow < 12, arr.ind=T) #A vertex is a neighbor of another one (in other words, the two vertices are adjacent), if they are incident to the same edge.
neighbors= neighbors[neighbors[,1]!=neighbors[,2]] #removes self-self relations
points(combine[neighbors,], col="red" ) #distinguishes red dots that are x euclidean distance away
Yellow
sum(Yellow < 12, na.rm = TRUE)
combine
I'm not sure what is wrong, it successfully plots the points but does not indicate when there are points less than 12 distance from each other. Additionally, the count it gives me is the total number of points, not when one point is less than 12 distance from each other.
My data looks like this
> dput(Green)
structure(list(XStart = c(265L, 967L, 1325L, 1169L, 435L, 916L,
558L, 197L), YStart = c(114L, 116L, 385L, 438L, 499L, 580L, 666L,
740L)), class = "data.frame", row.names = c(NA, -8L))
> dput(Red)
structure(list(XStart = c(105L, 250L, 586L, 406L, 136L, 867L,
962L, 721L, 522L, 14L, 462L, 748L, 532L, 734L, 1301L, 559L, 1094L,
641L, 982L, 217L, 633L, 340L, 138L, 1150L, 331L, 596L, 253L,
177L, 187L, 311L, 615L), YStart = c(0L, 0L, 0L, 33L, 51L, 162L,
193L, 195L, 200L, 209L, 213L, 221L, 283L, 289L, 293L, 306L, 350L,
364L, 375L, 423L, 472L, 548L, 588L, 592L, 597L, 684L, 705L, 744L,
814L, 870L, 1032L)), class = "data.frame", row.names = c(NA,
-31L))
> neighbors=which(Yellow < 12, arr.ind=T) #A vertex is a neighbor of another one (in other words, the two vertices are adjacent), if they are incident to the same edge.
> neighbors= neighbors[neighbors[,1]!=neighbors[,2]] #removes self-self relations
> points(combine[neighbors,], col="red" ) #distinguishes red dots that are x euclidean distance away
> Yellow
1 2 3 4 5 6 7 8 9
1 0.00000 145.0000 481.0000 302.8036 59.68249 779.03017 878.46343 646.12770 462.48135
2 145.00000 0.0000 336.0000 159.4522 124.88795 637.91300 737.69438 509.77054 337.61517
3 481.00000 336.0000 0.0000 183.0000 452.88078 324.35320 422.64051 237.17082 209.99048
4 302.80357 159.4522 183.0000 0.0000 270.59933 478.70868 578.56374 354.21604 203.33470
5 59.68249 124.8879 452.8808 270.5993 0.00000 739.37947 838.11694 602.46245 413.75959
6 779.03017 637.9130 324.3532 478.7087 739.37947 0.00000 99.92998 149.68300 347.08644
7 878.46343 737.6944 422.6405 578.5637 838.11694 99.92998 0.00000 241.00830 440.05568
8 646.12770 509.7705 237.1708 354.2160 602.46245 149.68300 241.00830 0.00000 199.06280
9 462.48135 337.6152 209.9905 203.3347 413.75959 347.08644 440.05568 199.06280 0.00000
10 227.95175 315.2412 608.9869 429.6976 199.61964 854.29386 948.13501 707.13860 508.07972
11 415.71384 300.5212 246.4650 188.5099 364.03297 408.19848 500.39984 259.62473 61.39218
12 679.91911 544.8348 274.0164 390.2666 635.17242 132.82319 215.82400 37.48333 226.97357
13 512.26751 399.5160 288.1059 279.9571 458.95534 356.18254 439.31765 208.48261 83.60024
14 692.21528 563.7171 324.6922 416.0769 643.62101 183.89671 247.38634 94.89468 229.92390
15 1231.36713 1091.0774 772.7056 932.0005 1189.86932 453.33983 353.44165 588.22105 784.53171
16 547.49612 434.8758 307.1889 312.9505 493.91700 340.00000 418.54271 196.37973 112.27199
17 1049.10486 913.6936 616.8987 757.5177 1003.57611 294.74226 205.11704 403.92326 591.34085
18 647.91357 534.2069 368.1318 405.9384 594.13298 303.11714 363.70593 186.97861 202.62527
19 953.81025 822.4652 545.3815 669.8806 905.92053 242.06198 183.09560 317.05047 492.16359
20 437.57628 424.2853 561.3288 433.3832 380.71643 700.44343 779.69545 553.17267 377.82800
21 708.21466 607.8429 474.3343 494.2166 651.34476 388.40185 431.37223 290.64239 293.77713
22 596.26253 555.3413 600.6829 519.2119 537.23831 653.24192 716.17665 519.39388 392.71873
23 588.92529 598.5716 739.2212 616.3189 537.00372 844.34412 913.78389 703.09174 545.89376
24 1201.03664 1077.2483 817.6552 930.6003 1149.29413 514.77082 441.07256 584.50834 740.30264
25 638.34552 602.4699 649.1795 568.9648 579.77668 690.30501 749.25096 560.09285 440.55647
> sum(Yellow < 12, na.rm = TRUE)
[1] 39

How to plot the first difference of a time series

I know this is pretty basic but I can't seem to get the code to work. I have a time series data set and I am trying to make it stationary by taking the first difference but I don't know what code to use to do that in R. The data is not in a library; I just imported it as a csv file.
What I've tried is plot(diff(data), type="o", main="first difference") and I get the error
Error in r[i1] - r[-length(r):-(length(r) - lag + 1L)] :
non-numeric argument to binary operator`
I am new to R so I have no idea what this means.
> dput(hotel)
structure(list(Month = 1:168, Occupancy = c(501L, 488L, 504L,
578L, 545L, 632L, 728L, 725L, 585L, 542L, 480L, 530L, 518L, 489L,
528L, 599L, 572L, 659L, 739L, 758L, 602L, 587L, 497L, 558L, 555L,
523L, 532L, 623L, 598L, 683L, 774L, 780L, 609L, 604L, 531L, 592L,
578L, 543L, 565L, 648L, 615L, 697L, 785L, 830L, 645L, 643L, 551L,
606L, 585L, 553L, 576L, 665L, 656L, 720L, 826L, 838L, 652L, 661L,
584L, 644L, 623L, 553L, 599L, 657L, 680L, 759L, 878L, 881L, 705L,
684L, 577L, 656L, 645L, 593L, 617L, 686L, 679L, 773L, 906L, 934L,
713L, 710L, 600L, 676L, 645L, 602L, 601L, 709L, 706L, 817L, 930L,
983L, 745L, 735L, 620L, 698L, 665L, 626L, 649L, 740L, 729L, 824L,
937L, 994L, 781L, 759L, 643L, 728L, 691L, 649L, 656L, 735L, 748L,
837L, 995L, 1040L, 809L, 793L, 692L, 763L, 723L, 655L, 658L,
761L, 768L, 885L, 1067L, 1038L, 812L, 790L, 692L, 782L, 758L,
709L, 715L, 788L, 794L, 893L, 1046L, 1075L, 812L, 822L, 714L,
802L, 748L, 731L, 748L, 827L, 788L, 937L, 1076L, 1125L, 840L,
864L, 717L, 813L, 811L, 732L, 745L, 844L, 833L, 935L, 1110L,
1124L, 868L, 860L, 762L, 877L)), .Names = c("Month", "Occupancy"
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-168L), spec = structure(list(cols = structure(list(Month = structure(list(), class = c("collector_integer",
"collector")), Occupancy = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("Month", "Occupancy")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
ts
The problem is that you have an R data frame
class(employment)
## [1] "tbl_df" "tbl" "data.frame"
and while it is possible to work with that representation it would be easier if you used an R time series object such as a ts object or a zoo object (from the zoo package).
Below we convert the data frame to a ts object and then take the first difference of its log and plot it. Since the times are 1, 2, 3, ... and that is the default we don't have to specify the times when creating the ts object empts
empts <- ts(employment$employmentW)
plot(diff(log(empts)))
(continued after image)
data.frame
If you did want to keep the data frame representation then try this. Note that diff reduces the length by one so we remove the first element of Month to keep the x and y parts the same length so that we are plotting the first difference of the log against 2, 3, 4, ... which is likely what is wanted. With the time series representation this is all automatically taken care of but not with the data frame representation so we have to do it manually.
plot(diff(log(employmentW)) ~ Month[-1], employment, type = "l")
Streamlined, base-R code (using log() since you requested it in a later, nearly identical post):
plot(diff(log(hotel$Occupancy)), type="o", main="first difference")
Here is a solution using the tidyverse approach.
The original data:
head(hotel)
# # A tibble: 6 x 2
# Month Occupancy
# <int> <int>
# 1 1 501
# 2 2 488
# 3 3 504
# 4 4 578
# 5 5 545
# 6 6 632
Calculate differences in occupancy from month to month:
library(dplyr)
hotel_diff <-
hotel %>%
mutate(Difference = Occupancy - lag(Occupancy)) %>%
na.omit() # to remove the first row which has an NA
head(hotel_diff)
# # A tibble: 6 x 3
# Month Occupancy Difference
# <int> <int> <int>
# 1 2 488 -13
# 2 3 504 16
# 3 4 578 74
# 4 5 545 -33
# 5 6 632 87
# 6 7 728 96
Plot:
library(ggplot2)
ggplot(hotel_diff, aes(Month, Difference)) + geom_line()
If you're new to R, I strongly suggest you skip the base-R data-wrangling and plotting functions and go straight to tidyverse packages such as dplyr and ggplot2. Your life will be much easier. There'a great free book available on this topic here: R for Data Science.

how to access first column after matrix transposed in R

I transposed my data and the original variable names are now in the first column with heading row.names
I need to access this column for my analysis but I cannot.
Sample of data below:
row.names CountryData YR1990 YR1991 YR1992
1 3 AFG 1200 1160 1097
2 4 AGO 320 417 397
3 5 ALB 2794 2017 2269
4 6 ARE 2216 1594 2341
AFTER TRANSPOSED
row.names AFG AGO ALB ARE
1 YR1990 1200 320 2794 2216
2 YR1991 1160 417 2071 1594
3 YR1992 1097 397 2269 2315
4 YR1993 1135 267 2685 858
#Read in data
>mydata=read.csv('CerialYield.csv',header=TRUE,dec=".",na.strings = c("NA",".."), stringsAsFactors=FALSE)
>mydata=mydata[1:233, 1:26]
>CerialData=mydata[,7:26]
>CountryData=mydata[,1]
#Transpose dataframe
>CerialCountryData=t(CerialData)
>colnames(CerialCountryData) <- CerialCountryData[1,]
>CerialCountryData <- CerialCountryData[2:nrow(CerialCountryData), ]
>CerialCountryData=as.data.frame(as.matrix(CerialCountryData))
structure(list(AFG = c(1200L, 1160L, 1097L, 1135L, 1140L, 1219L,
1203L, 1349L, 1389L, 1286L), AGO = c(320L, 417L, 397L, 267L,
298L, 402L, 653L, 567L, 701L, 620L), ALB = c(2794L, 2071L, 2269L,
2685L, 2460L, 2841L, 2450L, 2833L, 2872L, 2798L)), .Names = c("AFG",
"AGO", "ALB"), row.names = c("YR1990", "YR1991", "YR1992", "YR1993",
"YR1994", "YR1995", "YR1996", "YR1997", "YR1998", "YR1999"), class = "data.frame")
So, in your data.frame these are row names and not a separate column. In order to access those names you need the row.names function i.e.:
Data
structure(list(AFG = c(1200L, 1160L, 1097L, 1135L, 1140L, 1219L,
1203L, 1349L, 1389L, 1286L), AGO = c(320L, 417L, 397L, 267L,
298L, 402L, 653L, 567L, 701L, 620L), ALB = c(2794L, 2071L, 2269L,
2685L, 2460L, 2841L, 2450L, 2833L, 2872L, 2798L)), .Names = c("AFG",
"AGO", "ALB"), row.names = c("YR1990", "YR1991", "YR1992", "YR1993",
"YR1994", "YR1995", "YR1996", "YR1997", "YR1998", "YR1999"), class = "data.frame")
Solution
> row.names(df)
[1] "YR1990" "YR1991" "YR1992" "YR1993" "YR1994" "YR1995" "YR1996" "YR1997" "YR1998" "YR1999"
You can store that into a variable and then use it as a vector.
Hope it helps.
have you tried:
>table_name$row.names
or
>table_name[,1]

Resources