Inverse Association Rules - associations

Association rules are a very common technique when you want to figure out which events happens together (like burger and bread mostly sell together). In marketing this technique is used to find out the complimentary products.
I am looking for a technique to extract the "Substitute Products" and to be it is like Inverse Association rules to find out which events are less likely happens together. Is there any algorithm or technique available in Spark, R, Python, etc. for this?
Thanks,
Amir

I've done an a very practical implementation for Substitution Rule Mining using Teng, Hsieh and Chen (2002) for R. Maybe it can help you:
# Used packages:
library(arules)
SRM <- function(TransData, MinSup, MinConf, pMin, pChi, itemLabel, nTID){
# Packages ----------------------------------------------------------------
if (sum(search() %in% "package:arules") == 0) {
stop("Please load package arules")
}
# Checking Input data -----------------------------------------------------
if (missing(TransData)) {
stop("Transaction data is missing")
}
if (is.numeric(nTID) == FALSE) {
stop("nTID has to be one numeric number for the count of
Transactions")
}
if (length(nTID) > 1) {
stop("nTID has to be one number for the count of Transactions")
}
if (is.character(itemLabel) == FALSE) {
stop("itemLabel has to be a character")
}
# Concrete Item sets ---------------------------------------------------
# adding complements to transaction data
compl_trans <- addComplement(TransData,labels = itemLabel)
compl_tab <- crossTable(compl_trans,"support")
compl_tab_D <- as.data.frame(compl_tab)
# ordering matrix
compl_tab_D <- compl_tab_D[order(rownames((compl_tab))),order(colnames((compl_tab)))]
# Chi Value ---------------------------------------------------------------
# empty data frame for loop
complement_data <- data.frame(Chi = as.numeric(),
Sup_X.Y = as.numeric(),
X = as.character(),
Sup_X = as.numeric(),
Y = as.character(),
Sup_Y = as.numeric(),
CX = as.character(),
SupCX = as.numeric(),
CY = as.character(),
Sup_CY = as.numeric(),
Conf_X.CY = as.numeric(),
Sup_X.CY = as.numeric(),
Conf_Y.CX = as.numeric(),
SupY_CX = as.numeric())
# first loop for one item
for ( i in 1 : (length(itemLabel) - 1)) {
# second loop combines it with all other items
for (u in (i + 1) : length(itemLabel)) {
# getting chi value from Teng
a <- itemLabel[i]
b <- itemLabel[u]
ca <- paste0("!", itemLabel[i])
cb <- paste0("!", itemLabel[u])
chiValue <- nTID * (
compl_tab[ca, cb] ^ 2 / (compl_tab[ca, ca] * compl_tab[cb, cb]) +
compl_tab[ca, b] ^ 2 / (compl_tab[ca, ca] * compl_tab[b, b]) +
compl_tab[a, cb] ^ 2 / (compl_tab[a, a] * compl_tab[cb, cb]) +
compl_tab[a, b] ^ 2 / (compl_tab[a, a] * compl_tab[b, b]) - 1)
# condition to be dependent
if (compl_tab[a, b] > compl_tab[a, a] * compl_tab[b, b] && chiValue >= qchisq(pChi, 1) &&
compl_tab[a, a] >= MinSup && compl_tab[b, b] >= MinSup ) {
chi_sup <- data.frame(Chi = chiValue,
Sup_X.Y = compl_tab[a, b],
X = a,
Sup_X = compl_tab[a, a],
Y = b,
Sup_Y = compl_tab[b, b],
CX = ca,
SupCX = compl_tab[ca, ca],
CY = cb,
Sup_CY = compl_tab[cb, cb],
Conf_X.CY = compl_tab[a, cb] / compl_tab[a, a],
Sup_X.CY = compl_tab[a, cb],
Conf_Y.CX = compl_tab[ca, b] / compl_tab[b, b],
SupY_CX = compl_tab[ca, b])
try(complement_data <- rbind(complement_data, chi_sup))
}
}
}
if (nrow(complement_data) == 0) {
stop("No complement item sets could have been found")
}
# changing mode of
complement_data$X <- as.character(complement_data$X)
complement_data$Y <- as.character(complement_data$Y)
# calculating support for concrete itemsets with all others and their complements -------------------
## with complements
matrix_trans <- as.data.frame(as(compl_trans, "matrix"))
sup_three <- data.frame(Items = as.character(),
Support = as.numeric())
setCompl <- names(matrix_trans)
# 1. extracts all other values than that are not in the itemset
for (i in 1 : nrow(complement_data)) {
value <- setCompl[ !setCompl %in% c(complement_data$X[i],
complement_data$Y[i],
paste0("!", complement_data$X[i]),
paste0("!",complement_data$Y[i]))]
# 2. calculation of support
for (u in value) {
count <- sum(rowSums(matrix_trans[, c(complement_data$X[i], complement_data$Y[i], u )]) == 3)
sup <- count / nTID
sup_three_items <- data.frame(Items = paste0(complement_data$X[i], complement_data$Y[i], u),
Support=sup)
sup_three <- rbind(sup_three, sup_three_items)
}
}
# Correlation of single items-------------------------------------------------------------
# all items of concrete itemsets should be mixed for correlation
combis <- unique(c(complement_data$X, complement_data$Y))
# empty object
rules<- data.frame(
Substitute = as.character(),
Product = as.character(),
Support = as.numeric(),
Confidence = as.numeric(),
Correlation = as.numeric())
# first loop for one item
for (i in 1 : (length(combis) - 1)) {
# second loop combines it with all other items
for (u in (i + 1) : length(combis)) {
first <- combis[i]
second <- combis[u]
corXY <- (compl_tab[first, second] - (compl_tab[first, first] * compl_tab[second, second])) /
(sqrt((compl_tab[first, first] * (1 - compl_tab[first,first])) *
(compl_tab[second, second] * (1 - compl_tab[second, second]))))
# confidence
conf1 <- compl_tab[first, paste0("!", second)] / compl_tab[first, first]
conf2 <- compl_tab[second, paste0("!", first)] / compl_tab[second, second]
two_rules <- data.frame(
Substitute = c(paste("{", first, "}"),
paste("{", second, "}")),
Product = c(paste("=>", "{", second, "}"),
paste("=>", "{", first, "}")),
Support = c(compl_tab[first, paste0("!", second)], compl_tab[second, paste0("!", first)]),
Confidence = c(conf1, conf2),
Correlation = c(corXY, corXY)
)
# conditions
try({
if (two_rules$Correlation[1] < pMin) {
if (two_rules$Support[1] >= MinSup && two_rules$Confidence[1] >= MinConf) {
rules <- rbind(rules, two_rules[1, ])
}
if (two_rules$Support[2] >= MinSup && two_rules$Confidence[2] >= MinConf) {
rules <- rbind(rules, two_rules[2, ])
}
} })
}
}
# Correlation of concrete item pairs with single items --------------------
# adding variable for loop
complement_data$XY <- paste0(complement_data$X, complement_data$Y)
# combination of items
for (i in 1 : nrow(complement_data)){
# set of combinations from dependent items with single items
univector <- c(as.vector(unique(complement_data$X)), as.vector(unique(complement_data$Y)))
univector <- univector[!univector %in% c(complement_data$X[i], complement_data$Y[i])]
combis <- c(complement_data[i,"XY"], univector)
for (u in 2 : length(combis)) {
corXYZ <-(sup_three[sup_three$Items == paste0(combis[1], combis[u]),2] -
complement_data[complement_data$XY == combis[1],"Sup_X.Y"] *
compl_tab[combis[u],combis[u]]) /
(sqrt((complement_data[complement_data$XY == combis[1],"Sup_X.Y"] *
(1 - complement_data[complement_data$XY == combis[1],"Sup_X.Y"]) *
compl_tab[combis[u],combis[u]] * (1 - compl_tab[combis[u],combis[u]]))))
dataXYZ <- data.frame(
Substitute = paste("{", combis[1], "}"),
Product = paste("=>", "{", combis[u], "}"),
Support = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2],
Confidence = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2] /
complement_data[complement_data$XY == combis[1],"Sup_X.Y"],
Correlation = corXYZ)
# conditions
if (dataXYZ$Correlation < pMin && dataXYZ$Support >= MinSup && dataXYZ$Confidence >= MinConf) {
try(rules <- rbind(rules, dataXYZ))
}
}
}
if (nrow(rules) == 0) {
message("Sorry no rules could have been calculated. Maybe change input conditions.")
} else {
return(rules)
}
# end
}
I think a better explanation is in my blog:
http://mattimeyer.github.io/2016-12-21-Substitution-Rule-Mining/

Related

Dynamic Programming in R - Algorithm for Distance Between Warehouse Locations

I am trying to calculate the distance between warehouse locations in rStudio utilizing an algorithm from an academic paper. The formula accounts for dimensions of location width, depth, and side of the aisle. The added complexity comes in when calculating the shortest distance with multiple cross aisles. This is all based on this paper.
This is from a bird's eye view:
I have static values for α = 48, ß = 96, ϒ = 108, Ω = 75, S = 22.
I then have a data frame with i as the key for location number, X for aisle number, y for section number, z for side number, and Cross Aisle is a boolean (0 = not a cross-aisle, 1 = is a cross-aisle). Here is a sample:
i X Y Z Cross Aisle
1 1 1 1 0
2 1 2 1 0
....
357 12 20 2 0
These are the formulas between locations i and j if the warehouse had no cross-aisles and was one rectangular grid:
Items in the same aisle (xi = xj):
dij = |yi - yj| * ß + |zi - zj| * ϒ
If items are in different aisles, there are three different scenarios:
dij = {|xi - xj| * (2α + ϒ) + v} if zi = zj
dij = {(xj - xi) * (2α + ϒ) + ϒ + v} if zi = 1, zj = 2
dij = {(xj - xi) * (2α + ϒ) - ϒ + v} if zi = 2, zj = 1
where v is the "vertical" distance (bird's eye, up-down aisle):
v = min(ß * (2 * S - yi - yj), ß * (yi + yj)) + 2Ω
(*Note: the academic paper has a typo in the v formula. It states 2 - yi - yj in the first bit, but I found another, original source that had it correctly as 2 * S-yi - yj.)
This piece of the formula is not the tricky part. It is a fairly simple if/then function in R to compute. However, this next bit with the cross-aisle gets me.
This is from the academic paper:
The authors state essentially: There are two locations p1 and p2. There are two neighboring cross-aisles, a1 and a2. One is above p1 and the other is below p1. Also, cross-aisles b1 and b2 are found, which are neighboring p2 and lead left. The distance between p1 and p2 are as follows:
d(p1,p2) = min{d(p1,ai) + d(ai,bj) + d(bj,p2),i,j ∈ {1,2}}
I am unsure how to apply this algorithm to my data set and construct the necessary loops, and matrix to find the distances between my warehouse locations. I could really use some help here making sense of it.
Here is my actual data set.
Here is an image of my warehouse to give you a sense of the layout. The "X" locations are cross-aisles.
I was able to get a workable loop without the cross-aisles:
whse.data <- read.xlsx("data set.xlsx", sheet = 1) %>%
as.data.frame()
### CREATE COMBINATION OF LOCATIONS
require(tools)
cmbn.loc <- combinations(n = max(whse.data$i), r = 2, v = whse.data$i,
repeats.allowed = FALSE) %>%
as.data.frame()
### CALCULATE DISTANCE BETWEEN LOCATIONS
LocDst <- function(dc, wc, wa, tr, s, df, comb){
# Creates a distance between various locations
#
# Args:
# dc: Depth of cell (alpha)
# wc: Width of cell (beta)
# wa: Width of aisle (y)
# tr: turning radius (omega)
# s: number of sections (S)
# df: Data Frame with location i, x, y, z, Cross.Aisle data
# comb: Combination of locations to compare
#
# Returns:
# Data frame with distances between each location combination
#
dist.df_total <- data.frame()
for (n in 1:nrow(comb)){
i <- comb[n,1]
j <- comb[n,2]
xi <- df[df$i == i,2]
yi <- df[df$i == i,3]
zi <- df[df$i == i,4]
xj <- df[df$i == j,2]
yj <- df[df$i == j,3]
zj <- df[df$i == j,4]
v <- min(wc * (2 * s - yi - yj), wc * (yi + yj)) + 2 * tr
if(xi == xj){
dij <- abs(yi - yj) * wc + abs(zi - zj) * wa
} else if (zi == zj){
dij <- (abs(xi - xj) * (2 * dc + wa) + v)
} else if (zi == 1 & zj == 2){
dij <- ((xj - xi) * (2 * dc + wa) + wa + v)
} else {
dij <- ((xj - xi) * (2 * dc * wa) - wa + v)
}
dist.df <- data.frame(`i` = i, `j` = j, dist = dij)
dist.df_total <- rbind.data.frame(dist.df_total, dist.df)
}
return(dist.df_total)
}
dist <- LocDst(48, 96, 108, 75, 18, whse.data, cmbn.loc)
I need a workable for loop or something to be run Algorithm 1 above, please.
I was able to get something to work. If anyone has anything more straightforward, I am all ears. Maybe this will be helpful to someone out there!
I had to use Excel to calculate the distance between the cross-aisles. There's probably a code for it, but it wasn't value-add for me at this time. Here's a sample of that data:
V1 V2 Dist
7 18 672
7 19 780
7 33 204
....
341 342 108
where V1 represents the first location number and V2 the second for all cross-aisle combinations.
Everything else should be computed within the code (beyond what put above):
require(dplyr)
require(openxlsx)
require(tools)
whse.data <- read.xlsx("data set.xlsx", sheet = 1) %>%
as.data.frame()
### CREATE COMBINATION OF LOCATIONS
cmbn.loc <- combinations(n = max(whse.data$i), r = 2, v = whse.data$i,
repeats.allowed = FALSE) %>%
as.data.frame()
# CROSS-AISLES IN EACH SHELF
ca.shelf <- cross.aisles %>%
group_by(Shelf) %>%
summarise(No.Cross.Aisles = sum(Cross.Aisle)) %>%
as.data.frame()
# DISTANCE BETWEEN CROSS AISLES
cmbn.cross.aisle <- combinations(n = nrow(cross.aisles),
r = 2,
v = cross.aisles$i,
repeats.allowed = FALSE) %>%
as.data.frame()
dist.cross.aisle <- read.xlsx("Combination of Cross-Aisles v3.xlsx", sheet = 1) %>%
as.data.frame()
# CROSS AISLE FUNCTION
CrsAisDst <- function(dc, wc, wa, tr, s, no.sh, df, comb, ca.m, d.m){
# Creates a distance between various locations
#
# Args:
# dc: Depth of cell (alpha)
# wc: Width of cell (beta)
# wa: Width of aisle (y)
# tr: turning radius (omega)
# s: number of sections (S)
# no.sh: number of shelves
# df: Data Frame with location i, x, y, z, Cross.Aisle data
# comb: Combination of locations to compare
# ca.m: Cross-aisles matrix
# d.m: Distances between cross-aisles
#
# Returns:
# Data frame with distances between each location combination
#
dist.df_total <- data.frame()
for (n in 1:nrow(comb)){
i <- comb[n,1]
j <- comb[n,2]
xi <- df[df$i == i,2]
yi <- df[df$i == i,3]
zi <- df[df$i == i,4]
xj <- df[df$i == j,2]
yj <- df[df$i == j,3]
zj <- df[df$i == j,4]
v <- min(wc * (2 * s - yi - yj), wc * (yi + yj)) + 2 * tr
if(xi == xj){
min.dij <- abs(yi - yj) * wc + abs(zi - zj) * wa
} else {
shi <- df[df$i == i,6]
shj <- df[df$i == j,6]
### CROSS-AISLES
ca.i <- #ca.m[ca.m$Shelf == shi,1]
data.frame(`i` = ca.m[ca.m$Shelf == shi,1])
ca.j <- #ca.m[ca.m$Shelf == shj,1]
data.frame(`j` = ca.m[ca.m$Shelf == shj,1])
## JOIN DISTANCES
dist.df_total.i <- data.frame()
dist.df_total.j <- data.frame()
#
for (m in 1:nrow(ca.i)){
i.i <- i
j.i <- ca.i[m,]
xi.i <- df[df$i == i.i,2]
yi.i <- df[df$i == i.i,3]
zi.i <- df[df$i == i.i,4]
xj.i <- df[df$i == j.i,2]
yj.i <- df[df$i == j.i,3]
zj.i <- df[df$i == j.i,4]
dij.i <- abs(yi.i - yj.i) * wc + abs(zi.i - zj.i) * wa
dist.df.i <- data.frame(`i` = i.i, `j` = j.i, dist = dij.i)
dist.df_total.i <- rbind.data.frame(dist.df_total.i, dist.df.i)
}
for (l in 1:nrow(ca.j)){
i.j <- j
j.j <- ca.j[l,]
xi.j <- df[df$i == i.j,2]
yi.j <- df[df$i == i.j,3]
zi.j <- df[df$i == i.j,4]
xj.j <- df[df$i == j.j,2]
yj.j <- df[df$i == j.j,3]
zj.j <- df[df$i == j.j,4]
dij.j <- abs(yi.j - yj.j) * wc + abs(zi.j - zj.j) * wa
dist.df.j <- data.frame(`i` = i.j, `j` = j.j, dist = dij.j)
dist.df_total.j <- rbind.data.frame(dist.df_total.j, dist.df.j)
}
min.i <- dist.df_total.i %>% slice(which.min(dist))
min.j <- dist.df_total.j %>% slice(which.min(dist))
aisle <- data.frame(V1=min.i$j,V2=min.j$j)
dist.aisle <- semi_join(d.m, aisle, by = c("V1", "V2"))
# CALCULATING DISTANCE WITHOUT CROSS-AISLES
if (zi == zj){
dij <- (abs(xi - xj) * (2 * dc + wa) + v)
} else if (zi == 1 & zj == 2){
dij <- ((xj - xi) * (2 * dc + wa) + wa + v)
} else {
dij <- ((xj - xi) * (2 * dc * wa) - wa + v)
}
min.dij <- min(dij, (min.i$dist + min.j$dist + dist.aisle$Dist))
}
dist.df <- data.frame(`i` = i, `j` = j, dist = min.dij)
dist.df_total <- rbind.data.frame(dist.df_total, dist.df)
}
return(dist.df_total)
}
aisle.dist <- CrsAisDst(48, 96, 108, 75, 18, 23, whse.data, cmbn.loc, cross.aisles,
dist.cross.aisle)
Output looks like this:
i j dist
7 18 672
7 19 780
7 33 204
....
341 342 108
(Note: this last same I ran was just among cross-aisles, which is why the numbers look the same. I have tested it, though, and it will use the regular formula if it is less distance.)

Jaccard Analysis Similarity

I'm trying to do a Jaccard Analysis from R.
But, after the processing, my result columns are NULL.
I've tried to do a solution from many ways, but the problem still remains.
Can anyone help me please?
#######################PROCESSING JACCARD ANAYLSIS###################
Jaccard = function (x, y) {
M.11 = sum(x == 1 & y == 1)
M.10 = sum(x == 1 & y == 0)
M.01 = sum(x == 0 & y == 1)
return (M.11 / (M.11 + M.10 + M.01))
}
input.variables = data.frame(Q6_01, Q6_02, Q6_03, Q6_04, Q6_05, Q6_06, Q6_07, Q6_08)
m = matrix(data = NA, nrow = length(input.variables), ncol = length(input.variables))
for (r in 1:length(input.variables)) {
for (c in 1:length(input.variables)) {
if (c == r) {
m[r,c] = 1
} else if (c > r) {
m[r,c] = Jaccard(input.variables[,r], input.variables[,c])
}
}
}
variable.names = sapply(input.variables, attr, "label")
colnames(m) = variable.names
rownames(m) = variable.names
jaccards = m
If you are trying to calculate Jaccard similarity, I would suggest checking out the vegan package. The vegdist() function can be used to calculate lots of different similarity/dissimilarity measures. Check out ?vegdist.
For Jaccard, you would use vegdist(your_dataframe_name, method = "jaccard")

avoid R loop and parallelize with snow

I have a large loop that will take too long (~100 days). I'm hoping to speed it up with the snow library, but I'm not great with apply statements. This is only part of the loop, but if I can figure this part out, the rest should be straightforward. I'm ok with a bunch of apply statements or loops, but one apply statement using a function to get object 'p' would be ideal.
Original data
dim(m1) == x x # x >>> 0
dim(m2) == y x # y >>> 0, y > x, y > x-10
dim(mout) == x x
thresh == x-10 #specific to my data, actual number probably unimportant
len(v1) == y #each element is a random integer, min==1, max==thresh
len(v2) == y #each element is a random integer, min==1, max==thresh
Original loop
p <- rep(NA,y)
for (k in 1:y){
mout <- m1 * matrix(m2[k,],x,x)
mout <- mout/sum(mout)
if (v1[k] < thresh + 1){
if(v2[k] < thresh + 1){
p[k] <- out[v1[k],v2[k]]
}
if(v2[k] > thresh){
p[k] <- sum(mout[v1[k],(thresh+1):x])
}
}
#do stuff with object 'p'
}
library(snow)
dostuff <- function(k){
#contents of for-loop
mout <- m1 * matrix(m2[k,],x,x)
mout <- mout/sum(mout)
if (v1[k] < thresh + 1){
if(v2[k] < thresh + 1){
p <- out[v1[k],v2[k]]
}
if(v2[k] > thresh){
p <- sum(mout[v1[k],(thresh+1):x])
}
}
#etc etc
return(list(p,
other_vars))
}
exports = c('m1',
'm2',
'thresh',
'v1',
'x' ,
'v2')
cl = makeSOCKcluster(4)
clusterExport(cl,exports)
loop <- as.array(1:y)
out <- parApply(cl,loop,1,dostuff)
p <- rep(NA,y)
for(k in 1:y){
p[k] <- out[[k]][[1]]
other_vars[k] <- out[[k]][[2]]
}

Error in seq.default(a, length = max(0, b - a - 1)) : length must be non-negative number

I tried running the code below.
set.seed(307)
y<- rnorm(200)
h2=0.3773427
t=seq(-3.317670, 2.963407, length.out=500)
fit=density(y, bw=h2, n=1024, kernel="epanechnikov")
integrate.xy(fit$x, fit$y, min(fit$x), t[407])
However, i recived the following message:
"Error in seq.default(a, length = max(0, b - a - 1)) :
length must be non-negative number"
I am not sure what's wrong.
I do not encounter any problem when i use t[406] or t[408] as follow:
integrate.xy(fit$x, fit$y, min(fit$x), t[406])
integrate.xy(fit$x, fit$y, min(fit$x), t[408])
Does anyone know what's the problem and how to fix it? Appreciate your help please. Thanks!
I went through the source code for the integrate.xy function, and there seems to be a bug relating to the usage of the xtol argument.
For reference, here is the source code of integrate.xy function:
function (x, fx, a, b, use.spline = TRUE, xtol = 2e-08)
{
dig <- round(-log10(xtol))
f.match <- function(x, table) match(signif(x, dig), signif(table,
dig))
if (is.list(x)) {
fx <- x$y
x <- x$x
if (length(x) == 0)
stop("list 'x' has no valid $x component")
}
if ((n <- length(x)) != length(fx))
stop("'fx' must have same length as 'x'")
if (is.unsorted(x)) {
i <- sort.list(x)
x <- x[i]
fx <- fx[i]
}
if (any(i <- duplicated(x))) {
n <- length(x <- x[!i])
fx <- fx[!i]
}
if (any(diff(x) == 0))
stop("bug in 'duplicated()' killed me: have still multiple x[]!")
if (missing(a))
a <- x[1]
else if (any(a < x[1]))
stop("'a' must NOT be smaller than min(x)")
if (missing(b))
b <- x[n]
else if (any(b > x[n]))
stop("'b' must NOT be larger than max(x)")
if (length(a) != 1 && length(b) != 1 && length(a) != length(b))
stop("'a' and 'b' must have length 1 or same length !")
else {
k <- max(length(a), length(b))
if (any(b < a))
stop("'b' must be elementwise >= 'a'")
}
if (use.spline) {
xy <- spline(x, fx, n = max(1024, 3 * n))
if (xy$x[length(xy$x)] < x[n]) {
if (TRUE)
cat("working around spline(.) BUG --- hmm, really?\n\n")
xy$x <- c(xy$x, x[n])
xy$y <- c(xy$y, fx[n])
}
x <- xy$x
fx <- xy$y
n <- length(x)
}
ab <- unique(c(a, b))
xtol <- xtol * max(b - a)
BB <- abs(outer(x, ab, "-")) < xtol
if (any(j <- 0 == apply(BB, 2, sum))) {
y <- approx(x, fx, xout = ab[j])$y
x <- c(ab[j], x)
i <- sort.list(x)
x <- x[i]
fx <- c(y, fx)[i]
n <- length(x)
}
ai <- rep(f.match(a, x), length = k)
bi <- rep(f.match(b, x), length = k)
dfx <- fx[-c(1, n)] * diff(x, lag = 2)
r <- numeric(k)
for (i in 1:k) {
a <- ai[i]
b <- bi[i]
r[i] <- (x[a + 1] - x[a]) * fx[a] + (x[b] - x[b - 1]) *
fx[b] + sum(dfx[seq(a, length = max(0, b - a - 1))])
}
r/2
}
The value given to the xtol argument, is being overwritten in the line xtol <- xtol * max(b - a). But the value of the dig variable is calculated based on the original value of xtol, as given in the input to the function. Because of this mismatch, f.match function, in the line bi <- rep(f.match(b, x), length = k), returns no matches between x and b (i.e., NA). This results in the error that you have encountered.
A simple fix, at least for the case in question, would be to remove the xtol <- xtol * max(b - a) line. But, you should file a bug report with the maintainer of this package, for a more rigorous fix.

Implementing additional standard run rules with R and qcc

I am a newbie with R, and would like to understand what it can do for control charting. I have read articles on qcc and created sample charts in R studio based on my own datasets to generate graphics or simply the underlying data.
It appears that two out of the shewhart control/run rules are implemented in QCC (+/- 3 sigma and a string above/below center), but more have been defined and are frequently used in practice. e.g. Nelson rules
Is there an R library/function that implements these? In addition to implementing the rules, I want to support the option to specify the "constant" related to the rule. For example where the referenced article says "Eight points in a row.." I would like eight to be a parameter. I'm thinking that the $data output from the qcc command could be passed as an argument (along with vector of rule "constant" options), and in return would be a list of violation points and rule number violated.
Any thoughts / suggestions?
We're working on the implementation of Nelson Rules in R. I think this is exactly what you're looking for (happy to share, I couldn't find an R implementation anywhere else on the internet):
nelsonr1 <- function(x, m = mean(x), s = sd(x)) {
# Nelson's QC rule 1: detect values outside + or -3 sd
which(abs((x - m) / s) >= 3)
}
nelsonr2 <- function(x, m = mean(x), minrun = 9) {
# Nelson's QC rule 2: detect runs of >= 9 points on the same side of the mean
n <- length(x)
counts <- sign(x - m)
result <- counts
for (runlength in 2:minrun)
result <- result + c(counts[runlength:n], rep(0, runlength - 1))
which(abs(result) >= minrun)
}
nelsonr3 <- function(x, minrun = 6) {
# Nelson's QC rule 3: detect strict increase or decrease in >= 6 points in a row
# Between 6 points you have 5 instances of increasing or decreasing. Therefore minrun - 1.
n <- length(x)
signs <- sign(c(x[-1], x[n]) - x)
counts <- signs
for (rl in 2:(minrun - 1)) {
counts <- counts + c(signs[rl:n], rep(0, rl - 1))
}
which(abs(counts) >= minrun - 1)
}
nelsonr4 <- function(x, m = mean(x), minrun = 14, directing_from_mean = FALSE) {
# Nelson's QC rule 4: 14 points in a row alternating in direction from the mean,
# or 14 points in a row alternating in increase and decrease
n <- length(x)
if (directing_from_mean == TRUE) {
signs <- sign(x - m)
} else {
signs <- sign(c(x[-1],x[n]) - x)
}
counts <- signs
fac <- -1
for (rl in 2:minrun) {
counts <- counts + fac * c(signs[rl:n], rep(0, rl - 1))
fac <- -fac
}
counts <- abs(counts)
which(counts >= minrun)
}
nelsonr5 <- function(x, m = mean(x), s = sd(x), minrun = 3) {
# Nelson's QC rule 5: two out of 3 >2 sd from mean in the same direction
n <- length(x)
pos <- 1 * ((x - m) / s > 2)
neg <- 1 * ((x - m) / s < -2)
poscounts <- pos
negcounts <- neg
for (rl in 2:minrun) {
poscounts <- poscounts + c(pos[rl:n], rep(0, rl - 1))
negcounts <- negcounts + c(neg[rl:n], rep(0, rl - 1))
}
counts <- apply(cbind(poscounts, negcounts), 1, max)
which(counts >= minrun -1)
}
nelsonr6 <- function(x, m = mean(x), s = sd(x), minrun = 5) {
# Nelson's QC rule 6: four out of five > 1 sd from mean in the same direction
n <- length(x)
pos <- 1 * ((x - m) / s > 1)
neg <- 1 * ((x - m) / s < -1)
poscounts <- pos
negcounts <- neg
for (rl in 2:minrun) {
poscounts <- poscounts + c(pos[rl:n], rep(0, rl - 1))
negcounts <- negcounts + c(neg[rl:n], rep(0, rl - 1))
}
counts <- apply(cbind(poscounts, negcounts), 1, max)
which(counts >= minrun - 1)
}
nelsonr7 <- function(x, m = mean(x), s = sd(x), minrun = 15) {
# Nelson's QC rule 7: >= 15 points in a row within 1 sd from the mean
n <- length(x)
within <- 1 * (abs((x - m) / s) < 1)
counts <- within
for (rl in 2:minrun)
counts <- counts + c(within[rl:n], rep(0, rl - 1))
which(counts >= minrun)
}
nelsonr8 <- function(x, m = mean(x), s = sd(x), minrun = 8) {
# Nelson's QC rule 8: >= 8 points in a row all outside the m + -1s range
n <- length(x)
outofrange <- 1 * (abs((x - m) / s) > 1)
counts <- outofrange
for (rl in 2:minrun)
counts <- counts + c(outofrange[rl:n], rep(0, rl - 1))
which(counts >= minrun)
}
For example where the referenced article says "Eight points in a row.." I would like eight to be a parameter.
That's what this does too with the parameter minrun in some functions.

Resources