How to simplifying this R Code to detect repeated sequence - r

I found this below function to detect repeated sequence. I integrate the function into Monte Carlo Simulation to calculate the probability. The function I have is too long and takes too much time during the simulation. I would appreciate if anyone can help to simply the function and in turn fasten any simulation depends on it.
V1 <- c(68,71,72,69,80,78,80,81,84,82,67,73,65,68,66,70,69,72,74,73,68,75,70,72,75,73,69,75,74,79,80,78,80,81,79,82,69,73,67,66,70,72,69,72,75,80,68,69,71,77,70,73)
Check_repeat_Seq <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
return(max(dat$repeat_length))
}
Check_repeat_Seq(V1)
#### Can you please simplify the following also to calculate the sum of repeated.####
Check_repeat_Seq_no_overlap_sum <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
vec <- unlist(unname(L))
nms <- names(vec)
is_le <- function(i) any(grepl(nms[i], tail(nms, -i)) & (vec[i] <= tail(vec, -i)))
LL <- vec[ ! sapply(seq_along(nms), is_le) ]
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
dat$total_repeat <- dat$seq_rep*dat$repeat_length
return(sum(dat$total_repeat))
}
##### the original function should return data Frame as follows
Check_All_repeat_Seq<- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
vec <- unlist(unname(L))
nms <- names(vec)
is_le <- function(i) any(grepl(nms[i], tail(nms, -i)) & (vec[i] <= tail(vec, -i)))
LL <- vec[ ! sapply(seq_along(nms), is_le) ]
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
dat$total_repeat <- dat$seq_rep*dat$repeat_length
return(sum(dat))
}
please help simplifying the code with the same output

Update
An even faster iterative approach leveraging the Cantor pairing function:
allDup <- function(x) {
duplicated(x) | duplicated(x, fromLast = TRUE)
}
fPair <- function(i, j) {
# Cantor pairing function
k <- j + (i + j)*(i + j + 1L)/2L
match(k, unique(k))
}
Check_repeat_Seq3 <- function(v) {
v <- match(v, unique(v))
vPair <- fPair(head(v, -1), tail(v, -1))
blnKeep <- allDup(vPair)
idx <- which(blnKeep)
len <- 1L
while (length(idx)) {
len <- len + 1L
vPair <- fPair(vPair[blnKeep], v[idx + len])
blnKeep <- allDup(vPair)
idx <- idx[blnKeep]
}
return(len)
}
# benchmark against the rollaply solution
V1 <- c(68,71,72,69,80,78,80,81,84,82,67,73,65,68,66,70,69,72,74,73,68,75,70,72,75,73,69,75,74,79,80,78,80,81,79,82,69,73,67,66,70,72,69,72,75,80,68,69,71,77,70,73)
Check_repeat_Seq <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
return(max(dat$repeat_length))
}
Check_repeat_Seq(V1)
#> [1] 4
Check_repeat_Seq3(V1)
#> [1] 4
microbenchmark::microbenchmark(Check_repeat_Seq(V1), Check_repeat_Seq3(V1))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> Check_repeat_Seq(V1) 38445.7 40860.95 43153.058 42249.25 44051.15 60593.2 100
#> Check_repeat_Seq3(V1) 103.9 118.65 150.713 149.05 160.05 465.2 100
Original Solution
Check_repeat_Seq2 <- function(v) {
m <- matrix(c(head(v, -1), tail(v, -1)), ncol = 2)
idx <- which(duplicated(m) | duplicated(m, fromLast = TRUE))
len <- 2L
while (length(idx)) {
len <- len + 1L
m <- matrix(v[sequence(rep(len, length(idx)), idx)], ncol = len, byrow = TRUE)
idx <- idx[duplicated(m) | duplicated(m, fromLast = TRUE)]
}
return(len - 1L)
}
UPDATE 2
This should return your dat data.frame:
Check_repeat_Seq3 <- function(v) {
v1 <- match(v, unique(v))
vPair <- fPair(head(v1, -1), tail(v1, -1))
blnKeep <- allDup(vPair)
idx <- which(blnKeep)
if (length(idx)) {
len <- 1L
seq_rep <- integer(length(v)/2)
while (length(idx)) {
len <- len + 1L
vPair <- fPair(vPair[blnKeep], v1[idx + len])
blnKeep <- allDup(vPair)
seq_rep[len] <- nrow(unique(matrix(v[sequence(rep(len, length(blnKeep)), idx)], ncol = len, byrow = TRUE)))
idx <- idx[blnKeep]
}
len <- 2:len
return(data.frame(seq_rep = seq_rep[len], repeat_length = len, total_repeat = seq_rep[len]*len))
} else {
return(data.frame(seq_rep = integer(0), repeat_length = integer(0), total_repeat = integer(0)))
}
}

Related

Why do I get the error "number of items to replace is not a multiple of replacement length" when running the sppba function of the WRS2 package?

I would be super grateful for some help. I don't have a coding background and I am confused by the error message I am getting when running the sppb functions of the WRS2 package. These functions perform a robust mixed ANOVA using bootstrapping.
sppba(formula = score ~ my_between_variable * my_within_variable, id = participant_code, data = df_long_T2)
Error in xmat[, k] <- x[[kv]] :
number of items to replace is not a multiple of replacement length
I get the same error for all three sppb functions. The functions look the same except that instead of sppba the others say sppbb and sppbi. I don't even know what the functions are trying to replace. The functions work for me with other data.
The classes of all the things involved seem fine: score is numeric, order_supplement and time are factors, participant_code is character, df_long_T2 is a dataframe. I have 120 participants, 61 in one group and 59 in the other, with two observations per participant. There are no NAs in the columns involved.
Traceback() just gives me the one line of code above and the error message.
Debug() gives me this and I don't know what to make of it:
"Debug location is approximate because location is not available"
function (formula, id, data, est = "mom", avg = TRUE, nboot = 500,
MDIS = FALSE, ...)
{
if (missing(data)) {
mf <- model.frame(formula)
}
else {
mf <- model.frame(formula, data)
}
cl <- match.call()
est <- match.arg(est, c("mom", "onestep", "median"), several.ok = FALSE)
mf1 <- match.call()
m <- match(c("formula", "data", "id"), names(mf1), 0L)
mf1 <- mf1[c(1L, m)]
mf1$drop.unused.levels <- TRUE
mf1[[1L]] <- quote(stats::model.frame)
mf1 <- eval(mf1, parent.frame())
random1 <- mf1[, "(id)"]
depvar <- colnames(mf)[1]
if (all(length(table(random1)) == table(mf[, 3]))) {
ranvar <- colnames(mf)[3]
fixvar <- colnames(mf)[2]
}
else {
ranvar <- colnames(mf)[2]
fixvar <- colnames(mf)[3]
}
MC <- FALSE
K <- length(table(mf[, ranvar]))
J <- length(table(mf[, fixvar]))
p <- J * K
grp <- 1:p
est <- get(est)
fixsplit <- split(mf[, depvar], mf[, fixvar])
indsplit <- split(mf[, ranvar], mf[, fixvar])
dattemp <- mapply(split, fixsplit, indsplit, SIMPLIFY = FALSE)
data <- do.call(c, dattemp)
x <- data
jp <- 1 - K
kv <- 0
kv2 <- 0
for (j in 1:J) {
jp <- jp + K
xmat <- matrix(NA, ncol = K, nrow = length(x[[jp]]))
for (k in 1:K) {
kv <- kv + 1
xmat[, k] <- x[[kv]]
}
xmat <- elimna(xmat)
for (k in 1:K) {
kv2 <- kv2 + 1
x[[kv2]] <- xmat[, k]
}
}
xx <- x
nvec <- NA
jp <- 1 - K
for (j in 1:J) {
jp <- jp + K
nvec[j] <- length(x[[jp]])
}
bloc <- matrix(NA, nrow = J, ncol = nboot)
mvec <- NA
ik <- 0
for (j in 1:J) {
x <- matrix(NA, nrow = nvec[j], ncol = K)
for (k in 1:K) {
ik <- ik + 1
x[, k] <- xx[[ik]]
if (!avg)
mvec[ik] <- est(xx[[ik]])
}
tempv <- apply(x, 2, est)
data <- matrix(sample(nvec[j], size = nvec[j] * nboot,
replace = TRUE), nrow = nboot)
bvec <- matrix(NA, ncol = K, nrow = nboot)
for (k in 1:K) {
temp <- x[, k]
bvec[, k] <- apply(data, 1, rmanogsub, temp, est)
}
if (avg) {
mvec[j] <- mean(tempv)
bloc[j, ] <- apply(bvec, 1, mean)
}
if (!avg) {
if (j == 1)
bloc <- bvec
if (j > 1)
bloc <- cbind(bloc, bvec)
}
}
if (avg) {
d <- (J^2 - J)/2
con <- matrix(0, J, d)
id <- 0
Jm <- J - 1
for (j in 1:Jm) {
jp <- j + 1
for (k in jp:J) {
id <- id + 1
con[j, id] <- 1
con[k, id] <- 0 - 1
}
}
}
if (!avg) {
MJK <- K * (J^2 - J)/2
JK <- J * K
MJ <- (J^2 - J)/2
cont <- matrix(0, nrow = J, ncol = MJ)
ic <- 0
for (j in 1:J) {
for (jj in 1:J) {
if (j < jj) {
ic <- ic + 1
cont[j, ic] <- 1
cont[jj, ic] <- 0 - 1
}
}
}
tempv <- matrix(0, nrow = K - 1, ncol = MJ)
con1 <- rbind(cont[1, ], tempv)
for (j in 2:J) {
con2 <- rbind(cont[j, ], tempv)
con1 <- rbind(con1, con2)
}
con <- con1
if (K > 1) {
for (k in 2:K) {
con1 <- push(con1)
con <- cbind(con, con1)
}
}
}
if (!avg)
bcon <- t(con) %*% t(bloc)
if (avg)
bcon <- t(con) %*% (bloc)
tvec <- t(con) %*% mvec
tvec <- tvec[, 1]
tempcen <- apply(bcon, 1, mean)
vecz <- rep(0, ncol(con))
bcon <- t(bcon)
temp = bcon
for (ib in 1:nrow(temp)) temp[ib, ] = temp[ib, ] - tempcen +
tvec
bcon <- rbind(bcon, vecz)
if (!MDIS) {
if (!MC)
dv = pdis(bcon, center = tvec)
}
if (MDIS) {
smat <- var(temp)
bcon <- rbind(bcon, vecz)
chkrank <- qr(smat)$rank
if (chkrank == ncol(smat))
dv <- mahalanobis(bcon, tvec, smat)
if (chkrank < ncol(smat)) {
smat <- ginv(smat)
dv <- mahalanobis(bcon, tvec, smat, inverted = T)
}
}
bplus <- nboot + 1
sig.level <- 1 - sum(dv[bplus] >= dv[1:nboot])/nboot
tvec1 <- data.frame(Estimate = tvec)
if (avg) {
tnames <- apply(combn(levels(mf[, fixvar]), 2), 2, paste0,
collapse = "-")
rownames(tvec1) <- tnames
}
else {
fixcomb <- apply(combn(levels(mf[, fixvar]), 2), 2,
paste0, collapse = "-")
rnames <- levels(mf[, ranvar])
tnames <- as.vector(t(outer(rnames, fixcomb, paste)))
rownames(tvec1) <- tnames
}
result <- list(test = tvec1, p.value = sig.level, contrasts = con,
call = cl)
class(result) <- c("spp")
result
}
I expected to get an output like this:
## Test statistics:
## Estimate
## time1-time2 0.3000
##
## Test whether the corrresponding population parameters are the same:
## p-value: 0.37

Sample function in R vector combining

I need your help, I need to combine two vectors(z and Num1 or Num2), so z will 10 in final vector and Num1(Num2) was 90 in final vector.
Code that I have now:
I <- seq(1:100)
NA1<-vector()
NA2<-vector()
z <- rep(NA, 10)
Num1 <- rnorm(100)
Num2 <- rnorm(100)
vect_1 <- sample(c(Num1, z))
vect_2 <- sample(c(Num2, z))
vect_1_NA <- is.na(vect_1)
vect_2_NA <- is.na(vect_2)
for(i in I){
if(vect_1_NA[i] == TRUE)
NA1 <- append(NA1, i)
}
for(i in I){
if(vect_2_NA[i] == TRUE)
NA2 <- append(NA2, i)
}

how to get each output of iteration

i have to do 1000 iteration for this SIMPLS function to get the value of the coefficient. my problem is how to get the value of the coefficient for each iteration? can I print the output for iteration?
n = 10
k = 20
a = 2
coef = matrix(0,nrow=20, ncol=10)
for (i in 1:1000) {
t[,i] = matrix(rnorm(n%*%a,0,1), ncol=a) # n x a
p[,i] = matrix(rnorm(k%*%a,0,1), ncol=a) # k x a
B[,i] = matrix(rnorm(k,0,0.001), nrow=k, ncol=1) # k x 1
e[,i] = matrix(rcauchy(n,location=0,scale=1), nrow=n, ncol=1)##standard cauchy
x[,i] = t%*%t(p) ## explanatary variable xi
y[,i] = (t%*%(t(p)%*%B)) + e ## response variable yi
simpls <- function(y, x, a) {
n <- nrow(x)
k <- ncol(x)
m <- NCOL(y)
y <- matrix(y)
Ps <- matrix(0, k, a)
Cs <- matrix(0, m, a)
Rs <- matrix(0, k, a)
Ts <- matrix(0, n, a)
mx <- apply(x, 2, mean)
sdx <- apply(x, 2, sd)
x <- sapply(1:k, function(i) (x[,i]-mx[i]))
my <- apply(y, 2, mean)
sdy <- apply(y, 2, sd)
y <- sapply(1:m, function(i) (y[,i]-my[i]))
S <- t(x)%*%y
Snew <- S
for (i in 1:a) {
rs <- svd(Snew)$u[,1,drop=FALSE]
rs <- rs/norm(rs,type="F")
ts <- x%*%rs
ts <- ts/norm(ts,type="F")
ps <- t(x)%*%ts
cs <- t(y)%*%ts
Rs[,i] <- rs
Ts[,i] <- ts
Ps[,i] <- ps
Cs[,i] <- cs
Snew <- Snew-Ps[,1:i]%*%solve(t(Ps[,1:i])%*%Ps[,1:i])%*%t(Ps[,1:i])%*%Snew
}
coef[,i] <- matrix(drop(Rs%*%(solve(t(Ps)%*%Rs)%*%t(Cs))))
yfit <- x%*%coef
orgyfit <- yfit+my
res <- y-yfit
SSE <- sum((y-yfit)^2)
scale <- sqrt(SSE/(n-a))
stdres <- sapply(1:m, function(i) (res[,i]-mean(res[,i]))/sqrt(var(res[,i])))
hatt <- diag(Ts%*%solve(t(Ts)%*%Ts)%*%t(Ts))
result <- list(coef=coef, fit=orgyfit, res=res, SSE=SSE,scale=scale, stdres=stdres, leverage=hatt,Ts=Ts,Rs=Rs,Ps=Ps,Cs=Cs)
}
}
print(coef)
You can just add your coef to a vector for every iteration. I've created an example here:
coef_vector <- NULL
for (i in 1:10) {
loop_coef <- i*2
coef_vector <- c(coef_vector, loop_coef)
}
Result:
> coef_vector
[1] 2 4 6 8 10 12 14 16 18 20
>
Of course, if your coef is more complex than a variable, you can add it to a list instead of a vector.

Sum value by Combine all variable using R

Can somebody help me with data manipulation using R? i have data (data.train) like this
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
i need all combination from variable and the final form is like this
I have tried this code:
### Library
library(dplyr)
## datex
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
names.group <- names(data.train)[1:length(data.train)-1]
data.group <- Map(combn, list(names.group), seq_along(names.group), simplify = F) %>% unlist(recursive = F)
find.index <- sapply(data.group, function(x, find.y){
any(find.y %in% x)
}, find.y = c("datex"))
index.group <- NULL
for(i in 2:length(find.index)){
if(find.index[i] == "TRUE"){
index.group[i] <- i
}
}
index.group[is.na(index.group)] <- 0
for(i in 1:length(data.group)){
if(index.group[i] == 0){
data.group[[i]] <- 0
} else {
data.group[[i]] <- data.group[[i]]
}
}
data.group2 <- data.group[sapply(data.group, function(x) any(x != 0))]
combination.result <- lapply(data.group2, FUN = function(x) {
do.call(what = group_by_, args = c(list(data.train), x)) %>% summarise(sumVar = sum(valx))
})
combination.result
but i don't produce what i want. Thanks
You can generate for combinations of length 1 then for combinations of length 2. Use paste to create your Variable column. Then rbindlist all your results to get the final output.
library(data.table)
setDT(data.train)
sumCombi <- function(x, mySep="_") {
data.train[ , sum(Val), by=c("Date", x)][,
list(Date,
Variable=do.call(paste, c(.SD[,x,with=FALSE], list(sep=mySep))),
SumVal=V1)]
}
rbindlist(c(
#combinations with 1 element in each combi
lapply(c("A", "B", "C"), sumCombi)
,
#combinations with 2 elements in each combi
lapply(combn(c("A","B","C"), 2, simplify=FALSE), sumCombi)
), use.names=FALSE)
or more generically/programmatically:
#assuming that your columns are in the middle of the columns while excl. first and last columns
myCols <- names(data.train)[-c(1, ncol(data.train))]
rbindlist(unlist(
lapply(seq_along(myCols), function(n)
combn(myCols, n, sumCombi, simplify=FALSE)
), recursive=FALSE),
use.names=FALSE)

prediction.strength in Package fpc

I am using the function prediction.strength in the r Package fpc with k-medoids algorithms.
here is my code
prediction.strength(data,2,6,M=10,clustermethod=pamkCBI,DIST,krange=2:6,diss=TRUE,usepam=TRUE)
somehow I get the error message
Error in switch(method, kmeans = kmeans(xdata[indvec[[l]][[i]], ], k, :
EXPR must be a length 1 vector
Does anybody have experience with this r command? There are simple examples like
iriss <- iris[sample(150,20),-5]
prediction.strength(iriss,2,3,M=3,method="pam")
but my problem is that I am using dissimilarity matrix instead of the data itself for the k-medoids algorithms. I don't know how should I correct my code in this case.
Please note that in the package help the following is stated for the prediction.strength:
xdats - data (something that can be coerced into a matrix). Note that this can currently
not be a dissimilarity matrix.
I'm afraid you'll have to hack the function to get it to handle a distance matrix. I'm using the following:
pred <- function (distance, Gmin = 2, Gmax = 10, M = 50,
classification = "centroid", cutoff = 0.8, nnk = 1, ...)
{
require(cluster)
require(class)
xdata <- as.matrix(distance)
n <- nrow(xdata)
nf <- c(floor(n/2), n - floor(n/2))
indvec <- clcenters <- clusterings <- jclusterings <- classifications <- list()
prederr <- list()
dist <- as.matrix(distance)
for (k in Gmin:Gmax) {
prederr[[k]] <- numeric(0)
for (l in 1:M) {
nperm <- sample(n, n)
indvec[[l]] <- list()
indvec[[l]][[1]] <- nperm[1:nf[1]]
indvec[[l]][[2]] <- nperm[(nf[1] + 1):n]
for (i in 1:2) {
clusterings[[i]] <- as.vector(pam(as.dist(dist[indvec[[l]][[i]],indvec[[l]][[i]]]), k, diss=TRUE))
jclusterings[[i]] <- rep(-1, n)
jclusterings[[i]][indvec[[l]][[i]]] <- clusterings[[i]]$clustering
centroids <- clusterings[[i]]$medoids
j <- 3 - i
classifications[[j]] <- classifdist(as.dist(dist), jclusterings[[i]],
method = classification, centroids = centroids,
nnk = nnk)[indvec[[l]][[j]]]
}
ps <- matrix(0, nrow = 2, ncol = k)
for (i in 1:2) {
for (kk in 1:k) {
nik <- sum(clusterings[[i]]$clustering == kk)
if (nik > 1) {
for (j1 in (1:(nf[i] - 1))[clusterings[[i]]$clustering[1:(nf[i] -
1)] == kk]) {
for (j2 in (j1 + 1):nf[i]) if (clusterings[[i]]$clustering[j2] ==
kk)
ps[i, kk] <- ps[i, kk] + (classifications[[i]][j1] ==
classifications[[i]][j2])
}
ps[i, kk] <- 2 * ps[i, kk]/(nik * (nik -
1))
}
}
}
prederr[[k]][l] <- mean(c(min(ps[1, ]), min(ps[2,
])))
}
}
mean.pred <- numeric(0)
if (Gmin > 1)
mean.pred <- c(1)
if (Gmin > 2)
mean.pred <- c(mean.pred, rep(NA, Gmin - 2))
for (k in Gmin:Gmax) mean.pred <- c(mean.pred, mean(prederr[[k]]))
optimalk <- max(which(mean.pred > cutoff))
out <- list(predcorr = prederr, mean.pred = mean.pred, optimalk = optimalk,
cutoff = cutoff, method = clusterings[[1]]$clustermethod,
Gmax = Gmax, M = M)
class(out) <- "predstr"
out
}

Resources