How to make my program faster for pattern matching? - r

I don't like the way I have coded this and I think there should be far faster algorithms than this or R might have facilities to work this out.
four.in.a.row = function(player, v, debug=TRUE) {
if (debug) {
cat(sep="", "four.in.a.row(player=", player, ", v=", v, ")\n")
}
for (i in 1:(length(v)-4))
if (v[i]==player)
if (v[i]==v[i+1])
if (v[i+1]==v[i+2])
if (v[i+2]==v[i+3])
return(TRUE)
# ...
return(FALSE)
}
# four.in.a.row("X", c("O","X","X","X","X","O"))
# is TRUE, while
# four.in.a.row("O", c("O","X","X","X","X","O"))
# is FALSE.
Please let me know how the complexity can be improved. Also can you please tell me what is the complexity of the current algorithm?
Also when I use the following I receive an error:
four.in.a.row = function(player, v, debug=TRUE) {
if (debug) {
cat(sep="", "four.in.a.row(player=", player, ", v=", v, ")\n")
}
for (i in 1:(length(v)-4))
{
count=1
if (v[i]==player)
for (j in i+1:i+3)
if (v[i]==v[j])
count=count+1
if (count==4)
return(TRUE)
}
# ...
return(FALSE) # correct this return() statement
}
Error in if (v[i] == v[j]) count = count + 1 :
missing value where TRUE/FALSE needed

You can use rle:
four.in.a.row <- function(player, v) {
with(rle(v), any(lengths >= 4 & values == player))
}
Here is a more cryptic approach that should be even faster:
four.in.a.row <- function(player, v) {
x <- player == v
in.a.row <- seq_along(x) - cummax(seq_along(x) * !x)
any(in.a.row >= 4)
}
Finally, you could use regular expressions:
four.in.a.row = function(player, v) {
x <- paste(v, collapse = "")
pattern <- paste(rep(player, 4), collapse = "")
grepl(pattern, x)
}
You would have to test on your data to see which approach is faster. If fast enough, I would go with the rle approach for its conciseness.

Related

How to convert equation from R code into readable text?

I have defined multiple different functions, each containing one equation each, like so:
catalanFormula <- function(n){
return( (factorial(2 * n)) / ((factorial(n + 1)) * factorial(n)) )
}
triangularFormula <- function(n){
return( (n * (n+1)) / 2 )
}
I am then plotting each function using the plot() function from base R. What I want to do is be able to include the equation in my plot as a label or text, but in a way that is easily understandable. I know I can use LaTeX to write each equation manually, but I was wondering if there is a package or method anyone knows of that can deparse a mathematical equation in R and turn it into a standard readable mathematical equation. The only math occurring is factorials, multiplication, division, exponents, addition, and subtraction.
For example, I want to convert
(factorial(2 * n)) / ((factorial(n + 1)) * factorial(n))
into something that looks like this.
Then, be able to use the output within the text() function of base R plotting.
With the Ryacas package you can transform a math expression to its corresponding LaTeX code. But the factorial must be given as !:
library(Ryacas)
eq <- yac_symbol("(2 * n)! / ((n + 1)! * n!)")
tex(eq)
# "\\frac{\\left( 2 n\\right) !}{\\left( n + 1\\right) ! n!}"
Another possibility is to call Python from R and use the pytexit library:
from pytexit import py2tex
py2tex("math.factorial(2 * n) / ((factorial(n + 1)) * factorial(n))")
# $$\frac{\operatorname{factorial}\left(2n\right)}{\operatorname{factorial}\left(n+1\right) \operatorname{factorial}\left(n\right)}$$
You can also take a look at PyLaTeX and lax.
Here's a base R function that will walk the abstract syntax tree to replace the factorial and / functions with the corresponding ?plotmath markup so you can add them to R plots.
returnToPlotmath <- function(fun) {
swap <- function(x) {
if (class(x) %in% c("call","(")) {
x <- as.list(x)
if (as.character(x[[1]])=="/") {
x[[1]] = quote(frac)
x[[2]] = swap(x[[2]])
x[[3]] = swap(x[[3]])
} else if (as.character(x[[1]])=="factorial") {
x[[1]] = quote(`*`)
if (is.call(x[[2]])) {
x[[2]] = as.call(list(quote(`(`), x[[2]]))
} else {
x[[2]] = swap(x[[2]])
}
x[[3]] = "!"
} else if (as.character(x[[1]])=="*") {
if(is.call(x[[2]]) | is.call(x[[3]])) {
x[[1]] = quote(`%.%`)
x[[2]] = swap(x[[2]])
x[[3]] = swap(x[[3]])
}
} else {
x[[2]] = swap(x[[2]])
if (length(x)==3) x[[3]] = swap(x[[3]])
}
return(as.call(x))
} else {
return(x)
}
}
body_exprs <- body(fun)[[2]]
swap(body_exprs[[length(body_exprs)]])
}
This does assume that the return is the last statement in the function body {} block. You can get the expression with
returnToPlotmath(catalanFormula)
# frac(((2 * n) * "!"), (((n + 1) * "!") %.% (n * "!")))
returnToPlotmath(triangularFormula)
# frac((n %.% (n + 1)), 2)
And you can add them to plot titles and such
plot(main=returnToPlotmath(catalanFormula), 1, 1)
plot(main=returnToPlotmath(triangularFormula), 1, 1)
This solution is highly specific to the functions you need to transform. But it could be extended if needed.
The expr2latex() function from simsalapar parses expressions to LaTeX, and can be extended to include the factorial symbol with a minor addition to the code:
expr2latex2 <- function(expr) {
L <- length(expr)
c.BinTable <- simsalapar:::c.BinTable
if(!L) "" else {
Symb <- is.symbol(expr)
F <- if(Symb) expr else expr[[1]]
cF <- simsalapar:::mDeparse(F)
FF <- simsalapar:::renderAtom(F, Len=L, d.a = cF)
if(Symb && L != 1)
stop("is.symbol(.), but length(.) = ", L, " != 1")
else if(!Symb && typeof(expr) != "language" && L != 1)
stop("is not language nor symbol), but length(.) = ", L, " != 1")
switch(L,
## length 1:
FF,
{ ## length 2: e.g. "- 1", "+ x", "!TRUE", "~ ff",
#browser()
rhs <- expr2latex2(expr[[2]])
if (cF == "bold") paste0("\\mathbf{", rhs, "}")
else if(cF == "italic") paste0("\\mathit{", rhs, "}")
else if(cF == "factorial") paste0("(",rhs,")!") #extra case added in for factorial
else if(!simsalapar:::isOp(cF)) # not a binary operator ==> "function call":
paste0(FF,"(",rhs,")") ## e.g. "O(n)"
else if(cF == "{") paste0("{", rhs, "}")
else if(cF == "(") paste0("(", rhs, ")")
else paste(FF, rhs)
},
{ ## length 3:
lhs <- expr2latex2(expr[[2]])
rhs <- expr2latex2(expr[[3]])
if(cF == "[") ## subscript
paste0(lhs, "_{", rhs, "}")
else if(cF == "~") ## space
paste(lhs, "\\", rhs)
## not treated, as plotmath() does neither :
## else if(cF == "[[")
## paste0(lhs, "[[", rhs, "]]")
else if(cF %in% c.BinTable)
paste(lhs, simsalapar:::getTab(cF, simsalapar:::BinTable), rhs)
else if(cF %in% c.RelTable)
paste(lhs, simsalapar:::getTab(cF, simsalapar:::RelTable), rhs)
else if(simsalapar:::isOp(cF)) ## e.g. U + x
paste(lhs, FF, rhs)
else ## log(x, 2)
paste0(FF, "(", lhs, ",", rhs, ")")
},
## length >=4 : F(a, b, c, ...)
stop("length(expr) = ",L," (>= 4); not yet implemented") # TODO MM
)## end{switch}
}
}
original function
expr2latex( quote( (factorial(2 * n)) / ((factorial(n + 1)) * factorial(n)) ) )
#[1] "(factorial(2 n)) / ((factorial(n + 1)) factorial(n))"
revised treatment of factorials
expr2latex2( quote( (factorial(2 * n)) / ((factorial(n + 1)) * factorial(n)) ) )
#[1] "((2 n)!) / (((n + 1)!) (n)!)"

How to best combine unique and match in R?

I found myself often writing code such as
#' #param x input vector
#' #param ... passed to [slow_fun()]
fast_fun <- function(x, ...) {
u <- unique(x)
i <- match(x, u)
v <- slow_fun(u, ...)
v[i]
}
To accelerate a slow vectorized "pure" function where each input entry could theoretically be computed individually and where input is expected to contain many duplicates.
Now I wonder whether this is the best way to achieve such a speedup or is there some function (preferrably in base R or the tidyverse) which does something like unique and match at the same time?
Benchmarks so far
Thanks for the provided answers. I've written a small benchmark suite to compare the approaches:
method <- list(
brute = slow_fun,
unique_match = function(x, ...) {
u <- unique(x)
i <- match(x, u)
v <- slow_fun(u, ...)
v[i]
},
unique_factor = function(x, ...) {
if (is.character(x)) {
x <- factor(x)
i <- as.integer(x)
u <- levels(x)
} else {
u <- unique(x)
i <- as.integer(factor(x, levels = u))
}
v <- slow_fun(u, ...)
v[i]
},
unique_match_df = function(x, ...) {
u <- unique(x)
i <- if (is.numeric(x)) {
match(data.frame(t(round(x, 10))), data.frame(t(round(u, 10))))
} else {
match(data.frame(t(x)), data.frame(t(u)))
}
v <- slow_fun(u, ...)
v[i]
},
rcpp_uniquify = function(x, ...) {
iu <- uniquify(x)
v <- slow_fun(iu[["u"]], ...)
v[iu[["i"]]]
}
)
exprs <- lapply(method, function(fun) substitute(fun(x), list(fun = fun)))
settings$bench <- lapply(seq_len(nrow(settings)), function(i) {
cat("\rBenchmark ", i, " / ", nrow(settings), sep = "")
x <- switch(
settings$type[i],
integer = sample.int(
n = settings$n_distinct[i],
size = settings$n_total[i],
replace = TRUE
),
double = sample(
x = runif(n = settings$n_distinct[i]),
size = settings$n_total[i],
replace = TRUE
),
character = sample(
x = stringi::stri_rand_strings(
n = settings$n_distinct[i],
length = 20L
),
size = settings$n_total[i],
replace = TRUE
)
)
microbenchmark::microbenchmark(
list = exprs
)
})
library(tidyverse)
settings %>%
mutate(
bench = map(bench, summary)
) %>%
unnest(bench) %>%
group_by(n_distinct, n_total, type) %>%
mutate(score = median / min(median)) %>%
group_by(expr) %>%
summarise(mean_score = mean(score)) %>%
arrange(mean_score)
Currently, the rcpp-based approach is best in all tested settings on my machine but barely manages to exceed the unique-then-match method.
I suspect a greater advantage in performance the longer x becomes, because unique-then-match needs two passes over the data while uniquify() only needs one pass.
|expr | mean_score|
|:---------------|----------:|
|rcpp_uniquify | 1.018550|
|unique_match | 1.027154|
|unique_factor | 5.024102|
|unique_match_df | 36.613970|
|brute | 45.106015|
Maybe you can try factor + as.integer like below
as.integer(factor(x))
I found a cool, and fast, answer recently,
match(data.frame(t(x)), data.frame(t(y)))
As always, beware when working with floats. I recommend something like
match(data.frame(t(round(x,10))), data.frame(t(round(y))))
in such cases.
I've finally managed to beat unique() and match() using Rcpp to hand-code the algorithm in C++ using a std::unordered_map as core bookkeeping data structure.
Here is the source code, which can be used in R by writing it into a file and running Rcpp::sourceCpp on it.
#include <Rcpp.h>
using namespace Rcpp;
template <int T>
List uniquify_impl(Vector<T> x) {
IntegerVector idxes(x.length());
typedef typename Rcpp::traits::storage_type<T>::type storage_t;
std::unordered_map<storage_t, int> unique_map;
int n_unique = 0;
// 1. Pass through x once
for (int i = 0; i < x.length(); i++) {
storage_t curr = x[i];
int idx = unique_map[curr];
if (idx == 0) {
unique_map[curr] = ++n_unique;
idx = n_unique;
}
idxes[i] = idx;
}
// 2. Sort unique_map by its key
Vector<T> uniques(unique_map.size());
for (auto &pair : unique_map) {
uniques[pair.second - 1] = pair.first;
}
return List::create(
_["u"] = uniques,
_["i"] = idxes
);
}
// [[Rcpp::export]]
List uniquify(RObject x) {
switch (TYPEOF(x)) {
case INTSXP: {
return uniquify_impl(as<IntegerVector>(x));
}
case REALSXP: {
return uniquify_impl(as<NumericVector>(x));
}
case STRSXP: {
return uniquify_impl(as<CharacterVector>(x));
}
default: {
warning(
"Invalid SEXPTYPE %d (%s).\n",
TYPEOF(x), type2name(x)
);
return R_NilValue;
}
}
}

using else if in a function in R

I 'm trying to use else if within a function in R
new<-function(a,b,c,d){
if (a==1){
var1<-100+100+100
var2<-500+500+500
var3<-500-100
}else if (a==2){
var1<-100+100
var2<-500+500
var3<-500-10
} else if (a==3){
var1<-100
var2<-500
var3<-500
}
b<-var1-var2
c<- var2+var3
d<-var3-var1
if (b<100)
{print ("the value is good")
}else if (b>100)
{ print("check teh value")
}else
{print ("repeat")
}
}
output<-new(3,b,c,d)
I feel something fundamental is wrong with this which I m missing. I 'm trying to use else if to populate the values to be used as an input to call the same fucntion.
Help is greatly appreciated
You can simplify to:
new <- function(a) {
if (a == 1) {
b <- -1200
} else if (a == 2) {
b <- -800
} else if (a == 3) {
b <- -400
}
if (b < 100)
{
print ("the value is good")
} else if (b > 100)
{
print("check the value")
} else
{
print ("repeat")
}
return(b) # for fun return the value of b
}
output <- new(2)
But this function will always have the side effect of printing "the value is good" because b always evaluates to a negative number. What's the purpose?
If you would want to return values a to d, you could do the intermediate calculations as per your example and do: return(list(a = a, b = b, c = c, d = d)) instead of return(b).

implement matrix determinant in R

I was asked to implement function that calculates n-dimensional matrix determinant using Laplace expansion. This involves recursion. I developed this:
minor<-function(A,i,j) {
return(A[c(1:(i-1),(i+1):dim(A)[1]),c(1:(j-1),(j+1):dim(A)[2])])
}
determinantRec<-function(X,k) {
if (dim(X)[1] == 1 && dim(X)[2] == 1) return(X[1][1])
else {
s = 0
for (i in 1:dim(X)[2]) {
s = s + X[k][i]*(-1)^(k+i)*determinantRec(minor(X,k,i),k)
}
return(s)
}
}
where k in determinantRec(X,k) function indicates which row I want to use Laplace expansion along of.
My problem is when I run determinantRec(matrix(c(1,2,3,4),nrow = 2,ncol = 2),1) this error appears:
C stack usage 7970628 is too close to the limit
What is wrong with my code?
#julia, there is one simple type in your code. Just remove the '*' at the end of the definition of 's'. And don't indent the recursion.
determinantRek<-function(X,k) {
if (dim(X)[1] == 1 && dim(X)[2] == 1)
return(X[1,1])
if (dim(X)[1] == 2 && dim(X)[2] == 2)
return(X[1,1]*X[2,2]-X[1,2]*X[2,1])
else
s = 0
for (i in 1:dim(X)[2]) {
s = s + X[k,i]*(-1)^(k+i)
determinantRek(X[-k,-i],k)
}
return(s)
}
I did this way and works just fine, although it is super slow, compared to the det function in base R
laplace_expansion <- function(mat){
det1 <- function(mat){
mat[1]*mat[4]-mat[2]*mat[3]
}
determinant <- 0
for(j in 1:ncol(mat)){
mat1 <- mat[-1,-j]
if(nrow(mat1) == 2){
determinant <- determinant+mat[1,j]*(-1)^(1+j)*det1(mat1)
}else{
val <- mat[1,j]*(-1)^(1+j)
if(val != 0){
determinant <- determinant+val*laplace_expansion(mat1)
}
}
}
return(determinant)
}
This is my approach, I think it's cleaner.
deter <- function(X) {
stopifnot(is.matrix(X))
stopifnot(identical(ncol(X), nrow(X)))
if (all(dim(X) == c(1, 1))) return(as.numeric(X))
i <- 1:nrow(X)
out <- purrr::map_dbl(i, function(i){
X[i, 1] * (-1)^(i + 1) * deter(X[-i, -1, drop = FALSE])
})
return(sum(out))
}
Thank you #ArtemSokolov and #MrFlick for pointing the problem cause, it was it. I also discovered that this code does not calculate properly the determinant of 2x2 matrix. After all it looks like that:
determinantRek<-function(X,k) {
if (dim(X)[1] == 1 && dim(X)[2] == 1)
return(X[1,1])
if (dim(X)[1] == 2 && dim(X)[2] == 2)
return(X[1,1]*X[2,2]-X[1,2]*X[2,1])
else
s = 0
for (i in 1:dim(X)[2]) {
s = s + X[k,i]*(-1)^(k+i)*
determinantRek(X[-k,-i],k)
}
return(s)
}
Debuging with browser() was also helpful :)

Right (or left) side trimmed mean

Using:
mean (x, trim=0.05)
Removes 2.5% from each side of the distribution, which is fine for symmetrical two-tailed data. But if I have one tailed or highly asymmetric data I would like to be able to remove just one side of the distribution. Is there a function for this or do I have write myself a new one? If so, how?
Just create a modified mean.default. First look at mean.default:
mean.default
Then modify it to accept a new argument:
mean.default <-
function (x, trim = 0, na.rm = FALSE, ..., side="both")
{
if (!is.numeric(x) && !is.complex(x) && !is.logical(x)) {
warning("argument is not numeric or logical: returning NA")
return(NA_real_)
}
if (na.rm)
x <- x[!is.na(x)]
if (!is.numeric(trim) || length(trim) != 1L)
stop("'trim' must be numeric of length one")
n <- length(x)
if (trim > 0 && n) {
if (is.complex(x))
stop("trimmed means are not defined for complex data")
if (any(is.na(x)))
return(NA_real_)
if (trim >= 0.5)
return(stats::median(x, na.rm = FALSE))
lo <- if( side=="both" || side=="right" ){ floor(n * trim) + 1 }else{1}
hi <- if( side=="both" || side=="left" ){ n + 1 - (floor(n * trim) + 1 ) }else{ n}
x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi]
cat(c(length(x), lo , hi) )
}
.Internal(mean(x))
}
I don't know of a function. Something like the following would trim off the upper tail of the distribution before taking the mean.
upper.trim.mean <- function(x,trim) {
x <- sort(x)
mean(x[1:floor(length(x)*(1-trim))])
}
This should account for either side, or both sides for trimming.
trim.side.mean <- function(x, trim, type="both"){
if (type == "both") {
mean(x,trim)}
else if (type == "right") {
x <- sort(x)
mean(x[1:floor(length(x)*(1-trim))])}
else if (type == "left"){
x <- sort(x)
mean(x[max(1,floor(length(x)*trim)):length(x)])}}
one.sided.trim.mean <- function(x, trim, upper=T) {
if(upper) trim = 1-trim
data <- mean(x[x<quantile(x, trim)])
}
I found that all the answers posted do not match when checked manually. So I created one of my own. Its long but simple enough to understand
get_trim <- function(x,trim,type)
{
x <- sort(x)
ans<-0
if (type=="both")
{
for (i in (trim+1):(length(x)-trim))
{
ans=ans+x[i];
}
print(ans/(length(x)-(2*trim)))
}
else if(type=="left")
{
for (i in (trim+1):(length(x)))
{
ans=ans+x[i];
}
print(ans/(length(x)-trim))
}
else if (type=="right")
{
for (i in 1:(length(x)-trim))
{
ans=ans+x[i];
}
print(ans/(length(x)-trim))
}
}

Resources