I have a specific need to "transform" a number in R. As an example,
A "floor" operation behave as:
138 -> 100
1233 -> 1000
A "ceiling" operation behave as:
138 -> 200
1233 -> 2000
Is there an easy way to accomplish this in R? thanks
You could extract the exponent separatly:
floorEx <- function(x) {
ex <- 10^trunc(log10(x))
return(trunc(x/ex)*ex)
}
ceilingEx <- function(x) {
ex <- 10^trunc(log10(x))
return(ceiling(x/ex)*ex)
}
Examples:
floorEx(123)
# [1] 100
ceilingEx(123)
# [1] 200
ceilingEx(c(123, 1234, 12345))
# [1] 200 2000 20000
EDIT:
using trunc instead of floor and integrate old ex function (ex <- function(x)floor(log10(x))) to speedup the calculation a little bit
add benchmark to compare against #eddi's floorR
benchmark:
## provided by #eddi
floorR <- function(x) {r <- signif(x, 1); r - (r > x) * 10^trunc(log10(x))}
library("microbenchmark")
x <- 123; microbenchmark(floorEx(x), floorR(x), signif(x), times=1e4)
# Unit: nanoseconds
# expr min lq median uq max neval
# floorEx(x) 2182 2414 2521 2683.0 704190 10000
# floorR(x) 2894 3150 3278 3505.5 22260 10000
# signif(x) 372 472 507 556.0 10963 10000
x <- 1:1000; microbenchmark(floorEx(x), floorR(x), signif(x), times=1e2)
# Unit: microseconds
# expr min lq median uq max neval
# floorEx(x) 100.560 101.2460 101.6945 115.6385 818.895 100
# floorR(x) 354.848 355.4705 356.0420 375.9210 1074.582 100
# signif(x) 114.608 115.2120 115.4695 119.1805 186.738 100
It does not directly answer your question, but you can also take a look at signif :
R> x <- 138
R> signif(x,1)
[1] 100
R> x <- 1712
R> signif(x,1)
[1] 2000
Another option:
floor2 <- function(x) {
mag <- 10^(nchar(round(x))-1)
(x %/% mag) * mag
}
ceil2 <- function(x) {
mag <- 10^(nchar(round(x))-1)
((x + mag) %/% mag) * mag
}
I played with regexing and the ceiling floor functions to get this one:
ceil <- function(x) {
ceiling(as.numeric(sub("([[:digit:]])", "\\1.", x))) * (10^(nchar(x)-1))
}
flr <- function(x) {
floor(as.numeric(sub("([[:digit:]])", "\\1.", x))) * (10^(nchar(x)-1))
}
ceil(1233)
ceil(138)
flr(1233)
flr(138)
## > ceil(1233)
## [1] 2000
## > ceil(138)
## [1] 200
## > flr(1233)
## [1] 1000
## > flr(138)
## [1] 100
Here's a different take using #juba's suggestion. To get from a rounded answer to the floor or ceil we simply need to correct it a little bit:
floorR = function(x) {
rounded = signif(x, 1);
rounded - (rounded > x) * 10^trunc(log10(x))
}
ceilR = function(x) {
rounded = signif(x, 1);
rounded + (rounded < x) * 10^trunc(log10(x))
}
edit2: after vectorizing, the functions are a little bit slower (see edit history for non-vectorized versions). They are still fast for small vectors, but don't scale as well as #sgibb's solution (partly because signif doesn't scale that well):
x = 156; microbenchmark(floorEx(x), flr(x), floor2(x), signif(x), floorR(x), times = 10000)
#Unit: nanoseconds
# expr min lq median uq max neval
# floorEx(x) 4008 8348 10018 12021 158934 10000
# flr(x) 84810 121204 135896 141571 6708248 10000
# floor2(x) 32055 46078 51086 54091 360606 10000
# signif(x) 0 1002 1336 1671 86813 10000
# floorR(x) 3006 6679 8348 10017 207683 10000
x = c(1:1000); microbenchmark(floorEx(x), signif(x), floorR(x), times = 100)
#Unit: microseconds
# expr min lq median uq max neval
# floorEx(x) 125.879 157.4315 158.934 161.4385 243.742 100
# signif(x) 147.581 216.6975 217.365 220.5375 395.998 100
# floorR(x) 252.758 360.6055 362.275 366.4485 619.373 100
Related
This is my vector
x <- c("1", "1", "PNP004", "10", "10", NA, NA)
I need to find a way to return the index/value of the final element that is not NA.
Does anyone know a good way of doing this?
Any help is appreciated!
Ind <- max(which(!is.na(yourvec)))
yourvec[Ind]
Using dplyr:
dplyr::last(which(!is.na(yourvec)))
You can use tail from the result of which looking for !is.na of your vector.
tail(which(!is.na(x)), 1)
#[1] 5
Or a for loop.
idxLNNA <- function(x) {
if(length(x) > 0) {
for(i in length(x):1) if(!is.na(x[i])) break
if(i == 1 & is.na(x[i])) {0} else {i}
} else {0}
}
idxLNNA(x)
#[1] 5
Or using cumsum with which.max, what will return 1 in case there are only NA's.
which.max(cumsum(!is.na(x)))
#[1] 5
Or subtracting the rev hit from the length of x.
length(x) - which.min(rev(is.na(x))) + 1 #Will fail in case on only NA
#length(x) - match(FALSE, is.na(rev(x))) + 1 #Alternative
#[1] 5
Benchmark:
library(microbenchmark)
fun <- alist(Dason = max(which(!is.na(x)))
, juljo = dplyr::last(which(!is.na(x)))
, GKiTail = tail(which(!is.na(x)), 1)
, GKiCum = which.max(cumsum(!is.na(x)))
, GKiRev = length(x) - which.min(rev(is.na(x))) + 1
, GKiMatch = length(x) - match(FALSE, is.na(rev(x))) + 1
, GKiFor = idxLNNA(x)
)
x <- numeric(1e6)
microbenchmark(list = fun, control=list(order="block"))
#Unit: microseconds
# expr min lq mean median uq max neval cld
# Dason 4855.744 5740.1355 7941.21809 7082.5535 7671.371 107165.201 100 bc
# juljo 4145.322 4616.7815 5756.38147 6134.9200 6625.008 7378.724 100 b
# GKiTail 4082.716 4434.3880 5576.70509 6051.1465 6489.966 7433.579 100 b
# GKiCum 6552.213 7445.1525 8632.12253 8988.8700 9420.481 16791.845 100 c
# GKiRev 4005.929 4138.4735 5845.70457 4212.7470 5851.034 101665.685 100 b
# GKiMatch 5180.600 5483.8545 7507.82723 5998.2760 7373.458 108327.967 100 bc
# GKiFor 1.541 1.5775 2.16462 1.6145 1.724 20.436 100 a
x <- rep(NA, 1e6) #Dason, GKiCum and GKiRev Fail
microbenchmark(list = fun[-c(1,4,5)], control=list(order="block"))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# juljo 3.011272 3.076340 3.525396 3.111676 3.494768 6.367839 100 a
# GKiTail 2.942336 3.014327 3.529691 3.063891 3.809653 6.136984 100 a
# GKiMatch 4.928626 4.975369 7.490588 5.039941 6.823780 98.194653 100 b
# GKiFor 155.078444 159.314918 163.706542 160.168266 163.464146 258.136977 100 c
x <- numeric(0) #Dason Fails
microbenchmark(list = fun[-1], control=list(order="block"))
Unit: nanoseconds
# expr min lq mean median uq max neval cld
# juljo 26794 27324.0 28694.75 27640.0 27933.0 120143 100 d
# GKiTail 6746 7027.5 7396.45 7206.5 7432.5 21898 100 c
# GKiCum 869 880.0 947.72 890.0 948.0 3403 100 a
# GKiRev 2466 2527.0 2657.99 2565.5 2652.0 8071 100 b
# GKiMatch 2739 2807.5 2919.78 2862.5 2935.5 5651 100 b
# GKiFor 492 512.5 671.74 537.5 604.5 9088 100 a
This question already has answers here:
Find position of first value greater than X in a vector
(7 answers)
Closed 2 years ago.
I am looking for a function similar to match, but it should have an inequality as test and stop at the first occurrence.
a <- c(1,3,5,7,9,12)
b <- 6
# the output should be 4
# i tried this one: match(min(a [b <= a ]),a ), but with much data it is slow
1) if you have large vectors, where the stop should happen in the beginning, like:
a <- c(1,3,5,7,9,12, 100:1e6)
b <- 6
other approaches could be faster.
2) as we are benchmarking such a fast functions also compilation could make a difference, so I defined the approaches separately:
v1 <- function() match(min(a[b <= a ]), a)
v2 <- function() min(which(a >= b))
v3 <- function() which(b <= a)[1]
v4 <- function() match(TRUE, b <= a)
v5 <- function() Position(function(x) b <= x, a)
microbenchmark::microbenchmark(
v1(), v2(), v3(), v4(), v5()
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# v1() 10109.4 13822.10 23205.556 23481.95 26661.30 152809.4 100 c
# v2() 5796.8 6475.75 10404.439 6839.30 16991.90 34333.7 100 b
# v3() 5144.8 5648.75 9381.664 5963.05 6801.25 142202.0 100 b
# v4() 3963.0 4497.35 6938.296 4799.55 5419.80 22011.4 100 b
# v5() 4.3 14.20 35.509 16.85 18.45 1858.8 100 a
For original data, the last approach isn't the fastest:
a <- c(1,3,5,7,9,12)
b <- 6
microbenchmark::microbenchmark(
v1(), v2(), v3(), v4(), v5()
)
# Unit: nanoseconds
# expr min lq mean median uq max neval cld
# v1() 800 900 1151 1000 1100 12600 100 b
# v2() 900 1000 1105 1000 1100 5600 100 b
# v3() 800 900 953 900 1000 2700 100 b
# v4() 400 500 584 600 600 1500 100 a
# v5() 3600 3700 3983 3800 3900 10600 100 c
Additional comments on this can been seen, here:
Find position of first value greater than X in a vector
Probably this is was fastest. #Gregor found a faster one.
which(b <= a)[1]
# [1] 4
Benchmark
set.seed(42)
a <- sort(sample(1:100, 1e6, replace=T))
b <- 6
microbenchmark::microbenchmark(match(min(a[b <= a ]), a),
min(which(a >= b)),
which(b <= a)[1],
match(TRUE, b <= a))
Unit: milliseconds
expr min lq mean median uq max neval cld
match(min(a[b <= a]), a) 14.273060 14.770065 27.228264 26.043620 29.675095 273.45221 100 b
min(which(a >= b)) 9.537230 9.781012 13.306348 10.207344 10.445258 27.28422 100 a
which(b <= a)[1] 8.192664 8.475993 14.774976 8.869157 19.935542 267.22919 100 a
match(TRUE, b <= a) 6.045952 6.274809 9.125632 6.404545 6.845165 24.42210 100 a
You can still use match: match(TRUE, b <= a)
findInterval gives the index of b in a or the index of the largest value in a less than b if there is no match; therefore, use findInterval and then add 1 if it is not an exact match since you want the next rather than prior index. a must be sorted in ascending order.
ix <- findInterval(b, a, all.inside = TRUE)
ix <- ix + (a[ix] < b)
ix
## [1] 4
or possibly the first line is sufficient if you can change the requirements slightly. There is some question regarding how edge cases are to be handled but I have added all.inside = TRUE to handle one such situation.
I have a dataframe where one of the columns is of type string.
I would like to count the number of unique/distinct characters in that string.
eg.
"banana" -> 3
'he' -> 2
A reproducible example:
I have a data frame where a column is type string. I would need to filter out those rows where the string has only one distinct character.
col1 col2 col3
new york
qqqq
melbourne
aaaaaa
I would need to have a final data frame like
col1 col2 col3
new york
melbourne
So delete those rows completely.
This makes no assumption about "characters" being in letters and avoids making R data structures:
library(inline)
.char_unique_code <- "
std::vector < std::string > s = as< std::vector < std::string > >(x);
unsigned int input_size = s.size();
std::vector < std::string > chrs(input_size);
for (unsigned int i=0; i<input_size; i++) {
std::string t = s[i];
for (std::string::iterator chr=t.begin();
chr != t.end(); ++chr) {
if (chrs[i].find(*chr) == std::string::npos) {
chrs[i] += *chr;
}
}
}
return(wrap(chrs));
"
char_unique <-
rcpp(sig=signature(x="std::vector < std::string >"),
body=.char_unique_code,
includes=c("#include <string>",
"#include <iostream>"))
nchar(char_unique("banana"))
## [1] 3
Why avoid making R lists?
library(stringr)
library(microbenchmark)
library(ggplot2)
str_char_ct_unique <- function(x) sum(!!str_count(x, letters))
char_ct_unique <- function(x) nchar(char_unique(x))
r_char_ct_unique <- function(x) length(unique(strsplit(x, "")[[1]]))
microbenchmark(stringr=str_char_ct_unique("banana"),
rcpp=char_ct_unique("banana"),
r=r_char_ct_unique("banana"),
times=1000) -> mb
## Unit: microseconds
## expr min lq mean median uq max neval cld
## stringr 125.978 129.1765 139.271061 130.9415 139.3870 334.563 1000 c
## rcpp 1.458 2.0160 3.002184 2.6345 3.1365 32.244 1000 a
## r 4.797 6.1070 8.292847 7.3380 8.0505 86.709 1000 b
Let's make a vectorized version of Cath's pure R solution (not bothering with the other one since it's way too constrained) and compare against a vector of small random strings:
library(random)
library(purrr)
char_ct_unique <- function(x) nchar(char_unique(x))
r_char_ct_unique <- function(x) map_int(map(x, function(x) unique(strsplit(x, "")[[1]])), length)
tst <- as.vector(randomStrings(n=100, len=20, unique=FALSE))
sum(char_ct_unique(tst) == r_char_ct_unique(tst))
## [1] 100
microbenchmark(rcpp=char_ct_unique(tst),
r=r_char_ct_unique(tst),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## rcpp 53.643 56.2375 66.69311 60.2740 68.178 250.992 1000 a
## r 683.420 759.4070 952.14407 822.8905 922.710 6513.508 1000 b
And, now for the 10,000 character random string:
dat <- readLines("https://gist.githubusercontent.com/hrbrmstr/f80b157b383134b37fb3/raw/534b4c79e7c51710c6db6961bc5dc5ec25c4242b/gistfile1.txt")
digest::digest(dat, "sha1", serialize=FALSE)
## [1] "6c6695dd2f314762c81e6e6891ec1c138a4f3a08"
nchar(dat)
## [1] 10000
char_ct_unique(dat) == r_char_ct_unique(dat)
## [1] TRUE
microbenchmark(rcpp=char_ct_unique(dat),
r=r_char_ct_unique(dat),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## rcpp 73.801 110.681 122.9091 118.330 139.373 308.602 1000 a
## r 377.556 430.703 533.9120 448.631 492.466 4275.568 1000 b
I forgot to do David's "fixed" version:
f_r_char_ct_unique <- function(x) map_int(map(x, function(x) unique(strsplit(x, "", fixed=TRUE)[[1]])), length)
and, let's make it more interesting:
dat <- c(dat, toupper(dat), tolower(dat))
microbenchmark(rcpp=char_ct_unique(dat),
r=r_char_ct_unique(dat),
fr=f_r_char_ct_unique(dat),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval
## rcpp 218.299 284.143 331.319 332.281 358.1215 696.907 1000
## r 1266.976 1442.460 1720.320 1494.167 1634.7870 5896.685 1000
## fr 1260.027 1444.298 1769.664 1501.416 1652.8895 78457.729 1000
We can use str_count
library(stringr)
sum(!!str_count(str1, letters))
#[1] 3
Update
Using the new dataset
i1 <- !sapply(df1$col1, function(x) any(str_count(x, letters)>1))
df1[i1,,drop=FALSE]
data
str1 <- "banana"
I have a list of several vectors. I would like to check whether all vectors in the list are equal. There's identical which only works for pairwise comparison. So I wrote the following function which looks ugly to me. Still I did not find a better solution. Here's my RE:
test_true <- list(a=c(1,2,3),b=c(1,2,3),d=c(1,2,3))
test_false <- list(a=c(1,2,3),b=c(1,2,3),d=c(1,32,13))
compareList <- function(li){
stopifnot(length(li) > 1)
l <- length(li)
res <- lapply(li[-1],function(X,x) identical(X,x),x=li[[1]])
res <- all(unlist(res))
res
}
compareList(test_true)
compareList(test_false)
Any suggestions? Are there any native checks for identical for more than just pairwise comparison?
How about
allSame <- function(x) length(unique(x)) == 1
allSame(test_true)
# [1] TRUE
allSame(test_false)
# [1] FALSE
As #JoshuaUlrich pointed out below, unique may be slow on lists. Also, identical and unique may use different criteria. Reduce is a function I recently learned about for extending pairwise operations:
identicalValue <- function(x,y) if (identical(x,y)) x else FALSE
Reduce(identicalValue,test_true)
# [1] 1 2 3
Reduce(identicalValue,test_false)
# [1] FALSE
This inefficiently continues making comparisons after finding one non-match. My crude solution to that would be to write else break instead of else FALSE, throwing an error.
I woud do:
all.identical <- function(l) all(mapply(identical, head(l, 1), tail(l, -1)))
all.identical(test_true)
# [1] TRUE
all.identical(test_false)
# [1] FALSE
To summarize the solutions. Data for the tests:
x1 <- as.list(as.data.frame(replicate(1000, 1:100)))
x2 <- as.list(as.data.frame(replicate(1000, sample(1:100, 100))))
Solutions:
comp_list1 <- function(x) length(unique.default(x)) == 1L
comp_list2 <- function(x) all(vapply(x[-1], identical, logical(1L), x = x[[1]]))
comp_list3 <- function(x) all(vapply(x[-1], function(x2) all(x[[1]] == x2), logical(1L)))
comp_list4 <- function(x) sum(duplicated.default(x)) == length(x) - 1L
Test on the data:
for (i in 1:4) cat(match.fun(paste0("comp_list", i))(x1), " ")
#> TRUE TRUE TRUE TRUE
for (i in 1:4) cat(match.fun(paste0("comp_list", i))(x2), " ")
#> FALSE FALSE FALSE FALSE
Benchmarks:
library(microbenchmark)
microbenchmark(comp_list1(x1), comp_list2(x1), comp_list3(x1), comp_list4(x1))
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> comp_list1(x1) 138.327 148.5955 171.9481 162.013 188.9315 269.342 100 a
#> comp_list2(x1) 1023.932 1125.2210 1387.6268 1255.985 1403.1885 3458.597 100 b
#> comp_list3(x1) 1130.275 1275.9940 1511.7916 1378.789 1550.8240 3254.292 100 c
#> comp_list4(x1) 138.075 144.8635 169.7833 159.954 185.1515 298.282 100 a
microbenchmark(comp_list1(x2), comp_list2(x2), comp_list3(x2), comp_list4(x2))
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> comp_list1(x2) 139.492 140.3540 147.7695 145.380 149.6495 218.800 100 a
#> comp_list2(x2) 995.373 1030.4325 1179.2274 1054.711 1136.5050 3763.506 100 b
#> comp_list3(x2) 977.805 1029.7310 1134.3650 1049.684 1086.0730 2846.592 100 b
#> comp_list4(x2) 135.516 136.4685 150.7185 139.030 146.7170 345.985 100 a
As we see the most efficient solutions based on the duplicated and unique functions.
PUtting in my self-promoting suggestion for cgwtools::approxeq which essentially does what all.equal does but returns a vector of logical values indicating equality or not.
So: depends whether you want exact equality or floating-point-representational equality.
UPDATE
The overall best solution:
all.identical.list <- function(l) identical(unname(l[-length(l)]), unname(l[-1]))
Implementing Frank's solution with a break:
all.identical <- function(l) class(try(Reduce(function(x, y) if(identical(x, y)) x else break, l), silent = TRUE)) != "try-error"
Continuing with Artem's benchmarking and adding the solution from Jake's comment, speeds are pretty dependent on the objects being compared, but all.identical.list is consistently the fastest (or very close to fastest):
library(microbenchmark)
all.identical.list <- function(l) identical(unname(l[-length(l)]), unname(l[-1]))
all.identical <- function(l) !is.null(Reduce(function(x, y) if(identical(x, y)) x else NULL, l))
all.identical.break <- function(l) class(try(Reduce(function(x, y) if(identical(x, y)) x else break, l), silent = TRUE)) != "try-error"
comp_list4 <- function(l) sum(duplicated.default(l)) == length(l) - 1L
comp_list5 <- function(l) all(duplicated.default(l)[-1])
x1 <- as.list(as.data.frame(replicate(1000, 1:100)))
x2 <- as.list(as.data.frame(replicate(1000, sample(100))))
microbenchmark(all.identical.list(x1), all.identical(x1), all.identical.break(x1), comp_list4(x1), comp_list5(x1), check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> all.identical.list(x1) 60.3 66.65 125.803 72.90 94.30 3271.5 100
#> all.identical(x1) 1134.0 1209.45 1484.864 1265.85 1655.95 5085.3 100
#> all.identical.break(x1) 1156.6 1226.75 1602.869 1337.25 1698.05 5030.4 100
#> comp_list4(x1) 170.5 179.35 234.169 184.75 200.40 2164.1 100
#> comp_list5(x1) 173.3 182.35 213.542 187.55 194.50 1704.0 100
microbenchmark(all.identical.list(x2), all.identical(x2), all.identical.break(x2), comp_list4(x2), comp_list5(x2), check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> all.identical.list(x2) 31.0 34.30 47.182 37.65 46.90 180.8 100
#> all.identical(x2) 1002.8 1059.85 1237.426 1106.65 1278.35 3404.4 100
#> all.identical.break(x2) 119.4 137.15 156.748 147.60 164.00 340.8 100
#> comp_list4(x2) 165.0 172.35 189.869 181.20 192.25 334.6 100
#> comp_list5(x2) 166.6 171.10 188.782 179.25 190.55 394.9 100
x1 <- as.list(as.data.frame(replicate(10, 1:1e5)))
x2 <- as.list(as.data.frame(replicate(10, sample(1e5))))
microbenchmark(all.identical.list(x1), all.identical(x1), all.identical.break(x1), comp_list4(x1), comp_list5(x1), check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> all.identical.list(x1) 211.4 217.25 264.978 229.5 258.00 711.4 100
#> all.identical(x1) 182.2 187.50 218.062 195.3 217.05 499.4 100
#> all.identical.break(x1) 194.8 207.25 258.043 222.7 266.70 1013.4 100
#> comp_list4(x1) 1457.3 1495.30 1659.118 1543.0 1806.75 2689.0 100
#> comp_list5(x1) 1457.7 1502.45 1685.194 1553.5 1769.10 3021.2 100
microbenchmark(all.identical.list(x2), all.identical(x2), all.identical.break(x2), comp_list4(x2), comp_list5(x2), check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> all.identical.list(x2) 3.1 4.45 7.894 6.35 9.85 48.5 100
#> all.identical(x2) 12.0 15.25 19.404 17.05 22.05 56.1 100
#> all.identical.break(x2) 114.3 128.80 172.876 144.90 190.45 511.5 100
#> comp_list4(x2) 1292.2 1342.35 1443.261 1397.00 1472.25 1908.5 100
#> comp_list5(x2) 1292.4 1364.90 1478.291 1409.50 1484.80 2467.2 100
this also works
m <- combn(length(test_true),2)
for(i in 1:ncol(m)){
print(all(test_true[[m[,i][1]]] == test_true[[m[,i][2]]]))
}
Here is a rather inefficient way to extract the number.
as.integer((x%%floor(x)*100)-(signif(x%%floor(x)*100,1)))
Does anyone else have a better way?
You could try
as.integer(x*100) %% 10
Not sure if it's faster after all the coercion to string and back, but you can try location-based character manipulation:
as.numeric(sub("[-0-9+].[0-9]([0-9]).+","\\1",as.character(x),perl=TRUE)),
as.numeric(substr(strsplit(as.character(x),".",fixed=TRUE)[[1]][2],2,2))
Edit: It turns out it's slower.
x <- runif(1)
re.fxn <- function(x) {
as.integer(sub("[0-9+].[0-9]([0-9]).+","\\1",as.character(x),perl=TRUE))
}
ss.fxn <- function(x) {
as.integer(substr(strsplit(as.character(x),".",fixed=TRUE)[[1]][2],2,2))
}
ai.fxn <- function(x) {
as.integer(x*100) %% 10
}
microbenchmark(
as.integer((x%%floor(x)*100)-(signif(x%%floor(x)*100,1))),
as.integer(x*100) %% 10,
re.fxn(x) ,
ss.fxn(x),
ai.fxn(x),
times=10
)
expr min
1 ai.fxn(x) 5989
2 as.integer((x%%floor(x) * 100) - (signif(x%%floor(x) * 100, 1))) 11121
3 as.integer(x * 100)%%10 4278
4 re.fxn(x) 103508
5 ss.fxn(x) 40206
lq median uq max
1 6844 8555.0 8556 10266
2 12832 12832.5 14543 17965
3 4278 4278.0 5134 11121
4 103509 104364.0 106075 207017
5 41062 42344.5 42773 65869
Seems like a speed win for the modulo. And the difference only narrows slightly if you're looking for higher-order digits in a generic function robust to negative x's:
ss.fxn <- function(x,d=2) {
as.integer(substr(strsplit(as.character(x),".",fixed=TRUE)[[1]][2],d,d))
}
ai.fxn <- function(x,d=2) {
as.integer(abs(x)*10^d) %% 10
}
d <- 8
microbenchmark(
ss.fxn(x,d),
ai.fxn(x,d),
times=1000
)
expr min lq median uq max
1 ai.fxn(x, d) 6845 8555 10266 10266 75280
2 ss.fxn(x, d) 40206 41061 41062 41917 284006