Test if vector is contained in another vector, including repetitions - r

I've been struggling with this one for a while: given two vectors, each containing possible repetitions of elements, how do I test if one is perfectly contained in the other? %in% does not account for repetitions. I can't think of an elegant solution that doesn't rely on a something from the apply family.
x <- c(1, 2, 2, 2)
values <- c(1, 1, 1, 2, 2, 3, 4, 5, 6)
# returns TRUE, but x[x == 2] is greater than values[values == 2]
all(x %in% values)
# inelegant solution
"%contains%" <-
function(values, x){
n <- intersect(x, values)
all( sapply(n, function(i) sum(values == i) >= sum(x == i)) )
}
# which yields the following:
> values %contains% x
[1] FALSE
> values <- c(values, 2)
> values %contains% x
[2] TRUE
Benchmarking update
I may have found another solution in addition to the answer provided by Marat below
# values and x must all be non-negative - can change the -1 below accordingly
"%contains%" <-
function(values, x){
t <- Reduce(function(.x, .values) .values[-which.max(.values == .x)]
, x = x
, init = c(-1, values))
t[1] == -1
}
Benchmarking all the answers so far, including thelatemail's modification of marat, using both large and small x
library(microbenchmark)
set.seed(31415)
values <- sample(c(0:100), size = 100000, replace = TRUE)
set.seed(11235)
x_lrg <- sample(c(0:100), size = 1000, replace = TRUE)
x_sml <- c(1, 2, 2, 2)
lapply(list(x_sml, x_lrg), function(x){
microbenchmark( hoho_sapply(values, x)
, marat_table(values, x)
, marat_tlm(values, x)
, hoho_reduce(values, x)
, unit = "relative")
})
# Small x
# [[1]]
# Unit: relative
# expr min lq mean median uq max neval
# hoho_sapply(values, x) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
# marat_table(values, x) 12.718392 10.966770 7.487895 9.260099 8.648351 1.819833 100
# marat_tlm(values, x) 1.354452 1.181094 1.026373 1.088879 1.266939 1.029560 100
# hoho_reduce(values, x) 2.951577 2.748087 2.069830 2.487790 2.216625 1.097648 100
#
# Large x
# [[2]]
# Unit: relative
# expr min lq mean median uq max neval
# hoho_sapply(values, x) 1.158303 1.172352 1.101410 1.177746 1.096661 0.6940260 100
# marat_table(values, x) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
# marat_tlm(values, x) 1.099669 1.059256 1.102543 1.071960 1.072881 0.9857229 100
# hoho_reduce(values, x) 85.666549 81.391495 69.089366 74.173366 66.943621 27.9766047 100

Try using table, e.g.:
"%contain%" <- function(values,x) {
tx <- table(x)
tv <- table(values)
z <- tv[names(tx)] - tx
all(z >= 0 & !is.na(z))
}
Some examples:
> c(1, 1, 1, 2, 2, 3, 4, 5, 6) %contain% c(1,2,2,2)
[1] FALSE
> c(1, 1, 1, 2, 2, 3, 4, 5, 6, 2) %contain% c(1,2,2,2)
[1] TRUE
> c(1, 1, 1, 2, 2, 3, 4, 5, 6) %contain% c(1,2,2)
[1] TRUE
> c(1, 1, 1, 2, 2, 3, 4, 5, 6) %contain% c(1,2,2,7)
[1] FALSE

Related

Plotting CDF and PDF in R with custom function

I was wondering if there was any way to plot this PDF and CDF in R. I found these on a different question a user asked, and was curious.
I know that I have to create a function and then plot this, but I'm struggling with the different parameters and am unsure how to translate this to R. I have only ever plotted PDF/CDF using a normal distribution, or from datasets.
You can write the pdf and cdf as functions, using ifelse to specify the values within the ranges, then simply plot the functions over the desired range:
pdf <- function(x) {
ifelse(x >= 0 & x < 1, 0.5, ifelse(x < 1.5 & x >= 1, 1, 0))
}
cdf <- function(x) {
ifelse(x < 0, 0,
ifelse(x >= 0 & x < 1, 0.5 * x,
ifelse(x < 1.5 & x >= 1, x - 0.5, 1)))
}
plot(pdf, xlim = c(-1, 2), type = "s")
plot(cdf, xlim = c(-1, 2))
Created on 2022-10-27 with reprex v2.0.2
ifelse can be very slow, we can fill an output vector instead. numeric() creates a vector of zeroes of a specified length, we then simply change everything that should not yield zero.
pdf_vec <- function(x) {
out <- numeric(length(x))
out[x >= 0 & x < 1] <- .5
out[x >= 1 & x < 1.5] <- 1
out
}
cdf_vec <- function(x) {
out <- numeric(length(x))
tmp <- x >= 0 & x < 1
out[tmp] <- .5*x[tmp]
tmp <- x >= 1 & x < 1.5
out[tmp] <- x[tmp] - .5
tmp <- x >= 1.5
out[tmp] <- 1
out
}
set.seed(42)
x <- rnorm(1e6)
stopifnot(all.equal(cdf(x), cdf1(x)))
stopifnot(all.equal(pdf(x), pdf1(x)))
#Allan Camero already showed nicely how to plot it.
Microbenchmark
It's about three times faster than the ifelse solution.
microbenchmark::microbenchmark(
cdf_ifelse=cdf(x), cdf_vec=cdf1(x), check='equal'
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# cdf_ifelse 110.66285 121.25428 133.0789 133.86041 142.53296 167.64401 100 b
# cdf_vec 43.56277 45.08759 48.8924 46.83869 49.46047 74.95487 100 a
microbenchmark::microbenchmark(
pdf_ifelse=pdf(x), pdf_vec=pdf1(x), check='equal'
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# pdf_ifelse 71.39501 76.71747 101.17738 81.43037 87.82162 192.3919 100 b
# pdf_vec 27.82642 30.02056 34.55301 30.38457 31.29751 133.2798 100 a
We can try the code below
f <- function(x) 0.5 * (x >= 0) + 0.5 * (x >= 1) - (x >= 1.5)
F <- Vectorize(function(x) integrate(f, -Inf, x)$value)
plot(f, -2, 2, col = "red")
curve(F, -2, 2, add = TRUE, col = "blue")
legend(-1, 0.8,
legend = c("pdf", "cdf"),
col = c("red", "blue"),
lty = 1:2, cex = 2,
box.col = "white"
)

Alternative to mapply to select sample

I created a mapply function to select samples from a dataset but is there any faster ways to do it by avoiding mapply because it is slow and I have a larger dataset? My goal is to use more matrix / vector operations and less in terms of lists.
#A list of a set of data to be selected
bl <- list(list(c(1, 2),c(2, 3), c(3, 4), c(4, 5), c(5, 6), c(6, 7), c(7, 8), c(8, 9)),
list(c(1, 2, 3), c(2, 3, 4), c(3, 4, 5), c(4, 5, 6), c(5, 6, 7), c(6, 7, 8)),
list(c(1, 2, 3, 4, 5), c(2, 3, 4, 5, 6), c(3, 4, 5, 6, 7), c(4, 5, 6, 7, 8), c(5, 6, 7, 8, 9)))
#Number of elements to be selected
kn <- c(5, 4, 3)
#Total number of elements in each set
nb <- c(8, 6, 5)
#This output a list but preferably I would like a matrix
bl_func <- function() mapply(function(x, y, z) {
x[sample.int(y, z, replace = TRUE)]
}, bl, nb, kn, SIMPLIFY = FALSE)
EDIT
As suggested by #LMc, parallel::mcmapply indeed is faster:
mc.cores=parallel::detectCores()-1
bl_func <- function() parallel::mcmapply(function(x, y, z) {
x[sample.int(y, z, replace = TRUE)]
}, bl, nb, kn, SIMPLIFY = FALSE)
bl_func.0 <- function() mapply(function(x, y, z) {
x[sample.int(y, z, replace = TRUE)]
}, bl, nb, kn, SIMPLIFY = FALSE)
library(microbenchmark)
microbenchmark(
para = bl_func(),
nopara = bl_func.0(),
times = 100
)
Unit: microseconds
expr min lq mean median uq max neval
para 11601.12 18176.46 19901 20402.4 21872 26457 100
nopara 37.34 90.86 1275 246.5 1311 9159 100
I am still curious, though, of other ways to speed things up without the aid of parallel process. Any ideas will be appreciated!
Use a tool designed for speed and large datasets,e.g. data.table .
To do this you would need to reshape your data from lists to a data.table which is in any ways a good idea.
Here is an attempt:
require(data.table)
x = lapply(bl, function(x) data.table( t(data.frame(x) ) ) )
x = lapply(x, melt)
for( i in 1:length(x) ) x[[i]][, group := i]
x = rbindlist(x)
Now the original list of lists is structured in a data.table with 3 columns: the value containing the actual data, the variable defining the vectors within each list and the group defining the list ID.
> head(x)
variable value group
1: V1 1 1
2: V1 2 1
3: V1 3 1
4: V1 4 1
5: V1 5 1
6: V1 6 1
data.table has a by argument which means we can sample rows (.SD ) by one or several columns in the data.table like this:
x[,.SD[ sample( .N, sample(nb,1) , replace = TRUE ) ],by = group ]
group variable value
1: 1 V2 6
2: 1 V2 5
3: 1 V1 6
4: 1 V1 7
5: 1 V1 3

Using optimisation function (optimise) together with dbinom in R (optimisation issue)

When p = 0.5, n = 5 and x = 3
dbinom(3,5,0.5) = 0.3125
Lets say I dont know p (n and x is known) and want to find it.
binp <- function(bp) dbinom(3,5,bp) - 0.3125
optimise(binp, c(0,1))
It does not return 0.5. Also, why is
dbinom(3,5,0.5) == 0.3125 #FALSE
But,
x <- dbinom(3,5,0.5)
x == dbinom(3,5,0.5) #TRUE
optimize() searches the param that minimizes the output of function. Your function can return a negative value (e.g., binp(0.1) is -0.3044). If you search the param that minimizes difference from zero, it would be good idea to use sqrt((...)^2). If you want the param that makes output zero, uniroot would help you. And the param what you want isn't uniquely decided. (note; x <- dbinom(3, 5, 0.5); x == dbinom(3, 5, 0.5) is equibalent to dbinom(3, 5, 0.5) == dbinom(3, 5, 0.5))
## check output of dbinom(3, 5, prob)
input <- seq(0, 1, 0.001)
output <- Vectorize(dbinom, "prob")(3, 5, input)
plot(input, output, type="l")
abline(h = dbinom(3, 5, 0.5), col = 2) # there are two answers
max <- optimize(function(x) dbinom(3, 5, x), c(0, 1), maximum = T)$maximum # [1] 0.6000006
binp <- function(bp) dbinom(3,5,bp) - 0.3125 # your function
uniroot(binp, c(0, max))$root # [1] 0.5000036
uniroot(binp, c(max, 1))$root # [1] 0.6946854
binp2 <- function(bp) sqrt((dbinom(3,5,bp) - 0.3125)^2)
optimize(binp2, c(0, max))$minimum # [1] 0.499986
optimize(binp2, c(max, 1))$minimum # [1] 0.6947186
dbinom(3, 5, 0.5) == 0.3125 # [1] FALSE
round(dbinom(3, 5, 0.5), 4) == 0.3125 # [1] TRUE
format(dbinom(3, 5, 0.5), digits = 16) # [1] "0.3124999999999999"

extract information from a data frame parametrically (via a menu selection)

I would like to extract information from a data frame parametrically.
That is:
A <- c(3, 10, 20, 30, 40)
B <- c(30, 100, 200, 300, 400)
DF <- data.frame(A, B)
DF[A%in%c(1, 2, 3, 4, 5), ] # it works
# But what if this is the case,
# which comes for example out of a user-menu selection:
m <- "A%in%"
k <- c(1, 2, 3, 4, 5)
# How can we make something like that work:
DF[eval(parse(text=c(m, k))), ]
This works:
DF[eval(parse(text = paste0(m, deparse(k)))), ]
# A B
#1 3 30
However, eval(parse()) should be avoided. Maybe this would be an alternative for you?
x <- "A"
fun <- "%in%"
k <- c(1, 2, 3, 4, 5)
DF[getFunction(fun)(get(x), k), ]
# A B
#1 3 30
Also,
DF[eval(parse(text=paste(m, substitute(k)))),]
or
DF[eval(parse(text=paste(m, quote(k)))),]
or
DF[eval(parse(text=paste(m, "k"))),]

Is there a weighted.median() function?

I'm looking for something similar in form to weighted.mean(). I've found some solutions via search that write out the entire function but would appreciate something a bit more user friendly.
The following packages all have a function to calculate a weighted median: 'aroma.light', 'isotone', 'limma', 'cwhmisc', 'ergm', 'laeken', 'matrixStats, 'PSCBS', and 'bigvis' (on github).
To find them I used the invaluable findFn() in the 'sos' package which is an extension for R's inbuilt help.
findFn('weighted median')
Or,
???'weighted median'
as ??? is a shortcut in the same way ?some.function is for help(some.function)
Some experience using the answers from #wkmor1 and #Jaitropmange.
I've checked 3 functions from 3 packages, isotone, laeken, and matrixStats. Only matrixStats works properly. Other two (just as the median(rep(x, times=w) solution) give integer output. As long as I calculated median age of populations, decimal places matter.
Reproducible example. Calculation of the median age of a population
df <- data.frame(age = 0:100,
pop = spline(c(4,7,9,8,7,6,4,3,2,1),n = 101)$y)
library(isotone)
library(laeken)
library(matrixStats)
isotone::weighted.median(df$age,df$pop)
# [1] 36
laeken::weightedMedian(df$age,df$pop)
# [1] 36
matrixStats::weightedMedian(df$age,df$pop)
# [1] 36.164
median(rep(df$age, times=df$pop))
# [1] 35
Summary
matrixStats::weightedMedian() is the reliable solution
To calculate the weighted median of a vector x using a same length vector of (integer) weights w:
median(rep(x, times=w))
This is just a simple solution, ready to use almost anywhere.
weighted.median <- function(x, w) {
w <- w[order(x)]
x <- x[order(x)]
prob <- cumsum(w)/sum(w)
ps <- which(abs(prob - .5) == min(abs(prob - .5)))
return(x[ps])
}
Really old post but I just came across it and did some testing of the different methods. spatstat::weighted.median() seemed to be about 14 times faster than median(rep(x, times=w)) and its actually noticeable if you want to run the function more than a couple times. Testing was with a relatively large survey, about 15,000 people.
One can also use stats::density to create a weighted PDF, then convert this to a CDF, as elaborated here:
my_wtd_q = function(x, w, prob, n = 4096)
with(density(x, weights = w/sum(w), n = n),
x[which.max(cumsum(y*(x[2L] - x[1L])) >= prob)])
Then my_wtd_q(x, w, .5) will be the weighted median.
One could also be more careful to ensure that the total area under the density is one by re-normalizing.
A way in base to get a weighted median will be to order by the values and build the cumsum of the weights and get the value(s) at sum * 0.5 of the weights.
medianWeighted <- function(x, w, q=.5) {
n <- length(x)
i <- order(x)
w <- cumsum(w[i])
p <- w[n] * q
j <- findInterval(p, w)
Vectorize(function(p,j) if(w[n] <= 0) NA else
if(j < 1) x[i[1]] else
if(j == n) x[i[n]] else
if(w[j] == p) (x[i[j]] + x[i[j+1]]) / 2 else
x[i[j+1]])(p,j)
}
What will have the following results with simple input data.
medianWeighted(c(10, 40), c(1, 2))
#[1] 40
median(rep(c(10, 40), c(1, 2)))
#[1] 40
medianWeighted(c(10, 40), c(2, 1))
#[1] 10
median(rep(c(10, 40), c(2, 1)))
#[1] 10
medianWeighted(c(10, 40), c(1.5, 2))
#[1] 40
medianWeighted(c(10, 40), c(3, 4))
#[1] 40
median(rep(c(10, 40), c(3, 4)))
#[1] 40
medianWeighted(c(10, 40), c(1.5, 1.5))
#[1] 25
medianWeighted(c(10, 40), c(3, 3))
#[1] 25
median(rep(c(10, 40), c(3, 3)))
#[1] 25
medianWeighted(c(10, 40), c(0, 1))
#[1] 40
medianWeighted(c(10, 40), c(1, 0))
#[1] 10
medianWeighted(c(10, 40), c(0, 0))
#[1] NA
It can also be used for other qantiles
medianWeighted(1:10, 10:1, seq(0, 1, 0.25))
[1] 1 2 4 6 10
Compare with other methods.
#Functions from other Answers
weighted.median <- function(x, w) {
w <- w[order(x)]
x <- x[order(x)]
prob <- cumsum(w)/sum(w)
ps <- which(abs(prob - .5) == min(abs(prob - .5)))
return(x[ps])
}
my_wtd_q = function(x, w, prob, n = 4096)
with(density(x, weights = w/sum(w), n = n),
x[which.max(cumsum(y*(x[2L] - x[1L])) >= prob)])
weighted.quantile <- function(x, w, probs = seq(0, 1, 0.25),
na.rm = FALSE, names = TRUE) {
if (any(probs > 1) | any(probs < 0)) stop("'probs' outside [0,1]")
if (length(w) == 1) w <- rep(w, length(x))
if (length(w) != length(x)) stop("w must have length 1 or be as long as x")
if (isTRUE(na.rm)) {
w <- x[!is.na(x)]
x <- x[!is.na(x)]
}
w <- w[order(x)] / sum(w)
x <- x[order(x)]
cum_w <- cumsum(w) - w * (1 - (seq_along(w) - 1) / (length(w) - 1))
res <- approx(x = cum_w, y = x, xout = probs)$y
if (isTRUE(names)) {
res <- setNames(res, paste0(format(100 * probs, digits = 7), "%"))
}
res
}
Methods
M <- alist(
medRep = median(rep(DF$x, DF$w)),
isotone = isotone::weighted.median(DF$x, DF$w),
laeken = laeken::weightedMedian(DF$x, DF$w),
spatstat1 = spatstat.geom::weighted.median(DF$x, DF$w, type=1),
spatstat2 = spatstat.geom::weighted.median(DF$x, DF$w, type=2),
spatstat4 = spatstat.geom::weighted.median(DF$x, DF$w, type=4),
survey = survey::svyquantile(~x, survey::svydesign(id=~1, weights=~w, data=DF), 0.5)$x[1],
RAndres = weighted.median(DF$x, DF$w),
matrixStats = matrixStats::weightedMedian(DF$x, DF$w),
MichaelChirico = my_wtd_q(DF$x, DF$w, .5),
Leonardo = weighted.quantile(DF$x, DF$w, .5),
GKi = medianWeighted(DF$x, DF$w)
)
Results
DF <- data.frame(x=c(10, 40), w=c(1, 2))
sapply(M, eval)
# medRep isotone laeken spatstat1 spatstat2
# 40.00000 40.00000 40.00000 40.00000 25.00000
# spatstat4 survey RAndres matrixStats MichaelChirico
# 17.50000 40.00000 10.00000 30.00000 34.15005
# Leonardo.50% GKi
# 25.00000 40.00000
DF <- data.frame(x=c(10, 40), w=c(1, 1))
sapply(M, eval)
# medRep isotone laeken spatstat1 spatstat2
# 25.00000 25.00000 40.00000 10.00000 10.00000
# spatstat4 survey RAndres matrixStats MichaelChirico
# 10.00000 10.00000 10.00000 25.00000 25.05044
# Leonardo.50% GKi
# 25.00000 25.00000
In those two cases only isotone and GKi give identical results compared to what median(rep(x, w)) returns.
If you're working with the survey package, assuming you've defined your survey design and x is your variable of interest:
svyquantile(~x, mydesign, c(0.5))
I got here looking for weighted quantiles, so I thought I might as well leave for future readers what I ended up with. Naturally, using probs = 0.5 will return the weighted median.
I started with MichaelChirico's answer, which unfortunately was off at the edges. Then I decided to switch from density() to approx(). Finally, I believe I nailed the correction factor to ensure consistency with the default algorithm of the unweighted quantile().
weighted.quantile <- function(x, w, probs = seq(0, 1, 0.25),
na.rm = FALSE, names = TRUE) {
if (any(probs > 1) | any(probs < 0)) stop("'probs' outside [0,1]")
if (length(w) == 1) w <- rep(w, length(x))
if (length(w) != length(x)) stop("w must have length 1 or be as long as x")
if (isTRUE(na.rm)) {
w <- x[!is.na(x)]
x <- x[!is.na(x)]
}
w <- w[order(x)] / sum(w)
x <- x[order(x)]
cum_w <- cumsum(w) - w * (1 - (seq_along(w) - 1) / (length(w) - 1))
res <- approx(x = cum_w, y = x, xout = probs)$y
if (isTRUE(names)) {
res <- setNames(res, paste0(format(100 * probs, digits = 7), "%"))
}
res
}
When weights are uniform, the weighted quantiles are identical to regular unweighted quantiles:
x <- rnorm(100)
stopifnot(stopifnot(identical(weighted.quantile(x, w = 1), quantile(x)))
Example using the same data as in the weighted.mean() man page.
x <- c(3.7, 3.3, 3.5, 2.8)
w <- c(5, 5, 4, 1)/15
stopifnot(isTRUE(all.equal(
weighted.quantile(x, w, 0:4/4, names = FALSE),
c(2.8, 3.33611111111111, 3.46111111111111, 3.58157894736842,
3.7)
)))
And this is for whoever solely wants the weighted median value:
weighted.median <- function(x, w, ...) {
weighted.quantile(x, w, probs = 0.5, names = FALSE, ...)
}

Resources