Function for dpareto in R - r

I'm wondering if anybody has a function for dpareto written in R?
I'm not able to use the in built function in R as I can't install the libraries.
Thanks!

It's relatively easy to rip out functions from R packages, for example, from:
http://cran.r-project.org/web/packages/VGAM/VGAM.pdf
dpareto <- function(x, location, shape, log = FALSE) {
if (!is.logical(log.arg <- log) || length(log) != 1)
stop("bad input for argument 'log'")
rm(log)
L = max(length(x), length(location), length(shape))
x = rep(x, length.out = L);
location = rep(location, length.out = L);
shape = rep(shape, length.out = L)
logdensity = rep(log(0), length.out = L)
xok = (x > location)
logdensity[xok] = log(shape[xok]) + shape[xok] * log(location[xok]) -
(shape[xok]+1) * log(x[xok])
if (log.arg) logdensity else exp(logdensity)
}
alpha <- 3; k <- exp(1); x <- seq(2.8, 8, len = 300)
## Not run:
plot(x, dpareto(x, location = alpha, shape = k), type = "l",
main = "Pareto density split into 10 equal areas")

Related

How to calculate fuzzy performance index and normalized classification entropy in R

I am running Fuzzy C-Means Clustering using e1071 package. I want to decide the optimum number of clusters based on fuzzy performance index (FPI) (extent of fuzziness) and normalized classification entropy (NCE) (degree of disorganization of specific class) given in the following formula
where c is the number of clusters and n is the number of observations, μik is the fuzzy membership and loga is the natural logarithm.
I am using the following code
library(e1071)
x <- rbind(matrix(rnorm(100,sd=0.3),ncol=2),
matrix(rnorm(100,mean=1,sd=0.3),ncol=2))
cl <- cmeans(x,2,20,verbose=TRUE,method="cmeans")
cl$membership
I have been able to extract the μik i.e. fuzzy membership. Now, cmeans has to for different number of clusters e.g. 2 to 6 and the FPI and NCE has to be calculated to have a plot like the following
How can it be achieved in R?
Edit
I have tried the code provided by #nya for iris dataset using the following code
df <- scale(iris[-5])
FPI <- function(cmem){
c <- ncol(cmem)
n <- nrow(cmem)
1 - (c / (c - 1)) * (1 - sum(cmem^2) / n)
}
NCE <- function(cmem){
c <- ncol(cmem)
n <- nrow(cmem)
(n / (n - c)) * (- sum(cmem * log(cmem)) / n)
}
# prepare variables
cl <- list()
fpi <- nce <- NULL
# cycle through the desired number of clusters
for(i in 2:6){
cl[[i]] <- cmeans(df, i, 20, method = "cmeans")
fpi <- c(fpi, FPI(cl[[i]]$membership))
nce <- c(nce, NCE(cl[[i]]$membership))
}
# add space for the second axis label
par(mar = c(5,4,1,4) + .1)
# plot FPI
plot(2:6, fpi, lty = 2, pch = 18, type = "b", xlab = "Number of clusters", ylab = "FPI")
# plot NCE, manually adding the second axis
par(new = TRUE)
plot(2:6, nce, lty = 1, pch = 15, type = "b", xlab = "", ylab = "", axes = FALSE)
axis(4, at = pretty(range(nce)))
mtext("NCE", side = 4, line = 3)
# add legend
legend("top", legend = c("FPI", "NCE"), pch = c(18,15), lty = c(2,1), horiz = TRUE)
The minimum values of fuzzy performance index(FPI) and normalized classification entropy (NCE) were considered to decide the optimum number of clusters. NCE is always increasing and FPI is showing the decreasing value. Ideally it should have been
With available equations, we can program our own functions. Here, the two functions use equations present in the paper you suggested and one of the references the authors cite.
FPI <- function(cmem, method = c("FuzME", "McBrathney", "Rahul")){
method = match.arg(method)
C <- ncol(cmem)
N <- nrow(cmem)
# Rahul et al. 2019. https://doi.org/10.1080/03650340.2019.1578345
if(method == "Rahul"){
res <- 1 - (C / (C - 1)) * (1 - sum(cmem^2) / N)
}
# McBrathney & Moore 1985 https://doi.org/10.1016/0168-1923(85)90082-6
if(method == "McBrathney"){
F <- sum(cmem^2) / N
res <- 1 - (C * F - 1) / (F - 1)
}
# FuzME https://precision-agriculture.sydney.edu.au/resources/software/
# MATLAB code file fvalidity.m, downloaded on 11 Nov, 2021
if(method == "FuzME"){
F <- sum(cmem^2) / N
res <- 1 - (C * F - 1) / (C - 1)
}
return(res)
}
NCE <- function(cmem, method = c("FuzME", "McBrathney", "Rahul")){
method = match.arg(method)
C <- ncol(cmem)
N <- nrow(cmem)
if(method == "Rahul"){
res <- (N / (N - C)) * (- sum(cmem * log(cmem)) / N)
}
if(method %in% c("FuzME", "McBrathney")){
H <- -1 / N * sum(cmem * log(cmem))
res <- H / log(C)
}
return(res)
}
Then use those to calculate the indices from the degrees of membership from the cmeans function from the iris dataset.
# prepare variables
cl <- list()
fpi <- nce <- NULL
# cycle through the desired number of clusters
for(i in 2:6){
cl[[i]] <- e1071::cmeans(iris[, -5], i, 20, method = "cmeans")
fpi <- c(fpi, FPI(cl[[i]]$membership, method = "M"))
nce <- c(nce, NCE(cl[[i]]$membership, method = "M"))
}
Last, plot with two different axes in one plot.
# add space for the second axis label
par(mar = c(5,4,1,4) + .1)
# plot FPI
plot(2:6, fpi, lty = 2, pch = 18, type = "b", xlab = "Number of clusters", ylab = "FPI")
# plot NCE, manually adding the second axis
par(new = TRUE)
plot(2:6, nce, lty = 1, pch = 15, type = "b", xlab = "", ylab = "", axes = FALSE)
axis(4, at = pretty(range(nce)))
mtext("NCE", side = 4, line = 3)
# add legend
legend("top", legend = c("FPI", "NCE"), pch = c(18,15), lty = c(2,1), horiz = TRUE)
EDIT1: Updated the functions according to optional equations from two different publications and calculated the example on the iris dataset.
EDIT2: Added code for the FPI and NCE calculations specified in the FuzME MATLAB code available here.
Hope this could help
library(dplyr)
library(ggplot2)
f <- function(cl) {
C <- length(cl$size)
N <- sum(cl$size)
mu <- cl$membership
fpi <- 1 - C / (C - 1) * (1 - sum((mu)^2) / N)
nce <- N / (N - C) * (-sum(log(mu) * mu) / N)
c(FPI = fpi, NCE = nce)
}
data.frame(t(rbind(
K = 2:6,
sapply(
K,
function(k) f(cmeans(x, k, 20, verbose = TRUE, method = "cmeans"))
)
))) %>%
pivot_longer(cols = FPI:NCE, names_to = "Index") %>%
ggplot(aes(x = K, y = value, group = Index)) +
geom_line(aes(linetype = Index, color = Index)) +
geom_point() +
scale_y_continuous(
name = "FPI",
sec.axis = sec_axis(~., name = "NCE")
) +
theme(legend.position = "top")

Why is my Monte Carlo Integration wrong by a factor of 2?

I am trying to integrate the following function using a Monte Carlo Integration. The interval I want to integrate is x <- seq(0, 1, by = 0.01) and y <- seq(0, 1, by = 0.01).
my.f <- function(x, y){
result = x^2 + sin(x) + exp(cos(y))
return(result)
}
I calculated the integral using the cubature package.
library(cubature)
library(plotly)
# Rewriting the function, so it can be integrated
cub.function <- function(x){
result = x[1]^2 + sin(x[1]) + exp(cos(x[2]))
return(result)
}
cub.integral <- adaptIntegrate(f = cub.function, lowerLimit = c(0,0), upperLimit = c(1,1))
The result is 3.134606. But when I use my Monte Carlo Integration Code, see below, my result is about 1.396652. My code is wrong by more than a factor of 2!
What I did:
Since I need a volume to conduct a Monte Carlo Integration, I calculated the function values on the mentioned interval. This will give me an estimation of the maximum and minimum of the function.
# My data range
x <- seq(0, 1, by = 0.01)
y <- seq(0, 1, by = 0.01)
# The matrix, where I save the results
my.f.values <- matrix(0, nrow = length(x), ncol = length(y))
# Calculation of the function values
for(i in 1:length(x)){
for(j in 1:length(y)){
my.f.values[i,j] <- my.f(x = x[i], y = y[j])
}
}
# The maximum and minimum of the function values
max(my.f.values)
min(my.f.values)
# Plotting the surface, but this is not necessary
plot_ly(y = x, x = y, z = my.f.values) %>% add_surface()
So, the volume that we need is simply the maximum of the function values, since 1 * 1 * 4.559753 is simply 4.559753.
# Now, the Monte Carlo Integration
# I found the code online and modified it a bit.
monte = function(x){
tests = rep(0,x)
hits = 0
for(i in 1:x){
y = c(runif(2, min = 0, max = 1), # y[1] is y; y[2] is y
runif(1, min = 0, max = max(my.f.values))) # y[3] is z
if(y[3] < y[1]**2+sin(y[1])*exp(cos(y[2]))){
hits = hits + 1
}
prop = hits / i
est = prop * max(my.f.values)
tests[i] = est
}
return(tests)
}
size = 10000
res = monte(size)
plot(res, type = "l")
lines(x = 1:size, y = rep(cub.integral$integral, size), col = "red")
So, the result is completely wrong. But if I change the function a bit, suddenly is works.
monte = function(x){
tests = rep(0,x)
hits = 0
for(i in 1:x){
x = runif(1)
y = runif(1)
z = runif(1, min = 0, max = max(my.f.values))
if(z < my.f(x = x, y = y)){
hits = hits + 1
}
prop = hits / i
est = prop * max(my.f.values)
tests[i] = est
}
return(tests)
}
size = 10000
res = monte(size)
plot(res, type = "l")
lines(x = 1:size, y = rep(cub.integral$integral, size), col = "red")
Can somebody explain why the result suddenly changes? To me, both functions seem to do the exact same thing.
In your (first) code for monte, this line is in error:
y[3] < y[1]**2+sin(y[1])*exp(cos(y[2]))
Given your definition of my.f, it should surely be
y[3] < y[1]**2 + sin(y[1]) + exp(cos(y[2]))
Or..., given that you shouldn't be repeating yourself unnecessarily:
y[3] < my.f(y[1], y[2])

R Programming other alternatives for plot

I wonder how you can simplify these two :
plot (payroll,wins)
id = identify(payroll, wins,labels = code, n = 5)
plot (payroll,wins)
with(data, text(payroll, wins, labels = code, pos = 1, cex=0.5))
using other alternatives - pch() dan as.numeric()?
Not sure it's easier but you change pch during identification as below (taken from the R-help). Every time you click empty point change to filled-in dot.
# data simulation
data <- data.frame(payroll = rnorm(10), wins = rnorm(10), code = letters[1:10])
identifyPch <- function(x, y = NULL, n = length(x), plot = FALSE, pch = 19, ...)
{
xy <- xy.coords(x, y)
x <- xy$x
y <- xy$y
sel <- rep(FALSE, length(x))
while (sum(sel) < n) {
ans <- identify(x[!sel], y[!sel], labels = which(!sel), n = 1, plot = plot, ...)
if(!length(ans)) {
break
}
ans <- which(!sel)[ans]
points(x[ans], y[ans], pch = pch)
sel[ans] <- TRUE
}
## return indices of selected points
which(sel)
}
if(dev.interactive()) { ## use it
with(data, plot(payroll,wins))
id = with(data, identifyPch(payroll, wins))
}

Having a function output two quantities in looping structure in R?

In my R function below, I'm wondering how I can change my code such that I can get pe out of my fun function? Right now, fun only outputs L and U.
P.S. Of course, I want to keep the function work as it does right now, so therefore replicate may also need to change as a result of having fun output pe in addition to L and U.
CI.bi = function(n, p, n.sim){
fun <- function(n1 = n, p1 = p){
x <- rbinom(1, size = n1, prob = p1)
pe <- x/n1
res <- binom.test(x, n1, p1)[[4]]
c(L = res[1], U = res[2])
}
sim <- t(replicate(n.sim, fun()))
y = unlist(lapply(1:n.sim, function(x) c(x, x)))
plot(sim, y, ty = "n", ylab = NA, yaxt = "n")
segments(sim[ ,1], 1:n.sim, sim[ ,2], 1:n.sim, lend = 1)
}
# Example of use:
CI.bi(n = 15, p = .5, n.sim = 3)
You can have fun() return pe as an additional element of the return vector.
When referencing sim later on, just specify which columns you want to use. I believe the below code sample replicates your current functionality but has pe as an additional output of fun()
CI.bi = function(n, p, n.sim){
fun <- function(n1 = n, p1 = p){
x <- rbinom(1, size = n1, prob = p1)
pe <- x/n1
res <- binom.test(x, n1, p1)[[4]]
c(L = res[1], U = res[2], pe=pe)
}
sim <- t(replicate(n.sim, fun()))
y = unlist(lapply(1:n.sim, function(x) c(x, x)))
plot(sim[,1:2], y, ty = "n", ylab = NA, yaxt = "n")
segments(sim[ ,1], 1:n.sim, sim[ ,2], 1:n.sim, lend = 1)
}
CI.bi(n = 15, p = .5, n.sim = 3)

specClust() in kknn - arpack iteration limit increase

I am applying spectral clustering to a dataset with 4200 rows and 2 columns.
spec <- specClust(df1, centers=7, nn = 7, method = "symmetric")
I have the below error.
n .Call("R_igraph_arpack", func, extra, options, env, sym, PACKAGE = "igraph") :
At arpack.c:944 : ARPACK error, Maximum number of iterations reached
In addition: Warning message:
In .Call("R_igraph_arpack", func, extra, options, env, sym, PACKAGE = "igraph") :
At arpack.c:776 :ARPACK solver failed to converge (1001 iterations, 0/7 eigenvectors converged)
How do i increase the iterations of arpack because this doesnt work:
spec <- specClust(df1, centers=7, nn = 7, method = "symmetric",iter.max=301000)
Digging into the specClust, the ... does not pass anything to the arpack call.
The simplest thing to do I think is to copy the specClust code add maxiter=10000 and source the function in your script.
specCLust2 <- function (data, centers = NULL, nn = 7, method = "symmetric",
gmax = NULL, max.iter = 10000, ...)
{
call = match.call()
if (is.data.frame(data))
data = as.matrix(data)
da = apply(data, 1, paste, collapse = "#")
indUnique = which(!duplicated(da))
indAll = match(da, da[indUnique])
data2 = data
data = data[indUnique, ]
n <- nrow(data)
data = scale(data, FALSE, TRUE)
if (is.null(gmax)) {
if (!is.null(centers))
gmax = centers - 1L
else gmax = 1L
}
test = TRUE
while (test) {
DC = mydist(data, nn)
sif <- rbind(1:n, as.vector(DC[[2]]))
g <- graph(sif, directed = FALSE)
g <- decompose(g, min.vertices = 4)
if (length(g) > 1) {
if (length(g) >= gmax)
nn = nn + 2
else test = FALSE
}
else test = FALSE
}
W <- DC[[1]]
n <- nrow(data)
wi <- W[, nn]
SC <- matrix(1, nrow(W), nn)
SC[] <- wi[DC[[2]]] * wi
W = W^2/SC
alpha = 1/(2 * (nn + 1))
qua = abs(qnorm(alpha))
W = W * qua
W = dnorm(W, sd = 1)
DC[[1]] = W
L = Laplacian(DC, nn, method)
f <- function(x, extra) as.vector(extra %*% x)
if (is.null(centers))
kmax = 25
else kmax = max(centers)
###
#add the maxiter parameter to the arpack call, below
###
U <- arpack(f, extra = L, options = list(n = n, which = "SM",
nev = kmax, ncv = 2 * kmax, mode = 1, maxiter=max.iter), sym = TRUE)
ind <- order(U[[1]])
U[[2]] = U[[2]][indAll, ind]
U[[1]] = U[[1]][ind]
if (is.null(centers)) {
tmp = which.max(diff(U[[1]])) + 1
centers = which.min(AUC(U[[1]][1:tmp]))
}
if (method == "symmetric") {
rs = sqrt(rowSums(U[[2]]^2))
U[[2]] = U[[2]]/rs
}
result = kmeans(U[[2]], centers = centers, nstart = 20, ...)
archeType = getClosest(U[[2]][indAll, ], result$centers)
result$eigenvalue = U[[1]]
result$eigenvector = U[[2]]
result$data = data2
result$indAll = indAll
result$indUnique = indUnique
result$L = L
result$archetype = archeType
result$call = call
class(result) = c("specClust", "kmeans")
result
}

Resources