I have a small data set of locations and benzene concentrations in mg/kg
WELL.ID X Y BENZENE
1 MW-02 268.8155 282.83 0.00150
2 IW-06 271.6961 377.01 0.00050
3 IW-07 251.0236 300.41 0.01040
4 IW-08 278.9238 300.37 0.03190
5 MW-10 281.4008 414.15 2.04000
6 MW-12 391.3973 449.40 0.01350
7 MW-13 309.5307 335.55 0.01940
8 MW-15 372.8967 370.04 0.01620
9 MW-17 250.0000 428.04 0.01900
10 MW-24 424.4025 295.69 0.00780
11 MW-28 419.3205 250.00 0.00100
12 MW-29 352.9197 277.27 0.00031
13 MW-31 309.3174 370.92 0.17900
and I am trying to krig the values in a grid (the property these wells reside on) like so
setwd("C:/.....")
getwd()
require(geoR)
require(ggplot2)
a <- read.table("krigbenz_loc.csv", sep = ",", header = TRUE)
b <- data.matrix(a)
c <- as.geodata(b)
x.range <- as.integer(range(a[,2]))
y.range <- as.integer(range(a[,3]))
x = seq(from=x.range[1], to=x.range[2], by=1)
y = seq(from=y.range[1], to=y.range[2], by=1)
length(x)
length(y)
xv <- rep(x,length(y))
yv <- rep(y, each=length(x))
in_mat <- as.matrix(cbind(xv, yv))
this is when I start the Krig with
q <- ksline(c, cov.model="exp", cov.pars=c(10,3.33), nugget=0, locations=in_mat)
however, when looking at the output of this with
cbind(q$predict[1:10], q$krige.var[1:10])
i see
[,1] [,2]
[1,] 343.8958 10.91698
[2,] 343.8958 10.91698
[3,] 343.8958 10.91698
[4,] 343.8958 10.91698
[5,] 343.8958 10.91698
[6,] 343.8958 10.91698
[7,] 343.8958 10.91698
[8,] 343.8958 10.91698
[9,] 343.8958 10.91698
[10,] 343.8958 10.91698
these values do not change for the first 5000 rows... (cant view more because max.print = 5000... not sure how to change this either but that is a tangent..)
I am realizing that my
cov.pars = c(10,3.33)
being range and sill, are probably the issue.
the geoR.pdf, pg 19 describes what is expected from cov.pars however I am not sure how I should decide what these covariance parameters need to be.
Is there a method to find the appropriate values from my existing data or can I set these to generic values where my output will be similar to a kriging performed in the spatial analyst package of ESRI's ArcGIS?
ZR
::::EDIT:::
my geodata object was improperly converted...
here is the correct way to do this
c <- as.geodata(b, coords.col = 2:3, data.col = 4, )
also...for the variogram,
v1 <- variog(c)
length(v1$n)
v1.summary <- cbind(c(1:11), v1$v, v1$n)
colnames(v1.summary) <- c("lag", "semi-variance", "# of pairs")
v1.summary
One way to do this is to use the variofit function (also in the geoR package) to estimate the covariance parameters. For example, using your data and initial values:
vario <- variog(c) # See other options here for binning, etc
# Note that the order of the cov.pars is variance, then range, (see your question)
fitted_model <- variofit(vario=vario, ini.cov.pars=c(10, 3.33), cov.model='exp')
q <- ksline(c, cov.model=fitted_model$cov.model, cov.pars=fitted_model$cov.pars,
nugget=fitted_model$nugget, locations=in_mat)
It is worth your time to look at the variogram, by the way.
Related
I want to generat a veusing R.
Is there a way to generate a sequence of POSITIVE numbers that satisfy specific constraints
a mean of 13,
a standard deviation of 30.96 , and
a sample size of 6.
Thank you guys.
Another option (after echoing all the concerns about this being an XY problem):
We can transform n samples of practically any continuous distribution by finding a pair of scale and translation parameters that satisfies the desired constraints.
f <- function(n, mu, sigma) {
x <- rnorm(n) # substitute any continuous distribution here
fn <- function(par) {
y <- exp(x*par[2] + par[1])
log(abs(mean(y) - mu) + abs(sd(y) - sigma))
}
with(optim(c(0, 0), fn), exp(x*par[2] + par[1]))
}
# example usage
f(6L, 13, 30.96)
#> [1] 2.569263e-09 1.912637e-06 9.521086e-05 4.023787e-01 7.618698e+01 1.410541e+00
# take 20 sets of samples
m <- matrix(NA, 20, 8)
for (i in 1:nrow(m)) {
m[i, 1:6] <- sort(f(6L, 13, 30.96))
m[i, 7] <- mean(m[i, 1:6]) - 13 # difference from desired mean
m[i, 8] <- sd(m[i, 1:6]) - 30.96 # difference from desired SD
}
m
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
#> [1,] 2.373175e-12 5.239577e-08 1.612212e-07 9.033024e-01 0.9063398 76.19036 -7.105427e-15 -1.776357e-14
#> [2,] 1.185131e-17 2.420683e-12 1.967461e-11 1.613739e-03 1.8189977 76.17939 -7.105427e-15 3.552714e-15
#> [3,] 4.178030e-03 6.449223e-03 3.329587e-02 7.889274e-02 1.6949839 76.18220 1.776357e-15 0.000000e+00
#> [4,] 1.134263e-19 8.582391e-13 3.675582e-12 7.909737e-06 1.8206428 76.17935 -1.776357e-15 -3.552714e-15
#> [5,] 3.875063e-11 5.453168e-06 1.005380e-05 7.971751e-02 1.7390532 76.18121 -3.552714e-15 0.000000e+00
#> [6,] 2.057442e-10 1.395120e-04 2.825930e-03 8.257653e-01 0.9809560 76.19031 1.065814e-14 0.000000e+00
#> [7,] 3.281160e-02 3.939408e-02 4.617334e-02 6.313399e-01 1.0591873 76.19109 -1.776357e-15 -1.776357e-14
#> [8,] 1.652719e-08 1.700647e-08 2.909621e-05 1.366993e-02 1.8066190 76.17968 -1.065814e-14 -1.065814e-14
#> [9,] 3.231989e-18 1.103402e-08 4.891128e-08 2.246015e-01 1.5912433 76.18416 1.776357e-15 0.000000e+00
#> [10,] 2.147959e-25 1.412579e-21 4.929303e-16 7.013902e-04 1.8199323 76.17937 0.000000e+00 -5.684342e-14
#> [11,] 8.239083e-08 1.516883e-05 4.268010e-01 6.580114e-01 0.7215226 76.19365 1.065814e-14 3.552714e-15
#> [12,] 8.837010e-05 7.983109e-04 3.712888e-03 9.311085e-03 1.8064017 76.17969 0.000000e+00 -7.105427e-15
#> [13,] 3.586152e-178 2.149918e-134 1.681243e-56 2.681863e-09 1.8206509 76.17935 -5.329071e-15 -9.592327e-14
#> [14,] 5.860182e-07 1.645025e-05 1.057840e-03 3.878328e-02 1.7798381 76.18030 -1.776357e-15 -3.552714e-15
#> [15,] 2.363474e-05 1.090204e-03 3.533081e-03 2.924378e-01 1.5174856 76.18543 0.000000e+00 1.421085e-14
#> [16,] 1.666746e-03 6.105670e-02 1.966066e-01 6.731973e-01 0.8746421 76.19283 1.776357e-15 0.000000e+00
#> [17,] 1.633101e-06 5.641356e-05 2.427083e-02 9.874914e-02 1.6947355 76.18219 -1.598721e-14 -7.105427e-15
#> [18,] 2.124617e-72 2.143486e-68 9.930707e-68 1.367184e-08 1.8206509 76.17935 2.842171e-14 -1.776357e-14
#> [19,] 5.846315e-19 1.528350e-06 2.030263e-06 7.959439e-04 1.8198318 76.17937 -1.776357e-15 -1.776357e-14
#> [20,] 5.630657e-03 1.462770e-01 2.914364e-01 6.119128e-01 0.7504916 76.19425 1.776357e-15 0.000000e+00
Inspired by #SamMason's comment, here is an empirical solution. Though I am concerned that providing OP with an answer may actually do them a disservice, since I suspect this may well be an XY problem.
First, establish if it is possible to construct a set of six numbers that satisfy OP's constraints:
f <- function(x) {
y <- c(rep(x, 5), 6*13 - 5*x)
sd(y) - 30.96
}
uniroot(f, c(0, 1))$root
[1] 0.3606329
> y
[1] 0.3606329 0.3606329 0.3606329 0.3606329 0.3606329 76.1968355
> mean(y)
[1] 13
> sd(y)
[1] 30.96
So, yes, it is possible to construct a set of six numbers that meets OP's constraints. Now introduce a (small) degree of variation into the first five elements of the sample, fix the sixth to satisfy the mean constraint and calculate the sample SD. If the SD isn't "close enough" to the target, throw the sample away and try again. Impose an upper limit to the number of tries to prevent infinite looping.
# For reproducibility
set.seed(1234)
f1 <- function(sd) {
y <- rnorm(5, mean=0.3606329, sd=sd)
y[6] <- 6*13 - sum(y)
y
}
findIt <- function(sd, epsilon=0.001, maxIter=1000) {
iteration <- 0
found <- FALSE
while(!found) {
z <- f1(sd)
if (abs(sd(z) - 30.96) < epsilon) {
found <- TRUE
return(z)
}
iteration <- iteration + 1
if (iteration == maxIter) {
warning(paste0("No solution found after ", maxIter, " iterations"))
return(NA)
}
}
}
z <- findIt(0.2)
> z
[1] 0.44505164 0.66907765 0.47566925 0.09247431 0.12141987 76.19630728
> mean(z)
[1] 13
> sd(z)
[1] 30.96053
Note that findIt() is sloppy in that it assumes that the SD is "small enough" that all generated values are positive.
This procedure generates a sample that is "random" in the sense that five of the values are iid N(0.3606329, sigma * sigma) and the fifth is (highly) correlated with the sum of the other five. The joint distribution of all six values in the sample is not obvious to me.
I can't imagine a scenario in which this is a sensible thing to do.
I am given an empirical distribution FXemp of a real-valued random variable X. Given now X1,..., Xn having the same distribution as X and dependencies given by a copula C. I would like now to produce random samples of X1,..., Xn element of R.
E.g. I am given a vector of samples and the corresponding cdf
x <- rnorm(1000)
df <- ecdf(x)
Assume that I pick for a example a t-student or Clayton copula C. How can I produce random samples of for example 10 copies of x, where their dependency is determined by C.
Is there an easy way?
Or are their any packages that can be used here?
You can sample from the copula (with uniform margins) by using the copula package, and then apply the inverse ecdf to each component:
library(copula)
x <- rnorm(100) # sample of X
d <- 5 # desired number of copies
copula <- claytonCopula(param = 2, dim = d)
nsims <- 25 # number of simulations
U <- rCopula(nsims, copula) # sample from the copula (with uniform margins)
# now sample the copies of X ####
Xs <- matrix(NA_real_, nrow = nsims, ncol = d)
for(i in 1:d){
Xs[,i] <- quantile(x, probs = U[,i], type = 1) # type=1 is the inverse ecdf
}
Xs
# [,1] [,2] [,3] [,4] [,5]
# [1,] -0.5692185 -0.9254869 -0.6821624 -1.2148041 -0.682162391
# [2,] -0.4680407 -0.4263257 -0.3456553 -0.6132320 -0.925486872
# [3,] -1.1322063 -1.2148041 -0.8115089 -1.0074435 -1.430405604
# [4,] 0.9760268 1.2600186 1.0731551 1.2369623 0.835024471
# [5,] -1.1280825 -0.8995429 -0.5761037 -0.8115089 -0.543125426
# [6,] -0.1848303 -1.2148041 -0.5692185 0.8974921 -0.613232036
# [7,] -0.5692185 -0.3070884 -0.8995429 -0.8115089 -0.007292346
# [8,] 0.1696306 0.4072428 0.7646646 0.4910863 1.236962330
# [9,] -0.7908557 -1.1280825 -1.2970952 0.3655081 -0.633521404
# [10,] -1.3226053 -1.0074435 -1.6857615 -1.3226053 -1.685761474
# [11,] -2.5410325 -2.3604936 -2.3604936 -2.3604936 -2.360493569
# [12,] -2.3604936 -2.2530003 -1.9311289 -2.2956444 -2.360493569
# [13,] 0.4072428 -0.2150035 -0.3564803 -0.1051930 -0.166434458
# [14,] -0.4680407 -1.0729763 -0.6335214 -0.8995429 -0.899542914
# [15,] -0.9143225 -0.1522242 0.4053462 -1.0729763 -0.158375658
# [16,] -0.4998761 -0.7908557 -0.9813504 -0.1763604 -0.283013334
# [17,] -1.2148041 -0.9143225 -0.5176347 -0.9143225 -1.007443492
# [18,] -0.2150035 0.5675260 0.5214050 0.8310799 0.464151265
# [19,] -1.2148041 -0.6132320 -1.2970952 -1.1685962 -1.132206305
# [20,] 1.4456635 1.0444720 0.7850181 1.0742214 0.785018119
# [21,] 0.3172811 1.2369623 -0.1664345 0.9440006 1.260018624
# [22,] 0.5017980 1.4068250 1.9950305 1.2600186 0.976026807
# [23,] 0.5675260 -1.0729763 -1.2970952 -0.3653535 -0.426325703
# [24,] -2.5410325 -2.2956444 -2.3604936 -2.2956444 -2.253000326
# [25,] 0.4053462 -0.5431254 -0.5431254 0.8350245 0.950891450
i am working with r language and on umbalanced dataset and i need to know how can get the k nearest neighbors of a dataset becaue i need them to create new synthetic examples .
set.seed(123)
test <- 1:100
train.gc <- gc.subset[-test,]
test.gc <- gc.subset[test,]
train.def <- gc$Default[-test]
test.def <- gc$Default[test]
library(class)
knn.5 <- knn(train.gc, test.gc, train.def, k=5)
#how can i get the five nearest neighbours????????
Although it doesn't seem to be documented, the help for knn hints that the attributes may store something:
train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
k = knn(train, test, cl, k = 3, prob=TRUE)
names(attributes(k))
# [1] "levels" "class" "prob" "nn.index" "nn.dist"
and I'd hazard a guess that nn.index is the index of the neighbours:
> head(attr(k,"nn.index"))
[,1] [,2] [,3]
[1,] 10 2 13
[2,] 24 8 18
[3,] 1 18 8
[4,] 1 18 8
I'd guess those are the 3 nearest neighbours of the first four data points.
I'm using the following code to perform PCA:
PCA <- prcomp(Ret1, center = TRUE, scale. = TRUE)
summary(PCA)
I get the following result:
#Importance of components:
# PC1 PC2 PC3 PC4
#Standard deviation 1.6338 0.9675 0.60446 0.17051
#Proportion of Variance 0.6673 0.2340 0.09134 0.00727
#Cumulative Proportion 0.6673 0.9014 0.99273 1.00000
What I would like to do is a Rolling PCA for a specific window ( e.g. 180 days). The Result should be a matrix which shows the evolution of the "Proportion of Variance" of all principal components though time.
I tried it with
rollapply(Ret1, 180, prcomp)
but this doesn't work and I have no Idea how to save the "Proportion of Variance" for each time step in matrix.
The output matrix should look like this:
# PC1 PC2 PC3 PC4
#Period 1 0.6673 0.2340 0.09134 0.00727
#Period 2 0.7673 0.1340 0.09134 0.00727
# ....
Here is a mini subset of my data Ret1:
Cats Dogs Human Frogs
2016-12-13 0.0084041063 6.518479e-03 6.096295e-04 5.781271e-03
2016-12-14 -0.0035340384 -8.150321e-03 4.418382e-04 -5.978296e-03
2016-12-15 0.0107522782 3.875708e-03 -1.784663e-02 3.012253e-03
2016-12-16 0.0033034130 -1.752174e-03 -1.753624e-03 -4.448850e-04
2016-12-17 0.0000000000 0.000000e+00 0.000000e+00 0.000000e+00
2016-12-18 0.0000000000 0.000000e+00 0.000000e+00 0.000000e+00
2016-12-19 0.0019876743 1.973190e-03 -8.577261e-03 1.996151e-03
2016-12-20 0.0033235161 3.630921e-03 -4.757395e-03 4.594355e-03
2016-12-21 0.0003401156 -2.460351e-03 3.708875e-03 -1.636413e-03
2016-12-22 -0.0010940147 -1.864724e-03 -7.991572e-03 -1.158029e-03
2016-12-23 -0.0005387228 1.250898e-03 -2.843725e-03 7.492594e-04
2016-12-24 0.0000000000 0.000000e+00 0.000000e+00 0.000000e+00
2016-12-25 0.0000000000 0.000000e+00 0.000000e+00 0.000000e+00
2016-12-26 0.0000000000 0.000000e+00 0.000000e+00 0.000000e+00
2016-12-27 0.0019465877 2.245918e-03 0.000000e+00 5.632058e-04
2016-12-28 0.0002396803 -8.391658e-03 8.307552e-03 -5.598988e-03
2016-12-29 -0.0020884556 -2.933868e-04 1.661246e-03 -7.010738e-04
2016-12-30 0.0026172923 -4.647865e-03 9.574997e-03 -2.889166e-03
I tried the following:
PCA <- function(x){
Output=cumsum((apply((prcomp(x,center = TRUE, scale. = TRUE))$x, 2, var))/sum(vars))
return(Output)}
window <- 10
data <- Ret1
result <- rollapply(data, window,PCA)
plot(result)
#Gives you the Proportion of Variance = cumsum((apply((prcomp(x,center = TRUE, scale. = TRUE))$x, 2, var))/sum(vars))
First, the correct function for your purpose may be written as follow, using $sdev result of prcomp. I have left over center = TRUE and scale. = TRUE as they are function default.
PCA <- function(x){
oo <- prcomp(x)$sdev
oo / sum(oo)
}
Now, we can easily use sapply to do rolling operation:
## for your mini dataset of 18 rows
window <- 10
n <- nrow(Ret1)
oo <- sapply(seq_len(n - window + 1), function (i) PCA(Ret1[i:(i + window - 1), ]))
oo <- t(oo) ## an extra transposition as `sapply` does `cbind`
# [,1] [,2] [,3] [,4]
# [1,] 0.5206345 0.3251099 0.12789683 0.02635877
# [2,] 0.5722264 0.2493518 0.14588631 0.03253553
# [3,] 0.6051199 0.1973694 0.16151859 0.03599217
# [4,] 0.5195527 0.2874197 0.16497219 0.02805543
# [5,] 0.5682829 0.3100708 0.09456654 0.02707977
# [6,] 0.5344804 0.3149862 0.08912882 0.06140464
# [7,] 0.5954948 0.2542775 0.10434155 0.04588616
# [8,] 0.5627977 0.2581071 0.13068875 0.04840648
# [9,] 0.6089650 0.2559285 0.11022974 0.02487672
Each column is a PC, while each row gives proportional variance for each component in that period.
To further plot the result, you can use matplot:
matplot(oo, type = "l", lty = 1, col = 1:4,
xlab = "period", ylab = "proportional variance")
PCA 1-4 are sketched with colour 1:4, i.e., "black", "red", "green" and "blue".
Additional comments:
If you want to use zoo::rollapply, do
oo <- zoo::rollapply(Ret1, window, PCA, by.column = FALSE)
Precisely, I am reporting proportional standard deviation. If you really want proportional variance, chance PCA function to:
PCA <- function(x){
oo <- prcomp(x)$sdev ^ 2
oo / sum(oo)
}
Here is an excerpt of numeric matrix that I have
[1,] 30 -33.129487 3894754.1 -39.701738 -38.356477 -34.220534
[2,] 29 -44.289487 -8217525.9 -44.801738 -47.946477 -41.020534
[3,] 28 -48.439487 -4572815.9 -49.181738 -48.086477 -46.110534
[4,] 27 -48.359487 -2454575.9 -42.031738 -43.706477 -43.900534
[5,] 26 -38.919487 -2157535.9 -47.881738 -43.576477 -46.330534
[6,] 25 -45.069487 -5122485.9 -47.831738 -47.156477 -42.860534
[7,] 24 -46.207487 -2336325.9 -53.131738 -50.576477 -50.410534
[8,] 23 -51.127487 -2637685.9 -43.121738 -47.336477 -47.040534
[9,] 22 -45.645487 3700424.1 -56.151738 -47.396477 -50.720534
[10,] 21 -56.739487 1572594.1 -49.831738 -54.386577 -52.470534
[11,] 20 -46.319487 642214.1 -39.631738 -44.406577 -41.490534
What I want to do now, is to scale the values for each column to have values from 0 to 1.
I tried to accomplish this using the scale() function on my matrix (default parameters), and I got this
[1,] -0.88123100 0.53812440 -1.05963281 -1.031191482 -0.92872324
[2,] -1.17808251 -1.13538649 -1.19575096 -1.289013031 -1.11327085
[3,] -1.28847084 -0.63180980 -1.31265244 -1.292776849 -1.25141017
[4,] -1.28634287 -0.33914007 -1.12182012 -1.175023107 -1.19143220
[5,] -1.03524267 -0.29809911 -1.27795565 -1.171528133 -1.25738083
[6,] -1.19883019 -0.70775576 -1.27662116 -1.267774342 -1.16320727
[7,] -1.22910054 -0.32280189 -1.41807728 -1.359719044 -1.36810940
[8,] -1.35997055 -0.36443973 -1.15091204 -1.272613537 -1.27664977
[9,] -1.21415156 0.51127451 -1.49868058 -1.274226602 -1.37652260
[10,] -1.50924749 0.21727976 -1.33000083 -1.462151358 -1.42401647
[11,] -1.23207969 0.08873245 -1.05776452 -1.193844887 -1.12602635
Which is already close to what I want, but values from 0:1 were even better. I read the help manual of scale(), but I really don't understand how I would do that.
Try the following, which seems simple enough:
## Data to make a minimal reproducible example
m <- matrix(rnorm(9), ncol=3)
## Rescale each column to range between 0 and 1
apply(m, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
# [,1] [,2] [,3]
# [1,] 0.0000000 0.0000000 0.5220198
# [2,] 0.6239273 1.0000000 0.0000000
# [3,] 1.0000000 0.9253893 1.0000000
And if you were still to use scale:
maxs <- apply(a, 2, max)
mins <- apply(a, 2, min)
scale(a, center = mins, scale = maxs - mins)
Install the clusterSim package and run the following command:
normX = data.Normalization(x,type="n4");
scales package has a function called rescale:
set.seed(2020)
x <- runif(5, 100, 150)
scales::rescale(x)
#1.0000000 0.5053362 0.9443995 0.6671695 0.0000000
Not the prettiest but this just got the job done, since I needed to do this in a dataframe.
column_zero_one_range_scale <- function(
input_df,
columns_to_scale #columns in input_df to scale, must be numeric
){
input_df_replace <- input_df
columncount <- length(columns_to_scale)
for(i in 1:columncount){
columnnum <- columns_to_scale[i]
if(class(input_df[,columnnum]) !='numeric' & class(input_df[,columnnum])!='integer')
{print(paste('Column name ',colnames(input_df)[columnnum],' not an integer or numeric, will skip',sep='')) }
if(class(input_df[,columnnum]) %in% c('numeric','integer'))
{
vec <- input_df[,columnnum]
rangevec <- max(vec,na.rm=T)-min(vec,na.rm=T)
vec1 <- vec - min(vec,na.rm=T)
vec2 <- vec1/rangevec
}
input_df_replace[,columnnum] <- vec2
colnames(input_df_replace)[columnnum] <- paste(colnames(input_df)[columnnum],'_scaled')
}
return(input_df_replace)
}