Matrix version of rasterToPoints? - r

Anyone know of a non-raster method to achieve the following?
require(raster)
d = data.frame(rasterToPoints(raster(volcano)))
head(d)
x y layer
1 0.008196721 0.9942529 100
2 0.024590164 0.9942529 100
3 0.040983607 0.9942529 101
4 0.057377049 0.9942529 101
5 0.073770492 0.9942529 101
6 0.090163934 0.9942529 101
Cheers.

One way would be to use the row and col command:
library(raster)
data(volcano)
df <- data.frame(
x = as.vector(col(volcano)),
y = (yy <- as.vector(row(volcano)))[length(yy):1],
val = as.vector(volcano)
)
raster rescales the range to 0 - 1, if not specified differently, so we would to have to do this too:
## rescale
df$x <- with(df, (x - min(x)) / (max(x) - min(x)))
df$y <- with(df, (y - min(x)) / (max(y) - min(y)))
Finally lets check, that the results are the same:
## Using raster df1 <- data.frame(rasterToPoints(raster(volcano)))
cols <- colorRampPalette(c('white', "blue",'red')) df$col <-
cols(20)[as.numeric(cut(df$val, breaks = 20))] df1$col <-
cols(20)[as.numeric(cut(df1$layer, breaks = 20))]
par(mfrow = c(1, 2)) plot(df[, 1:2], col = df$col, pch = 20, main =
"matrix")
plot(df1[, 1:2], col = df1$col, pch = 20, main = "raster")
Note:
While the results appear the same visually, they are not. The resolution of the raster command is most likely different, and hence there are different nrows for df and df1.

Faster for large matrices:
data.frame(
x = rep(1:ncol(m), each=nrow(m)),
y = rep(nrow(m):1, ncol(m)),
val = as.vector(m)
)

Related

R Getting numeric matrix from predict()

I have the following code:
fit_lm=lm(z~x+y)
mix <- 2
max <- 12
miy <- 2
may <- 12
griddf <- expand.grid(x = seq(mix,max, length.out = 10),
y = seq( miy,may,length.out = 10))
Prediction_data <- data.frame(griddf)
colnames(Prediction_data) <- c("x", "y")
coordinates(Prediction_data ) <- ~ x + y
terrain_lm <- predict(fit_lm, Prediction_data)
I want that terrain_lm is a numeric matrix, in such a way, that I can use
fig <- plot_ly()
fig <- fig %>% add_surface(terrain_lm)
but I get a 1d array with 100 elements.
The result of predict is a vector. You need to add it to the x and y values and then use xtabs to transform into a suitable matrix for a surface plot.
library(plotly)
#test data
x <- runif(20, 4, 10)
y <- runif(20, 3, 6)
z <- 3*x+y +runif(20, 0, 2)
fit_lm <- lm(z~x+y)
mix <- 2
max <- 12
miy <- 2
may <- 12
griddf <- expand.grid(x = seq(mix,max, length.out = 10),
y = seq( miy,may,length.out = 10))
terrain_lm <- data.frame(griddf)
terrain_lm$z <- predict(fit_lm, terrain_lm)
fig <- plot_ly(z = ~xtabs(z ~ x + y, data = terrain_lm))
fig <- fig %>% add_surface()

mdply (or similar) a function instead of for loop

I want to generate data from a function iterating over a range of values. The setting is best explained in a small example:
myfun <- function(a, b, sims) {
x = 3/a*b
y = mean(a*rnorm(sims))
return(data.frame(x = x, y = y))
}
# Output I want:
d <- data.frame(x = 0, y= 0)
d[1,] <- myfun(a=4, b=2, sims = 100)
d[2,] <- myfun(a=4, b=3, sims = 100)
d[3,] <- myfun(a=4, b=4, sims = 100)
# --> With a for loop this is easy
# Using mdply, however, does not work
a <- expand.grid(a=1:3)
d <- plyr::mdply(a, myfun, b=seq(1,100, length=100), sims = 100)
You can use Map :
data <- expand.grid(a = 1:3, b = 1:100)
result <- do.call(rbind, Map(myfun, data$a, data$b, MoreArgs = list(sims = 100)))
head(result)
# x y
#1 3.0 -0.17846248
#2 1.5 0.06837716
#3 1.0 0.01034184
#4 6.0 -0.02898619
#5 3.0 0.10077290
#6 2.0 0.22321839
A similar way would be if you Vectorize myfun. Vectorize is a wrapper around mapply.
myfun_vec <- Vectorize(myfun)
t(myfun_vec(data$a, data$b, 100))
A purrr option :
result <- purrr::map2_df(data$a, data$b, myfun, sims = 100)

with ggplot in R

I need little help. I try to do plot with ggplot package. When I want to make plot, depends of more than 1 factor (for example here: plot changes when średnia1 and odchylenie1 change):
alpha = 0.05
N = 100
sample_l = 10
srednia1 = seq(-7, 7, by = 1)
odchylenie1 = seq(1, 10, by = 1)
srednia2 = 2
odchylenie2 = 2
prob = 0.7
params = expand.grid(sample_l, srednia1, odchylenie1, srednia2, odchylenie2, prob)
str(params)
names(params) = c("dlugość", "średnia1", "odchylenie1", "średnia2", "odchyelnie2", "prawdopodobienstwo")
set.seed(100)
now <- Sys.time()
powers <- sapply(1:nrow(params), function(p){
l <- params[p, 1]
par_1 <- c(params[p, 2],params[p, 3])
par_2 <- c(params[p, 4], params[p, 5])
p <- params[p,6]
p_sim <-sapply(rep(l, N), function(x){
my_sample <- rmix(l,"norm", par_1, "norm", par_2, p)
shapiro.test(my_sample)$p.value
})
mean(p_sim < alpha)
})
Sys.time() - now
power_df <- bind_cols(params, power = powers)
power_df %>% ggplot(aes(x = średnia1,
y = power,
col = factor(odchylenie1))) +
geom_line()
it work perfect, but now, when I want to make plot only depends of 1 factor - prob something goes wrong. I have error : Error: Aesthetics must be either length 1 or the same as the data (150): x, y. Here is a code:
alpha = 0.05
N = 100
sample_l = 10
srednia1 = 2
odchylenie1 = 2
srednia2 = 1
odchylenie2 = 1
prob = seq(0.1,0.9,by=0.1)
set.seed(100)
now <- Sys.time()
powers <- sapply(1:nrow(params), function(p){
l <- params[p, 1]
par_1 <- c(params[p, 2],params[p, 3])
par_2 <- c(params[p, 4], params[p, 5])
p <- params[p,6]
p_sim <-sapply(rep(l, N), function(x){
my_sample <- rmix(l,"norm", par_1, "norm", par_2, p)
shapiro.test(my_sample)$p.value
})
mean(p_sim < alpha)
})
Sys.time() - now
power_df <- bind_cols(params, power = powers)
power_df %>% ggplot(aes(x = prob, y = power)) + geom_line()
PLEASE HELP ME :(

How to associate variable values from a df to another

I have a dataframe with three values, x and y are coordinates and z is the value of the indipendent variable:
x.range <- c(1,10)
y.range <- c(20,50)
grid <- expand.grid(x = seq(x.range[1], x.range[2], by=0.5),
y = seq(y.range[1], y.range[2], by=0.5))
grid$z <- runif(nrow(grid),10, 70)
Now i have another dataframe like this with only x and y values:
x1 <- c(3.7,5.4,9.2)
y1 <- c(41.1,30.3,22.9)
df <- data.frame(x=x1,y=y1)
Now i want to associate to the points of dataframe df the z value of the nearest point of dataframe grid (with the shortest distance). Thanks.
This isn't the prettiest, but works
apply(df, 1,
function(x){
pythag <- sqrt((x[1] - grid$x)^2 +
(x[2] - grid$y)^2)
grid[which.min(pythag), "z"]
})
Simply returning the value for the nearest point using Pythagoras.
Edit
Recoding to adhere to coding standards:
pythag <- function(x, y, g){
which.min(((x - g$x)^2 + (y - g$y)^2)^0.5)
}
idx <- mapply(FUN = pythag,
x = df[["x"]],
y = df[["y"]],
MoreArgs = list(g = grid))
grid[idx,]

Is there a weighted.median() function?

I'm looking for something similar in form to weighted.mean(). I've found some solutions via search that write out the entire function but would appreciate something a bit more user friendly.
The following packages all have a function to calculate a weighted median: 'aroma.light', 'isotone', 'limma', 'cwhmisc', 'ergm', 'laeken', 'matrixStats, 'PSCBS', and 'bigvis' (on github).
To find them I used the invaluable findFn() in the 'sos' package which is an extension for R's inbuilt help.
findFn('weighted median')
Or,
???'weighted median'
as ??? is a shortcut in the same way ?some.function is for help(some.function)
Some experience using the answers from #wkmor1 and #Jaitropmange.
I've checked 3 functions from 3 packages, isotone, laeken, and matrixStats. Only matrixStats works properly. Other two (just as the median(rep(x, times=w) solution) give integer output. As long as I calculated median age of populations, decimal places matter.
Reproducible example. Calculation of the median age of a population
df <- data.frame(age = 0:100,
pop = spline(c(4,7,9,8,7,6,4,3,2,1),n = 101)$y)
library(isotone)
library(laeken)
library(matrixStats)
isotone::weighted.median(df$age,df$pop)
# [1] 36
laeken::weightedMedian(df$age,df$pop)
# [1] 36
matrixStats::weightedMedian(df$age,df$pop)
# [1] 36.164
median(rep(df$age, times=df$pop))
# [1] 35
Summary
matrixStats::weightedMedian() is the reliable solution
To calculate the weighted median of a vector x using a same length vector of (integer) weights w:
median(rep(x, times=w))
This is just a simple solution, ready to use almost anywhere.
weighted.median <- function(x, w) {
w <- w[order(x)]
x <- x[order(x)]
prob <- cumsum(w)/sum(w)
ps <- which(abs(prob - .5) == min(abs(prob - .5)))
return(x[ps])
}
Really old post but I just came across it and did some testing of the different methods. spatstat::weighted.median() seemed to be about 14 times faster than median(rep(x, times=w)) and its actually noticeable if you want to run the function more than a couple times. Testing was with a relatively large survey, about 15,000 people.
One can also use stats::density to create a weighted PDF, then convert this to a CDF, as elaborated here:
my_wtd_q = function(x, w, prob, n = 4096)
with(density(x, weights = w/sum(w), n = n),
x[which.max(cumsum(y*(x[2L] - x[1L])) >= prob)])
Then my_wtd_q(x, w, .5) will be the weighted median.
One could also be more careful to ensure that the total area under the density is one by re-normalizing.
A way in base to get a weighted median will be to order by the values and build the cumsum of the weights and get the value(s) at sum * 0.5 of the weights.
medianWeighted <- function(x, w, q=.5) {
n <- length(x)
i <- order(x)
w <- cumsum(w[i])
p <- w[n] * q
j <- findInterval(p, w)
Vectorize(function(p,j) if(w[n] <= 0) NA else
if(j < 1) x[i[1]] else
if(j == n) x[i[n]] else
if(w[j] == p) (x[i[j]] + x[i[j+1]]) / 2 else
x[i[j+1]])(p,j)
}
What will have the following results with simple input data.
medianWeighted(c(10, 40), c(1, 2))
#[1] 40
median(rep(c(10, 40), c(1, 2)))
#[1] 40
medianWeighted(c(10, 40), c(2, 1))
#[1] 10
median(rep(c(10, 40), c(2, 1)))
#[1] 10
medianWeighted(c(10, 40), c(1.5, 2))
#[1] 40
medianWeighted(c(10, 40), c(3, 4))
#[1] 40
median(rep(c(10, 40), c(3, 4)))
#[1] 40
medianWeighted(c(10, 40), c(1.5, 1.5))
#[1] 25
medianWeighted(c(10, 40), c(3, 3))
#[1] 25
median(rep(c(10, 40), c(3, 3)))
#[1] 25
medianWeighted(c(10, 40), c(0, 1))
#[1] 40
medianWeighted(c(10, 40), c(1, 0))
#[1] 10
medianWeighted(c(10, 40), c(0, 0))
#[1] NA
It can also be used for other qantiles
medianWeighted(1:10, 10:1, seq(0, 1, 0.25))
[1] 1 2 4 6 10
Compare with other methods.
#Functions from other Answers
weighted.median <- function(x, w) {
w <- w[order(x)]
x <- x[order(x)]
prob <- cumsum(w)/sum(w)
ps <- which(abs(prob - .5) == min(abs(prob - .5)))
return(x[ps])
}
my_wtd_q = function(x, w, prob, n = 4096)
with(density(x, weights = w/sum(w), n = n),
x[which.max(cumsum(y*(x[2L] - x[1L])) >= prob)])
weighted.quantile <- function(x, w, probs = seq(0, 1, 0.25),
na.rm = FALSE, names = TRUE) {
if (any(probs > 1) | any(probs < 0)) stop("'probs' outside [0,1]")
if (length(w) == 1) w <- rep(w, length(x))
if (length(w) != length(x)) stop("w must have length 1 or be as long as x")
if (isTRUE(na.rm)) {
w <- x[!is.na(x)]
x <- x[!is.na(x)]
}
w <- w[order(x)] / sum(w)
x <- x[order(x)]
cum_w <- cumsum(w) - w * (1 - (seq_along(w) - 1) / (length(w) - 1))
res <- approx(x = cum_w, y = x, xout = probs)$y
if (isTRUE(names)) {
res <- setNames(res, paste0(format(100 * probs, digits = 7), "%"))
}
res
}
Methods
M <- alist(
medRep = median(rep(DF$x, DF$w)),
isotone = isotone::weighted.median(DF$x, DF$w),
laeken = laeken::weightedMedian(DF$x, DF$w),
spatstat1 = spatstat.geom::weighted.median(DF$x, DF$w, type=1),
spatstat2 = spatstat.geom::weighted.median(DF$x, DF$w, type=2),
spatstat4 = spatstat.geom::weighted.median(DF$x, DF$w, type=4),
survey = survey::svyquantile(~x, survey::svydesign(id=~1, weights=~w, data=DF), 0.5)$x[1],
RAndres = weighted.median(DF$x, DF$w),
matrixStats = matrixStats::weightedMedian(DF$x, DF$w),
MichaelChirico = my_wtd_q(DF$x, DF$w, .5),
Leonardo = weighted.quantile(DF$x, DF$w, .5),
GKi = medianWeighted(DF$x, DF$w)
)
Results
DF <- data.frame(x=c(10, 40), w=c(1, 2))
sapply(M, eval)
# medRep isotone laeken spatstat1 spatstat2
# 40.00000 40.00000 40.00000 40.00000 25.00000
# spatstat4 survey RAndres matrixStats MichaelChirico
# 17.50000 40.00000 10.00000 30.00000 34.15005
# Leonardo.50% GKi
# 25.00000 40.00000
DF <- data.frame(x=c(10, 40), w=c(1, 1))
sapply(M, eval)
# medRep isotone laeken spatstat1 spatstat2
# 25.00000 25.00000 40.00000 10.00000 10.00000
# spatstat4 survey RAndres matrixStats MichaelChirico
# 10.00000 10.00000 10.00000 25.00000 25.05044
# Leonardo.50% GKi
# 25.00000 25.00000
In those two cases only isotone and GKi give identical results compared to what median(rep(x, w)) returns.
If you're working with the survey package, assuming you've defined your survey design and x is your variable of interest:
svyquantile(~x, mydesign, c(0.5))
I got here looking for weighted quantiles, so I thought I might as well leave for future readers what I ended up with. Naturally, using probs = 0.5 will return the weighted median.
I started with MichaelChirico's answer, which unfortunately was off at the edges. Then I decided to switch from density() to approx(). Finally, I believe I nailed the correction factor to ensure consistency with the default algorithm of the unweighted quantile().
weighted.quantile <- function(x, w, probs = seq(0, 1, 0.25),
na.rm = FALSE, names = TRUE) {
if (any(probs > 1) | any(probs < 0)) stop("'probs' outside [0,1]")
if (length(w) == 1) w <- rep(w, length(x))
if (length(w) != length(x)) stop("w must have length 1 or be as long as x")
if (isTRUE(na.rm)) {
w <- x[!is.na(x)]
x <- x[!is.na(x)]
}
w <- w[order(x)] / sum(w)
x <- x[order(x)]
cum_w <- cumsum(w) - w * (1 - (seq_along(w) - 1) / (length(w) - 1))
res <- approx(x = cum_w, y = x, xout = probs)$y
if (isTRUE(names)) {
res <- setNames(res, paste0(format(100 * probs, digits = 7), "%"))
}
res
}
When weights are uniform, the weighted quantiles are identical to regular unweighted quantiles:
x <- rnorm(100)
stopifnot(stopifnot(identical(weighted.quantile(x, w = 1), quantile(x)))
Example using the same data as in the weighted.mean() man page.
x <- c(3.7, 3.3, 3.5, 2.8)
w <- c(5, 5, 4, 1)/15
stopifnot(isTRUE(all.equal(
weighted.quantile(x, w, 0:4/4, names = FALSE),
c(2.8, 3.33611111111111, 3.46111111111111, 3.58157894736842,
3.7)
)))
And this is for whoever solely wants the weighted median value:
weighted.median <- function(x, w, ...) {
weighted.quantile(x, w, probs = 0.5, names = FALSE, ...)
}

Resources