Mahalanobis difference by group with dplyr - r

I want to get a Mahalanobis difference for each set of two scores, after being grouped by another variable. In this case, it would be a Mahalanobis difference for each Attribute (across each set of 2 scores). The output should be 3 Mahalanobis distances (one for A, B and C).
Currently I am working with (in my original dataframe, there are some NAs, hence I include one in the reprex):
library(tidyverse)
library(purrr)
df <- tibble(Attribute = unlist(map(LETTERS[1:3], rep, 5)),
Score1 = c(runif(7), NA, runif(7)),
Score2 = runif(15))
mah_db <- df %>%
dplyr::group_by(Attribute) %>%
dplyr::summarise(MAH = mahalanobis(Score1:Score2,
center = base::colMeans(Score1:Score2),
cov(Score1:Score2, use = "pairwise.complete.obs")))
This raises the error:
Caused by error in base::colMeans(): ! 'x' must be an array of at
least two dimensions
But as far as I can tell, I am giving colMeans two columns.
So what's going wrong here? And I wonder if even fixing this gives a complete solution?

It seems your question is more about the statistics than dplyr. So I just give a small example based on your data and an adapted example from ?mahalanobis. Perhaps also have a look here or here.
df <- subset(x = df0, Attribute == "A", select = c("Score1", "Score2"))
df$mahalanobis <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))
df$p <- pchisq(q = df$mahalanobis, df = 2, lower.tail = FALSE)
plot(density(df$mahalanobis, bw = 0.3), ylim = c(0, 0.8),
main="Squared Mahalanobis distances");
grid()
rug(df$mahalanobis)
df <- subset(x = df0, Attribute == "B", select = c("Score1", "Score2"))
df <- df[complete.cases(df), ]
df$mahalanobis <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))
df$p <- pchisq(q = df$mahalanobis, df = 2, lower.tail = FALSE)
lines(density(df$mahalanobis, bw = 0.3), col = "red",
main="Squared Mahalanobis distances");
rug(df$mahalanobis, col = "red")
df <- subset(x = df0, Attribute == "C", select = c("Score1", "Score2"))
df$mahalanobis <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))
df$p <- pchisq(q = df$mahalanobis, df = 2, lower.tail = FALSE)
lines(density(df$mahalanobis, bw = 0.3), col = "green",
main="Squared Mahalanobis distances");
rug(df$mahalanobis, col = "green")
Hope, that helps (and too long for a comment).
(Of course you can make to code much shorter, but it shows in each step what happens.)

Related

How to provide group-wise boundaries for parameters in modelling using R nls_multstart?

I am new to using the purrr package in R and I am struggling with trying to pass a further argument to a function inside nls_multstart.
I have a nested data frame that contains data for different combinations of grouping variables.
I want to fit the same model to the data of each combinations of groups in the nested data frame.
So far, I was able to fit the model to each data.
# model
my_model <- function(ymax, k, t) {
ymax * (1 - exp(-k*t))
}
# data
t = seq(from = 1, to = 100, by = 1)
y1 = unlist(lapply(t, my_model, ymax = 500, k = 0.04))
y2 = unlist(lapply(t, my_model, ymax = 800, k = 0.06))
y = c(y1, y2)
a <- rep(x = "a", times = 100)
b <- rep(x = "b", times = 100)
groups <- c(a, b)
df <- data.frame(groups, t, y)
nested <- df %>%
group_by(groups) %>%
nest() %>%
rowwise() %>%
ungroup() %>%
mutate(maximum = map_dbl(map(data, "y"), max))
# set staring values
l <- c(ymax = 100 , k = 0.02)
u <- c(ymax = 300, k = 0.03)
# works, but without group-specific lower and upper boundaries
# fit the model
fit <- nested %>%
mutate(res = map(.x = data,
~ nls_multstart(y ~ my_model(ymax, k, t = t),
data = .x,
iter = 20,
start_lower = l,
start_upper = u,
supp_errors = 'N',
na.action = na.omit)))
However, when trying to use the value in column maximum as a group-specific boundary, R throws the following error:
# using group-specific boundary does not work
# fit the model
fit2 <- nested %>%
mutate(res = map(.x = data,
~ nls_multstart(y ~ my_model(ymax, k, t = t),
data = .x,
iter = 20,
start_lower = l,
start_upper = u,
supp_errors = 'N',
na.action = na.omit,
lower = c(maximum, 0),
upper = c(maximum*1.2, 1))))
Error in nls.lm(par = start, fn = FCT, jac = jac, control = control, lower = lower, :
length(lower) must be equal to length(par)
Can anybody give a hint how to improve on that?

Avoid writing large number of column names in a model formula with bs() terms

I want to use bs function for numerical variables in my dataset when fitting a logistic regression model.
df <- data.frame(a = c(0,1), b = c(0,1), d = c(0,1), e = c(0,1),
f= c("m","f"), output = c(0,1))
library(splines)
model <- glm(output~ bs(a, df=2)+ bs(b, df=2)+ bs(d, df=2)+ bs(e, df=2)+
factor(f) ,
data = df,
family = "binomial")
In my actual dataset, I need to apply bs() to way more columns than this example. Is there a way I can do this without writing all the terms?
We can use some string manipulation with sprintf, together with reformulate:
predictors <- c("a", "b", "d", "e")
bspl.terms <- sprintf("bs(%s, df = 2)", predictors)
other.terms <- "factor(f)"
form <- reformulate(c(bspl.terms, other.terms), response = "output")
#output ~ bs(a, df = 2) + bs(b, df = 2) + bs(d, df = 2) + bs(e,
# df = 2) + factor(f)
If you want to use a different df and degree for each spline, it is also straightforward (note that df can not be smaller than degree).
predictors <- c("a", "b", "d", "e")
dof <- c(3, 4, 3, 6)
degree <- c(2, 2, 2, 3)
bspl.terms <- sprintf("bs(%s, df = %d, degree = %d)", predictors, dof, degree)
other.terms <- "factor(f)"
form <- reformulate(c(bspl.terms, other.terms), response = "output")
#output ~ bs(a, df = 3, degree = 2) + bs(b, df = 4, degree = 2) +
# bs(d, df = 3, degree = 2) + bs(e, df = 6, degree = 3) + factor(f)
Prof. Ben Bolker: I was going to something a little bit fancier, something like predictors <- setdiff(names(df)[sapply(df, is.numeric)], "output").
Yes. This is good for safety. And of course, an automatic way if OP wants to include all numerical variables other than "output" as predictors.

Perform an operation with complete cases without changing the original vectors

I would like to calculate a rank-biserial correlation. But the (only it seems) package can't handle missing values that well. It has no built in "na.omit = TRUE" function. I could remove the missings in the data frame, but that would be a hustle with many different calculations.
n <- 500
df <- data.frame(id = seq (1:n),
ord = sample(c(0:3), n, rep = TRUE),
sex = sample(c("m", "f"), n, rep = TRUE, prob = c(0.55, 0.45))
)
df <- as.data.frame(apply (df, 2, function(x) {x[sample( c(1:n), floor(n/10))] <- NA; x} ))
library(rcompanion)
wilcoxonRG(x = df$ord, g = df$sex, verbose = T)
I imagine something stupidly easy like "complete.cases(wilcoxonRG(x = df$ord, g = df$sex, verbose = T)). It's probably not that hard but I could only find comeplete data frame manipulations. Thanks in advance!

Plot heatmaps of multiple data frames using a slider in R

I have multiple data.frames and each one of them represent the pairwise interactions of individuals at different time points.
Here is an example of how my data.frames look.
df1 <- matrix(data = rexp(9, rate = 10), nrow = 3, ncol = 3)
df2 <- matrix(data = rexp(16, rate = 10), nrow = 4, ncol = 4)
df3 <- matrix(data = rexp(4, rate = 10), nrow = 2, ncol = 2)
I would like to plot them as it is pointed in this page (https://plotly.com/r/sliders/)
where with a slider I can move from one heatmap to the other.
I have tried so far with plotly but I have not succeeded. Any help is highly appreciated.
I am struggling for long with this issue. I might be a bit blind at this point so please forgive me if the question is stupid.
Following the Sine Wave Slider example on https://plotly.com/r/sliders/ this can be achieved like so. The first step of my approach involves converting the matrices to dataframes with columns x, y, z. Second instead of lines we plot heatmaps.
df1 <- matrix(data = rexp(9, rate = 10), nrow = 3, ncol = 3)
df2 <- matrix(data = rexp(16, rate = 10), nrow = 4, ncol = 4)
df3 <- matrix(data = rexp(4, rate = 10), nrow = 2, ncol = 2)
library(tibble)
library(tidyr)
library(plotly)
# Make dataframes
d <- lapply(list(df1, df2, df3), function(d) {
d %>%
as_tibble(.colnames = seq(ncol(.))) %>%
rowid_to_column("x") %>%
pivot_longer(-x, names_to = "y", values_to = "z") %>%
mutate(y = stringr::str_extract(y, "\\d"),
y = as.numeric(y))
})
aval <- list()
for(step in seq_along(d)){
aval[[step]] <-list(visible = FALSE,
name = paste0('v = ', step),
x = d[[step]]$x,
y = d[[step]]$y,
z = d[[step]]$z)
}
aval[1][[1]]$visible = TRUE
steps <- list()
fig <- plot_ly()
for (i in seq_along(aval)) {
fig <- add_trace(fig, x = aval[i][[1]]$x, y = aval[i][[1]]$y, z = aval[i][[1]]$z, visible = aval[i][[1]]$visible,
name = aval[i][[1]]$name, type = "heatmap")
fig
step <- list(args = list('visible', rep(FALSE, length(aval))), method = 'restyle')
step$args[[2]][i] = TRUE
steps[[i]] = step
}
fig <- fig %>%
layout(sliders = list(list(active = 0,
currentvalue = list(prefix = "Heatmap: "),
steps = steps)))
fig

Is dplyr's left_join correct way to attach a data.frame to a SpatialPolygonDataFrame in R?

Merging extra data (frames) to spatial objects in R can be tricky (as explained here, or here)
Searching for a solution on how to correctly do the job I found this SO question listing several methods. dplyr's left_join was not listed there. I spotted it being used in Robin's tutorial.
My question is - is this a correc method to use? Are there any use cases (different number of rows? different rows names? sorting? etc.) that this solution would fail?
Here is some reproducible code illustarting the methods I found / came across:
# libraries
library("spdep"); library("sp"); library("dplyr")
# sopatial data
c <- readShapePoly(system.file("etc/shapes/columbus.shp", package="spdep")[1])
m <- c#data
c#data <- subset(c#data, select = c("POLYID", "INC"))
c#data$INC2 <- c#data$INC
c#data$INC <- NULL
ex <- subset(c, c$POLYID <= 2) # polygons with messed up data in merged df
c <- subset(c, c$POLYID < 49) # remove one polygon from shape so that df has one poly too many
# messing up merge data
m <- subset(m, POLYID != 1) # exclude polygon
m <- subset(m, select = c("POLYID", "INC")) # only two vars
rownames(m) <- m$POLYID - 2 # change rownames
m$POLYID[m$POLYID == 2] <- 0 # wrong ID
m <- m[order(m$INC),] # different sort
m$POLYID2 <- m$POLYID # duplicated to check dplyr
# left_join solution
s1 <- c
s1#data <- left_join(s1#data, m)
plot(c)
plot(s1, col = "red", density = 40, angle = 0, add = TRUE)
plot(ex, col= NA, border = "green", add = TRUE)
View(s1#data)
# match solution
s2 <- c
s2#data = data.frame(s2#data, m[match(s2#data[,"POLYID"], m[,"POLYID"]),])
plot(c)
plot(s2, col = "red", density = 40, angle = 0, add = TRUE)
plot(ex, col= NA, border = "green", add = TRUE)
View(s2#data)
# sp solution
s3 <- c
s3 <- sp::merge(s3, m, by="POLYID")
plot(c)
plot(s3, col = "red", density = 40, angle = 0, add = TRUE)
plot(ex, col= NA, border = "green", add = TRUE)
View(s3#data)
# inner join solution
s4 <- c
s4#data <- inner_join(s4#data, m)
plot(c)
plot(s4, col = "red", density = 40, angle = 0, add = TRUE)
plot(ex, col= NA, border = "green", add = TRUE)
View(s4#data)
# rebuild solution???
s5 <- c
s5.df <- as(s5, "data.frame")
s5.df1 <- merge(s5.df, m, sort=FALSE, by.x="POLYID", by.y="POLYID", all.x=TRUE, all.y=TRUE)
s51 <- SpatialPolygonsDataFrame(as(s5, "SpatialPolygons"), data=s5.df1)
plot(c)
plot(s51, col = "red", density = 40, angle = 0, add = TRUE)
plot(ex, col= NA, border = "green", add = TRUE)
Left join seems to do the job. Same as sp::merge and match ( I do hope there is no messing up the order so for instance plotted polygons are associated with different vales after the merge?). None of the solutions actually removes two polygons withmissing data, but I presume this is correct behaviour in R?

Resources