cor() with missing values - r

I want a matrix with only the correlation coefficients which are bigger than 0.2. I came up with the following solution.
mts.data <- ts(data.frame(a=arima.sim(model=list(1,0,0), n=10),
b=arima.sim(model=list(1,0,1), n=10), c=arima.sim(model=list(1,0,0),
n=10), d=arima.sim(model=list(1,0,2), n=10),
e=arima.sim(model=list(2,0,1), n=10)), start=c(2007,1), frequency=12)
critcor <- function(x) {
crit.mat <- matrix(0, nrow=ncol(x), ncol=ncol(x))
for(j in 1:ncol(x)) {
for(i in 1:ncol(x)) {
if(abs(cor(x[,i], x[,j])) > 0.2) {
crit.mat[i,j] <- cor(x[,i], x[,j])
}
}
}
return(crit.mat)
}
This works fine. Unfortunately, my data set contains missing values.
mts.data[1:3, 4] <- NA
mts.data[9:10, 5] <- NA
When I run my function, I got an error.
critcor(mts.data)
# Error in if (abs(cor(x[, i], x[, j])) > 0.2) { :
# missing value where TRUE/FALSE needed
I'm browsing the Internet for several hours now and I have absolutely no idea how I could fix this. If a correlation is not possible because of the missing values, I want my function just print a 0 instead.

You can greatly simplify your code like this:
cm = cor(mts.data, use = "p")
cm[abs(cm) <= 0.2] = 0
which gives:
> cm
a b c d e
a 1.0000000 0.0000000 -0.4667718 -0.5241904 -0.6864418
b 0.0000000 1.0000000 0.0000000 -0.3270387 0.0000000
c -0.4667718 0.0000000 1.0000000 0.4708803 0.5222566
d -0.5241904 -0.3270387 0.4708803 1.0000000 0.0000000
e -0.6864418 0.0000000 0.5222566 0.0000000 1.0000000
The snippet use = "p" is short for "pairwise complete observations", i.e. NAs will be omitted when necessary. For more options and details see ?cor.
The error you received was when you had a value that was NA. Then also the comparison NA > 0.2 will be NA and if does not accept NA as its input, thus the error.

Related

How to find the optimal cut-off point to minimize both the FNR and FPR in R?

I should find the optimal threshold to minimize both the false positive rate and false negative rate. An equal weight between these two rates should be assumed. I write the following code:
data=read.csv( url("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"), sep=",")
library(ROCR)
pred=prediction(data$decile_score/10, data$two_year_recid)
perf=performance(pred, measure="fnr",x.measure="fpr")
opt.cut = function(perf, pred)
{
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(False_negative_rate = 1-y[[ind]], False_positive_rate = x[[ind]],
cutoff = p[[ind]])
}, perf#x.values, perf#y.values, pred#cutoffs)
}
print(opt.cut(perf, pred))
It throws out this result:
[,1]
False_negative_rate 0
False_positive_rate 0
cutoff Inf
However, I think there is something wrong with my code.
Well, I think your code is flawed from a logical point of view. You said You want to
minimize both the false positive rate and false negative rate
But then you minimize
d = (x - 0)^2 + (y-1)^2
which is 1 - FNR which is the True Positive Rate.
Thus, assuming you want to minimize FPR and FNR you could simply do:
pred#cutoffs[[1]][which.min(sqrt(perf#x.values[[1]] ^ 2 + perf#y.values[[1]] ^ 2))]
# [1] 0.5
(no need to use extra loops as R is nicely vectorized)
To verify this result, you can simply calculate FPR and FNR yourself for different cutoffs (which will give you the same results as performance of course, but it is a good exercise to understand the principles):
t(sapply(pred#cutoffs[[1]], function(co) {
prediction <- factor(ifelse(data$decile_score / 10 < co, 0, 1), 0:1)
confusion_matrix <- table(data$two_year_recid, prediction)
fpr <- confusion_matrix[1, 2] / sum(confusion_matrix[1, ])
fnr <- confusion_matrix[2, 1] / sum(confusion_matrix[2, ])
c(cutoff = co, fpr = fpr, fnr = fnr, dist = sqrt(fpr ^ 2 + fnr ^2))
}))
# cutoff fpr fnr dist
# [1,] Inf 0.00000000 1.00000000 1.0000000
# [2,] 1.0 0.02195307 0.90895109 0.9092162
# [3,] 0.9 0.06056018 0.79975392 0.8020436
# [4,] 0.8 0.10143830 0.69209474 0.6994890
# [5,] 0.7 0.16250315 0.58443556 0.6066071
# [6,] 0.6 0.23391370 0.47431560 0.5288581
# [7,] 0.5 0.32349230 0.37403876 0.4945223 #### <<- Minimum
# [8,] 0.4 0.43325763 0.27130114 0.5111912
# [9,] 0.3 0.55084532 0.18486620 0.5810388
# [10,] 0.2 0.71435781 0.09474008 0.7206128
# [11,] 0.1 1.00000000 0.00000000 1.0000000
The first values in perf#x.values, perf#y.values, pred#cutoffs are causing your results, they are 1, 0 and Inf, respectively. In order to remove them, loop
through each list member and extract the vectors without the 1st element.
library(ROCR)
opt.cut = function(perf, pred) {
#
x.values <- lapply(perf#x.values, `[`, -1)
y.values <- lapply(perf#y.values, `[`, -1)
cutoffs <- lapply(pred#cutoffs, `[`, -1)
#
cut.ind <- mapply(FUN=function(x, y, p){
d <- x^2 + y^2
ind <- which.min(d)
c(False_negative_rate = y[[ind]],
False_positive_rate = x[[ind]],
cutoff = p[[ind]])
}, x.values, y.values, cutoffs)
cut.ind
}
pred <- prediction(data$decile_score/10, data$two_year_recid)
perf <- performance(pred, measure = "fnr", x.measure = "fpr")
opt.cut(perf, pred)
# [,1]
#False_negative_rate 0.3740388
#False_positive_rate 0.3234923
#cutoff 0.5000000

Looping regression over increasing sample sizes in R

I am trying to store the coefficients & SEs of a linear regression in R. The regression starts with a sample size of 10 and needs to add 1 for each loop up to 1000. I have generated random variables (using rnorm), created variables to store the values in and can get the code to store the first regression, but it stops after 1 loop (sample size 10). What am I missing in my code here? Thank you for your help.
matrix_coef <-NULL
df <- data.frame(yi, x1, x2)
for (i in 10:1000) {
lm(df)
matrix_coef <- summary(lm(df))
b0[i]<- coef(matrix_coef)[1:1, 1:1]
bx1[i] <- coef(matrix_coef)[2:2, 1:1 ]
bx2[i] <- coef(matrix_coef)[3:3, 1:1]
sd0[i] <- coef(matrix_coef)[1:1, 2:2]
sdx1[i] <- coef(matrix_coef)[2:2, 2:2]
sdx2[i] <- coef(matrix_coef)[3:3, 2:2]
}
You've got a few issues:
Inside the loop, df doesn't change, you're always fitting lm to the full data. You need to use i to change the data that the model trains on. Something like lm(df[1:i, ]), perhaps (assuming your full data has 1000 rows and you want the iterations to fit the first 10, then 11, then 12, ... 1000 rows, rather than, say, resampling each time)
The line lm(df) by itself fits a model, but you don't do store the results and your next line summary(lm(df)) fits the exact same model again.
You initialize the wrong object. matrix_coef <- NULL essentially does nothing. The line matrix_coef <- summary(lm(df)) inside your loop will have no problem creating the matrix_coef object. (And we can save a bunch of typing by defining it as coef(summary(lm(df))) and not typing coef() on all the subsequent lines.) However, the objects that you index should be initialized, and preferably to the correct length.
1:1 is the same as 1. 2:2 is the same as 2, etc.
Addressing all of these gives us this:
set.seed(47)
n <- 15 ## small n for demonstration purposes
df <- data.frame(yi = rnorm(n), x1 = rnorm(n), x2 = rnorm(n))
b0 <- numeric(n)
bx1 <- numeric(n)
bx2 <- numeric(n)
sd0 <- numeric(n)
sdx1 <- numeric(n)
sdx2 <- numeric(n)
for (i in 10:n) {
matrix_coef <- coef(summary(lm(df[1:i, ])))
b0[i] <- matrix_coef[1, 1]
bx1[i] <- matrix_coef[2, 1]
bx2[i] <- matrix_coef[3, 1]
sd0[i] <- matrix_coef[1, 2]
sdx1[i] <- matrix_coef[2, 2]
sdx2[i] <- matrix_coef[3, 2]
}
b0
# [1] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
# [9] 0.00000000 -0.17267676 -0.17490218 -0.08251370 -0.04048010 -0.04976162 -0.03881043
sdx2
# [1] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.4888940
# [11] 0.4470402 0.4569932 0.4724890 0.4669553 0.4323498

Create multiple confusion matrices in R using loops

I am trying to create multiple confusion matrices from one dataframe, with each matrix generated based off a different condition in the dataframe.
So for the dataframe below, I want a confusion matrix for when Value = 1, Value = 2, Value =3
observed predicted Value
1 1 1
0 1 1
1 0 2
0 0 2
1 1 3
0 0 3
and see the results like:
Value Sensitivity Specificity PPV NPV
1 .96 .71 .84 .95
2 .89 .63 .30 .45
3 .88 .95 .28 .80
This is what I tried with a reproducible example. I am trying to write a loop that looks at every row, determines if Age = 1, and then pulls the values from the predicted and observed columns to generate a confusion matrix. Then I manually pull out the values from the confusion matrix to write out sen, spec, ppv, and npv and tried to combine all the matrices together. And then the loop starts again with Age = 2.
data(scat)
df<-scat %>% transmute(observed=ifelse(Site=="YOLA","case", "control"), predicted=ifelse(Location=="edge","case", "control"),Age)
x<-1 #evaluate at ages 1 through 5
for (i in dim(df)[1]) { #for every row in df
while(x<6) { #loop stops at Age=5
if(x=df$Age) {
q<-confusionMatrix(data = df$predicted, reference = df$observed, positive = "case")
sensitivity = q$table[1,1]/(q$table[1,1]+q$table[2,1])
specificity = q$table[2,2]/(q$table[2,2]+q$table[1,2])
ppv = q$table[1,1]/(q$table[1,1]+q$table[1,2])
npv = q$table[2,2]/(q$table[2,2]+q$table[2,1])
matrix(c(sensitivity, specificity, ppv, npv),ncol=4,byrow=TRUE)
}
}
x <- x + 1 #confusion matrix at next Age value
}
final<- rbind(matrix) #combine all the matrices together
However, this loop is completely non-functional. I'm not sure where the error is.
Your code can be simplified and the desired output achieved like this:
library(caret)
library(dplyr)
data(scat)
df <- scat %>%
transmute(observed = factor(ifelse(Site == "YOLA","case", "control")),
predicted = factor(ifelse(Location == "edge","case", "control")),
Age)
final <- t(sapply(sort(unique(df$Age)), function(i) {
q <- confusionMatrix(data = df$predicted[df$Age == i],
reference = df$observed[df$Age == i],
positive = "case")$table
c(sensitivity = q[1, 1] / (q[1, 1] + q[2, 1]),
specificity = q[2, 2] / (q[2, 2] + q[1, 2]),
ppv = q[1, 1] / (q[1, 1] + q[1, 2]),
npv = q[2, 2] / (q[2, 2] + q[2, 1]))
}))
Resulting in
final
#> sensitivity specificity ppv npv
#> [1,] 0.0 0.5625000 0.00000000 0.8181818
#> [2,] 0.0 1.0000000 NaN 0.8000000
#> [3,] 0.2 0.5882353 0.06666667 0.8333333
#> [4,] 0.0 0.6923077 0.00000000 0.6923077
#> [5,] 0.5 0.6400000 0.25000000 0.8421053
However, it's nice to know why your own code didn't work, so here are a few issues that might be useful to consider:
You need factor columns rather than character columns for confusionMatrix
You were incrementing through the rows of df, but you need one iteration for each unique age, not each row in your data frame.
Your line to increment x happens outside of the while loop, so x never increments and the loop never terminates, so the console just hangs.
You are doing if(x = df$Age), but you need a == to test equality.
It doesn't make sense to compare x to df$Age anyway, because x is length 1 and df$Age is a long vector.
You have unnecessary repetition by doing q$table each time. You can just make q equal to q$table to make your code more readable and less error-prone.
You call matrix at the end of the loop, but you don't store it anywhere, so the whole loop doesn't actually do anything.
You are trying to rbind an object called matrix in the last line which doesn't exist
Your lack of spaces between math operators, commas and variables make the code less readable and harder to debug. I'm not just saying this as a stylistic point; it is a major source of errors I see frequently here on SO.

Find the maximum value in decision tree

I created decision tree with Party package in R.
I'm trying to get the route/branch with the maximum value.
It can be mean value that came from box-plot
and it can be probability value that came from binary tree
(source: rdatamining.com)
This can be done pretty easily actually, though while your definition of maximum value is clear for a regression tree, it is not very clear for a classification tree, as in each node different level can have it's own maximum
Either way, here's a pretty simple helper function that will return you the predictions for each type of tree
GetPredicts <- function(ct){
f <- function(ct, i) nodes(ct, i)[[1]]$prediction
Terminals <- unique(where(ct))
Predictions <- sapply(Terminals, f, ct = ct)
if(is.matrix(Predictions)){
colnames(Predictions) <- Terminals
return(Predictions)
} else {
return(setNames(Predictions, Terminals))
}
}
Now luckily you've took your trees from the examples of ?ctree, so we can test them (next time, please provide the code you used yourself)
Regression Tree (your frist tree)
## load the package and create the tree
library(party)
airq <- subset(airquality, !is.na(Ozone))
airct <- ctree(Ozone ~ ., data = airq,
controls = ctree_control(maxsurrogate = 3))
plot(airct)
Now, test the function
res <- GetPredicts(airct)
res
# 5 3 6 9 8
# 18.47917 55.60000 31.14286 48.71429 81.63333
So we've got the predictions per each terminal node. You can easily proceed with which.max(res) from here (I'll leave it for you to decide)
Classification tree (your second tree)
irisct <- ctree(Species ~ .,data = iris)
plot(irisct, type = "simple")
Run the function
res <- GetPredicts(irisct)
res
# 2 5 6 7
# [1,] 1 0.00000000 0.0 0.00000000
# [2,] 0 0.97826087 0.5 0.02173913
# [3,] 0 0.02173913 0.5 0.97826087
Now, the output is a bit harder to read because each class has it's own probabilities. You could make this a bit more readable using
row.names(res) <- levels(iris$Species)
res
# 2 5 6 7
# setosa 1 0.00000000 0.0 0.00000000
# versicolor 0 0.97826087 0.5 0.02173913
# virginica 0 0.02173913 0.5 0.97826087
The, you could do something like the following in order to get the overall maximum value
which(res == max(res), arr.ind = TRUE)
# row col
# setosa 1 1
For column/row maxes, you could do
matrixStats::colMaxs(res)
# [1] 1.0000000 0.9782609 0.5000000 0.9782609
matrixStats::rowMaxs(res)
# [1] 1.0000000 0.9782609 0.9782609
But, again, I'll leave to you to decide on how to proceed from here.

Obtaining k-order statistics by row from a proximity matrix in R

After obtaining a proximity matrix from randomForest, for each row/observation in the data set I would like to find the k-nearest points (excluding the observation itself, or, equivalently, the diagonal elements of the proximity matrix) and then their true class labels. I know how to find the indices of the max or min of each row in a matrix but I do not know how to program R to exclude the diagonal elements and to identify the k greatest entries by row.
> set.seed(1234)
> d <- iris[sample(nrow(iris), 6, replace = FALSE),]
> iris.rf <- randomForest(Species ~ ., data=d, ntree=2000, proximity=TRUE, oob.prox=TRUE)
> m <- iris.rf$proximity
> m
18 93 91 92 126 149
18 1.0000000 0.7486911 0.7653631 0.7500000 0.2620690 0.4723926
93 0.7486911 1.0000000 0.7836257 0.7329545 0.5497076 0.2763819
91 0.7653631 0.7836257 1.0000000 0.6795580 0.5371429 0.2289157
92 0.7500000 0.7329545 0.6795580 1.0000000 0.3107345 0.4535519
126 0.2620690 0.5497076 0.5371429 0.3107345 1.0000000 0.8115942
149 0.4723926 0.2763819 0.2289157 0.4535519 0.8115942 1.0000000
> which(m[1,] == max(m[1,]), arr.ind = TRUE)
18
1
I was able to solve it on my own, although if there is a way to improve the code (or if there is an error) I would appreciate the feedback.
f = function(d,k){
m <- randomForest(y ~ ., data=d, ntree=2000, proximity=TRUE, oob.prox=TRUE)$proximity
diag(m) <-0
tmp <- lapply(as.list(as.data.frame(m)),order,decreasing = TRUE)
tmp1 <- t(as.data.frame(tmp))[,c(1:k)]
class.mat <- matrix(data=NA,nrow(d), k, byrow=TRUE)
for(i in 1:nrow(d)){
class.mat[i,] = d$y[tmp1[i,]]
}
return(class.mat)
}

Resources