Related
I want to perform regressions across matrices in a list, and in each case use the last 48 months of data.
Setup
Mat1 <- matrix(nrow=310, ncol =48, data= rnorm(310*48, 0, 0.1))
Mat2 <- matrix(nrow =310, ncol= 51, data = rnorm(310*51,0,0.1))
Mat1[1:300, 48] <- NA
Mat2[1:40, 51] <- NA
ind1 <- matrix(nrow =48, ncol =3, data = rnorm(48*3, 0,0.1))
ind2 <- matrix(nrow =51, ncol =3, data = rnorm(51*3, 0,0.1))
list1 <- c(Mat1, Mat2)
ind.list <- c(ind1,ind2)
I want to regress rows in List1[i], against columns in ind1[i] using the most recent 48 months of data. For instance, the first regression will regress row 1 of Mat1, against column1 of Ind1, Column2 of Ind1 and Column3 of Ind1. The second regression, will regress row 2 of Mat1, against column1 of Ind1, Column2 of Ind1 and Column3 of Ind1. Repeat this process for all rows in Mat1 and store coefficients.
However, I only want the regression to run if i have data in the last column of each row of Mat1. So to be clear: step1. check if data is present in column 48 row 1 of Mat1. If yes, run regression over 48 months of data. if No store NA in coefficient matrix. Step 2. Is NA present in row2 of Mat1 in column 48? If No, then run regression. If yes, store an NA in coefficient matrix.
Then move to Mat2. Regress Mat2[1, 3:51] against column 1 of Ind2[3:51,1] column2 of Ind2 i.e Ind2[3:51,2] and column3 of Ind2 i.e Ind2[3:51,3]
Repeat the process for all rows in Mat2 and store beta coefficients in a matrix.
Overall, regress rows of matrices in list1 against the corresponding columns of matrices in Ind1.
What i've tried:
for (i in 1:2) {
for (j in 1:310) {
coefficients1 = matrix(nrow = 310, ncol = 2)
coefficients2 = matrix(nrow=310, ncol =2)
coefficients3 = matrix(nrow=310, ncol =2)
if(is.na(list1[[i]][j,ncol(is.na(list1[[i]]))])) next
coefficients1[j,i] = coefficients(lm(list1[[i]][j, ]) ~ ind.list[[i]][,1] + ind.list[[i]][,2] +ind.list[[i]][,3])[1]
coefficients2[j,i] = coefficients(lm(list1[[i]][j, ]) ~ ind.list[[i]][,1] + ind.list[[i]][,2] +ind.list[[i]][,3])[2]
cefficients3[j,i] = coefficients(lm(list1[[i]][j, ]) ~ ind1.list[[i]][,1] + ind.list[[i]][,2] +ind.list[[i]][,3])[3]
}
}
Output should stall all the beta coefficients for each corresponding regression. Regression of Mat1 against Ind1 should yield 310 coefficients of Beta1 stored in coefficients1, 310 coefficients of beta2, stored in coefficients2 and 310 coefficients of beta3, stored in coefficients3.
You can try using apply which works well on matrices.
total_mat1 <- t(apply(Mat1, 1, function(x) coefficients(lm(x~ind1))))
#Change the values to NA when the last value of Mat1 is NA
total_mat1[is.na(Mat1[, ncol(Mat1)]), ] <- NA
total_mat1 is a matrix of 310 rows X 4 columns.
Now similar for Mat2 after subsetting selected rows and columns.
total_mat2 <- t(apply(Mat2[, 3:51], 1, function(x) coefficients(lm(x~ind2[3:51,]))))
To first check the NA values and then perform the analysis we can do
total_mat1 <- matrix(NA, nrow = nrow(Mat1), ncol = 4)
inds <- !is.na(Mat1[, ncol(Mat1)])
total_mat1[inds, ] <- t(apply(Mat1[inds, ],1, function(x) coefficients(lm(x~ind1))))
I want to quantify group similarity by computing the mean of the distance between all sets of (multidimensional) points in each pair.
I can do this easily enough manually for each pair of groups manually like so:
library(dplyr)
library(tibble)
library(proxy)
# dummy data
set.seed(123)
df1 <- data.frame(x = rnorm(100,0,4),
y = rnorm(100,1,5),
z = rbinom(100, 1, 0.1))
df2 <- data.frame(x = rnorm(100,-1,3),
y = rnorm(100,0,6),
z = rbinom(100, 1, 0.1))
df3 <- data.frame(x = rnorm(100,-30,4),
y = rnorm(100,10,2),
z = rbinom(100, 1, 0.9))
# compute distance (unscaled, uncentred data)
dist(df1, df2, method = "gower") %>% mean
dist(df1, df3, method = "gower") %>% mean
dist(df2, df3, method = "gower") %>% mean
But I'd like to somehow vectorise this as my actual data has 30+ groups. A simple for loop can achieve this like so:
# combine data and scale, centre
df <- rbind(df1, df2, df3) %>%
mutate(id = rep(1:3, each = 100))
df <- df %>%
select(-id) %>%
transmute_all(scale) %>%
add_column(id = df$id)
# create empty matrix for comparisons
n <- df$id %>% unique %>% length
m <- matrix(nrow = n, ncol = n)
# loop through each pair once
for(i in 1:n) {
for(j in 1:i) { #omit top right corner
if(i == j) {
m[i,j] <- NA #omit diagonal
} else {
m[i,j] <- dist(df[df$id == i,1:3], df[df$id == j,1:3], method = "gower") %>% mean
}
}
}
m
[,1] [,2] [,3]
[1,] NA NA NA
[2,] 0.2217443 NA NA
[3,] 0.8446070 0.8233932 NA
However, this method scales predictably badly; a quick benchmark suggests this will take 90+ hours with my actual data which has 30+ groups with 1000+ rows per group.
Can anyone suggest a more efficient solution, or perhaps a fundamentally different way to frame the problem which I'm missing?
I'm not sure if this will do well but here's another approach. You use ls to obtain the names of matrices, combn to generate pairs of two, and then get to obtain the matrices for calculating dist
do.call(rbind,
combn(ls(pattern = "df\\d+"), 2, FUN = function(x)
data.frame(pair = toString(x),
dist = mean(dist(get(x[1]), get(x[2]), method = "gower")),
stringsAsFactors = FALSE),
simplify = FALSE
))
# pair dist
#1 df1, df2 0.2139304
#2 df1, df3 0.8315169
#3 df2, df3 0.8320911
You could take each pair of groups, concatenate them, and then just calculate the dissimilarity matrix within that group. Obviously this means you're comparing a group to itself to an extent, but it may still work for your use case, and with daisy it is reasonably quick for your size of data.
library(cluster)
n <- 30
groups <- vector("list", 30)
# dummy data
set.seed(123)
for(i in 1:30) {
groups[[i]] = data.frame(x = rnorm(1000,ceiling(runif(1, -10, 10)),ceiling(runif(1, 2, 4))),
y = rnorm(1000,ceiling(runif(1, -10, 10)),ceiling(runif(1, 2, 4))),
z = rbinom(1000,1,runif(1,0.1,0.9)))
}
m <- matrix(nrow = n, ncol = n)
# loop through each pair once
for(i in 1:n) {
for(j in 1:i) { #omit top right corner
if(i == j) {
m[i,j] <- NA #omit diagonal
} else {
# concatenate groups
dat <- rbind(df_list[[i]], df_list[[j]])
# compute all distances (between groups and within groups), return matrix
mm <- dat %>%
daisy(metric = "gower") %>%
as.matrix
# retain only distances between groups
mm <- mm[(nrow(df_list[[i]])+1):nrow(dat) , 1:nrow(df_list[[i]])]
# write mean distance to global comparison matrix
m[i,j] <- mean(mm)
}
}
}
proxy can work with lists of matrices as input,
you only need to define a wrapper function that does what you want:
nested_gower <- function(x, y, ...) {
mean(proxy::dist(x, y, ..., method = "gower"))
}
proxy::pr_DB$set_entry(
FUN = nested_gower,
names = c("ngower"),
distance = TRUE,
loop = TRUE
)
df_list <- list(df1, df2, df3)
proxy::dist(df_list, df_list, method = "ngower")
[,1] [,2] [,3]
[1,] 0.1978306 0.2139304 0.8315169
[2,] 0.2139304 0.2245903 0.8320911
[3,] 0.8315169 0.8320911 0.2139049
This will still be slow,
but it should be faster than for loops in plain R
(proxy uses C in the background).
Important: note that the diagonal of the resulting cross-distance matrix doesn't have zeros.
If you were to call dist like proxy::dist(df_list, method = "ngower"),
proxy will assume that distance(x, y) = distance(y, x) (symmetry),
and that distance(x, x) = 0,
the latter of which is not true in this case.
Passing two arguments to dist prevents this assumption.
If you really don't care about the diagonal,
pass only one argument to save some extra time by avoiding the calculations of the upper triangular.
Alternatively, if you do care about the diagonal but still want to avoid calculating the upper triangular,
call dist first with one argument and then call proxy::dist(df_list, df_list, method = "ngower", pairwise = TRUE).
Side note: if you want to imitate this behavior with the gower package (as suggested by d.b),
you could define the wrapper function as:
nested_gower <- function(x, y, ...) {
distmat <- sapply(seq_len(nrow(y)), function(y_row) {
gower::gower_dist(x, y[y_row, , drop = FALSE], ...)
})
mean(distmat)
}
However, the values returned seem to change depending on how many records are passed to the functions,
so it's hard to tell what would be the best approach.
*Use proxy::pr_DB$delete_entry("ngower") first if you want to redefine a function in proxy.
If you prefer proxy's version of the Gower cross-distance matrix,
it occurs to me that you could leverage some of the functionality of my dtwclust package to do the calculations in parallel:
library(dtwclust)
library(doParallel)
custom_dist <- new("tsclustFamily", dist = "ngower", control = list(symmetric = TRUE))#dist
workers <- makeCluster(detectCores())
registerDoParallel(workers)
distmat <- custom_dist(df_list)
stopCluster(workers); registerDoSEQ()
This might be faster for your actual use case
(not so much for the small sample data here).
Same caveat about the diagonal
(so use custom_dist(df_list, df_list) or custom_dist(df_list, pairwise = TRUE)).
See section 3.2 here and the documentation of tsclustFamily if you'd like more info.
I am trying to build various regression models with different columns (independent variables in my dataset).
set.seed(0)
True = rnorm(20, 100, 10)
v = matrix(rnorm(120, 10, 3), nrow = 20)
dt = data.frame(cbind(True, v))
colnames(dt) = c('True', paste0('ABC', 1:6))
So the independent variables I want to throw in the data is "ABCi", aka when i=1, use ABC1, etc. Each model uses the first 80% of the observations to build, then I make a prediction on the rest 20%.
I tried this:
reg.pred = rep(0, ncol(dt))
for (i in 1:nrow(dt)){
reg = lm(True~paste0('ABC', i), data = dt[(1:(0.8*nrow(dt))),])
reg.pred[i] = predict(reg, data = dt[(0.8*nrow(dt)):nrow(dt),])
}
Not working... giving errors like:
Error in model.frame.default(formula = True ~ paste0("ABC", i), data = dt[(1:(0.8 * :
variable lengths differ (found for 'paste0("ABC", i)')
Not sure how can I retrieve the variable name in a loop... Any suggestion is appreciated!
You do not technically need to use as.formula() as #Sonny suggests, but you cannot mix a character representation of the formula and formula notation. So, you need to fix that. However, once you do, you'll notice that there are other issues with your code that #Sonny either did not notice or opted not to address.
Most notably, the line
reg.pred = rep(0, ncol(dt))
implies you want a single prediction from each model, but
predict(reg, data = dt[(0.8*nrow(dt)):nrow(dt),])
implies you want a prediction for each of the observations not in the training set (you'll need a +1 after 0.8*nrow(dt) for that by the way).
I think the following should fix all your issues:
set.seed(0)
True = rnorm(20, 100, 10)
v = matrix(rnorm(120, 10, 3), nrow = 20)
dt = data.frame(cbind(True, v))
colnames(dt) = c('True', paste0('ABC', 1:6))
# Make a matrix for the predicted values; each column is for a model
reg.pred = matrix(0, nrow = 0.2*nrow(dt), ncol = ncol(dt)-1)
for (i in 1:(ncol(dt)-1)){
# Get the name of the predictor we want here
this_predictor <- paste0("ABC", i)
# Make a character representation of the lm formula
lm_formula <- paste("True", this_predictor, sep = "~")
# Run the model
reg = lm(lm_formula, data = dt[(1:(0.8*nrow(dt))),])
# Get the appropriate test data
newdata <- data.frame(dt[(0.8*nrow(dt)+1):nrow(dt), this_predictor])
names(newdata) <- this_predictor
# Store predictions
reg.pred[ , i] = predict(reg, newdata = newdata)
}
reg.pred
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 100.2150 100.8394 100.7915 99.88836 97.89952 105.7201
# [2,] 101.2107 100.8937 100.9110 103.52487 102.13965 104.6283
# [3,] 100.0426 101.0345 101.2740 100.95785 102.60346 104.2823
# [4,] 101.1055 100.9686 101.5142 102.56364 101.56400 104.4447
In this matrix of predictions, each column is from a different model, and the rows correspond to the last four rows of your data (the rows not in your training set).
You can use as.formula
f <- as.formula(
paste("True",
paste0('ABC', i),
sep = " ~ "))
reg = lm(f, data = dt[(1:(0.8*nrow(dt))),])
I need a suggestion on how I get the results
of my regression analysis into an object.
I wan't to perform the regression analysis row wise and
with a window of 20 days.
The object Slope should save the results (slopes) of each days regressions analysis over the window.
#Loading Library
require(quantmod)
#Initiation of Example
mc_result <- matrix(sample(c(1:200)), ncol = 200, nrow =1)
mc_result1 <- matrix(sample(c(1:200)), ncol =200, nrow =1)
mc_result <- rbind(mc_result, mc_result1)
a <- c(1:200)
Slope <- matrix(ncol=2, nrow=181)
Caution this Loop that does not work.
The Loop should apply Rollapply row wise
and save the results for each day in the object Slope.
However, this is how the result should look like, but with changing Slope values. At the moment the Slope Value is stable and I don't know why.
for (i in 1:2) {
Slope[,i] <- rollapply(data =mc_result[i,], width=20,
FUN = function(z)
summary(lm(mc_result[i,] ~ a, data = as.data.frame(z)))$coefficients[2], by.column = FALSE)
}
I think what you want is the following (in your code none of mc_result[i,] or a is rolling over the indices in the data, that's why the linear regression coefficients are not changing, since you are training on the same dataset, only z is changing, you need to change the code to something like the following):
#Loading Library
require(quantmod)
#Initiation of Example
mc_result <- matrix(sample(c(1:200)), ncol = 200, nrow =1)
mc_result1 <- matrix(sample(c(1:200)), ncol =200, nrow =1)
mc_result <- rbind(mc_result, mc_result1)
a <- c(1:200)
Slope <- matrix(ncol=2, nrow=181)
for (i in 1:2) {
Slope[,i] <- rollapply(data = 1:200, width=20,
FUN = function(z) {
summary(lm(mc_result[i,z] ~ a[z]))$coefficients[2]
}, by.column = FALSE)
}
head(Slope)
[,1] [,2]
[1,] 1.3909774 2.0278195
[2,] 1.0315789 2.8421053
[3,] 1.5082707 2.8571429
[4,] 0.0481203 1.6917293
[5,] 0.2969925 0.2060150
[6,] 1.3526316 0.6842105
I want to calculate correlation statistics using cor.test(). I have a data matrix where the two pairs to be tested are on consecutive lines (I have more than thousand pairs so I need to correct for that also later). I was thinking that I could loop through every two and two lines in the matrix and perform the test (i.e. first test correlation between row1 and row2, then row3 and row4, row5 and row6 etc.), but I don't know how to make this kind of loop.
This is how I do the test on a single pair:
d = read.table(file="cor-test-sample-data.txt", header=T, sep="\t", row.names = 1)
d = as.matrix(d)
cor.test(d[1,], d[2,], method = "spearman")
You could try
res <- lapply(split(seq_len(nrow(mat1)),(seq_len(nrow(mat1))-1)%/%2 +1),
function(i){m1 <- mat1[i,]
if(NROW(m1)==2){
cor.test(m1[1,], m1[2,], method="spearman")
}
else NA
})
To get the p-values
resP <- sapply(res, function(x) x$p.value)
indx <- t(`dim<-`(seq_len(nrow(mat1)), c(2, nrow(mat1)/2)))
names(resP) <- paste(indx[,1], indx[,2], sep="_")
resP
# 1_2 3_4 5_6 7_8 9_10 11_12 13_14
#0.89726818 0.45191660 0.14106085 0.82532260 0.54262680 0.25384239 0.89726815
# 15_16 17_18 19_20 21_22 23_24 25_26 27_28
#0.02270217 0.16840791 0.45563229 0.28533447 0.53088721 0.23453161 0.79235990
# 29_30 31_32
#0.01345768 0.01611903
Or using mapply (assuming that the rows are even)
ind <- seq(1, nrow(mat1), by=2) #similar to the one used by #CathG in for loop
mapply(function(i,j) cor.test(mat1[i,], mat1[j,],
method='spearman')$p.value , ind, ind+1)
data
set.seed(25)
mat1 <- matrix(sample(0:100, 20*32, replace=TRUE), ncol=20)
Try
d = matrix(rep(1:9, 3), ncol=3, byrow = T)
sapply(2*(1:(nrow(d)/2)), function(pair) unname(cor.test(d[pair-1,], d[pair,], method="spearman")$estimate))
pvalues<-c()
for (i in seq(1,nrow(d),by=2)) {
pvalues<-c(pvalues,cor.test(d[i,],d[i+1,],method="spearman")$p.value)
}
names(pvalues)<-paste(row.names(d)[seq(1,nrow(d),by=2)],row.names(d)[seq(2,nrow(d),by=2)],sep="_")