Vectorized calculation of adjacency matrix - r

I have the following function:
CFC_GLM <- function(data, frequency_bins){
adj_mat <- matrix(0, nrow = dim(data)[1], ncol = dim(data)[1])
bf_filters <- list()
combs <- combinations(length(frequency_bins), 2, repeats.allowed = T)
all_adj_mat <- list()
for(z in 1:length(frequency_bins)){
bf_filters[[z]] <- butter(3, c(frequency_bins[[z]][1]/1200,
frequency_bins[[z]][2]/1200), type = "pass")
}
for(f in 1:nrow(combs)){
for(i in 1:dim(data)[1]){
for(j in 1:dim(data)[1]){
sensor_1 <- data[i,]
sensor_2 <- data[j,]
sensor_1_filt = filtfilt(bf_filters[[combs[f,1]]], sensor_1)
sensor_2_filt = filtfilt(bf_filters[[combs[f,2]]], sensor_2)
a_y <- abs(hilbert(sensor_2_filt, 1200))
a_x <- abs(hilbert(sensor_1_filt, 1200))
theta_x <- angle(hilbert(sensor_1_filt, 1200)) %% 2*pi
a_x_norm <- (a_x - mean(a_x))/std(a_x)
a_y_norm <- (a_y - mean(a_y))/std(a_y)
theta_x_norm <- (theta_x - mean(theta_x))/std(theta_x)
fit <- lm(a_y_norm ~ sin(theta_x_norm) + cos(theta_x_norm) +
a_x_norm)
summ <- summary(fit)
r <- sqrt(summ$r.squared)
adj_mat[i,j] <- r
}
}
all_adj_mat[[f]] <- adj_mat
}
return(all_adj_mat)
}
Just to summarize, the function takes a matrix of signals (246 sensors by 2400 samples), performs some signal processing, and then performs a GLM between every possible pairs of sensors. This process is repeated for 4 frequency bandwidths and their combinations (within and cross-frequency coupling). Right now, this code seems terribly inefficient and takes a really long time to run. Is there a way to vectorize/parallelize this function? I have researched this question extensively and cannot seem to find an answer.
I am not sure whether to make some of the tasks within the function parallel or just make the whole function able to be called by parApply (vectorized). My intuition is the latter but I am not sure how to approach this. Any help is greatly appreciated.
Reproducible Example
test_data <- c(-347627.104358097, 821947.421444641, 496824.676355433,
-178091.364312102, -358842.250713998, 234666.210462063,
-1274153.04141668,
1017066.42839987, -158388.137875357, 191691.279588641,
-16231.2106151229,
378249.600546794, 1080850.88212858, -688841.640871254,
-616713.991288002,
639401.465180969, -1625802.44142751, 472370.867686569,
-631863.239075449,
-598755.248911174, 276422.966753179, -44010.9403226763,
1569374.08537143,
-1138797.2585617, -824232.849278583, 955783.332556046,
-1943384.98409094,
-54443.829280377, -1040354.44654998, -1207674.05255178,
496481.331429747,
-417435.356472725, 1886817.1254085, -1477199.59091112,
-947353.716505171,
1116336.49812969, -2173805.84111182, -574875.152250742,
-1343996.2219146,
-1492260.06197604, 626856.67540728, -713761.48191904, 1987730.27341334,
-1673384.77863935, -968522.886481198, 1089458.71433614,
-2274932.19262517,
-1096749.79392427, -1520842.86946059, -1390794.61065106,
669864.477272507,
-906096.822125892, 1863506.59188299, -1720956.06310511,
-889359.420058576,
885300.628410276, -2224340.54992297, -1619386.88041896,
-1570131.07127786,
-934848.556063722, 644671.113108699, -973418.329437102,
1541962.53750178,
-1636863.31666018, -728992.972371437, 551297.997356909,
-2026413.5471505,
-2129730.49230266, -1511423.25789691, -236962.889589694,
580683.399845852,
-906261.700784793, 1080101.95011954, -1455931.89179814,
-518630.187846405,
158846.288141661, -1715610.22092989, -2601349.5081924,
-1380068.64260811,
541310.557194977, 509125.333244057, -711696.682554995,
551748.792106809,
-1222430.29467688, -293847.487823853, -215078.751157158,
-1354005.89576504,
-2997647.23289805, -1220136.14918605, 1231169.98678596,
455388.081391798,
-415489.975542684, 32724.7895795912, -980848.930757441,
-86618.5594163355,
-506333.915891838, -1022235.58829567, -3279232.01820961,
-1076344.95091665,
1696655.88400158), .Dim = c(10L, 10L))
frequency_bins <- list(band1 = c(2,4), band2 = c(4,12), band3 =
c(12,30), band4 = c(30,100))
system.time(test_result <- CFC_GLM(test_data, frequency_bins))
user system elapsed
1.839 0.009 1.849
I'm not sure how to include the result in a manageable way. Sorry for the naivety. This is only with 10 sensors by 10 samples, to have a manageable test set.

Right off the bat I would suggest predeclaring the length of your lists.
bf_filters <- rep(list(NA), length(frequency_bins))
all_adj_mat <- rep(list(NA), nrow(combos))
#this is your function to be applied
i_j_fun <- function ( perms ) {
sensor_1_filt = filtfilt(bf_filters[[combos[f,1]]], data[perms[1],])
sensor_2_filt = filtfilt(bf_filters[[combos[f,2]]], data[persm[2],])
a_y <- abs(hilbert(sensor_2_filt, 1200))
a_x <- abs(hilbert(sensor_1_filt, 1200))
theta_x <- angle(hilbert(sensor_1_filt, 1200)) %% 2*pi
a_x_norm <- (a_x - mean(a_x))/std(a_x)
a_y_norm <- (a_y - mean(a_y))/std(a_y)
theta_x_norm <- (theta_x - mean(theta_x))/std(theta_x)
fit <- lm(a_y_norm ~ sin(theta_x_norm) + cos(theta_x_norm) +
a_x_norm)
summ <- summary(fit)
r <- sqrt(summ$r.squared)
return(r)
}
Your i and j for loops can be turned into a function and used with apply.
#perms acts like the for loop
perms <- permuations(dim(data)[1], 2, seq_len(dim(data)[1]))
for(f in 1:nrow(combs)){
all_adj_mat[[f]] <- matrix(apply(perms, 1, i_j_fun),
nrow = dim(data)[1], ncol = dim(data[2]), byrow = TRUE)
}
That should do it.

Related

How to create multiple repeating data frame or matrices in R?

I am trying to make multiple data frames (df_1,,, df_N) with the same structure.
For now, I have made them all individually, but I imagine there should be more efficient way of writing the codes.
Below are the matrices I created (only three for now, but can be more than 100 later on)
quantileMatrix_1 <- matrix(NA,nrow=ncol(outDf_1), ncol = 3)
for(jj in 1:ncol(outDf_1)){
quantiles <- outDf_1[,jj] %>% quantile(probs=c(.5,.025,.975))
quantileMatrix_1[jj,] <- quantiles
}
quantileMatrix_2 <- matrix(NA,nrow=ncol(outDf_2), ncol = 3)
for(jj in 1:ncol(outDf_2)){
quantiles <- outDf_2[,jj] %>% quantile(probs=c(.5,.025,.975))
quantileMatrix_2[jj,] <- quantiles
}
quantileMatrix_3 <- matrix(NA,nrow=ncol(outDf_3), ncol = 3)
for(jj in 1:ncol(outDf_3)){
quantiles <- outDf_3[,jj] %>% quantile(probs=c(.5,.025,.975))
quantileMatrix_3[jj,] <- quantiles
}
I would use another for loop, to put every df in a list.
my_matrix <- list()
for (d in 1:100) {
quantileMatrix_d <- matrix(NA,nrow=ncol(outDf_1), ncol = 3)
for(jj in 1:ncol(outDf_1)){
quantiles <- outDf_1[,jj] %>% quantile(probs=c(.5,.025,.975))
quantileMatrix_d[jj,] <- quantiles
}
my_matrix[[d]] <- quantileMatrix_d
}

Forecasting More than One Column

How do I run this for each of the 50 columns I have instead of one at a time?
#Chosen vector creation
IMBFM <- as.numeric(data$IMBFM)
#Hidden layers creation
alpha <- 1.5^(-10)
hn <- length(IMBFM)/(alpha*(length(IMBFM)+30))
#Fitting nnetar
lambda <- BoxCox.lambda(IMBFM)
dnn_pred <- nnetar(IMBFM, size= hn, lambda = lambda)
#Fitting nnetar
dnn_forecast <- forecast(dnn_pred, h= 30, PI = TRUE)
dnn_forecast
plot(dnn_forecast)
Create a function that takes your column, and returns a list of the forecast and the plot
f <- function(x) {
x <- as.numeric(x)
alpha <- 1.5^(-10)
hn <- length(x)/(alpha*(length(x)+30))
lambda <- BoxCox.lambda(x)
dnn_pred <- nnetar(x, size= hn, lambda = lambda)
dnn_forecast <- forecast(dnn_pred, h= 30, PI = TRUE)
return(
list("forecast" = dnn_forecast, "plot" = plot(dnn_forecast))
)
}
Create a vector of your columns of interest / many ways to do this; this is the manual way, but your might be able to use a regex on colnames(data) to select the ones of interest, depending on the names
mycols = c("IBMF", "col2", "col3", ... "col50")
Use lapply to apply the function to each element of mycols
result = lapply(mycols, function(col) data[[col]])

Faster way to select top values and rowmeans

Hello I am trying to speed up a block of code that is currently working, but is quite slow with the amount of data that I have. I need to identify the top n% highest value in a row and subsequently use this to make an average by subsetting a dataframe and averaging the values of the subset. Any help or suggestions would be appreciated. This is my current approach:
corrMat <- matrix(runif(944*9843), nrow=944, ncol = 9843)
GeneExpression <- matrix(runif(11674*9843, min=0, max=100), nrow = 11674, ncol=9843)
cutOff <- apply(corrMat, MARGIN = 1, FUN = quantile, 0.99)
topCells <- corrMat > cutOff
data <- matrix(, nrow = nrow(topCells), ncol = nrow(GeneExpression))
colnames(data) <- rownames(GeneExpression)
for(i in colnames(data)){
for(j in 1:nrow(topCells)){
data[j,i] <- mean(t(GeneExpression[i, topCells[j,]]))
}
}
data
Here's a smaller version of your example along with my base R solution. Chances are there's also a neat tidyverse way of doing this but I wouldn't know.
corrMat <- matrix(runif(24*18), nrow=24)
GeneExpression <- matrix(runif(36*18, min=0, max=100), nrow = 36)
cutOff <- apply(corrMat, MARGIN = 1, FUN = quantile, 0.99)
topCells <- corrMat > cutOff
data <- data2 <- matrix(, nrow = nrow(topCells), ncol = nrow(GeneExpression))
colnames(data) <- rownames(GeneExpression) # rownames are NULL so this is not needed
start <- Sys.time() # benchmarking
for(i in 1:ncol(data)){ # iterate by column rather than colname
for(j in 1:nrow(topCells)){
data[j,i] <- mean(t(GeneExpression[i, topCells[j,]]))
}
}
eric <- Sys.time() - start
start <- Sys.time()
# apply over rows of topCells to take row means of GeneExpression
# per row of topCells
# then just transpose
data2 <- t(apply(topCells, 1, function(x) rowMeans(GeneExpression[, x, drop = F])))
milan <- Sys.time() - start
all(data == data2)
[1] TRUE
eric
Time difference of 0.08776498 secs
milan
Time difference of 0.02593184 secs
Using your original example data, my solution takes 6.43s to run.
Hope this helps.

How to extract the p.value and estimate from cor.test() in a data.frame?

In this example, I have temperatures values from 50 different sites, and I would like to correlate the Site1 with all the 50 sites. But I want to extract only the components "p.value" and "estimate" generated with the function cor.test() in a data.frame into two different columns.
I have done my attempt and it works, but I don't know how!
For that reason I would like to know how can I simplify my code, because the problem is that I have to run two times a Loop "for" to get my results.
Here is my example:
# Temperature data
data <- matrix(rnorm(500, 10:30, sd=5), nrow = 100, ncol = 50, byrow = TRUE,
dimnames = list(c(paste("Year", 1:100)),
c(paste("Site", 1:50))) )
# Empty data.frame
df <- data.frame(label=paste("Site", 1:50), Estimate="", P.value="")
# Extraction
for (i in 1:50) {
df1 <- cor.test(data[,1], data[,i] )
df[,2:3] <- df1[c("estimate", "p.value")]
}
for (i in 1:50) {
df1 <- cor.test(data[,1], data[,i] )
df[i,2:3] <- df1[c("estimate", "p.value")]
}
df
I will appreciate very much your help :)
I might offer up the following as well (masking the loops):
result <- do.call(rbind,lapply(2:50, function(x) {
cor.result<-cor.test(data[,1],data[,x])
pvalue <- cor.result$p.value
estimate <- cor.result$estimate
return(data.frame(pvalue = pvalue, estimate = estimate))
})
)
First of all, I'm guessing you had a typo in your code (you should have rnorm(5000 if you want unique values. Otherwise you're going to cycle through those 500 numbers 10 times.
Anyway, a simple way of doing this would be:
data <- matrix(rnorm(5000, 10:30, sd=5), nrow = 100, ncol = 50, byrow = TRUE,
dimnames = list(c(paste("Year", 1:100)),
c(paste("Site", 1:50))) )
# Empty data.frame
df <- data.frame(label=paste("Site", 1:50), Estimate="", P.value="")
estimates = numeric(50)
pvalues = numeric(50)
for (i in 1:50){
test <- cor.test(data[,1], data[,i])
estimates[i] = test$estimate
pvalues[i] = test$p.value
}
df$Estimate <- estimates
df$P.value <- pvalues
df
Edit: I believe your issue was is that in the line df <- data.frame(label=paste("Site", 1:50), Estimate="", P.value="") if you do typeof(df$Estimate), you see it's expecting an integer, and typeof(test$estimate) shows it spits out a double, so R doesn't know what you're trying to do with those two values. you can redo your code like thus:
df <- data.frame(label=paste("Site", 1:50), Estimate=numeric(50), P.value=numeric(50))
for (i in 1:50){
test <- cor.test(data[,1], data[,i])
df$Estimate[i] = test$estimate
df$P.value[i] = test$p.value
}
to make it a little more concise.
similar to the answer of colemand77:
create a cor function:
cor_fun <- function(x, y, method){
tmp <- cor.test(x, y, method= method)
cbind(r=tmp$estimate, p=tmp$p.value) }
apply through the data.frame. You can transpose the result to get p and r by row:
t(apply(data, 2, cor_fun, data[, 1], "spearman"))

Strange bahaviour with lapply, lazy evaluation?

can someone explain me what is going on here?
I have a list of lists that I need to match with a table, and I am using lapply with fmatch (package fastmatch http://cran.r-project.org/web/packages/fastmatch/index.html) for that (which I think uses matching by hashing the table to be matched with, in contrast to match).
However, this is rather slow if table values have to be evaluated in the function (at least that's what I suspect), but I am not entirely sure.
I have found a workaround which speeds up the computation from 5.5 to 0.01s, but would like a more elegant solution.
Here is a reproducible example:
set.seed(10)
matchFeatures <- replicate(n = 1000, paste0("a", sample(x = 1:10000, size = sample(x = 1:10, size = 1))))
matchTable <- 1:10000
system.time(m1 <- lapply(matchFeatures, function(features) fmatch(features, paste0("a", 1:10000))))
system.time(m2 <- lapply(matchFeatures, function(features) force(fmatch(features, paste0("a", 1:10000)))))
system.time({tempTable <- paste0("a", 1:10000); m3 <- lapply(matchFeatures, function(features) fmatch(features, tempTable))})
identical(m1, m3)
Thanks Justin, just to follow up, I was looking for something like this:
system.time(m4 <- lapply(matchFeatures, fmatch, table = paste0("a", 1:10000)))
In the first two functions, you're running the paste command once for each iteration (i.e. 10000 times). In the third, it only happens once. If you use matchTable <- paste('a', 1:10000) and pass matchTable to all three versions you get a substantial speed up as expected.
matchFeatures <- replicate(n = 1000,
paste0("a",
sample(x = 1:10000,
size = sample(x = 1:10, size = 1))))
matchTable <- paste('a', 1:10000)
system.time(m1 <- lapply(matchFeatures,
function(features) fmatch(features, matchTable)))
system.time(m2 <- lapply(matchFeatures,
function(features) force(fmatch(features, matchTable))))
system.time(m3 <- lapply(matchFeatures,
function(features) fmatch(features, matchTable)))
identical(m1, m3)

Resources