Broken stick (or piecewise) regression with 2 breakpoints - r

I want to estimate two breakpoints of a function with the next data:
df = data.frame (x = 1:180,
y = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 2, 2, 4, 2, 2, 3, 2, 1, 2,0, 1, 0, 1, 4, 0, 1, 2, 3, 1, 1, 1, 0, 2, 0, 3, 2, 1, 1, 1, 1, 5, 4, 2, 1, 0, 2, 1, 1, 2, 0, 0, 2, 2, 1, 1, 1, 0, 0, 0, 0,
2, 3, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
# plotting y ~ x
plot(df)
I know that the function have two breakpoints such that:
y = y1 if x < b1;
y = y2 if b1 < x < b2;
y = y3 if b2 < x;
And I want to find b1 and b2 to fit a kind of rectangular function with the following form
Can anyone help me or point me in the right direction? Thanks!

1) kmeans Try kmeans like this:
set.seed(123)
km <- kmeans(df, 3, nstart = 25)
> fitted(km, "classes") # or equivalently km$cluster
[1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[112] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
> unique(fitted(km, "centers")) # or except for order km$centers
x y
3 30.5 0.5166667
1 90.5 0.9000000
2 150.5 0.0000000
> # groups are x = 1-60, 61-120 and 121-180
> simplify2array(tapply(df$x, km$cluster, range))
1 2 3
[1,] 61 121 1
[2,] 120 180 60
plot(df, col = km$cluster)
lines(fitted(km)[, "y"] ~ x, df)
2) brute force Another approach is a brute force approach in which we calculate every possible pair of breakpoints and choose the pair whose sum of squares in a linear model is least.
grid <- subset(expand.grid(b1 = 1:180, b2 = 1:80), b1 < b2)
# the groups are [1, b1], (b1, b2], (b2, Inf)
fit <- function(b1, b2, x, y) {
grp <- factor((x > b1) + (x > b2))
lm(y ~ grp)
}
dv <- function(...) deviance(fit(...))
wx <- which.min(mapply(dv, grid$b1, grid$b2, MoreArgs = df))
grid[wx, ]
## b1 b2
## 14264 44 80
plot(df)
lines(fitted(fit(grid$b1[wx], grid$b2[wx], x, y)) ~ x, df)

I can see that y is integer numbers, so maybe this is best estimated with a Poisson or Binomial model. Here is a solution using the R package mcp:
# Three intercept segments
model = list(
y ~ 1,
~ 1,
~ 1
)
library(mcp)
fit = mcp(model, df, family = poisson(), par_x = "x", adapt = 2000)
plot(fit)
Notice that mcp is one of the only packages to estimate the uncertainty around the change point ant parameter estimates. The summary shows where the change point is estimated to be (cp_1 and cp_2) as well as the other parameters (on a log-scale since that's the default link function for Poisson models):
summary(fit)
Family: poisson(link = 'log')
Iterations: 9000 from 3 chains.
Segments:
1: y ~ 1
2: y ~ 1 ~ 1
3: y ~ 1 ~ 1
Population-level parameters:
name mean lower upper Rhat n.eff
cp_1 39.57 37.8 45.00 1 54
cp_2 99.82 99.0 101.21 1 2211
int_1 -4.00 -6.5 -1.88 1 577
int_2 0.32 0.1 0.54 1 6288
int_3 -11.02 -20.9 -3.56 1 2487

Related

compare rows and fill gaps with previous values in a data table

I have data table that looks like this:
library(data.table)
data <- data.table(time = c(seq(0, 14)),
anom = c(0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0),
gier = c(0, 0, 0, 4, 9, 7, 2, 0, 3, 1, 4, 2, 0, 0, 0))
Now I want to fill the gaps (zeros) with ones in column anom so that the result looks like this:
res <- data.table(time = c(seq(0, 14)),
anom = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0),
gier = c(0, 0, 0, 4, 9, 7, 2, 0, 3, 1, 4, 2, 0, 0, 0))
In addition there is the problem that I want to be flexible with the gap size so I can choose how big the gap can be. There must be an easy way to do something similar to LOCF only for real values (maybe filling it with the previous value of the row and not only ones or zeros) and not only for NA's like the functions fill or na.locf.
An example using the maxgap argument to select the maximum gap size
library(zoo)
na.fill(
na.locf(
replace(data$anom,data$anom==0,NA),
na.rm=F,
maxgap=2
),0
)
[1] 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0
Here is another option using rolling join:
maxgap <- 1L
data[, c("rn", "lu") := .(.I, anom)]
data[anom==0L, lu := fcoalesce(
data[anom!=0L][.SD, on=.(rn=rn), roll=maxgap, rollends=c(FALSE, FALSE), x.anom],
anom)
]
output:
time anom gier rn lu
1: 0 0 0 1 0
2: 1 0 0 2 0
3: 2 0 0 3 0
4: 3 1 4 4 1
5: 4 1 9 5 1
6: 5 1 7 6 1
7: 6 0 2 7 1
8: 7 0 0 8 0
9: 8 1 3 9 1
10: 9 0 1 10 1
11: 10 1 4 11 1
12: 11 1 2 12 1
13: 12 0 0 13 0
14: 13 0 0 14 0
15: 14 0 0 15 0

Issues with plotting network in igraph

I am having some issues in realizing a bipartite network in R with the library igraph. Here is my script:
library(igraph)
library(reshape2)
setwd("....")
getwd()
library(readxl)
network=read_excel("network1.xlsx")
print(network)
subjects=as.character(unlist(network[,1]))
agents=colnames(network[-1])
print(network)
network = network[,-1]
g=graph.incidence(network, weighted = T)
V(g)$type
V(g)$name=c(subjects,agents)
V(g)$color = V(g)$type
V(g)$color=gsub("FALSE","red",V(g)$color)
V(g)$color=gsub("TRUE","lightblue",V(g)$color)
plot(g, edge.arrow.width = 0.3,
vertex.size = 5,
edge.arrow.size = 0.5,
vertex.size2 = 5,
vertex.label.cex = 1,
vertex.label.color="black",
asp = 0.35,
margin = 0,
edge.color="grey",
edge.width=(E(g)$weight),
layout=layout_as_bipartite)
The network is properly plotted
as you can see
however I have two issues
(1) I don't understand the order in which the vertexs are showed in the plot. They are not in the same order of the excel file, neither in alphabetical or numerical order. They seem to be in random order. How could I choose the order in which the vertex should be placed?
(2) I don't understand why some vertex are closer toghether, and some are more far apart. I would all vertexes at the same distance. How could I do it?
Thank you a lot for your invaluable help.
Since you do not provide your data, I will illustrate with a made-up example.
Sample graph data
library(igraph)
set.seed(123)
EL = matrix(c(sample(8,18, replace=T),
sample(LETTERS[1:6], 18, replace=T)), ncol=2)
g = simplify(graph_from_edgelist(EL))
V(g)$type = bipartite_mapping(g)$type
VCol = c("#FF000066", "#0000FF66")[as.numeric(V(g)$type)+1]
plot(g, layout=layout_as_bipartite(g), vertex.color=VCol)
As with your graph, this has two problems. The nodes are ordered arbitrarily
and the lower row is oddly spaced. Let's address those problems one at a time.
To do so, we will need to take control of the layout instead of using any of
the automated layout functions. A layout is simply a vcount(g) * 2 matrix
giving the x-y coordinates of the vertices for plotting. Here, I will put one
type of nodes in the top row by specifying the y coordinate as 1 and the other
nodes in a lower row by specifying y=0. We want to specify the order horizontally
by rank (alphabetically) within each group. So
LO = matrix(0, nrow=vcount(g), ncol=2)
LO[!V(g)$type, 2] = 1
LO[V(g)$type, 1] = rank(V(g)$name[V(g)$type])
LO[!V(g)$type, 1] = rank(V(g)$name[!V(g)$type])
plot(g, layout=LO, vertex.color=VCol)
Now both rows are ordered and evenly spaced, but because there are fewer
vertices in the bottom row, there is an unattractive, unbalanced look. We
can fix that by stretching the bottom row. I find it easier to make the right
scale factor if the coordinates go from 0 to (number of nodes) - 1 rather than
1 to (number of nodes) as above. Doing this, we get
LO[V(g)$type, 1] = rank(V(g)$name[V(g)$type]) - 1
LO[!V(g)$type, 1] = (rank(V(g)$name[!V(g)$type]) - 1) *
(sum(V(g)$type) - 1) / (sum(!V(g)$type) - 1)
plot(g, layout=LO, vertex.color=VCol)
thank you a lot. I performed your very very helpful example, and with the step one I did it work properly with my data, keeping the different thickness of the edges and all as in my plot, but with the proper order. This is very important, thank you a lot. However, I have some troubles in understanding how to rescale properly the top and the bottom row with my data, because they always seem to bee too near. probably I did not understand completly the coordinates on which I have to work. Here are my data.
> `> network=read_excel("network1.xlsx",2)
> dput(network)
structure(list(`NA` = c(2333, 2439, 2450, 2451, 2452, 2453, 2454,
2455, 2456, 2457, 2458, 2459, 2460, 2461, 2480, 2490, 2491, 2492,
2493, 2494, 2495), A = c(12, 2, 2, 5, 2, 0, 5, 3, 0, 0, 7, 0,
0, 0, 6, 2, 10, 7, 1, 2, 5), B = c(0, 1, 0, 1, 0, 0, 2, 0, 0,
0, 0, 0, 1, 0, 5, 0, 2, 0, 0, 0, 0), C = c(0, 0, 0, 0, 1, 0,
4, 0, 0, 0, 0, 1, 0, 0, 2, 0, 4, 4, 2, 1, 0), D = c(2, 0, 0,
0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 7, 0, 4, 0, 1, 4, 0), E = c(11,
2, 3, 3, 3, 8, 3, 6, 4, 1, 1, 0, 12, 0, 5, 0, 4, 6, 4, 8, 9),
F = c(2, 0, 0, 3, 1, 0, 10, 1, 0, 0, 0, 1, 0, 0, 9, 0, 0,
1, 1, 3, 3), G = c(0, 3, 1, 1, 0, 0, 0, 0, 0, 3, 2, 0, 0,
0, 1, 0, 0, 2, 0, 1, 0), H = c(0, 0, 2, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1), I = c(0, 0, 0, 0, 0,
0, 3, 0, 6, 3, 0, 0, 1, 0, 7, 0, 0, 4, 1, 2, 0), J = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-21L), .Names = c(NA, "A", "B", "C", "D", "E", "F", "G", "H",
"I", "J"))
> print(network)
NA A B C D E F G H I J
1 2333 12 0 0 2 11 2 0 0 0 0
2 2439 2 1 0 0 2 0 3 0 0 0
3 2450 2 0 0 0 3 0 1 2 0 0
4 2451 5 1 0 0 3 3 1 0 0 0
5 2452 2 0 1 0 3 1 0 0 0 0
6 2453 0 0 0 0 8 0 0 0 0 1
7 2454 5 2 4 2 3 10 0 1 3 0
8 2455 3 0 0 0 6 1 0 0 0 0
9 2456 0 0 0 0 4 0 0 0 6 0
10 2457 0 0 0 0 1 0 3 0 3 0
11 2458 7 0 0 0 1 0 2 0 0 0
12 2459 0 0 1 0 0 1 0 0 0 0
13 2460 0 1 0 0 12 0 0 0 1 0
14 2461 0 0 0 0 0 0 0 0 0 0
15 2480 6 5 2 7 5 9 1 2 7 1
16 2490 2 0 0 0 0 0 0 0 0 0
17 2491 10 2 4 4 4 0 0 0 0 0
18 2492 7 0 4 0 6 1 2 0 4 0
19 2493 1 0 2 1 4 1 0 0 1 0
20 2494 2 0 1 4 8 3 1 0 2 0
21 2495 5 0 0 0 9 3 0 1 0 0
> `

Conditional probabilities in a binary matrix given a pair of values as the condition

My data comes from a multiple-choice question where respondents could choose more than one selection (the five selections are different roles they held, such as Role1 is a participant on the IT Committee, or Role2 is a participant on the Budget Committee, etc.). I converted the roles into binary variables where a "1" indicates the respondent selected that role and a "0" indicates they did not select it.
Here is sample data:
structure(list(Role1 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 1), Role2 = c(0, 1, 1, 1, 1, 0, 1, 1, 1,
1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1), Role3 = c(1, 0, 0, 0, 0, 1,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1), Role4 = c(0, 1, 0,
1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0), Role5 = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
Using the bindata package, its condprob function calculates the probability that a respondent held any of the roles given that they held another role.
library(bindata)
# Returns a matrix containing the conditional probabilities, and converts the matrix to a data frame
condlTable.df <- as.data.frame(condprob(SOdata))
My programming question: How can R take any random pair of roles and calculate the conditional probability of any of the other roles? For example, if a respondent selected Role1 and Role2, a pair of roles, what is the probability that they also selected Role3, or Role4, or Role5? The ideal output would be like the output of condprob but for pairs of selections.
Thank you for your help.
Would the prop.table function give you what you need?
For example:
prop.table(condlTable.df)
This gives you the proportions by row:
Role1 Role2 Role3 Role4 Role5
Role1 0.07097829 0.05977119 0.02241420 0.01867850 0.007471399
Role2 0.07097829 0.07097829 0.01774457 0.01330843 0.008872286
Role3 0.06083853 0.04055902 0.07097829 0.02027951 0.020279510
Role4 0.07097829 0.04258697 0.02839131 0.07097829 0.000000000
Role5 0.04731886 0.04731886 0.04731886 0.00000000 0.070978286
adding CondlTable.df per my comment/question below.
Role1 Role2 Role3 Role4 Role5
Role1 1.0000000 0.8421053 0.3157895 0.2631579 0.1052632
Role2 1.0000000 1.0000000 0.2500000 0.1875000 0.1250000
Role3 0.8571429 0.5714286 1.0000000 0.2857143 0.2857143
Role4 1.0000000 0.6000000 0.4000000 1.0000000 0.0000000
Role5 0.6666667 0.6666667 0.6666667 0.0000000 1.0000000
Here is your original data:
Role1 Role2 Role3 Role4 Role5
1 1 0 1 0 0
2 1 1 0 1 0
3 1 1 0 0 0
4 1 1 0 1 0
5 1 1 0 0 0
6 1 0 1 1 0
7 1 1 0 0 0
8 1 1 0 0 1
9 1 1 0 0 0
10 1 1 1 0 0
11 1 1 0 0 0
12 1 0 0 1 0
13 1 1 1 0 0
14 1 1 0 0 0
15 1 1 0 0 0
16 1 1 0 0 0
17 1 1 1 1 0
18 1 1 0 0 0
19 0 0 1 0 1
20 1 1 1 0 1

cumsum according to certain restricts in r

I have a large data of car accidents and a sample of it is provided below.
accident is a binary variable of whether the accident happens or
not.
shift_number is the number of the shift, 0 means the driver is
taking a rest and not a shift.
time_diff is the amount of time at each observation.
df <- data.frame(
accident = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1),
shift_number = c(1, 1, 0, 0, 0, 2, 2, 2, 0, 0, 3, 3, 3, 3, 3),
time_diff = 3:17
)
My question is to measure the total amount of working time since the driver starts this shift for each accident.
wanted <- data.frame
(
accident = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1),
shift_number = c(1, 1, 0, 0, 0, 2, 2, 2, 0, 0, 3, 3, 3, 3, 3),
time_diff = 3:17,
cum_time = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 0, 75)
)
Does anyone have ideas on solving this problem with R? It's better to have data.table or vectorised solution because I've got huge data to deal with.
df$cum_time = 0
accident = which(df$accident == 1)
df$cum_time[accident] <- sapply(accident, function(x) {
sum(df$time_diff[(which.max(cumsum(df$shift_number[1:x] == 0)) + 1): x])
})
df
# accident shift_number time_diff cum_time
#1 0 1 3 0
#2 0 1 4 0
#3 0 0 5 0
#4 0 0 6 0
#5 0 0 7 0
#6 0 2 8 0
#7 0 2 9 0
#8 0 2 10 0
#9 0 0 11 0
#10 0 0 12 0
#11 0 3 13 0
#12 1 3 14 27
#13 0 3 15 0
#14 0 3 16 0
#15 1 3 17 75
We first make all the values in cum_time variable as 0. We find the indices where accident has occurred. For each of those indices we find the latest 0 in shift_number and calculate the sum of values of time_diff from the latest 0 to x and assign it to its respective indices.
Use the ave function to compute the cumulative sum of time_diff by shift_number:
cumsum_by_shift <- ave(df$time_diff, df$shift_number, FUN=cumsum)
#[1] 3 7 5 11 18 8 17 27 29 41 13 27 42 58 75
Pick out elements of cumsum_by_shift where accidents occur:
cum_time <- ifelse(df$accident == 1, cumsum_by_shift, 0)
#[1] 0 0 0 0 0 0 0 0 0 0 0 27 0 0 75
Note the use of the vectorized ifelse function.

Match information from a correlation matrix according to their p-value cut off

I have used rcorr function of Hmisc library for calculation of correlations and p-values. Then extracted pvalues to Pval matrix and correlation coefficients to corr matrix.
Rvalue<-structure(c(1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1), .Dim = c(10L,
10L), .Dimnames = list(c("41699", "41700", "41701", "41702",
"41703", "41704", "41705", "41707", "41708", "41709"), c("41699",
"41700", "41701", "41702", "41703", "41704", "41705", "41707",
"41708", "41709")))
> Pvalue<-structure(c(NA, 0, 0, 0, 0.0258814351024321, 0, 0, 0, 0, 0, 0,
NA, 6.70574706873595e-14, 0, 0, 2.1673942640632e-09, 1.08217552696743e-07,
0.0105345133269157, 0, 0, 0, 6.70574706873595e-14, NA, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, NA, 2.22044604925031e-15, 0, 0, 0, 0,
0, 0.0258814351024321, 0, 0, 2.22044604925031e-15, NA, 0, 0,
0, 0.000322310440723728, 0.00298460759118657, 0, 2.1673942640632e-09,
0, 0, 0, NA, 0, 0, 0, 0, 0, 1.08217552696743e-07, 0, 0, 0, 0,
NA, 0, 0, 0, 0, 0.0105345133269157, 0, 0, 0, 0, 0, NA, 0, 0,
0, 0, 0, 0, 0.000322310440723728, 0, 0, 0, NA, 0, 0, 0, 0, 0,
0.00298460759118657, 0, 0, 0, 0, NA), .Dim = c(10L, 10L), .Dimnames = list(
c("41699", "41700", "41701", "41702", "41703", "41704", "41705",
"41707", "41708", "41709"), c("41699", "41700", "41701",
"41702", "41703", "41704", "41705", "41707", "41708", "41709"
)))
Then I converted corr matrix to Boolean matrix (0,1) which number one means good correlation. Then I want to math good correlations with significant pvalues. I need an edge list including the p-value. I implemented following code:
n=1
m=list()
for(i in 1:nrow(Rvalue))
{
for (j in 1:nrow(Rvalue))
{
if (i<j & Pvalue[i,j]<0.05 & Rvalue[i,j]==1)
{
m[[n]]<-c(rownames(Rvalue)[i], colnames(Rvalue)[j], signif(Pvalue[i,j], digits = 4))
n=n+1
}
}
print(i)
}
then, then output is:
> m
[[1]]
[1] "41699" "41700" "0"
[[2]]
[2] "41699" "41701" "0"
[[3]]
[3] "41699" "41702" "0"
[[4]]
[4] "41699" "41704" "0"
...
Result is OK, but since the matrices are very big, it needs much time. How can I speed up this process? Please note that I need node names. Is there any related functions?
I also have found two similar questions but not exactly what I needed (+ and +). Thanks in advance.
You could try
indx <- which(Rvalue==1 & Pvalue < 0.05 & !is.na(Pvalue), arr.ind=TRUE)
d1 <- data.frame(rN=row.names(Rvalue)[indx[,1]],
cN=colnames(Rvalue)[indx[,2]], Pval=signif(Pvalue[indx],
digits=4))
head(d1,2)
# rN cN Pval
#1 41700 41699 0
#2 41701 41699 0
Update
Not sure why you are getting the same result when you change the cutoff. It may be possible that the P values may be too small that it would be TRUE in the cutoffs you tried. Here is an example to show that it does return different values. Suppose, I create a function from the above code,
f1 <- function(Rmat, Pmat, cutoff){
indx <- which(Rmat==1 & Pmat < cutoff & !is.na(Pmat), arr.ind=TRUE)
d1 <- data.frame(rN=row.names(Rmat)[indx[,1]],
cN=colnames(Rmat)[indx[,2]], Pval=signif(Pmat[indx],
digits=4))
d1}
f1(R1, P1, 0.05)
# rN cN Pval
#1 B A 0.021
#2 C A 0.018
#3 D A 0.001
#4 A B 0.021
#5 A C 0.018
#6 E C 0.034
#7 A D 0.001
#8 C E 0.034
f1(R1, P1, 0.01)
# rN cN Pval
#1 D A 0.001
#2 A D 0.001
f1(R1, P1, 0.001)
#[1] rN cN Pval
#<0 rows> (or 0-length row.names)
data
set.seed(24)
R1 <- matrix(sample(c(0,1), 5*5, replace=TRUE), 5,5,
dimnames=list(LETTERS[1:5], LETTERS[1:5]))
R1[lower.tri(R1)] <- 0
R1 <- R1+t(R1)
diag(R1) <- 1
set.seed(49)
P1 <- matrix(sample(seq(0,0.07, by=0.001), 5*5, replace=TRUE), 5, 5,
dimnames=list(LETTERS[1:5], LETTERS[1:5]))
P1[lower.tri(P1)] <- 0
P1 <- P1+t(P1)
diag(P1) <- NA
Since your matrix has a large number of columns and rows, that would be a good idea to avoid simultaneous "for loop". You can instead use mapply function which is more handy.
mapply(FUN = NULL , ...)
instead of FUN use the following function:
myf= function(x){ x "les then threshold"}
You can use mapply(FUN = myf , "Your Matrix") twice to check if the elements of two correlation and pvalue matrices agree with threshold.
Store the results in two boolean matrices, P1 and P2. Then multiply P1 and P2 (direct multiplication).
myf1 = function(x) {x<0.05}
myf2 = function(x) {x>0.7}
P1 = mapply(FUN = myf1 , matP)
P2 = mapply(FUN = myf2 , matR)
P = P1 * P2
The elements in P which are labeled as "True" are the desired nodes. It will work fine!
And here there is the result for your smaple:
P1 = mapply(FUN = myf1 , Pvalue)
P2 = mapply(FUN = myf2 , Rvalue)
P = P1 * P2
NA 1 1 1 0 1 1 0 1 1 1 NA 0 0 0 0 0 0 1 1 1 0 NA 1 0
1 1 1 1 1 1 0 1 NA 0 1 1 0 1 1 0 0 0 0 NA 1 0 1 0 0
1 0 1 1 1 NA 1 1 1 1 1 0 1 1 0 1 NA 1 1 1 0 0 1 0 1
1 1 NA 0 0 1 1 1 1 0 1 1 0 NA 1 1 1 1 1 0 1 1 0 1 NA

Resources