Related
I have this dataframe and would like to compute a count of zero sequences every time they appear in a row so that the output would be A: 2 4, B:1 2 1, C:2 5, D: 2 3, E: 1 1
df <- data.frame(
A=c(1, 0, 0, 1, 1, 0, 0, 0, 0),
B=c(0, 1, 1, 0, 0, 1, 0, 1, 1),
C=c(0, 0, 1, 1, 0, 0, 0, 0, 0),
D=c(0, 0, 1, 1, 1, 1, 0, 0, 0),
E=c(1, 0, 1, 1, 1, 1, 0, 1, 1)
)
We may use rle by looping over the columns of the data.frame and get the lengths of the 0 values in base R
lapply(df1, function(x) with(rle(x), lengths[!values]))
-output
$A
[1] 2 4
$B
[1] 1 2 1
$C
[1] 2 5
$D
[1] 2 3
$E
[1] 1 1
data
df1 <- structure(list(A = c(1, 0, 0, 1, 1, 0, 0, 0, 0), B = c(0, 1,
1, 0, 0, 1, 0, 1, 1), C = c(0, 0, 1, 1, 0, 0, 0, 0, 0), D = c(0,
0, 1, 1, 1, 1, 0, 0, 0), E = c(1, 0, 1, 1, 1, 1, 0, 1, 1)), row.names = c(NA,
-9L), class = "data.frame")
I got data like this
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
I would like to get the cumulative count of each column for each id and get the data like this
structure(list(id2 = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1_b = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2_b = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
You can get a cumulative sum with cumsum.
To split data.frame into subsets, you can use split and then lapply cumsum over the list of the data.frames and again over the list of the columns, or you can use the ave function which does exactly that:
data = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
data[-1] = ave(data[-1], data$id, FUN=cumsum)
edit:
I assumed that the cumulative sum is requested (as per instructions) and that there is a mistake in the example data. If the example data is correct, then the condition is If the count is zero, don't do cumulative sum and leave at zero or ifelse(x == 0, 0, cumsum(x)) (as per #r2evans). However, this construct doesn't work when applied for the data.frame. A more complex helper function is required:
data[-1] = ave(data[-1], data$id, FUN=function(x){
y = cumsum(x)
y[x == 0] = 0
y
})
We can now compare it with the requested (renamed) data:
result = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2 = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
identical(data, result)
Base R,
ave(df$drug_2, df$id, FUN = function(z) ifelse(z == 0, z, cumsum(z)))
# [1] 0 1 2 3 4 0 5 0 0 1 0 2
Edit Simplified the solution after reading r2evans' approach.
You could use
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with("drug"),
~ifelse(.x == 0, 0, cumsum(.x)))) %>%
ungroup()
This returns
# A tibble: 12 x 3
id drug_1 drug_2
<dbl> <dbl> <dbl>
1 1 0 0
2 1 0 1
3 1 0 2
4 1 0 3
5 1 0 4
6 1 1 0
7 1 2 5
8 2 0 0
9 2 0 0
10 2 1 1
11 2 0 0
12 2 2 2
Base R solution:
# Resolve the names of vectors we want to cumulatively sum:
# drug_vec_names => character vector
drug_vec_names <- grep( "^drug\\_", colnames(df), value = TRUE)
# Resolve the names of vectors we want to keep:
# not_drug_vec_names => character vector
not_drug_vec_names <- names(df)[!(names(df) %in% drug_vec_names)]
# Calculate the result: res => data.frame
res <- setNames(
cbind(
df[,not_drug_vec_names],
replace(
ave(
df[,drug_vec_names],
df[,not_drug_vec_names],
FUN = cumsum
),
df[,drug_vec_names] == 0,
0
)
),
c(not_drug_vec_names, drug_vec_names)
)
If you have binary values (1/0) in drug columns, you can multiply the cumulative sum with itself to get 0 for 0 values.
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('drug'), ~cumsum(.) * .)) %>%
ungroup
# id drug_1 drug_2
# <dbl> <dbl> <dbl>
# 1 1 0 0
# 2 1 0 1
# 3 1 0 2
# 4 1 0 3
# 5 1 0 4
# 6 1 1 0
# 7 1 2 5
# 8 2 0 0
# 9 2 0 0
#10 2 1 1
#11 2 0 0
#12 2 2 2
I need to transform main diagonal
{matrix(
1 1 1 1,
0 2 2 2,
0 0 3 3,
0 0 0 4)
}
into:
{matrix(
0 0 0 1,
0 0 1 2,
0 1 2 3,
1 2 3 4)
}
I tried all operators I could find t(), arev(), flipud(), apply(x,2,rev) and so on. Without a positive result. Hope you can help me.
Does this work for you? Takes each column and 'rotates' (for lack of a better word) x places, where x is the column index.
res <- sapply(1:ncol(input),function(x){
#get relevant column
base <- input[,x]
n <- length(base)
indices <- 1:n
#reshuffle indices: first above x, then below x
out <- base[c(indices[indices>x],indices[indices<=x])]
out
})
all(res==output)
[1] TRUE
data used:
input <- structure(c(1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 4), .Dim = c(4L,
4L))
output <- structure(c(0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4), .Dim = c(4L,
4L))
I have used rcorr function of Hmisc library for calculation of correlations and p-values. Then extracted pvalues to Pval matrix and correlation coefficients to corr matrix.
Rvalue<-structure(c(1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1), .Dim = c(10L,
10L), .Dimnames = list(c("41699", "41700", "41701", "41702",
"41703", "41704", "41705", "41707", "41708", "41709"), c("41699",
"41700", "41701", "41702", "41703", "41704", "41705", "41707",
"41708", "41709")))
> Pvalue<-structure(c(NA, 0, 0, 0, 0.0258814351024321, 0, 0, 0, 0, 0, 0,
NA, 6.70574706873595e-14, 0, 0, 2.1673942640632e-09, 1.08217552696743e-07,
0.0105345133269157, 0, 0, 0, 6.70574706873595e-14, NA, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, NA, 2.22044604925031e-15, 0, 0, 0, 0,
0, 0.0258814351024321, 0, 0, 2.22044604925031e-15, NA, 0, 0,
0, 0.000322310440723728, 0.00298460759118657, 0, 2.1673942640632e-09,
0, 0, 0, NA, 0, 0, 0, 0, 0, 1.08217552696743e-07, 0, 0, 0, 0,
NA, 0, 0, 0, 0, 0.0105345133269157, 0, 0, 0, 0, 0, NA, 0, 0,
0, 0, 0, 0, 0.000322310440723728, 0, 0, 0, NA, 0, 0, 0, 0, 0,
0.00298460759118657, 0, 0, 0, 0, NA), .Dim = c(10L, 10L), .Dimnames = list(
c("41699", "41700", "41701", "41702", "41703", "41704", "41705",
"41707", "41708", "41709"), c("41699", "41700", "41701",
"41702", "41703", "41704", "41705", "41707", "41708", "41709"
)))
Then I converted corr matrix to Boolean matrix (0,1) which number one means good correlation. Then I want to math good correlations with significant pvalues. I need an edge list including the p-value. I implemented following code:
n=1
m=list()
for(i in 1:nrow(Rvalue))
{
for (j in 1:nrow(Rvalue))
{
if (i<j & Pvalue[i,j]<0.05 & Rvalue[i,j]==1)
{
m[[n]]<-c(rownames(Rvalue)[i], colnames(Rvalue)[j], signif(Pvalue[i,j], digits = 4))
n=n+1
}
}
print(i)
}
then, then output is:
> m
[[1]]
[1] "41699" "41700" "0"
[[2]]
[2] "41699" "41701" "0"
[[3]]
[3] "41699" "41702" "0"
[[4]]
[4] "41699" "41704" "0"
...
Result is OK, but since the matrices are very big, it needs much time. How can I speed up this process? Please note that I need node names. Is there any related functions?
I also have found two similar questions but not exactly what I needed (+ and +). Thanks in advance.
You could try
indx <- which(Rvalue==1 & Pvalue < 0.05 & !is.na(Pvalue), arr.ind=TRUE)
d1 <- data.frame(rN=row.names(Rvalue)[indx[,1]],
cN=colnames(Rvalue)[indx[,2]], Pval=signif(Pvalue[indx],
digits=4))
head(d1,2)
# rN cN Pval
#1 41700 41699 0
#2 41701 41699 0
Update
Not sure why you are getting the same result when you change the cutoff. It may be possible that the P values may be too small that it would be TRUE in the cutoffs you tried. Here is an example to show that it does return different values. Suppose, I create a function from the above code,
f1 <- function(Rmat, Pmat, cutoff){
indx <- which(Rmat==1 & Pmat < cutoff & !is.na(Pmat), arr.ind=TRUE)
d1 <- data.frame(rN=row.names(Rmat)[indx[,1]],
cN=colnames(Rmat)[indx[,2]], Pval=signif(Pmat[indx],
digits=4))
d1}
f1(R1, P1, 0.05)
# rN cN Pval
#1 B A 0.021
#2 C A 0.018
#3 D A 0.001
#4 A B 0.021
#5 A C 0.018
#6 E C 0.034
#7 A D 0.001
#8 C E 0.034
f1(R1, P1, 0.01)
# rN cN Pval
#1 D A 0.001
#2 A D 0.001
f1(R1, P1, 0.001)
#[1] rN cN Pval
#<0 rows> (or 0-length row.names)
data
set.seed(24)
R1 <- matrix(sample(c(0,1), 5*5, replace=TRUE), 5,5,
dimnames=list(LETTERS[1:5], LETTERS[1:5]))
R1[lower.tri(R1)] <- 0
R1 <- R1+t(R1)
diag(R1) <- 1
set.seed(49)
P1 <- matrix(sample(seq(0,0.07, by=0.001), 5*5, replace=TRUE), 5, 5,
dimnames=list(LETTERS[1:5], LETTERS[1:5]))
P1[lower.tri(P1)] <- 0
P1 <- P1+t(P1)
diag(P1) <- NA
Since your matrix has a large number of columns and rows, that would be a good idea to avoid simultaneous "for loop". You can instead use mapply function which is more handy.
mapply(FUN = NULL , ...)
instead of FUN use the following function:
myf= function(x){ x "les then threshold"}
You can use mapply(FUN = myf , "Your Matrix") twice to check if the elements of two correlation and pvalue matrices agree with threshold.
Store the results in two boolean matrices, P1 and P2. Then multiply P1 and P2 (direct multiplication).
myf1 = function(x) {x<0.05}
myf2 = function(x) {x>0.7}
P1 = mapply(FUN = myf1 , matP)
P2 = mapply(FUN = myf2 , matR)
P = P1 * P2
The elements in P which are labeled as "True" are the desired nodes. It will work fine!
And here there is the result for your smaple:
P1 = mapply(FUN = myf1 , Pvalue)
P2 = mapply(FUN = myf2 , Rvalue)
P = P1 * P2
NA 1 1 1 0 1 1 0 1 1 1 NA 0 0 0 0 0 0 1 1 1 0 NA 1 0
1 1 1 1 1 1 0 1 NA 0 1 1 0 1 1 0 0 0 0 NA 1 0 1 0 0
1 0 1 1 1 NA 1 1 1 1 1 0 1 1 0 1 NA 1 1 1 0 0 1 0 1
1 1 NA 0 0 1 1 1 1 0 1 1 0 NA 1 1 1 1 1 0 1 1 0 1 NA
I have a data frame with some columns:
that I want to transform into a factor,
in which the different levels are coded as -2, -1, 0, 1, 2, 3, 4
for which I want the levels to be labeled as 0 or 1 following this convention:
-2 = 1
-1 = 1
0 = 0
1 = 1
2 = 1
3 = 1
4 = 0
I have the following code:
#Convert to factor
dat[idx] <- lapply(dat[idx], factor, levels = -2:4, labels = c(1, 1, 0, 1, 1, 1, 0))
#Drop unused factor levels
dat <- droplevels(dat)
This works, but it gives me the following warning:
In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, :
duplicated levels in factors are deprecated
I tried the following code (per Ananda Mahto's suggestion) but no luck:
levels(dat[idx]) <- list(`0` = c(0, 4), `1` = c(-2, -1, 1, 2, 3))
I figured there has to be a better way to do this, any suggestions?
My data looks like this:
structure(list(Timestamp = structure(c(1380945601, 1380945603,
1380945605, 1380945607, 1380945609, 1380945611, 1380945613, 1380945615,
1380945617, 1380945619), class = c("POSIXct", "POSIXt"), tzone = ""),
FCB2C01 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), RCB2C01 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C02 = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1), RCB2C02 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C03 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), RCB2C03 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), FCB2C04 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C04 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C05 = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1), RCB2C05 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C06 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C06 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), FCB2C07 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C07 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C08 = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1), RCB2C08 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C09 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C09 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), FCB2C10 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C10 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("Timestamp", "FCB2C01",
"RCB2C01", "FCB2C02", "RCB2C02", "FCB2C03", "RCB2C03", "FCB2C04",
"RCB2C04", "FCB2C05", "RCB2C05", "FCB2C06", "RCB2C06", "FCB2C07",
"RCB2C07", "FCB2C08", "RCB2C08", "FCB2C09", "RCB2C09", "FCB2C10",
"RCB2C10"), row.names = c(NA, 10L), class = "data.frame")
And the column index:
idx <- seq(2,21,2)
If I correctly understand what you want to do, the "right" way would be to use the levels function to specify your levels. Compare the following:
set.seed(1)
x <- sample(-2:4, 10, replace = TRUE)
YourApproach <- factor(x, levels = -2:4, labels = c(1, 1, 0, 1, 1, 1, 0))
# Warning message:
# In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, :
# duplicated levels in factors are deprecated
YourApproach
# [1] 1 0 1 0 1 0 0 1 1 1
# Levels: 1 1 0 1 1 1 0
xFac <- factor(x, levels = -2:4)
levels(xFac) <- list(`0` = c(0, 4), `1` = c(-2, -1, 1, 2, 3))
xFac
# [1] 1 0 1 0 1 0 0 1 1 1
# Levels: 0 1
Note the difference in the "Levels" in each of those. This also means that the underlying numeric representation is going to be different:
> as.numeric(YourApproach)
[1] 2 3 5 7 2 7 7 5 5 1
> as.numeric(xFac)
[1] 2 1 2 1 2 1 1 2 2 2