Related
So I hope I can express my question, here I have the following example that I made up:
result <- c(1,1,1,1,1,1,1,1,1,1)
con1 <- c(1,2,2,2,1,1,2,2,2,2)
con2 <- c(2,1,2,2,1,1,2,2,2,1)
con3 <- c(2,2,1,1,1,2,2,2,2,1)
con4 <- c(2,1,2,2,1,1,2,1,1,2)
con5 <- c(1,2,2,2,1,2,2,2,2,1)
a <- tibble(Result=result,Con1=con1,Con2=con2,Con3=con3,Con4=con4,Con5=con5)
The above code gives me the following tibble, where each row is a patient:
> a
# A tibble: 10 x 6
Result Con1 Con2 Con3 Con4 Con5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 2 2 1
2 1 2 1 2 1 2
3 1 2 2 1 2 2
4 1 2 2 1 2 2
5 1 1 1 1 1 1
6 1 1 1 2 1 2
7 1 2 2 2 2 2
8 1 2 2 2 1 2
9 1 2 2 2 1 2
10 1 2 1 1 2 1
The Result are cases that are positive for a mayor illnes (thats why all are 1's) while the Con_i are yes or no question for the patient where 1=yes and 2=no, I want to get the number of patients that said yes to: 0 questions, 1 questions, 2-3 questions and 4 or more questions.
So far I've tried to do this:
a1 <-a %>% add_column(X=1)
a1$X <- case_when(a$Con1==2 & a$Con2==2 & a$Con3==2 & a$Con4==2 & a$Con5==2 ~ 0,
a$Con1==1 & a$Con2==2 & a$Con3==2 & a$Con4==2 & a$Con5==2 |
a$Con1==2 & a$Con2==1 & a$Con3==2 & a$Con4==2 & a$Con5==2|
a$Con1==2 & a$Con2==2 & a$Con3==1 & a$Con4==2 & a$Con5==2|
a$Con1==2 & a$Con2==2 & a$Con3==2 & a$Con4==1 & a$Con5==2|
a$Con1==2 & a$Con2==2 & a$Con3==2 & a$Con4==2 & a$Con5==1 ~ 1)
table <- a1 %>% group_by(X) %>% count(X,Result)
table
> table
# A tibble: 3 x 3
# Groups: X [3]
X Result n
<dbl> <dbl> <int>
1 0 1 1
2 1 1 4
3 NA 1 5
But I know is not the most efficient way, plus i would need to make all the combinations for 2-3 cases and 4+ cases and is not scalable, so I'm looking for a much easier way to do it and scale it, hope I can get your help and thanks in advance!
Perhaps the simplest:
table(rowSums(a[,-1] < 2))
# 0 1 2 3 5 <--- counts of "1" in each row
# 1 4 2 2 1 <--- number of patients with that count
Since you need to group 2-3 and 4+, then
table(cut(rowSums(a[,-1] < 2), c(0, 1, 2, 4, Inf), include.lowest = TRUE))
# [0,1] (1,2] (2,4] (4,Inf]
# 5 2 2 1
While the logic is using < 2, it's just as easy to check for == 1L or similar equality.
Tracing this, step by step:
a[,-1] == 1
# Con1 Con2 Con3 Con4 Con5
# [1,] TRUE FALSE FALSE FALSE TRUE
# [2,] FALSE TRUE FALSE TRUE FALSE
# [3,] FALSE FALSE TRUE FALSE FALSE
# [4,] FALSE FALSE TRUE FALSE FALSE
# [5,] TRUE TRUE TRUE TRUE TRUE
# [6,] TRUE TRUE FALSE TRUE FALSE
# [7,] FALSE FALSE FALSE FALSE FALSE
# [8,] FALSE FALSE FALSE TRUE FALSE
# [9,] FALSE FALSE FALSE TRUE FALSE
# [10,] FALSE TRUE TRUE FALSE TRUE
rowSums(a[,-1] == 1)
# [1] 2 2 1 1 5 3 0 1 1 3
That last is the number of 1s for each "patient" (row).
From this, I count one 0, four 1s, two 2s plus two 3s, zero 4s plus one 5. This should total 5, 2, 2, 1 ... so #andrew_reece is correct, let's use cut(...,right=FALSE):
table(cut(rowSums(a[,-1] < 2), c(0, 1, 2, 4, Inf), right = FALSE))
# [0,1) [1,2) [2,4) [4,Inf)
# 1 4 4 1
I should have caught earlier the [0,1] (previous answer), indicating 0 and 1 are close-ended, meaning both 0 and 1 are included in the same bin.
An option with Reduce and table
table(Reduce(`+`, lapply(a[-1], `<`, 2)))
# 0 1 2 3 5
#1 4 2 2 1
Pivot your data so all the Con vars are a column, and the yes/no values for each Con sit in a separate column. Then you can use group_by and summarise operations to get your grouping:
a %>%
mutate(patient = letters[row_number()]) %>%
pivot_longer(starts_with("Con")) %>%
group_by(patient) %>%
summarise(yes = sum(value == 1),
no = sum(value == 2)) %>%
group_by(yes) %>%
summarise(yes_ct = n()) %>%
mutate(yes_grp = case_when(
yes %in% 2:3 ~ "2-3",
yes >= 4 ~ "ge4",
TRUE ~ as.character(yes)
)) %>%
group_by(yes_grp) %>%
summarise(ct = sum(yes_ct))
# A tibble: 4 x 2
yes_grp ct
<chr> <int>
1 0 1
2 1 4
3 2-3 4
4 ge4 1
I made an explicit patient variable (just row numbers, basically) to make pivot and group operations easier.
Try this:
library(data.table)
df <- setDT(a) - 1
df$sum <- 5 - rowSums( df[,2:6] )
freq <- data.table(table(df$sum))
names(freq) <- c('Questions_Yes', 'Patients')
freq <- freq[,`:=`(
Questions_Yes = case_when(
Questions_Yes %in% c(2:3) ~ "2-3",
Questions_Yes >= 4 ~ "4+",
TRUE ~ as.character(Questions_Yes)
))
][, .(Patients = sum(Patients)), by = Questions_Yes]
Questions_Yes Patients
1: 0 1
2: 1 4
3: 2-3 4
4: 4+ 1
I have a dataset with repeating sequences of TRUE that I would like to label based on some conditions - by id, and by the sequence's incremental value. A FALSE breaks the sequence of TRUEs and the first FALSE that breaks any given sequence of TRUE should be included in that sequence. Consecutive FALSEs in between TRUEs are irrelevant and are labeled 0.
For example:
> test
id logical sequence
1 1 TRUE 1
2 1 TRUE 1
3 1 FALSE 1
4 1 TRUE 2
5 1 TRUE 2
6 1 FALSE 2
7 1 TRUE 3
8 2 TRUE 1
9 2 TRUE 1
10 2 TRUE 1
11 2 FALSE 1
12 2 TRUE 2
13 2 TRUE 2
14 2 TRUE 2
15 3 FALSE 0
16 3 FALSE 0
17 3 FALSE 0
18 3 TRUE 1
19 3 FALSE 1
20 3 TRUE 2
21 3 FALSE 2
22 3 FALSE 0
23 3 FALSE 0
24 3 FALSE 0
25 3 TRUE 3
And so on. I have considered using rle() which produces
> rle(test$logical)
Run Length Encoding
lengths: int [1:13] 2 1 2 1 4 1 3 3 1 1 ...
values : logi [1:13] TRUE FALSE TRUE FALSE TRUE FALSE ...
But I am not sure how to map this back on the data frame. Any suggestions on how to approach this problem?
Here are the sample data:
> dput(test)
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), logical = c(TRUE, TRUE,
FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE,
TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE,
FALSE, FALSE, TRUE)), .Names = c("id", "logical"), class = "data.frame", row.names = c(NA,
-25L))
A pure data.table solution:
# load the 'data.table'-package & convert 'test' to a data.table with 'setDT'
library(data.table)
setDT(test)
# calculate the new sequence
test[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id
][new_seq != 0, new_seq := rleid(new_seq), by = id][]
which gives:
id logical new_seq
1: 1 TRUE 1
2: 1 TRUE 1
3: 1 FALSE 1
4: 1 TRUE 2
5: 1 TRUE 2
6: 1 FALSE 2
7: 1 TRUE 3
8: 2 TRUE 1
9: 2 TRUE 1
10: 2 TRUE 1
11: 2 FALSE 1
12: 2 TRUE 2
13: 2 TRUE 2
14: 2 TRUE 2
15: 3 FALSE 0
16: 3 FALSE 0
17: 3 FALSE 0
18: 3 TRUE 1
19: 3 FALSE 1
20: 3 TRUE 2
21: 3 FALSE 2
22: 3 FALSE 0
23: 3 FALSE 0
24: 3 FALSE 0
25: 3 TRUE 3
What this does:
rleid(logical) - !logical creates a numeric run length id and substracts 1 for where logical is equal to FALSE
The result of the previous step is then multiplied with the result of !(!logical & !shift(logical, fill = FALSE)), which is a TRUE/FALSE vector for consequtive FALSE values except the first one of a FALSE-sequence.
Finally, we create a new run length id for only the rows where new_seq is not equal to 0 and have your desired result.
A slightly improved alternative (as suggested by #jogo in the comments):
test[, new_seq := (rleid(logical) - !logical) * (logical | shift(logical, fill = FALSE)), by = id
][new_seq != 0, new_seq := rleid(new_seq), by = id][]
There is for sure a better implementation of makeSeq function but this works.
This one uses libraries data.table, magrittr and dplyr
Function
makeSeq <- function(x) {
res <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum
IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x
res[IND2F] <- 0
res[!IND2F] <- rleidv(res[!IND2F])
return(res)
}
data.table solution
setDT(df)[,yourSEQ:=makeSeq(logical),by="id"]
df
tidyverse fans use
df %>% group_by(id) %>% mutate(yourSEQ = makeSeq(logical)) %>% ungroup
Result
> df
id logical yourSEQ
1: 1 TRUE 1
2: 1 TRUE 1
3: 1 FALSE 1
4: 1 TRUE 2
5: 1 TRUE 2
6: 1 FALSE 2
7: 1 TRUE 3
8: 2 TRUE 1
9: 2 TRUE 1
10: 2 TRUE 1
11: 2 FALSE 1
12: 2 TRUE 2
13: 2 TRUE 2
14: 2 TRUE 2
15: 3 FALSE 0
16: 3 FALSE 0
17: 3 FALSE 0
18: 3 TRUE 1
19: 3 FALSE 1
20: 3 TRUE 2
21: 3 FALSE 2
22: 3 FALSE 0
23: 3 FALSE 0
24: 3 FALSE 0
25: 3 TRUE 3
id logical yourSEQ
without using rle in dtmtd2 and also some timings:
dplyrmtd0 <- function() {
test %>%
group_by(id) %>%
mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>%
mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L))
}
setDT(test)
makeSeq <- function(x) {
res <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum
IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x
res[IND2F] <- 0
res[!IND2F] <- rleidv(res[!IND2F])
return(res)
}
dt0 <- copy(test)
dtmtd0 <- function() {
dt0[,yourSEQ:=makeSeq(logical),by="id"]
}
dt1 <- copy(test)
dtmtd1 <- function() {
dt1[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id
][new_seq != 0, new_seq := rleid(new_seq), by = id][]
}
dt4 <- copy(test)
dtmtd2 <- function() {
dt4[, sequence := {
idx <- cumsum(diff(c(FALSE, logical))==1L)
mask <- shift(logical, fill=FALSE) | logical
idx * mask
}, by=id]
}
microbenchmark(dplyrmtd0(), dtmtd0(), dtmtd1(), dtmtd2(), times=5L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval
dplyrmtd0() 375.6089 376.7271 433.1885 380.7428 443.8844 588.9791 5
dtmtd0() 481.5189 487.1245 492.9527 495.6855 500.1588 500.2759 5
dtmtd1() 146.0376 147.0163 154.7501 152.7157 154.2976 173.6831 5
dtmtd2() 106.3401 107.7728 112.7580 108.5239 119.4398 121.7131 5
data:
library(data.table)
library(dplyr)
library(microbenchmark)
M <- 1e6
test <- data.frame(id=sample(LETTERS, M, replace=TRUE) ,
logical=sample(c(TRUE, FALSE), M, replace=TRUE))
test <- test[order(test$id),]
You could use the cumsum for your rle values, then you have to go back and fix the sequential FALSE values.
library(dplyr)
test %>%
group_by(id) %>%
mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>%
mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L)) %>%
print(n = 25)
# # A tibble: 25 x 5
# # Groups: id [3]
# id logical sequence sum_rle sequence2
# <int> <lgl> <int> <int> <int>
# 1 1 TRUE 1 1 1
# 2 1 TRUE 1 1 1
# 3 1 FALSE 1 1 1
# 4 1 TRUE 2 2 2
# 5 1 TRUE 2 2 2
# 6 1 FALSE 2 2 2
# 7 1 TRUE 3 3 3
# 8 2 TRUE 1 1 1
# 9 2 TRUE 1 1 1
# 10 2 TRUE 1 1 1
# 11 2 FALSE 1 1 1
# 12 2 TRUE 2 2 2
# 13 2 TRUE 2 2 2
# 14 2 TRUE 2 2 2
# 15 3 FALSE 0 0 0
# 16 3 FALSE 0 0 0
# 17 3 FALSE 0 0 0
# 18 3 TRUE 1 1 1
# 19 3 FALSE 1 1 1
# 20 3 TRUE 2 2 2
# 21 3 FALSE 2 2 2
# 22 3 FALSE 0 2 0
# 23 3 FALSE 0 2 0
# 24 3 FALSE 0 2 0
# 25 3 TRUE 3 3 3
if you prefer a really concise version of the same thing...
library(dplyr)
group_by(test, id) %>%
mutate(sequence = if_else(!logical & !lag(logical), 0L,
with(rle(logical), rep(cumsum(values), lengths)),
missing = 0L))
I am exporting data from R with the command:
write.table(output,file = "data.raw", na "-9999", sep = "\t", row.names = FALSE, col.names = FALSE)
It exports my data correctly, but it exports all of the logical variables as TRUE and FALSE.
I need to read the data into another program that can only process numeric values. Is there an efficient way to convert logical columns to numeric 1s and 0s during the export? I have a large number of numeric variables, so I was hoping to automatically loop through all the variables in the data.table
Alternatively, my output object is a data.table. Is there an efficient way to convert all the logical variables in a data.table into numeric variables?
In case it is helpful, here is some code to generate a data.table with a logical variable in it (it is not a large number of logical variables, but enough to use on example code):
DT = data.table(cbind(1:100, rnorm(100) > 0)
DT[ , V3:= V2 == 1 ]
DT[ , V4:= V2 != 1 ]
For a data.frame, you could convert all logical columns to numeric with:
# The data
set.seed(144)
dat <- data.frame(V1=1:100,V2=rnorm(100)>0)
dat$V3 <- dat$V2 == 1
head(dat)
# V1 V2 V3
# 1 1 FALSE FALSE
# 2 2 TRUE TRUE
# 3 3 FALSE FALSE
# 4 4 FALSE FALSE
# 5 5 FALSE FALSE
# 6 6 TRUE TRUE
# Convert all to numeric
cols <- sapply(dat, is.logical)
dat[,cols] <- lapply(dat[,cols], as.numeric)
head(dat)
# V1 V2 V3
# 1 1 0 0
# 2 2 1 1
# 3 3 0 0
# 4 4 0 0
# 5 5 0 0
# 6 6 1 1
In data.table syntax:
# Data
set.seed(144)
DT = data.table(cbind(1:100,rnorm(100)>0))
DT[,V3 := V2 == 1]
DT[,V4 := FALSE]
head(DT)
# V1 V2 V3 V4
# 1: 1 0 FALSE FALSE
# 2: 2 1 TRUE FALSE
# 3: 3 0 FALSE FALSE
# 4: 4 0 FALSE FALSE
# 5: 5 0 FALSE FALSE
# 6: 6 1 TRUE FALSE
# Converting
(to.replace <- names(which(sapply(DT, is.logical))))
# [1] "V3" "V4"
for (var in to.replace) DT[, (var):= as.numeric(get(var))]
head(DT)
# V1 V2 V3 V4
# 1: 1 0 0 0
# 2: 2 1 1 0
# 3: 3 0 0 0
# 4: 4 0 0 0
# 5: 5 0 0 0
# 6: 6 1 1 0
Simplest way of doing this!
Multiply your matrix by 1
For example:
A <- matrix(c(TRUE,FALSE,TRUE,TRUE,TRUE,FALSE,FALSE,TRUE),ncol=4)
A
# [,1] [,2] [,3] [,4]
# [1,] TRUE TRUE TRUE FALSE
# [2,] FALSE TRUE FALSE TRUE
B <- 1*A
B
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 0 1 0 1
(You could also add zero: B <- 0 + A)
What about just a:
dat <- data.frame(le = letters[1:10], lo = rep(c(TRUE, FALSE), 5))
dat
le lo
1 a TRUE
2 b FALSE
3 c TRUE
4 d FALSE
5 e TRUE
6 f FALSE
7 g TRUE
8 h FALSE
9 i TRUE
10 j FALSE
dat$lo <- as.numeric(dat$lo)
dat
le lo
1 a 1
2 b 0
3 c 1
4 d 0
5 e 1
6 f 0
7 g 1
8 h 0
9 i 1
10 j 0
or another approach could be with dplyr in order to retain the previous column if the case (no one knows) your data will be imported in R.
library(dplyr)
dat <- dat %>% mutate(lon = as.numeric(lo))
dat
Source: local data frame [10 x 3]
le lo lon
1 a TRUE 1
2 b FALSE 0
3 c TRUE 1
4 d FALSE 0
5 e TRUE 1
6 f FALSE 0
7 g TRUE 1
8 h FALSE 0
9 i TRUE 1
10 j FALSE 0
Edit: Loop
I do not know if my code here is performing but it checks all column and change to numerical only those that are logical. Of course if your TRUE and FALSE are not logical but character strings (which might be remotely) my code won't work.
for(i in 1:ncol(dat)){
if(is.logical(dat[, i]) == TRUE) dat[, i] <- as.numeric(dat[, i])
}
If there are multiple columns, you could use set (using #josilber's example)
library(data.table)
Cols <- which(sapply(dat, is.logical))
setDT(dat)
for(j in Cols){
set(dat, i=NULL, j=j, value= as.numeric(dat[[j]]))
}
As Ted Harding pointed out in the R-help mailing list, one easy way to convert logical objects to numeric is to perform an arithmetic operation on them. Convenient ones would be * 1 and + 0, which will keep the TRUE/FALSE == 1/0 paradigm.
For your mock data (I've changed the code a bit to use regular R packages and to reduce size):
df <- data.frame(cbind(1:10, rnorm(10) > 0))
df$X3 <- df$X2 == 1
df$X4 <- df$X2 != 1
The dataset you get has a mixture of numeric and boolean variables:
X1 X2 X3 X4
1 1 0 FALSE TRUE
2 2 0 FALSE TRUE
3 3 1 TRUE FALSE
4 4 1 TRUE FALSE
5 5 1 TRUE FALSE
6 6 0 FALSE TRUE
7 7 0 FALSE TRUE
8 8 1 TRUE FALSE
9 9 0 FALSE TRUE
10 10 1 TRUE FALSE
Now let
df2 <- 1 * df
(If your dataset contains character or factor variables, you will need to apply this operation to a subset of df filtering out those variables)
df2 is equal to
X1 X2 X3 X4
1 1 0 0 1
2 2 0 0 1
3 3 1 1 0
4 4 1 1 0
5 5 1 1 0
6 6 0 0 1
7 7 0 0 1
8 8 1 1 0
9 9 0 0 1
10 10 1 1 0
Which is 100% numeric, as str(df2) will show you.
Now you can safely export df2 to your other program.
One line solution
Using the following code we take all the logical columns and make them numeric.
library(magrittr)
dat %<>% mutate_if(is.logical,as.numeric)
The same as #saebod but with usual pipe.
library(dplyr)
dat <- dat %>% mutate_if(is.logical, as.numeric)
Question :
I want to create a dummy variable first in R which is 1 if the value of a another dummy changed from 0 to 1 under the condition that it is not the first observation for an id number. The problem behind this is that I want to recognise firms which entered a market during the observed time period in a panel setting.
As an example I tried to create this with a small sample set:
id <- c(1,1,1,2,2,3,3,3)
dummy <- c(0,1,1,0,1,1,0,1)
df <- data.frame(id,dummy)
df[,"id"]
first.dum <- function(x)
c( x[-1,"id"] == x[,"id"]
& x[-1,"dummy"] != x[,"dummy"]
& x[,"dummy"] == "1")
df$first <- first.dum(df)
df
The result comes like ...
id dummy first
1 1 0 FALSE
2 1 1 FALSE
3 1 1 FALSE
4 2 0 FALSE
5 2 1 FALSE
6 3 1 TRUE
7 3 0 FALSE
8 3 1 FALSE
I think I did not understand how that dataframe manipulation really works.
Any help would be appreciated.
Here's how I would approach this using data.table package
library(data.table)
setDT(df)[, first := c(0, diff(dummy)) == 1, id][]
# id dummy first
# 1: 1 0 FALSE
# 2: 1 1 TRUE
# 3: 1 1 FALSE
# 4: 2 0 FALSE
# 5: 2 1 TRUE
# 6: 3 1 FALSE
# 7: 3 0 FALSE
# 8: 3 1 TRUE
Basically we are checking per group, if dummy is bigger by one than the previous observation (starting from the second observation).
You can do it similarly using dplyr
library(dplyr)
df %>% group_by(id) %>% mutate(first = c(0, diff(dummy)) == 1)
Or using base R
unlist(tapply(df$dummy, df$id, function(x) c(0, diff(x)) == 1))
Try something like
df$first <- df$id == c(NA, df$id[-nrow(df)]) &
df$dummy > c(1, df$dummy[-nrow(df)])
to give
> df
id dummy first
1 1 0 FALSE
2 1 1 TRUE
3 1 1 FALSE
4 2 0 FALSE
5 2 1 TRUE
6 3 1 FALSE
7 3 0 FALSE
8 3 1 TRUE
If you want something like your function, consider
first.dum <- function(x) {
y <- rbind(c(NA,1),x[-nrow(x),])
x[,"id"] == y[,"id"] & x[,"dummy"] > y[,"dummy"]
}
This is proving to be a monster for me with zero experience in R script. I have a data frame with 57 columns, 30 rows of data
Here is what I am trying to do:
1) Go to each column:
2) Count the number of times 2/3/4/5/6/7/8/9 consecutive values are less than -1
3) Print the result as a text file
4) Repeat step 2 and 3 for the second column and so on
I looked around and also on r stackoverflow
check number of times consecutive value appear based on a certain criteria
This is one column of my data:
data<-c(-0.996,-1.111,-0.638,0.047,0.694,1.901,2.863,2.611,2.56,2.016,0.929,-0.153,-0.617,-0.143
0.199,0.556,0.353,-0.638,0.347,0.045,-0.829,-0.882,-1.143,-0.869,0.619,0.923,-0.474,0.227
0.394,0.789,1.962,1.132,0.1,-0.278,-0.303,-0.606,-0.705,-0.858,-0.723,-0.081,1.206,2.329
1.863,2.1,1.547,2.026,0.015,-0.441,-0.371,-0.304,-0.668,-0.953,-1.256,-1.185,-0.891,-0.569
0.485,0.421,-0.004,0.024,-0.39,-0.58,-1.178,-1.101,-0.882,0.01,0.052,-0.166,-1.703,-1.048
-0.718,-0.036,-0.561,-0.08,0.272,-0.041,-0.811,-0.929,-0.853,-1.047,0.431,0.576,0.642,1.62
2.324,1.251,1.384,0.195,-0.081,-0.335,-0.176,1.089,-0.602,-1.134,-1.356,-1.203,-0.795,-0.752
-0.692,-0.813,-1.172,-0.387,-0.079,-0.374,-0.157,0.263,0.313,0.975,2.298,1.71,0.229,-0.313
-0.779,-1.12,-1.102,-1.01,-0.86,-1.118,-1.211,-1.081,-1.156,-0.972)
When I run the following code:
for (col in 1:ncol(data)) {
runs <- rle(data[,col])
print(runs$lengths[which(runs$values < -1)])
}
It gives me this:
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
It has counted the number of values <-1 but not runs. Is it something that I am during wrong here?
(massive edit)
Fixed data vector (was missing commas):
data <- c(-0.996,-1.111,-0.638,0.047,0.694,1.901,2.863,2.611,2.56,2.016,0.929,-0.153,-0.617,-0.143,
0.199,0.556,0.353,-0.638,0.347,0.045,-0.829,-0.882,-1.143,-0.869,0.619,0.923,-0.474,0.227,
0.394,0.789,1.962,1.132,0.1,-0.278,-0.303,-0.606,-0.705,-0.858,-0.723,-0.081,1.206,2.329,
1.863,2.1,1.547,2.026,0.015,-0.441,-0.371,-0.304,-0.668,-0.953,-1.256,-1.185,-0.891,-0.569,
0.485,0.421,-0.004,0.024,-0.39,-0.58,-1.178,-1.101,-0.882,0.01,0.052,-0.166,-1.703,-1.048,
-0.718,-0.036,-0.561,-0.08,0.272,-0.041,-0.811,-0.929,-0.853,-1.047,0.431,0.576,0.642,1.62,
2.324,1.251,1.384,0.195,-0.081,-0.335,-0.176,1.089,-0.602,-1.134,-1.356,-1.203,-0.795,-0.752,
-0.692,-0.813,-1.172,-0.387,-0.079,-0.374,-0.157,0.263,0.313,0.975,2.298,1.71,0.229,-0.313,
-0.779,-1.12,-1.102,-1.01,-0.86,-1.118,-1.211,-1.081,-1.156,-0.972)
Doing data < -1 gives you a logical vector, and we can count runs of TRUE & FALSE:
runs <- rle(data < -1)
print(runs)
## Run Length Encoding
## lengths: int [1:21] 1 1 20 1 29 2 8 2 4 2 ...
## values : logi [1:21] FALSE TRUE FALSE TRUE FALSE TRUE ...
Then extract the length of only the TRUE runs:
print(runs$lengths[which(runs$values)])
## [1] 1 1 2 2 2 1 3 1 3 4
and, iterate over columns of a data frame as previously shown:
# make a data frame from sampled versions of data
set.seed(1492) # repeatable
df <- data.frame(V1=data,
V2=sample(data, length(data), replace=TRUE),
V3=sample(data, length(data), replace=TRUE),
V4=sample(data, length(data), replace=TRUE))
# do the extraction
for (col in 1:ncol(df)) {
runs <- rle(df[, col] < -1)
print(runs$lengths[which(runs$values)])
}
## [1] 1 1 2 2 2 1 3 1 3 4
## [1] 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1