Incremental sequences with interruptions - r

I have a dataset with repeating sequences of TRUE that I would like to label based on some conditions - by id, and by the sequence's incremental value. A FALSE breaks the sequence of TRUEs and the first FALSE that breaks any given sequence of TRUE should be included in that sequence. Consecutive FALSEs in between TRUEs are irrelevant and are labeled 0.
For example:
> test
id logical sequence
1 1 TRUE 1
2 1 TRUE 1
3 1 FALSE 1
4 1 TRUE 2
5 1 TRUE 2
6 1 FALSE 2
7 1 TRUE 3
8 2 TRUE 1
9 2 TRUE 1
10 2 TRUE 1
11 2 FALSE 1
12 2 TRUE 2
13 2 TRUE 2
14 2 TRUE 2
15 3 FALSE 0
16 3 FALSE 0
17 3 FALSE 0
18 3 TRUE 1
19 3 FALSE 1
20 3 TRUE 2
21 3 FALSE 2
22 3 FALSE 0
23 3 FALSE 0
24 3 FALSE 0
25 3 TRUE 3
And so on. I have considered using rle() which produces
> rle(test$logical)
Run Length Encoding
lengths: int [1:13] 2 1 2 1 4 1 3 3 1 1 ...
values : logi [1:13] TRUE FALSE TRUE FALSE TRUE FALSE ...
But I am not sure how to map this back on the data frame. Any suggestions on how to approach this problem?
Here are the sample data:
> dput(test)
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), logical = c(TRUE, TRUE,
FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE,
TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE,
FALSE, FALSE, TRUE)), .Names = c("id", "logical"), class = "data.frame", row.names = c(NA,
-25L))

A pure data.table solution:
# load the 'data.table'-package & convert 'test' to a data.table with 'setDT'
library(data.table)
setDT(test)
# calculate the new sequence
test[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id
][new_seq != 0, new_seq := rleid(new_seq), by = id][]
which gives:
id logical new_seq
1: 1 TRUE 1
2: 1 TRUE 1
3: 1 FALSE 1
4: 1 TRUE 2
5: 1 TRUE 2
6: 1 FALSE 2
7: 1 TRUE 3
8: 2 TRUE 1
9: 2 TRUE 1
10: 2 TRUE 1
11: 2 FALSE 1
12: 2 TRUE 2
13: 2 TRUE 2
14: 2 TRUE 2
15: 3 FALSE 0
16: 3 FALSE 0
17: 3 FALSE 0
18: 3 TRUE 1
19: 3 FALSE 1
20: 3 TRUE 2
21: 3 FALSE 2
22: 3 FALSE 0
23: 3 FALSE 0
24: 3 FALSE 0
25: 3 TRUE 3
What this does:
rleid(logical) - !logical creates a numeric run length id and substracts 1 for where logical is equal to FALSE
The result of the previous step is then multiplied with the result of !(!logical & !shift(logical, fill = FALSE)), which is a TRUE/FALSE vector for consequtive FALSE values except the first one of a FALSE-sequence.
Finally, we create a new run length id for only the rows where new_seq is not equal to 0 and have your desired result.
A slightly improved alternative (as suggested by #jogo in the comments):
test[, new_seq := (rleid(logical) - !logical) * (logical | shift(logical, fill = FALSE)), by = id
][new_seq != 0, new_seq := rleid(new_seq), by = id][]

There is for sure a better implementation of makeSeq function but this works.
This one uses libraries data.table, magrittr and dplyr
Function
makeSeq <- function(x) {
res <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum
IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x
res[IND2F] <- 0
res[!IND2F] <- rleidv(res[!IND2F])
return(res)
}
data.table solution
setDT(df)[,yourSEQ:=makeSeq(logical),by="id"]
df
tidyverse fans use
df %>% group_by(id) %>% mutate(yourSEQ = makeSeq(logical)) %>% ungroup
Result
> df
id logical yourSEQ
1: 1 TRUE 1
2: 1 TRUE 1
3: 1 FALSE 1
4: 1 TRUE 2
5: 1 TRUE 2
6: 1 FALSE 2
7: 1 TRUE 3
8: 2 TRUE 1
9: 2 TRUE 1
10: 2 TRUE 1
11: 2 FALSE 1
12: 2 TRUE 2
13: 2 TRUE 2
14: 2 TRUE 2
15: 3 FALSE 0
16: 3 FALSE 0
17: 3 FALSE 0
18: 3 TRUE 1
19: 3 FALSE 1
20: 3 TRUE 2
21: 3 FALSE 2
22: 3 FALSE 0
23: 3 FALSE 0
24: 3 FALSE 0
25: 3 TRUE 3
id logical yourSEQ

without using rle in dtmtd2 and also some timings:
dplyrmtd0 <- function() {
test %>%
group_by(id) %>%
mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>%
mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L))
}
setDT(test)
makeSeq <- function(x) {
res <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum
IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x
res[IND2F] <- 0
res[!IND2F] <- rleidv(res[!IND2F])
return(res)
}
dt0 <- copy(test)
dtmtd0 <- function() {
dt0[,yourSEQ:=makeSeq(logical),by="id"]
}
dt1 <- copy(test)
dtmtd1 <- function() {
dt1[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id
][new_seq != 0, new_seq := rleid(new_seq), by = id][]
}
dt4 <- copy(test)
dtmtd2 <- function() {
dt4[, sequence := {
idx <- cumsum(diff(c(FALSE, logical))==1L)
mask <- shift(logical, fill=FALSE) | logical
idx * mask
}, by=id]
}
microbenchmark(dplyrmtd0(), dtmtd0(), dtmtd1(), dtmtd2(), times=5L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval
dplyrmtd0() 375.6089 376.7271 433.1885 380.7428 443.8844 588.9791 5
dtmtd0() 481.5189 487.1245 492.9527 495.6855 500.1588 500.2759 5
dtmtd1() 146.0376 147.0163 154.7501 152.7157 154.2976 173.6831 5
dtmtd2() 106.3401 107.7728 112.7580 108.5239 119.4398 121.7131 5
data:
library(data.table)
library(dplyr)
library(microbenchmark)
M <- 1e6
test <- data.frame(id=sample(LETTERS, M, replace=TRUE) ,
logical=sample(c(TRUE, FALSE), M, replace=TRUE))
test <- test[order(test$id),]

You could use the cumsum for your rle values, then you have to go back and fix the sequential FALSE values.
library(dplyr)
test %>%
group_by(id) %>%
mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>%
mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L)) %>%
print(n = 25)
# # A tibble: 25 x 5
# # Groups: id [3]
# id logical sequence sum_rle sequence2
# <int> <lgl> <int> <int> <int>
# 1 1 TRUE 1 1 1
# 2 1 TRUE 1 1 1
# 3 1 FALSE 1 1 1
# 4 1 TRUE 2 2 2
# 5 1 TRUE 2 2 2
# 6 1 FALSE 2 2 2
# 7 1 TRUE 3 3 3
# 8 2 TRUE 1 1 1
# 9 2 TRUE 1 1 1
# 10 2 TRUE 1 1 1
# 11 2 FALSE 1 1 1
# 12 2 TRUE 2 2 2
# 13 2 TRUE 2 2 2
# 14 2 TRUE 2 2 2
# 15 3 FALSE 0 0 0
# 16 3 FALSE 0 0 0
# 17 3 FALSE 0 0 0
# 18 3 TRUE 1 1 1
# 19 3 FALSE 1 1 1
# 20 3 TRUE 2 2 2
# 21 3 FALSE 2 2 2
# 22 3 FALSE 0 2 0
# 23 3 FALSE 0 2 0
# 24 3 FALSE 0 2 0
# 25 3 TRUE 3 3 3
if you prefer a really concise version of the same thing...
library(dplyr)
group_by(test, id) %>%
mutate(sequence = if_else(!logical & !lag(logical), 0L,
with(rle(logical), rep(cumsum(values), lengths)),
missing = 0L))

Related

How to count cases at intervarls with conditions? for a tibble

So I hope I can express my question, here I have the following example that I made up:
result <- c(1,1,1,1,1,1,1,1,1,1)
con1 <- c(1,2,2,2,1,1,2,2,2,2)
con2 <- c(2,1,2,2,1,1,2,2,2,1)
con3 <- c(2,2,1,1,1,2,2,2,2,1)
con4 <- c(2,1,2,2,1,1,2,1,1,2)
con5 <- c(1,2,2,2,1,2,2,2,2,1)
a <- tibble(Result=result,Con1=con1,Con2=con2,Con3=con3,Con4=con4,Con5=con5)
The above code gives me the following tibble, where each row is a patient:
> a
# A tibble: 10 x 6
Result Con1 Con2 Con3 Con4 Con5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 2 2 1
2 1 2 1 2 1 2
3 1 2 2 1 2 2
4 1 2 2 1 2 2
5 1 1 1 1 1 1
6 1 1 1 2 1 2
7 1 2 2 2 2 2
8 1 2 2 2 1 2
9 1 2 2 2 1 2
10 1 2 1 1 2 1
The Result are cases that are positive for a mayor illnes (thats why all are 1's) while the Con_i are yes or no question for the patient where 1=yes and 2=no, I want to get the number of patients that said yes to: 0 questions, 1 questions, 2-3 questions and 4 or more questions.
So far I've tried to do this:
a1 <-a %>% add_column(X=1)
a1$X <- case_when(a$Con1==2 & a$Con2==2 & a$Con3==2 & a$Con4==2 & a$Con5==2 ~ 0,
a$Con1==1 & a$Con2==2 & a$Con3==2 & a$Con4==2 & a$Con5==2 |
a$Con1==2 & a$Con2==1 & a$Con3==2 & a$Con4==2 & a$Con5==2|
a$Con1==2 & a$Con2==2 & a$Con3==1 & a$Con4==2 & a$Con5==2|
a$Con1==2 & a$Con2==2 & a$Con3==2 & a$Con4==1 & a$Con5==2|
a$Con1==2 & a$Con2==2 & a$Con3==2 & a$Con4==2 & a$Con5==1 ~ 1)
table <- a1 %>% group_by(X) %>% count(X,Result)
table
> table
# A tibble: 3 x 3
# Groups: X [3]
X Result n
<dbl> <dbl> <int>
1 0 1 1
2 1 1 4
3 NA 1 5
But I know is not the most efficient way, plus i would need to make all the combinations for 2-3 cases and 4+ cases and is not scalable, so I'm looking for a much easier way to do it and scale it, hope I can get your help and thanks in advance!
Perhaps the simplest:
table(rowSums(a[,-1] < 2))
# 0 1 2 3 5 <--- counts of "1" in each row
# 1 4 2 2 1 <--- number of patients with that count
Since you need to group 2-3 and 4+, then
table(cut(rowSums(a[,-1] < 2), c(0, 1, 2, 4, Inf), include.lowest = TRUE))
# [0,1] (1,2] (2,4] (4,Inf]
# 5 2 2 1
While the logic is using < 2, it's just as easy to check for == 1L or similar equality.
Tracing this, step by step:
a[,-1] == 1
# Con1 Con2 Con3 Con4 Con5
# [1,] TRUE FALSE FALSE FALSE TRUE
# [2,] FALSE TRUE FALSE TRUE FALSE
# [3,] FALSE FALSE TRUE FALSE FALSE
# [4,] FALSE FALSE TRUE FALSE FALSE
# [5,] TRUE TRUE TRUE TRUE TRUE
# [6,] TRUE TRUE FALSE TRUE FALSE
# [7,] FALSE FALSE FALSE FALSE FALSE
# [8,] FALSE FALSE FALSE TRUE FALSE
# [9,] FALSE FALSE FALSE TRUE FALSE
# [10,] FALSE TRUE TRUE FALSE TRUE
rowSums(a[,-1] == 1)
# [1] 2 2 1 1 5 3 0 1 1 3
That last is the number of 1s for each "patient" (row).
From this, I count one 0, four 1s, two 2s plus two 3s, zero 4s plus one 5. This should total 5, 2, 2, 1 ... so #andrew_reece is correct, let's use cut(...,right=FALSE):
table(cut(rowSums(a[,-1] < 2), c(0, 1, 2, 4, Inf), right = FALSE))
# [0,1) [1,2) [2,4) [4,Inf)
# 1 4 4 1
I should have caught earlier the [0,1] (previous answer), indicating 0 and 1 are close-ended, meaning both 0 and 1 are included in the same bin.
An option with Reduce and table
table(Reduce(`+`, lapply(a[-1], `<`, 2)))
# 0 1 2 3 5
#1 4 2 2 1
Pivot your data so all the Con vars are a column, and the yes/no values for each Con sit in a separate column. Then you can use group_by and summarise operations to get your grouping:
a %>%
mutate(patient = letters[row_number()]) %>%
pivot_longer(starts_with("Con")) %>%
group_by(patient) %>%
summarise(yes = sum(value == 1),
no = sum(value == 2)) %>%
group_by(yes) %>%
summarise(yes_ct = n()) %>%
mutate(yes_grp = case_when(
yes %in% 2:3 ~ "2-3",
yes >= 4 ~ "ge4",
TRUE ~ as.character(yes)
)) %>%
group_by(yes_grp) %>%
summarise(ct = sum(yes_ct))
# A tibble: 4 x 2
yes_grp ct
<chr> <int>
1 0 1
2 1 4
3 2-3 4
4 ge4 1
I made an explicit patient variable (just row numbers, basically) to make pivot and group operations easier.
Try this:
library(data.table)
df <- setDT(a) - 1
df$sum <- 5 - rowSums( df[,2:6] )
freq <- data.table(table(df$sum))
names(freq) <- c('Questions_Yes', 'Patients')
freq <- freq[,`:=`(
Questions_Yes = case_when(
Questions_Yes %in% c(2:3) ~ "2-3",
Questions_Yes >= 4 ~ "4+",
TRUE ~ as.character(Questions_Yes)
))
][, .(Patients = sum(Patients)), by = Questions_Yes]
Questions_Yes Patients
1: 0 1
2: 1 4
3: 2-3 4
4: 4+ 1

R: creating variable conditionally based off two shift statements

I have a data set like the following:
id age mod
1 1 1
1 5 0
1 6 1
1 7 1
1 9 1
2 3 0
2 4 1
And I'd like to create the variable first and give it a value of true only for the first occurrence of each episode of mod (each episode begins when mod==1). An episode can be defined as a series (or standalone day) of mod==1 in which age is incrementing by 1 or age is incrementing by 2. In other words, if age== 2 and mod==1, age==3 and mod==0 and age==4 and mod==1, ages 2-4 are still apart of the same series because they are still within 2 days of each other.
So ideally the final data set would look like this:
id age mod first
1 1 1 TRUE
1 5 0 FALSE
1 6 1 TRUE
1 7 1 FALSE
1 9 1 FALSE
2 3 0 FALSE
2 4 1 TRUE
I've tried using lag statements within data.table and have not be successful.
The simple condition is that mod[i]==1 & mod[i-1]==0. That is, if a row has a mod value of 1 and the previous row has a mod value of 0, then it is flagged as first.
This should work:
d = read.table(text='id age mod
1 1 1
1 5 0
1 6 1
1 7 1
1 9 1
2 3 0
2 4 1', header=T)
d$first[1] = (d$mod[1]==1)
d$first[2:nrow(d)] = (d$mod[2:nrow(d)]==1 & d$mod[1:nrow(d)-1]==0)
I believe this should meet your criteria. This could probably be done in a long, convoluted one-liner, but I think breaking it into multiple steps for the sake of clarity will meet the same ends without a meaningful performance hit.
library(data.table)
## Note - I added a couple more sample rows here
DT <- fread("id age mod
1 1 1
1 5 0
1 6 1
1 7 1
1 9 1
2 3 0
2 4 1
3 1 1
3 5 0
3 9 1
3 10 1")
## Create a column to track jumps in age
DT[, agejump := age - shift(age, n = 1L, fill = NA, type = "lag") > 2L, by = .(id)]
## Create a column to define continued sequences
DT[, continued := mod == 1 & shift(mod, n = 1L, fill = NA, type = "lag") == 1L, by = .(id)]
## backfill the first row of NA's for each id for both variables with FALSE
DT[DT[, .I[1], by = .(id)]$V1, c("agejump","continued") := FALSE]
## define first
DT[,first := (mod == 1 & continued == FALSE | mod == 1 & continued == TRUE & agejump == TRUE)]
print(DT)
# id age mod agejump continued first
# 1: 1 1 1 FALSE FALSE TRUE
# 2: 1 5 0 TRUE FALSE FALSE
# 3: 1 6 1 FALSE FALSE TRUE
# 4: 1 7 1 FALSE TRUE FALSE
# 5: 1 9 1 FALSE TRUE FALSE
# 6: 2 3 0 FALSE FALSE FALSE
# 7: 2 4 1 FALSE FALSE TRUE
# 8: 3 1 1 FALSE FALSE TRUE
# 9: 3 5 0 TRUE FALSE FALSE
# 10: 3 9 1 TRUE FALSE TRUE
# 11: 3 10 1 FALSE TRUE FALSE

How to remove a tail with NA in r?

I have a vector:
a <- c(NA,1:5,NA,NA,1:3, rep(NA,round(runif(1,0,100))))
I need to remove the trailing NAs. Desired result:
c(NA, 1:5, NA, NA, 1:3)
You can do
a[1:max(which(!is.na(a)))]
# [1] NA 1 2 3 4 5 NA NA 1 2 3
We subset the vector from position 1 to the last non NA value.
One option would be
a[rev(cumprod(rev(is.na(a)))) == 0]
# [1] NA 1 2 3 4 5 NA NA 1 2 3
Here are the steps:
(a <- c(NA, 1:5, NA, NA, 1:3, NA, NA))
# [1] NA 1 2 3 4 5 NA NA 1 2 3 NA NA
is.na(a)
# [1] TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
rev(is.na(a))
# [1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
cumprod(rev(is.na(a)))
# [1] 1 1 0 0 0 0 0 0 0 0 0 0 0
rev(cumprod(rev(is.na(a))))
# [1] 0 0 0 0 0 0 0 0 0 0 0 1 1
You can find the maximum position which is not an NA and subset accordingly
> a[1:max(which(!is.na(a)))]
[1] NA 1 2 3 4 5 NA NA 1 2 3
Also a possibility:
a[cumsum(!is.na(a)) != max(cumsum(!is.na(a))) * is.na(a)]
[1] NA 1 2 3 4 5 NA NA 1 2 3
In idividual steps:
is.na(a)
[1] TRUE FALSE FALSE FALSE FALSE
cumsum(!is.na(a))
[1] 0 1 2 3 4
cumsum(!is.na(a)) != max(cumsum(!is.na(a)))
[1] TRUE TRUE TRUE TRUE TRUE
cumsum(!is.na(a)) != max(cumsum(!is.na(a))) * is.na(a)
[1] TRUE TRUE TRUE TRUE TRUE
Just for fun, a little benchmarking:
library(microbenchmark)
a <- rep(a, 1e5)
microbenchmark(
markus = a[1:max(which(!is.na(a)))],
Julius_Vainora = a[rev(cumprod(rev(is.na(a)))) == 0],
Kim = rm_NA_tail(a),
tmfmnk = a[cumsum(!is.na(a)) != max(cumsum(!is.na(a))) * is.na(a)],
nsinghs = a[1:(length(a) - rle(is.na(rev(a)))$lengths[1])],
times = 5
)
Unit: milliseconds
expr min lq mean median uq max neval cld
markus 150.7346 153.0674 156.4194 153.3031 159.4718 165.5201 5 a
Julius_Vainora 393.8520 418.8186 616.3269 703.4022 749.6600 815.9018 5 bc
Kim 370.7680 382.1826 536.0828 632.0031 642.1882 653.2720 5 bc
tmfmnk 390.2626 415.2378 466.4245 415.8310 423.3828 687.4082 5 b
nsinghs 537.0404 781.1403 798.6929 793.1027 842.6777 1039.5033 5 c
I think this works:
rm_NA_tail <- function(a) {
if (is.na(a[length(a)])) {
return(a[is.na(match(data.table::rleid(a), max(data.table::rleid(a))))])
} else {
return(a)
}
}
This can be done using rle()
a[1:(length(a) - rle(is.na(rev(a)))$lengths[1])]
# [1] NA 1 2 3 4 5 NA NA 1 2 3
rle(is.na(rev(a)))$lengths[1] gets the count of trailing NA in the vector, then subtract it from the total vector length to get the index up to which you want to keep the vector.

One Hot Encoding for top categories, NA, and remaining subsumed as 'others' in R

I want to one hot encode my variables only for the top categories and NA and 'others'.
So in this simplified example, hot encoding b where freq > 1 and NA:
id <- c(1, 2, 3, 4, 5, 6)
b <- c(NA, "A", "C", "A", "B", "C")
c <- c(2, 3, 6, NA, 4, 7)
df <- data.frame(id, b, c)
id b c
1 1 <NA> 2
2 2 A 3
3 3 C 6
4 4 A NA
5 5 B 4
6 6 C 7
table <- as.data.frame(table(df$b))
Var1 Freq
1 A 2
2 B 1
3 C 2
table_top <- table[table$Freq > 1,]
Var1 Freq
1 A 2
3 C 2
Now, I would like to have something like this
id b_NA c b_A b_C b_Others
1 1 2 0 0 0
2 0 3 1 0 0
3 0 6 0 1 0
4 0 NA 1 0 0
5 0 4 0 0 1
6 0 7 0 1 0
I have tried with subsetting df
table_top <- as.vector(table_top$Var1)
table_only_top <- subset(df, b %in% table_top)
table_only_top
a b c
2 1 A 3
3 2 C 6
4 2 A NA
6 3 C 7
However, now I am stuck how to get to the output. In my real data I have many more categories than here, so using the names from the output is not an option. Also the others category in my real output exists of many categories.
Any hint is highly appreciated :)
Fast and sexy with data.table and mltools:
> one_hot(dt, naCols = TRUE, sparsifyNAs = TRUE)
id cat_NA cat_A cat_C cat_Others freq
1: 1 1 0 0 0 2
2: 2 0 1 0 0 3
3: 3 0 0 1 0 6
4: 4 0 1 0 0 NA
5: 5 0 0 0 1 4
6: 6 0 0 1 0 7
Code
Load libraries
library(dplyr)
library(data.table)
library(mltools)
Transform data
# Kick out all with freq == 1 and below
df <- df %>%
# Group by variables that will be onehotted
group_by(cat) %>%
# Add a count per group item column
mutate(count = n()) %>%
# Ungroup for next steps
ungroup() %>%
# Change all that have a count of 1 or below to "Others".
# If cat was a factor, we would get numeric results at this step.
mutate(cat = ifelse(!is.na(cat) & count <= 1, "Others", cat),
# Only now we turn it into a factor for the one_hot function
cat = as.factor(cat)) %>%
# Drop the count column
select(id, cat, freq)
# Turn into data.table
dt <- as.data.table(df)
Check intermediate result
> dt
id cat freq
1: 1 <NA> 2
2: 2 A 3
3: 3 C 6
4: 4 A NA
5: 5 Others 4
6: 6 C 7
Data
id <- c(1, 2, 3, 4, 5, 6)
cat <- c(NA, "A", "C", "A", "B", "C")
freq <- c(2, 3, 6, NA, 4, 7)
# It is important to have no other factor variables other
# than the variable(s) you one want to one hot. For that reason
# the automatic factoring is turned off.
df <- data.frame(id, cat, freq,
stringsAsFactors = FALSE)
> df
id cat freq
1 1 <NA> 2
2 2 A 3
3 3 C 6
4 4 A NA
5 5 B 4
6 6 C 7
Definitely not an elegant solution but it should work:
library(tideverse)
library(reshape2)
df %>%
gather(var, val, -id) %>%
add_count(var, val) %>%
mutate(res = ifelse(var == "b" & n > 1, 1, 0),
val = paste("b_", val, sep = "")) %>%
filter(var == "b" & n != 1) %>%
dcast(id ~ val, value.var = "res") %>%
full_join(df, by = c("id" = "id")) %>%
mutate(b_NA = ifelse(is.na(b), 1, 0)) %>%
mutate_at(vars(contains("b_")), funs(replace(., is.na(.), 0))) %>%
mutate(b_OTHERS = ifelse(rowSums(.[grep("b_", names(.))]) != 0, 0, 1))
id b_A b_C b c b_NA b_OTHERS
1 2 1 0 A 3 0 0
2 3 0 1 C 6 0 0
3 4 1 0 A NA 0 0
4 6 0 1 C 7 0 0
5 1 0 0 <NA> 2 1 0
6 5 0 0 B 4 0 1
You could cbind data.frames based on your different criteria.
# simple conditions -------------------------------------------------------
df <- df_orig[,-1]
df_na <- is.na(df)
colnames(df_na) <- paste0(colnames(df),"_NA")
df_A <- df=="A"
colnames(df_A) <- paste0(colnames(df),"_A")
df_C <- df=="C"
colnames(df_C) <- paste0(colnames(df),"_C")
# for counts you can use sapply with one loop -----------------------------
df_counts <- df
for(j in 1:ncol(df)) {
counts <- sapply(1:nrow(df), function(x) sum(df[x,j]==df[,j], na.rm=T) )
df_counts[,j] <- counts
}
df_counts <- df
# or avoid explicit loops altogether --------------------------------------
df_counts2 <- sapply(1:ncol(df), function(y) sapply(1:nrow(df), function(x) sum(df[x,y]==df[,y], na.rm=T) ) )
colnames(df_counts2 ) <- paste0(colnames(df),"_counts")
# cbind df's -------------------------------------------------------------
df_full <- cbind(df_orig, df_na, df_A, df_C, df_counts2)
# check if frequency greater then 1 or NA ---------------------------------
df_full$result <- df_full[,10:11] >=2 | df_full[,4:5]
df_full
The harder part is I suppose to compute the frequencies, here I included two ways. the result is:
id b c b_NA c_NA b_A c_A b_C c_C b_counts c_counts result.b_NA result.c_NA
1 1 <NA> 2 FALSE FALSE FALSE FALSE FALSE FALSE 1 1 FALSE FALSE
2 2 A 3 FALSE FALSE TRUE FALSE FALSE FALSE 2 1 TRUE FALSE
3 3 C 6 FALSE FALSE FALSE FALSE TRUE FALSE 2 1 TRUE FALSE
4 4 A NA FALSE TRUE TRUE NA FALSE NA 2 0 TRUE TRUE
5 5 B 4 FALSE FALSE FALSE FALSE FALSE FALSE 1 1 FALSE FALSE
6 6 C 7 FALSE FALSE FALSE FALSE TRUE FALSE 2 1 TRUE FALSE
You can modify the columns based on your conditions. Hope that helps

Replace logical values (TRUE / FALSE) with numeric (1 / 0)

I am exporting data from R with the command:
write.table(output,file = "data.raw", na "-9999", sep = "\t", row.names = FALSE, col.names = FALSE)
It exports my data correctly, but it exports all of the logical variables as TRUE and FALSE.
I need to read the data into another program that can only process numeric values. Is there an efficient way to convert logical columns to numeric 1s and 0s during the export? I have a large number of numeric variables, so I was hoping to automatically loop through all the variables in the data.table
Alternatively, my output object is a data.table. Is there an efficient way to convert all the logical variables in a data.table into numeric variables?
In case it is helpful, here is some code to generate a data.table with a logical variable in it (it is not a large number of logical variables, but enough to use on example code):
DT = data.table(cbind(1:100, rnorm(100) > 0)
DT[ , V3:= V2 == 1 ]
DT[ , V4:= V2 != 1 ]
For a data.frame, you could convert all logical columns to numeric with:
# The data
set.seed(144)
dat <- data.frame(V1=1:100,V2=rnorm(100)>0)
dat$V3 <- dat$V2 == 1
head(dat)
# V1 V2 V3
# 1 1 FALSE FALSE
# 2 2 TRUE TRUE
# 3 3 FALSE FALSE
# 4 4 FALSE FALSE
# 5 5 FALSE FALSE
# 6 6 TRUE TRUE
# Convert all to numeric
cols <- sapply(dat, is.logical)
dat[,cols] <- lapply(dat[,cols], as.numeric)
head(dat)
# V1 V2 V3
# 1 1 0 0
# 2 2 1 1
# 3 3 0 0
# 4 4 0 0
# 5 5 0 0
# 6 6 1 1
In data.table syntax:
# Data
set.seed(144)
DT = data.table(cbind(1:100,rnorm(100)>0))
DT[,V3 := V2 == 1]
DT[,V4 := FALSE]
head(DT)
# V1 V2 V3 V4
# 1: 1 0 FALSE FALSE
# 2: 2 1 TRUE FALSE
# 3: 3 0 FALSE FALSE
# 4: 4 0 FALSE FALSE
# 5: 5 0 FALSE FALSE
# 6: 6 1 TRUE FALSE
# Converting
(to.replace <- names(which(sapply(DT, is.logical))))
# [1] "V3" "V4"
for (var in to.replace) DT[, (var):= as.numeric(get(var))]
head(DT)
# V1 V2 V3 V4
# 1: 1 0 0 0
# 2: 2 1 1 0
# 3: 3 0 0 0
# 4: 4 0 0 0
# 5: 5 0 0 0
# 6: 6 1 1 0
Simplest way of doing this!
Multiply your matrix by 1
For example:
A <- matrix(c(TRUE,FALSE,TRUE,TRUE,TRUE,FALSE,FALSE,TRUE),ncol=4)
A
# [,1] [,2] [,3] [,4]
# [1,] TRUE TRUE TRUE FALSE
# [2,] FALSE TRUE FALSE TRUE
B <- 1*A
B
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 0 1 0 1
(You could also add zero: B <- 0 + A)
What about just a:
dat <- data.frame(le = letters[1:10], lo = rep(c(TRUE, FALSE), 5))
dat
le lo
1 a TRUE
2 b FALSE
3 c TRUE
4 d FALSE
5 e TRUE
6 f FALSE
7 g TRUE
8 h FALSE
9 i TRUE
10 j FALSE
dat$lo <- as.numeric(dat$lo)
dat
le lo
1 a 1
2 b 0
3 c 1
4 d 0
5 e 1
6 f 0
7 g 1
8 h 0
9 i 1
10 j 0
or another approach could be with dplyr in order to retain the previous column if the case (no one knows) your data will be imported in R.
library(dplyr)
dat <- dat %>% mutate(lon = as.numeric(lo))
dat
Source: local data frame [10 x 3]
le lo lon
1 a TRUE 1
2 b FALSE 0
3 c TRUE 1
4 d FALSE 0
5 e TRUE 1
6 f FALSE 0
7 g TRUE 1
8 h FALSE 0
9 i TRUE 1
10 j FALSE 0
Edit: Loop
I do not know if my code here is performing but it checks all column and change to numerical only those that are logical. Of course if your TRUE and FALSE are not logical but character strings (which might be remotely) my code won't work.
for(i in 1:ncol(dat)){
if(is.logical(dat[, i]) == TRUE) dat[, i] <- as.numeric(dat[, i])
}
If there are multiple columns, you could use set (using #josilber's example)
library(data.table)
Cols <- which(sapply(dat, is.logical))
setDT(dat)
for(j in Cols){
set(dat, i=NULL, j=j, value= as.numeric(dat[[j]]))
}
As Ted Harding pointed out in the R-help mailing list, one easy way to convert logical objects to numeric is to perform an arithmetic operation on them. Convenient ones would be * 1 and + 0, which will keep the TRUE/FALSE == 1/0 paradigm.
For your mock data (I've changed the code a bit to use regular R packages and to reduce size):
df <- data.frame(cbind(1:10, rnorm(10) > 0))
df$X3 <- df$X2 == 1
df$X4 <- df$X2 != 1
The dataset you get has a mixture of numeric and boolean variables:
X1 X2 X3 X4
1 1 0 FALSE TRUE
2 2 0 FALSE TRUE
3 3 1 TRUE FALSE
4 4 1 TRUE FALSE
5 5 1 TRUE FALSE
6 6 0 FALSE TRUE
7 7 0 FALSE TRUE
8 8 1 TRUE FALSE
9 9 0 FALSE TRUE
10 10 1 TRUE FALSE
Now let
df2 <- 1 * df
(If your dataset contains character or factor variables, you will need to apply this operation to a subset of df filtering out those variables)
df2 is equal to
X1 X2 X3 X4
1 1 0 0 1
2 2 0 0 1
3 3 1 1 0
4 4 1 1 0
5 5 1 1 0
6 6 0 0 1
7 7 0 0 1
8 8 1 1 0
9 9 0 0 1
10 10 1 1 0
Which is 100% numeric, as str(df2) will show you.
Now you can safely export df2 to your other program.
One line solution
Using the following code we take all the logical columns and make them numeric.
library(magrittr)
dat %<>% mutate_if(is.logical,as.numeric)
The same as #saebod but with usual pipe.
library(dplyr)
dat <- dat %>% mutate_if(is.logical, as.numeric)

Resources