How do I create this variable in R? - r

Consider the following test data set using R:
testdat<-data.frame("id"=c(rep(1,5),rep(2,5),rep(3,5)),
"period"=rep(seq(1:5),3),
"treat"=c(c(0,1,1,1,0),c(0,0,1,1,1),c(0,0,1,1,1)),
"state"=c(rep(0,5),c(0,1,1,1,1),c(0,0,0,1,1)),
"int"=c(rep(0,13),1,1))
testdat
id period treat state int
1 1 1 0 0 0
2 1 2 1 0 0
3 1 3 1 0 0
4 1 4 1 0 0
5 1 5 0 0 0
6 2 1 0 0 0
7 2 2 0 1 0
8 2 3 1 1 0
9 2 4 1 1 0
10 2 5 1 1 0
11 3 1 0 0 0
12 3 2 0 0 0
13 3 3 1 0 0
14 3 4 1 1 1
15 3 5 1 1 1
The first 4 variables are what I have, int is the variable I want to make. It is similar to an interaction between treat and state, but that would include 1s in rows 8-10 which is not desired. Essentially, I only want an interaction when state changes during treat but not otherwise. Any thoughts on how to create this (especially on a large scale for a dataset with a million observations)?
Edit: For clarification on why I want this measure. I want to run something like the following regression:
lm(outcome~treat+state+I(treat*state))
But I'm really interested in the interaction only when treat straddles a change in state. If I were to run the above regression, I(treat*state) pools the effect of the interaction I'm interested in and when treat is 1 entirely when state is 1. In theory, I think these will have two different effects so I need to disaggregate them. I hope this makes sense and I am happy to provide additional details.

I'm sure this is possible in base R, but here's a tidyversion:
library(dplyr)
testdat %>%
group_by(grp = cumsum(c(FALSE, diff(treat) > 0))) %>%
mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
ungroup() %>%
select(-grp)
# # A tibble: 15 x 6
# id period treat state int int2
# <dbl> <int> <dbl> <dbl> <dbl> <int>
# 1 1 1 0 0 0 0
# 2 1 2 1 0 0 0
# 3 1 3 1 0 0 0
# 4 1 4 1 0 0 0
# 5 1 5 0 0 0 0
# 6 2 1 0 0 0 0
# 7 2 2 0 1 0 0
# 8 2 3 1 1 0 0
# 9 2 4 1 1 0 0
# 10 2 5 1 1 0 0
# 11 3 1 0 0 0 0
# 12 3 2 0 0 0 0
# 13 3 3 1 0 0 0
# 14 3 4 1 1 1 1
# 15 3 5 1 1 1 1
Alternative logic for grouping uses run-length encoding, effectively the same (suggested you https://stackoverflow.com/a/35313426):
testdat %>%
group_by(grp = { yy <- rle(treat); rep(seq_along(yy$lengths), yy$lengths); }) %>%
# ...
And as in that answer, I wish dplyr had an equivalent to data.table's rleid. The expected logic is to be able to group by consecutive same-values in a column, but not the same value across all rows. If you look at this mid-pipe (before cleaning up grp), you'd see
testdat %>%
group_by(grp = { yy <- rle(treat); rep(seq_along(yy$lengths), yy$lengths); }) %>%
mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
ungroup()
# # A tibble: 15 x 7
# id period treat state int grp int2
# <dbl> <int> <dbl> <dbl> <dbl> <int> <int>
# 1 1 1 0 0 0 1 0
# 2 1 2 1 0 0 2 0
# 3 1 3 1 0 0 2 0
# 4 1 4 1 0 0 2 0
# 5 1 5 0 0 0 3 0
# 6 2 1 0 0 0 3 0
# 7 2 2 0 1 0 3 0
# 8 2 3 1 1 0 4 0
# 9 2 4 1 1 0 4 0
# 10 2 5 1 1 0 4 0
# 11 3 1 0 0 0 5 0
# 12 3 2 0 0 0 5 0
# 13 3 3 1 0 0 6 0
# 14 3 4 1 1 1 6 1
# 15 3 5 1 1 1 6 1
But that's just wishful thinking. I guess I could also do
my_rleid <- function(x) { yy <- rle(x); rep(seq_along(yy$lengths), yy$lengths); }
testdat %>%
group_by(grp = my_rleid(treat)) %>%
# ...

Here is a base R way using rle and ave.
r <- rle(testdat$treat)
r$values <- cumsum(r$values) + seq_along(r$values)
int2 <- +(ave(testdat$state, inverse.rle(r), FUN = function(x) x != x[1]) & testdat$treat == 1)
testdat <- cbind(testdat, int2)
testdat
# id period treat state int int2
#1 1 1 0 0 0 0
#2 1 2 1 0 0 0
#3 1 3 1 0 0 0
#4 1 4 1 0 0 0
#5 1 5 0 0 0 0
#6 2 1 0 0 0 0
#7 2 2 0 1 0 0
#8 2 3 1 1 0 0
#9 2 4 1 1 0 0
#10 2 5 1 1 0 0
#11 3 1 0 0 0 0
#12 3 2 0 0 0 0
#13 3 3 1 0 0 0
#14 3 4 1 1 1 1
#15 3 5 1 1 1 1
Timings
Since the question mentions performance as an issue, the real use case data set has 1 million rows, here are the timings of my solution and the one by r2evans.
Write both solutions as functions.
library(dplyr)
f1 <- function(X){
r <- rle(X$treat)
r$values <- cumsum(r$values) + seq_along(r$values)
int2 <- +(ave(X$state, inverse.rle(r), FUN = function(x) x != x[1]) & testdat$treat == 1)
cbind(X, int2)
}
f2 <- function(X){
X %>%
group_by(grp = cumsum(c(FALSE, diff(treat) > 0))) %>%
mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
ungroup() %>%
select(-grp)
}
How many copies of testdat are needed.
log2(1e6/nrow(testdat))
#[1] 16.02468
df1 <- testdat
for(i in 1:15) df1 <- rbind(df1, df1)
nrow(df1)
#[1] 491520
That is half a million, should be enough for a test.
mb <- microbenchmark::microbenchmark(
base = f1(df1),
dplyr = f2(df1),
times = 10
)
rm(df1) # tidy up
print(mb, unit = "relative", order = "median")
#Unit: relative
# expr min lq mean median uq max neval
# base 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
# dplyr 1.283237 1.359772 1.331494 1.369062 1.316815 1.256968 10
The base R solution is around 36% faster.

Another base version using also ave.
testdat$treat & c(0, diff(testdat$state))==1 goes to TRUE when state changes from 0 to 1 when treat is 1. testdat$treat & testdat$state goes to 1 when both are 1.
testdat$int2 <- +ave(testdat$treat & c(0, diff(testdat$state))==1,
cumsum(c(0, abs(diff(testdat$treat & testdat$state)))),
FUN=function(x) rep(x[1], length(x)))
testdat
# id period treat state int int2
#1 1 1 0 0 0 0
#2 1 2 1 0 0 0
#3 1 3 1 0 0 0
#4 1 4 1 0 0 0
#5 1 5 0 0 0 0
#6 2 1 0 0 0 0
#7 2 2 0 1 0 0
#8 2 3 1 1 0 0
#9 2 4 1 1 0 0
#10 2 5 1 1 0 0
#11 3 1 0 0 0 0
#12 3 2 0 0 0 0
#13 3 3 1 0 0 0
#14 3 4 1 1 1 1
#15 3 5 1 1 1 1
Or using Reduce:
testdat$int2 <- Reduce(function(x,y) {if(y==-1) 0 else if(x==1 || y==1) 1 else 0},
(testdat$treat & c(0, diff(testdat$state))==1) -c(0, diff(testdat$treat &
testdat$state) == -1), accumulate = TRUE)
Timings (continue from #Rui-Barradas):
f3 <- function(testdat) {cbind(testdat, int2=+ave(testdat$treat &
c(0, diff(testdat$state))==1, cumsum(c(0, abs(diff(testdat$treat &
testdat$state)))), FUN=function(x) rep(x[1], length(x))))}
f4 <- function(testdat) {cbind(testdat, int2=Reduce(function(x,y) {
if(y==-1) 0 else if(x==1 || y==1) 1 else 0}, (testdat$treat & c(0,
diff(testdat$state))==1) -c(0, diff(testdat$treat & testdat$state) == -1),
accumulate = TRUE))}
microbenchmark::microbenchmark(base = f1(df1), dplyr = f2(df1),
GKi1 = f3(df1), GKi2 = f4(df1), times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# base 1132.7269 1188.7439 1233.106 1226.8532 1293.9901 1364.8358 10 c
# dplyr 1376.0856 1436.4027 1466.418 1458.7240 1509.8990 1559.7976 10 d
# GKi1 960.5438 1006.8803 1029.105 1022.6114 1065.7427 1074.6027 10 b
# GKi2 588.0484 667.2482 694.415 699.0845 739.5523 786.1819 10 a

Related

Create an index variable for blocks of values

I have a dataframe "data" with a grouping variable "grp" and a binary classification variable "classif". For each group in grp, I want to create a "result" variable creating an index of separate blocks of 0 in the classif variable. For the time being, I don't know how to reset the count for each level of the grouping variable and I don't find a way to only create the index for blocks of 0s (ignoring the 1s).
Example data:
grp <- c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3)
classif <- c(0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,1,0)
result <- c(1,0,2,2,0,3,3,0,0,1,1,1,1,0,2,0,0,0,3,3,0,0,1,1,1,0,2,0,3)
wrong_result <- c(1,2,3,3,4,5,5,1,1,2,2,2,2,3,4,5,5,5,6,6,1,1,2,2,2,3,4,5,6)
Data <- data.frame(grp,classif,result, wrong_result)
I have tried using rleid but the following command produces "wrong_result", which is not what I'm after.
data[, wrong_result:= rleid(classif)]
data[, wrong_result:= rleid(classif), by=grp]
With dplyr, use cumsum() and lag() to find blocks of zeroes .by group. (Make sure you’re using the latest version of dplyr to use the .by argument).
library(dplyr)
Data %>%
mutate(
result2 = ifelse(
classif == 0,
cumsum(classif == 0 & lag(classif, default = 1) == 1),
0
),
.by = grp
)
grp classif result result2
1 1 0 1 1
2 1 1 0 0
3 1 0 2 2
4 1 0 2 2
5 1 1 0 0
6 1 0 3 3
7 1 0 3 3
8 2 1 0 0
9 2 1 0 0
10 2 0 1 1
11 2 0 1 1
12 2 0 1 1
13 2 0 1 1
14 2 1 0 0
15 2 0 2 2
16 2 1 0 0
17 2 1 0 0
18 2 1 0 0
19 2 0 3 3
20 2 0 3 3
21 3 1 0 0
22 3 1 0 0
23 3 0 1 1
24 3 0 1 1
25 3 0 1 1
26 3 1 0 0
27 3 0 2 2
28 3 1 0 0
29 3 0 3 3
Use rle and sequentially number the runs produced and then convert back and zero out the runs of 1's. No packages are used.
seq0 <- function(x) {
r <- rle(x)
is0 <- r$values == 0
r$values[is0] <- seq_len(sum(is0))
inverse.rle(r) * !x
}
transform(Data, result2 = ave(classif, grp, FUN = seq0))

R: Replace string with consecutive 0 less then three with 1

I have a vector like this in R:
dt = data.frame(input=c(0,0,1,1,0,0,1,0,0,0,1,1,1,0,1) )
dt
input
# 1 0
# 2 0
# 3 1
# 4 1
# 5 0
# 6 0
# 7 1
# 8 0
# 9 0
# 10 0
# 11 1
# 12 1
# 13 1
# 14 0
# 15 1
I want to replace the consecutive 0, in which the length is less than three, with 1, and save it to a new column.
#update:
I also hope that the replacement only happens when less than three 0 are sandwiched between 1. So in this condition, I will ignore the two 0 in raw 1 and 2. (or also when happening in the tail or meet NA)
For example, I want to output:
input output
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 0 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
How can I write it in the foreach loop? (I have the data with thousands of rows)
Thanks.
Create a grouping column with rleid on the 'input' column, and if the number of rows is less than 3 and all values are 0, replace with 1 or else return input
library(dplyr)
library(data.table)
dt %>%
mutate(new = cumsum(input)) %>%
group_by(grp = rleid(input)) %>%
mutate(output = if(n() <3 & all(input == 0) & all(new > 0)) 1 else input) %>%
ungroup %>%
select(-grp, -new)
-output
# A tibble: 15 × 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
Or use base R with rle
dt$output <- inverse.rle(within.list(rle(dt$input),
values[!values & lengths < 3 & seq_along(values) != 1] <- 1))
dt$output
#[1] 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1
Update after clarification:
We could now ungroup() and check if the sequence is wrapped by a 1 with lag(input==1):
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
ungroup() %>%
mutate(output = case_when(input == 0 &
lag(input==1) &
x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
select(-x)
Output:
A tibble: 15 x 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
First answer:
Here is a suggestion. But I don't understand the rows 1 and 2 in your output. "replace consecutive 0, in which the length is less than three, with 1" this is the case for row 1 and 2.
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
mutate(output = case_when(input == 0 & x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
ungroup() %>%
select(-x)
input output
<dbl> <dbl>
1 0 1
2 0 1
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
Update following reformulation of the question: This tidyverse approach simply makes use of case_when().
library(dplyr)
mutate(dt, inputX = case_when(input == 0 &
lag(input) == 1 &
lead(input) == 1 ~ 1,
input == 0 &
lag(input) == 0 &
lag(input, n = 2) == 1 &
lead(input) == 1 ~ 1,
T ~ input))
# input inputX
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
Previous solution: Having understood the requirements like Tarjae did, a tidyverse option could look as follows.
library(dplyr)
dt %>%
mutate(x = cumsum(input)) %>%
group_by(x) %>%
mutate(y = +(n() %in% 2:3)) %>%
ungroup() %>%
transmute(input = input,
inputX = if_else(y == 1, 1, input))
# # A tibble: 15 x 2
# input inputX
# <dbl> <dbl>
# 1 0 1
# 2 0 1
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1

How to count number of columns that have a value by a grouping variable in R?

I have data like this:
repetition Ob1 Ob2 Ob3 Ob4
1 0 0 0 1
1 0 0 3 0
1 1 3 3 0
1 2 3 3 0
2 4 0 2 2
2 4 0 3 0
2 0 0 0 0
3 0 0 0 0
3 4 0 4 0
3 0 0 0 0
I want to count the number of columns per repetition that have a certain value e.g. 1.
So in this case repetition 1 should return a 2 because Ob1 and Ob4 have a value of 1. Everything else gets a 0 because there are no other repetitions with a 1.
you can get count using dplyr package below code:
df$count <- rowSums(df[,2:5] == df$repetition)
df %>% select(repetition, count) %>% group_by(repetition) %>% summarise(count = sum(count))
# A tibble: 3 x 2
repetition count
<int> <dbl>
1 1 2
2 2 2
3 3 0
You can use by like:
by(x[-1]==1, x$repetition, function(y) sum(colSums(y) > 0))
#INDICES: 1
#[1] 2
#------------------------------------------------------------
#INDICES: 2
#[1] 0
#------------------------------------------------------------
#INDICES: 3
#[1] 0
or to return a named vector
c(by(x[-1]==1, x$repetition, function(y) sum(colSums(y) > 0)))
#1 2 3
#2 0 0

Create new column when when values repeat 3 or more times

Problem
I'm trying to create a new column (b) based on values from a previous column (a). Column a is binary, consisting of either 0's or 1's. If there are three or more 1's in a row in column a, then keep them in column b. I'm close to the desired output, but when there are two 1's in a row, the ifelse grabs the second value because it's meeting the first condition.
Desired Output–Column b
df <- data.frame(a = c(1,1,1,0,0,1,0,1,1,0,1,1,1,0,1,1,0,1,1,1,1),
b = c(1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,1))
df
a b
1 1 1
2 1 1
3 1 1
4 0 0
5 0 0
6 1 0
7 0 0
8 1 0 #
9 1 0 #
10 0 0
11 1 1
12 1 1
13 1 1
14 0 0
15 1 0 #
16 1 0 #
17 0 0
18 1 1
19 1 1
20 1 1
21 1 1
Failed Attempt...s
require(dplyr)
df_fail <- df %>% mutate(b=ifelse((lag(df$a) + df$a) > 1 |(df$a + lead(df$a) + lead(df$a,2)) >= 3, df$a,NA))
df_fail
a b
1 1 1
2 1 1
3 1 1
4 0 0
5 0 0
6 1 0
7 0 0
8 1 0
9 1 1 # should be 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 0
15 1 0
16 1 1 # should be 0
17 0 0
18 1 1
19 1 1
20 1 1
21 1 1
We can use rle from base R to change the elements that have less than 3 repeating 1s to 0
inverse.rle(within.list(rle(df$a), values[values == 1 & lengths <3] <- 0))
#[1] 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1
Or use rleid from data.table
library(data.table)
library(dplyr)
df %>%
group_by(grp = rleid(a)) %>%
mutate(b1 = if(n() <3 & all(a == 1)) 0 else a) %>%
ungroup %>%
select(-grp)

Creating a new variable by detecting max value for each id

My data set contains three variables:
id <- c(1,1,1,1,1,1,2,2,2,2,5,5,5,5,5,5)
ind <- c(0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1)
price <- c(1,2,3,4,5,6,1,2,3,4,1,2,3,4,5,6)
mdata <- data.frame(id,ind,price)
I need to create a new variable (ind2) that is if ind=0, then ind2=0.
also, if ind=1, then ind2=0, unless the price value is max, then ind2=1.
The new data looks like:
id ind ind2 price
1 0 0 1
1 0 0 2
1 0 0 3
1 0 0 4
1 0 0 5
1 0 0 6
2 1 0 1
2 1 0 2
2 1 0 3
2 1 1 4
5 1 0 1
5 1 0 2
5 1 0 3
5 1 0 4
5 1 0 5
5 1 1 6
library(dplyr)
mdata %>%
group_by(id) %>%
mutate(ind2 = +(ind == 1L & price == max(price)))
# id ind price ind2
# 1 1 0 1 0
# 2 1 0 2 0
# 3 1 0 3 0
# 4 1 0 4 0
# 5 1 0 5 0
# 6 1 0 6 0
# 7 2 1 1 0
# 8 2 1 2 0
# 9 2 1 3 0
# 10 2 1 4 1
# 11 5 1 1 0
# 12 5 1 2 0
# 13 5 1 3 0
# 14 5 1 4 0
# 15 5 1 5 0
# 16 5 1 6 1
Or if you prefer data.table
setDT(mdata)[, ind2 := +(ind == 1L & price == max(price)), by = id]
Or with base R
mdata$ind2 <- unlist(lapply(split(mdata,mdata$id),
function(x) +(x$ind == 1L & x$price == max(x$price))))

Resources