Counting of conditional frequency in R - r

I have a table with only one column and more than 200 rows. It includes three values, 0, 1 and 3. I´m interested in only these incidents, where an 1 follwos a 0. Can R count all X=1 if X-1 = =, given that X is the value of any row.
It would be great, if someone could help !
Best, Anna

Do you mean something like this?
# Create some sample data
set.seed(2020)
df <- data.frame(incident = sample(c(0, 1, 3), 10, replace = TRUE))
# incident
#1 3
#2 1
#3 0
#4 0
#5 1
#6 1
#7 0
#8 0
#9 1
#10 1
sum(c(df$incident[-1] == 1, FALSE) * (df$incident == 0))
# Or: with(df, sum(c(incident[-1] == 1, FALSE) * (incident == 0)))
#[1] 2
Here, c(incident[-1] == 1, FALSE) * (incident == 0) is the logical AND of x[i-1] = 0 and x[i] = 1. sum then sums the number of occurrences (in this case there are 2: one in rows 4/5 and one in rows 8/9).

library(tidyverse)
set.seed(123)
(df <- tibble(value = sample(c(0, 1, 3),size = 200, replace = TRUE)))
#> # A tibble: 200 x 1
#> value
#> <dbl>
#> 1 3
#> 2 3
#> 3 3
#> 4 1
#> 5 3
#> 6 1
#> 7 1
#> 8 1
#> 9 3
#> 10 0
#> # … with 190 more rows
count <- 0
#use map instead of walk to view the process row by row
walk(2:nrow(df), ~ {
if (df$value[[.x - 1]] == 0 && df$value[[.x]] == 1) count <<- count + 1
})
count
#> [1] 26
#some rows where the pattern is happening
df[86:87, ]
#> # A tibble: 2 x 1
#> value
#> <dbl>
#> 1 0
#> 2 1
df[93:94, ]
#> # A tibble: 2 x 1
#> value
#> <dbl>
#> 1 0
#> 2 1
Created on 2021-06-28 by the reprex package (v2.0.0)
Using dplyr:
transmute(df, dif = c(NA, diff(value))) %>%
count(dif) %>%
filter(dif == 1)
#> # A tibble: 1 x 2
#> dif n
#> <dbl> <int>
#> 1 1 26
Created on 2021-06-28 by the reprex package (v2.0.0)

Related

The Most Efficient Way of Forming Groups using R

I have a tibble dt given as follows:
library(tidyverse)
dt <- tibble(x=as.integer(c(0,0,1,0,0,0,1,1,0,1))) %>%
mutate(grp = as.factor(c(rep("A",3), rep("B",4), rep("C",1), rep("D",2))))
dt
As one can observe the rule for grouping is:
starts 0 and ends with 1 (e.g., groups A, B, D) or
it solely contains 1 (e.g., group C)
Problem: Given a tibble with column integer vector x of zeros and 1 that starts with 0 and ends in 1, what is the most efficient way to obtain a grouping using R? (You can use any grouping symbols/factors.)
We can get the cumulative sum of 'x' (assuming it is binary), take the lag add 1 and use that index to replace it with LETTERS (Note that LETTERS was used only as part of matching with the expected output - it can take go up to certain limit)
library(dplyr)
dt %>%
mutate(grp2 = LETTERS[lag(cumsum(x), default = 0)+ 1])
-output
# A tibble: 10 x 3
x grp grp2
<int> <fct> <chr>
1 0 A A
2 0 A A
3 1 A A
4 0 B B
5 0 B B
6 0 B B
7 1 B B
8 1 C C
9 0 D D
10 1 D D
Though the strategy proposed by Akrun is fantastic, yet to show that it can be managed through accumulate also
library(tidyverse)
dt <- tibble(x=as.integer(c(0,0,1,0,0,0,1,1,0,1))) %>%
mutate(grp = as.factor(c(rep("A",3), rep("B",4), rep("C",1), rep("D",2))))
dt %>%
mutate(GRP = accumulate(lag(x, default = 0),.init =1, ~ if(.y != 1) .x else .x+1)[-1])
#> # A tibble: 10 x 3
#> x grp GRP
#> <int> <fct> <dbl>
#> 1 0 A 1
#> 2 0 A 1
#> 3 1 A 1
#> 4 0 B 2
#> 5 0 B 2
#> 6 0 B 2
#> 7 1 B 2
#> 8 1 C 3
#> 9 0 D 4
#> 10 1 D 4
Created on 2021-06-13 by the reprex package (v2.0.0)

How can I ask R to flag a specific pattern (i.e., column value changes from 1 to 0) each time it occurs within a group of observations?

The below reprex mimics my data: for each person, I have different values of 'res' at different times. I need an indicator variable ('flag') to tell me each time that 'res' changes from 1 to 0 within a given person, and I want 'flag' to equal 1 at the first time (and the first time only) that 'res'= 0 after 'res' = 1. Lastly, I want to count the number of times 'flag' = 1 for each person.
My code has two problems:
It flags every time after 'res'= 1 that 'res' = 0 (but I need 'flag'= 1 only the first time 'res'=0).
Counting the number of times 'flag' = 1 does not work.
Note: The last 'res_next_time' is inevitably NA. By definition in my data, I would never have 'flag'=1 here, so it's okay that it defaults to 0.
Thanks for your help!
#Load packages
library(Hmisc)
#> Loading required package: lattice
#> Loading required package: survival
#> Loading required package: Formula
#> Loading required package: ggplot2
#>
#> Attaching package: 'Hmisc'
#> The following objects are masked from 'package:base':
#>
#> format.pval, units
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.0.4
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:Hmisc':
#>
#> src, summarize
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
#> Warning: package 'tidyr' was built under R version 4.0.5
#Create data set
person <- c(1, 1, 1, 2, 3, 3, 3, 3, 3, 3)
time <- c(1, 2, 3, 1, 2, 1, 2, 3, 4, 5)
res <- c(1, 0, 1, 1, 1, 0, 0, 1, 0, 1)
#Populate data frame
d <- cbind(person, time, res)
d <- as.data.frame(d)
#Create new variable equal to 'res' at the person's next time point
d$res_next_time <- Lag(d$res, -1)
#Group times by person
d %>%
group_by(person) %>%
#Create a new variable 'flag' = 1 when a person's 'res' changes from 1 to 0, and 'flag' = 0 otherwise
mutate(flag = case_when(res_next_time < 1 ~ 1, TRUE ~ 0)) %>%
#Because 'flag'= 1 is at the time of 'res'= 1 before 'res'= 0, we lag it to have 'flag' = 1 at 'res' = 0
mutate(flag_res0 = Lag(flag, +1)) %>%
#Replace the NAs in 'flag_res0' with 0
replace_na(list(flag_res0 = 0)) %>%
#mutate(flag_res0 = as.numeric(flag_res0 & cumsum(flag_res0) <= 1)) %>%
#Count number of flags per person
mutate(mig_freq = sum(flag_res0)) %>%
#Limit the data to only include the final indicator
select('person', 'time', 'res', 'flag_res0')
#> # A tibble: 10 x 4
#> # Groups: person [3]
#> person time res flag_res0
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 0
#> 2 1 2 0 1
#> 3 1 3 1 0
#> 4 2 1 1 0
#> 5 3 2 1 0
#> 6 3 1 0 1
#> 7 3 2 0 1
#> 8 3 3 1 0
#> 9 3 4 0 1
#> 10 3 5 1 0
Created on 2021-04-15 by the reprex package (v0.3.0)
You won't need the column res_next_time with my solution. I think that #Paul PR's is more concise.
# using your data d
d %>%
group_by(person) %>%
mutate(flag2 = if_else(lag(res) == 1 & res == 0 &
!(duplicated(lag(res) == 1 & res == 0)),1, 0, 0))
You could add ungroup() at the end. That might be important depending on what's next. This basically is 'if TRUE TRUE and not duplicated, then...'
Your comment indicated that you aren't looking for the first occurrence but any occurrence within the group.
That's actually much simpler.
(d %>%
group_by(person) %>%
mutate(flag = if_else(lag(res) == 1 & res == 0, 1, 0, 0)))
The output looks like this. (I added data at the end of your example data to show my occurrences.)
# # A tibble: 13 x 4
# # Groups: person [3]
# person time res flag
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 0
# 2 1 2 0 1
# 3 1 3 1 0
# 4 2 1 1 0
# 5 3 2 1 0
# 6 3 1 0 1
# 7 3 2 0 0
# 8 3 3 1 0
# 9 3 4 0 1
# 10 3 5 1 0
# 11 3 6 0 1
# 12 1 7 1 0
# 13 1 8 0 1
Here's a solution that solves the problem in two steps:
Use dplyr's lag function to calculate the previous value of res, rather than the next value of res. We do this in the grouped data frame so the first value of res_last_time is NA for each person.
Use cumsum in the grouped data frame to only keep the first flag = 1 for each person.
d %>%
group_by(person) %>%
mutate(res_last_time = lag(res, 1)) %>%
mutate(flag = res == 0 & res_last_time == 1) %>%
mutate(flag = as.numeric(flag & cumsum(flag) <= 1))
Using your same d data.frame, here are the results I get:
#> # A tibble: 10 x 5
#> # Groups: person [3]
#> person time res res_last_time flag
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 NA 0
#> 2 1 2 0 1 1
#> 3 1 3 1 0 0
#> 4 2 1 1 NA 0
#> 5 3 2 1 NA 0
#> 6 3 1 0 1 1
#> 7 3 2 0 0 0
#> 8 3 3 1 0 0
#> 9 3 4 0 1 0
#> 10 3 5 1 0 0
Created on 2021-04-15 by the reprex package (v1.0.0)

Add column to grouped data that assigns 1 to individuals and randomly assigns 1 or 0 to pairs

I have a dataframe...
df <- tibble(
id = 1:7,
family = c("a","a","b","b","c", "d", "e")
)
Families will only contain 2 members at most (so they're either individuals or pairs).
I need a new column 'random' that assigns the number 1 to families where there is only one member (e.g. c, d and e) and randomly assigns 0 or 1 to families containing 2 members (a and b in the example).
By the end the data should look like the following (depending on the random assignment of 0/1)...
df <- tibble(
id = 1:7,
family = c("a","a","b","b","c", "d", "e"),
random = c(1, 0, 0, 1, 1, 1, 1)
)
I would like to be able to do this with a combination of group_by and mutate since I am mostly using Tidyverse.
I tried the following (but this didn't randomly assign 0/1 within families)...
df %>%
group_by(family) %>%
mutate(
random = if_else(
condition = n() == 1,
true = 1,
false = as.double(sample(0:1,1,replace = T))
)
You could sample along the sequence length of the family group and take the answer modulo 2:
df %>%
group_by(family) %>%
mutate(random = sample(seq(n())) %% 2)
#> # A tibble: 7 x 3
#> # Groups: family [5]
#> id family random
#> <int> <chr> <dbl>
#> 1 1 a 0
#> 2 2 a 1
#> 3 3 b 0
#> 4 4 b 1
#> 5 5 c 1
#> 6 6 d 1
#> 7 7 e 1
We can use if/else
library(dplyr)
df %>%
group_by(family) %>%
mutate(random = if(n() == 1) 1 else sample(rep(0:1, length.out = n())))
# A tibble: 7 x 3
# Groups: family [5]
# id family random
# <int> <chr> <dbl>
#1 1 a 0
#2 2 a 1
#3 3 b 1
#4 4 b 0
#5 5 c 1
#6 6 d 1
#7 7 e 1
Another option
df %>%
group_by(family) %>%
mutate(random = 2 - sample(1:n()))
# A tibble: 7 x 3
# Groups: family [5]
id family random
# <int> <chr> <dbl>
# 1 1 a 1
# 2 2 a 0
# 3 3 b 1
# 4 4 b 0
# 5 5 c 1
# 6 6 d 1
# 7 7 e 1

Strange behavior with a conditional mutate with dplyr

My apologies if this topic has been discussed somewhere, I was not able to find it out.
I was trying to apply a quite simple conditional mutate() with dplyr when I noticed something quite weird to me, I explain:
Let's say that in a data.frame I want to modify a variable (here VALUE) according to the value of a specific row in each group (here COND).
The modification is: "if the last value of COND within the current group is 0, then set VALUE to 99 for the current group, otherwhise do nothing"
Here's what I naturally wrote:
tab <- data.frame(
ID = c(rep(1,3), rep(2,3)),
COND = c(c(1,0,0), rep(1,3)),
VALUE = 1:6
)
tab %>%
group_by(ID) %>%
mutate(VALUE = ifelse(COND[n()] == 0,
99,
VALUE))
# ID COND VALUE
# <dbl> <dbl> <dbl>
# 1 1 1 99
# 2 1 0 99
# 3 1 0 99
# 4 2 1 4
# 5 2 1 4 <
# 6 2 1 4 <
The propagation went well for the first group since VALUE is now 99 which is legitimate (COND == 0 in row 3) whereas I was surprised to see that VALUE also changed for the other group by propagating the first value of VALUE within the group while the condition is not fulfilled.
Can someone enlight me on what I am misunderstanding here?
Expected result was:
# ID COND VALUE
# <dbl> <dbl> <dbl>
# 1 1 1 99
# 2 1 0 99
# 3 1 0 99
# 4 2 1 4
# 5 2 1 5 <
# 6 2 1 6 <
[edit] I also tried using case_when() which apparently I do not manage well either:
tab %>%
group_by(ID) %>%
mutate(VALUE = case_when(
COND[n()] == 0 ~ 99,
TRUE ~ VALUE
))
# Erreur : must be a double vector, not an integer vector
One workaround that would be to calculate an intermediate variable, but I am quite surprised having to do that.
Possible solution:
tab %>%
group_by(ID) %>%
mutate(TEST_COND = COND[n()] == 0,
VALUE = ifelse(TEST_COND, 99, VALUE))
# ID COND VALUE TEST_COND
# <dbl> <dbl> <dbl> <lgl>
# 1 1 1 99 TRUE
# 2 1 0 99 TRUE
# 3 1 0 99 TRUE
# 4 2 1 4 FALSE
# 5 2 1 5 FALSE
# 6 2 1 6 FALSE
# Yeepee
Try this
library(dplyr)
tab <- data.frame(
ID = c(rep(1,3), rep(2,3)),
COND = c(1, rep(0,2), rep(1,3)),
VALUE = 1:6
)
tab %>%
group_by(ID) %>%
mutate(VALUE = case_when(last(COND) == 0 ~ 99L,
TRUE ~ VALUE))
#> # A tibble: 6 x 3
#> # Groups: ID [2]
#> ID COND VALUE
#> <dbl> <dbl> <int>
#> 1 1 1 99
#> 2 1 0 99
#> 3 1 0 99
#> 4 2 1 4
#> 5 2 1 5
#> 6 2 1 6
Created on 2020-05-12 by the reprex package (v0.3.0)

compress / summarize string start and length data in R

I have a data.frame of (sub)string positions within a larger string. The data contains the start of a (sub)string and it's length. The end position of the (sub)string can be easily calculated.
data1 <- data.frame(start = c(1,3,4,9,10,13),
length = c(2,1,3,1,2,1)
)
data1$end <- (data1$start + data1$length - 1)
data1
#> start length end
#> 1 1 2 2
#> 2 3 1 3
#> 3 4 3 6
#> 4 9 1 9
#> 5 10 2 11
#> 6 13 1 13
Created on 2019-12-10 by the reprex package (v0.3.0)
I would like to 'compress' this data.frame by summarizing continuous (sub)strings (strings that are connected with each other) so that my new data looks like this:
data2 <- data.frame(start = c(1,9,13),
length = c(6,3,1)
)
data2$end <- (data2$start + data2$length - 1)
data2
#> start length end
#> 1 1 6 6
#> 2 9 3 11
#> 3 13 1 13
Created on 2019-12-10 by the reprex package (v0.3.0)
Is there preferably a base R solution which gets me from data1 to data2?
f = cumsum(with(data1, c(0, start[-1] - head(end, -1))) != 1)
do.call(rbind, lapply(split(data1, f), function(x){
with(x, data.frame(start = start[1],
length = tail(end, 1) - start[1] + 1,
end = tail(end, 1)))}))
# start length end
#1 1 6 6
#2 9 3 11
#3 13 1 13
Using dplyr we can do the following:
library(dplyr)
data1 %>%
group_by(consecutive = cumsum(start != lag(end, default = 0) + 1)) %>%
summarise(start = min(start), length=sum(length), end=max(end)) %>%
ungroup %>% select(-consecutive)
#> # A tibble: 3 x 3
#> start length end
#> <dbl> <dbl> <dbl>
#> 1 1 6 6
#> 2 9 3 11
#> 3 13 1 13

Resources