compress / summarize string start and length data in R - r

I have a data.frame of (sub)string positions within a larger string. The data contains the start of a (sub)string and it's length. The end position of the (sub)string can be easily calculated.
data1 <- data.frame(start = c(1,3,4,9,10,13),
length = c(2,1,3,1,2,1)
)
data1$end <- (data1$start + data1$length - 1)
data1
#> start length end
#> 1 1 2 2
#> 2 3 1 3
#> 3 4 3 6
#> 4 9 1 9
#> 5 10 2 11
#> 6 13 1 13
Created on 2019-12-10 by the reprex package (v0.3.0)
I would like to 'compress' this data.frame by summarizing continuous (sub)strings (strings that are connected with each other) so that my new data looks like this:
data2 <- data.frame(start = c(1,9,13),
length = c(6,3,1)
)
data2$end <- (data2$start + data2$length - 1)
data2
#> start length end
#> 1 1 6 6
#> 2 9 3 11
#> 3 13 1 13
Created on 2019-12-10 by the reprex package (v0.3.0)
Is there preferably a base R solution which gets me from data1 to data2?

f = cumsum(with(data1, c(0, start[-1] - head(end, -1))) != 1)
do.call(rbind, lapply(split(data1, f), function(x){
with(x, data.frame(start = start[1],
length = tail(end, 1) - start[1] + 1,
end = tail(end, 1)))}))
# start length end
#1 1 6 6
#2 9 3 11
#3 13 1 13

Using dplyr we can do the following:
library(dplyr)
data1 %>%
group_by(consecutive = cumsum(start != lag(end, default = 0) + 1)) %>%
summarise(start = min(start), length=sum(length), end=max(end)) %>%
ungroup %>% select(-consecutive)
#> # A tibble: 3 x 3
#> start length end
#> <dbl> <dbl> <dbl>
#> 1 1 6 6
#> 2 9 3 11
#> 3 13 1 13

Related

Counting of conditional frequency in R

I have a table with only one column and more than 200 rows. It includes three values, 0, 1 and 3. I´m interested in only these incidents, where an 1 follwos a 0. Can R count all X=1 if X-1 = =, given that X is the value of any row.
It would be great, if someone could help !
Best, Anna
Do you mean something like this?
# Create some sample data
set.seed(2020)
df <- data.frame(incident = sample(c(0, 1, 3), 10, replace = TRUE))
# incident
#1 3
#2 1
#3 0
#4 0
#5 1
#6 1
#7 0
#8 0
#9 1
#10 1
sum(c(df$incident[-1] == 1, FALSE) * (df$incident == 0))
# Or: with(df, sum(c(incident[-1] == 1, FALSE) * (incident == 0)))
#[1] 2
Here, c(incident[-1] == 1, FALSE) * (incident == 0) is the logical AND of x[i-1] = 0 and x[i] = 1. sum then sums the number of occurrences (in this case there are 2: one in rows 4/5 and one in rows 8/9).
library(tidyverse)
set.seed(123)
(df <- tibble(value = sample(c(0, 1, 3),size = 200, replace = TRUE)))
#> # A tibble: 200 x 1
#> value
#> <dbl>
#> 1 3
#> 2 3
#> 3 3
#> 4 1
#> 5 3
#> 6 1
#> 7 1
#> 8 1
#> 9 3
#> 10 0
#> # … with 190 more rows
count <- 0
#use map instead of walk to view the process row by row
walk(2:nrow(df), ~ {
if (df$value[[.x - 1]] == 0 && df$value[[.x]] == 1) count <<- count + 1
})
count
#> [1] 26
#some rows where the pattern is happening
df[86:87, ]
#> # A tibble: 2 x 1
#> value
#> <dbl>
#> 1 0
#> 2 1
df[93:94, ]
#> # A tibble: 2 x 1
#> value
#> <dbl>
#> 1 0
#> 2 1
Created on 2021-06-28 by the reprex package (v2.0.0)
Using dplyr:
transmute(df, dif = c(NA, diff(value))) %>%
count(dif) %>%
filter(dif == 1)
#> # A tibble: 1 x 2
#> dif n
#> <dbl> <int>
#> 1 1 26
Created on 2021-06-28 by the reprex package (v2.0.0)

The Most Efficient Way of Forming Groups using R

I have a tibble dt given as follows:
library(tidyverse)
dt <- tibble(x=as.integer(c(0,0,1,0,0,0,1,1,0,1))) %>%
mutate(grp = as.factor(c(rep("A",3), rep("B",4), rep("C",1), rep("D",2))))
dt
As one can observe the rule for grouping is:
starts 0 and ends with 1 (e.g., groups A, B, D) or
it solely contains 1 (e.g., group C)
Problem: Given a tibble with column integer vector x of zeros and 1 that starts with 0 and ends in 1, what is the most efficient way to obtain a grouping using R? (You can use any grouping symbols/factors.)
We can get the cumulative sum of 'x' (assuming it is binary), take the lag add 1 and use that index to replace it with LETTERS (Note that LETTERS was used only as part of matching with the expected output - it can take go up to certain limit)
library(dplyr)
dt %>%
mutate(grp2 = LETTERS[lag(cumsum(x), default = 0)+ 1])
-output
# A tibble: 10 x 3
x grp grp2
<int> <fct> <chr>
1 0 A A
2 0 A A
3 1 A A
4 0 B B
5 0 B B
6 0 B B
7 1 B B
8 1 C C
9 0 D D
10 1 D D
Though the strategy proposed by Akrun is fantastic, yet to show that it can be managed through accumulate also
library(tidyverse)
dt <- tibble(x=as.integer(c(0,0,1,0,0,0,1,1,0,1))) %>%
mutate(grp = as.factor(c(rep("A",3), rep("B",4), rep("C",1), rep("D",2))))
dt %>%
mutate(GRP = accumulate(lag(x, default = 0),.init =1, ~ if(.y != 1) .x else .x+1)[-1])
#> # A tibble: 10 x 3
#> x grp GRP
#> <int> <fct> <dbl>
#> 1 0 A 1
#> 2 0 A 1
#> 3 1 A 1
#> 4 0 B 2
#> 5 0 B 2
#> 6 0 B 2
#> 7 1 B 2
#> 8 1 C 3
#> 9 0 D 4
#> 10 1 D 4
Created on 2021-06-13 by the reprex package (v2.0.0)

High and low objects using R's car library

I'm trying to add a column which reads my dataframe's column and outputs a 1 if the element is bigger than a certain number (and a zero if the condition isn't met). However, this code doesn't seem to work: df is an existing dataframe.
df2 <- data.frame(df2, C=Recode(df$numbers, "hi:200=1; else=0")) ##C = numbers > 200 = 1
I'm using R's car library.
Does this achieve what you need?
df2 <- tibble(numbers = c(1, 200, 201))
df2$recoded <- ifelse(df2$numbers > 200, 1, 0)
df2
# # A tibble: 3 x 2
# numbers recoded
# <dbl> <dbl>
# 1 1 0
# 2 200 0
# 3 201 1
In base R we can also do
df2$recoded <- as.integer(df2$numbers > 200)
In data.table we could do:
library(data.table)
df <- datasets::cars
setDT(df)
df[, numbers := ifelse(df$dist > 10, 1, 0)][1:10, ]
#> speed dist numbers
#> 1: 4 2 0
#> 2: 4 10 0
#> 3: 7 4 0
#> 4: 7 22 1
#> 5: 8 16 1
#> 6: 9 10 0
#> 7: 10 18 1
#> 8: 10 26 1
#> 9: 10 34 1
#> 10: 11 17 1
Created on 2021-03-17 by the reprex package (v0.3.0)

filter all rows smaller than x with all following values also smaller than x

I am looking for a concise way to filter a data.frame for all rows smaller than a value x with all following values also smaller than x. I found a way but it is somehwat verbose. I tried to do it with dplyr::cumall and cumany, but was not able to figure it out.
Here is a small reprex including my actual approach. Ideally I would only have one filter line or mutate + filter, but with the current approach it takes two rounds of mutate/filter.
library(dplyr)
# Original data
tbl <- tibble(value = c(100,100,100,10,10,5,10,10,5,5,5,1,1,1,1))
# desired output:
# keep only rows, where value is smaller than 5 and ...
# no value after that is larger than 5
tbl %>%
mutate(id = row_number()) %>%
filter(value <= 5) %>%
mutate(id2 = lead(id, default = max(id) + 1) - id) %>%
filter(id2 == 1)
#> # A tibble: 7 x 3
#> value id id2
#> <dbl> <int> <dbl>
#> 1 5 9 1
#> 2 5 10 1
#> 3 5 11 1
#> 4 1 12 1
#> 5 1 13 1
#> 6 1 14 1
#> 7 1 15 1
Created on 2020-04-20 by the reprex package (v0.3.0)
You could combine cummin with a reversed reverse cummax:
tbl %>% filter(rev(cummax(rev(value))) <= 5 & cummin(value) <= 5)
# A tibble: 7 x 1
value
<dbl>
1 5
2 5
3 5
4 1
5 1
6 1
7 1
A base R option is to use subset + rle
tblout <- subset(tbl,
with(rle(value<=5 & c(0,diff(value))<=0),
rep(lengths>1 & values,lengths)))
such that
> tblout
# A tibble: 7 x 1
value
<dbl>
1 5
2 5
3 5
4 1
5 1
6 1
7 1

R dplyr - select values from one column based on position of a specific value in another column

I am working with gait-cycle data. I have 8 events marked for each id and gait trial. The values "LFCH" and "RFCH" occurs twice in each trial, as these represent the beginning and the end of the gait cycles from left and right leg.
Sample Data Frame:
df <- data.frame(ID = rep(1:5, each = 16),
Gait_nr = rep(1:2, each = 8, times=5),
Frame = rep(c(1,5,7,9,10,15,22,25), times = 10),
Marks = rep(c("LFCH", "LHL", "RFCH", "LTO", "RHL", "LFCH", "RTO", "RFCH"), times =10)
head(df,8)
ID Gait_nr Frame Marks
1 1 1 1 LFCH
2 1 1 5 LHL
3 1 1 7 RFCH
4 1 1 9 LTO
5 1 1 10 RHL
6 1 1 15 LFCH
7 1 1 22 RTO
8 1 1 25 RFCH
I wold like to create something like
Total_gait_left = Frame[The last time Marks == "LFCH"] - Frame[The first time Marks == "LFCH"]
My current code solves the problem, but depends on the position of the Frame values rather than actual values in Marks. Any individual not following the normal gait pattern will have wrong values produced by the code.
library(tidyverse)
l <- df %>% group_by(ID, Gait_nr) %>% filter(grepl("L.+", Marks)) %>%
summarize(Total_gait = Frame[4] - Frame[1],
Side = "left")
r <- df %>% group_by(ID, Gait_nr) %>% filter(grepl("R.+", Marks)) %>%
summarize(Total_gait = Frame[4] - Frame[1],
Side = "right")
val <- union(l,r, by=c("ID", "Gait_nr", "Side")) %>% arrange(ID, Gait_nr, Side)
Can you help me make my code more stable by helping me change e.g. Frame[4] to something like Frame[Marks=="LFCH" the last time ]?
If both LFCH and RFCH happen exactly twice, you can filter and then use diff in summarize:
df %>%
group_by(ID, Gait_nr) %>%
summarise(
left = diff(Frame[Marks == 'LFCH']),
right = diff(Frame[Marks == 'RFCH'])
)
# A tibble: 10 x 4
# Groups: ID [?]
# ID Gait_nr left right
# <int> <int> <dbl> <dbl>
# 1 1 1 14 18
# 2 1 2 14 18
# 3 2 1 14 18
# 4 2 2 14 18
# 5 3 1 14 18
# 6 3 2 14 18
# 7 4 1 14 18
# 8 4 2 14 18
# 9 5 1 14 18
#10 5 2 14 18
We can use first and last from the dplyr package.
library(dplyr)
df2 <- df %>%
filter(Marks %in% "LFCH") %>%
group_by(ID, Gait_nr) %>%
summarise(Total_gait = last(Frame) - first(Frame)) %>%
ungroup()
df2
# # A tibble: 10 x 3
# ID Gait_nr Total_gait
# <int> <int> <dbl>
# 1 1 1 14
# 2 1 2 14
# 3 2 1 14
# 4 2 2 14
# 5 3 1 14
# 6 3 2 14
# 7 4 1 14
# 8 4 2 14
# 9 5 1 14
# 10 5 2 14

Resources