Find the length of consecutive numbers in R - r

I have a data.frame that looks like this:
data=data.frame(position=c(1,2,3,1,1,4,5,6,7,8,2,2),
name=c("A","B","C","A","A","D","E","F","G","H","B","B"))
position name
1 A
2 B
3 C
1 A
1 A
4 D
5 E
6 F
7 G
8 H
2 B
2 B
I would like to be able to identify in the column "position" all the consecutive intervals
and then paste into a new column the length of each interval.
I would like my data to look somehow like this.
position length
1 - 3 3
4 - 8 5
Any help and comment are highly appreciated

Here is a base R solution.
Create a column, sequence, which indicates which rows are contiguous.
data$sequence <- c(NA, head(data$position, -1)) + 1 == data$position
data$sequence[[1]] <- data$sequence[[2]]
data
#> position name sequence
#> 1 1 A TRUE
#> 2 2 B TRUE
#> 3 3 C TRUE
#> 4 1 A FALSE
#> 5 1 A FALSE
#> 6 4 D FALSE
#> 7 5 E TRUE
#> 8 6 F TRUE
#> 9 7 G TRUE
#> 10 8 H TRUE
#> 11 2 B FALSE
#> 12 2 B FALSE
Use rle to construct the run lengths.
run_lengths <- rle(data$sequence)
i_ends <- cumsum(run_lengths$lengths)[run_lengths$values]
i_starts <- c(1, head(i_ends, -1))
data.frame(
position = paste0(data$position[i_starts], " - ", data$position[i_ends]),
length = i_ends - i_starts
)
#> position length
#> 1 1 - 3 2
#> 2 3 - 8 7

Does this work:
library(dplyr)
library(tidyr)
library(data.table)
data %>% mutate(ID = case_when (position == lead(position) - 1 ~ 1, TRUE ~ 0)) %>%
mutate(ID = case_when(position == lag(position) + 1 ~ 1, TRUE ~ ID)) %>% mutate(r = rleid(ID)) %>% filter(ID == 1) %>%
group_by(r) %>% mutate(position = paste(min(position),max(position), sep = '-'), length = length(unique(name))) %>% ungroup() %>% select(1,5) %>% distinct()
# A tibble: 2 x 2
position length
<chr> <int>
1 1-3 3
2 4-8 5
>

Related

fill NA values per group based on first value of a group

I am trying to fill NA values of my dataframe. However, I would like to fill them based on the first value of each group.
#> df = data.frame(
group = c(rep("A", 4), rep("B", 4)),
val = c(1, 2, NA, NA, 4, 3, NA, NA)
)
#> df
group val
1 A 1
2 A 2
3 A NA
4 A NA
5 B 4
6 B 3
7 B NA
8 B NA
#> fill(df, val, .direction = "down")
group val
1 A 1
2 A 2
3 A 2 # -> should be 1
4 A 2 # -> should be 1
5 B 4
6 B 3
7 B 3 # -> should be 4
8 B 3 # -> should be 4
Can I do this with tidyr::fill()? Or is there another (more or less elegant) way how to do this? I need to use this in a longer chain (%>%) operation.
Thank you very much!
Use tidyr::replace_na() and dplyr::first() (or val[[1]]) inside a grouped mutate():
library(dplyr)
library(tidyr)
df %>%
group_by(group) %>%
mutate(val = replace_na(val, first(val))) %>%
ungroup()
#> # A tibble: 8 × 2
#> group val
#> <chr> <dbl>
#> 1 A 1
#> 2 A 2
#> 3 A 1
#> 4 A 1
#> 5 B 4
#> 6 B 3
#> 7 B 4
#> 8 B 4
PS - #richarddmorey points out the case where the first value for a group is NA. The above code would keep all NA values as NA. If you'd like to instead replace with the first non-missing value per group, you could subset the vector using !is.na():
df %>%
group_by(group) %>%
mutate(val = replace_na(val, first(val[!is.na(val)]))) %>%
ungroup()
Created on 2022-11-17 with reprex v2.0.2
This should work, which uses dplyr's case_when
library(dplyr)
df %>%
group_by(group) %>%
mutate(val = case_when(
is.na(val) ~ val[1],
TRUE ~ val
))
Output:
group val
<chr> <dbl>
1 A 1
2 A 2
3 A 1
4 A 1
5 B 4
6 B 3
7 B 4
8 B 4

Keep only the second observation per group in R

I have a data frame ordered by id variables ("city"), and I want to keep the second observation of those cities that have more than one observation.
For example, here's an example data set:
city <- c(1,1,2,3,3,4,5,6,7,7,8)
value <- c(3,5,7,8,2,5,4,2,3,2,3)
mydata <- data.frame(city, value)
Then we have:
city value
1 1 3
2 1 5
3 2 7
4 3 8
5 3 2
6 4 5
7 5 4
8 6 2
9 7 3
10 7 2
11 8 3
The ideal outcome would be:
city value
2 1 5
3 2 7
5 3 2
6 4 5
7 5 4
8 6 2
10 7 2
11 8 3
Any help is appreciated!
dplyr
library(dplyr)
mydata %>%
group_by(city) %>%
filter(n() == 1L | row_number() == 2L) %>%
ungroup()
# # A tibble: 8 x 2
# city value
# <dbl> <dbl>
# 1 1 5
# 2 2 7
# 3 3 2
# 4 4 5
# 5 5 4
# 6 6 2
# 7 7 2
# 8 8 3
or slightly different
mydata %>%
group_by(city) %>%
slice(min(n(), 2)) %>%
ungroup()
base R
ind <- ave(rep(TRUE, nrow(mydata)), mydata$city,
FUN = function(z) length(z) == 1L | seq_along(z) == 2L)
ind
# [1] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
mydata[ind,]
# city value
# 2 1 5
# 3 2 7
# 5 3 2
# 6 4 5
# 7 5 4
# 8 6 2
# 10 7 2
# 11 8 3
data.table
Since you mentioned "is way bigger", you might consider data.table at some point for its speed and referential semantics. (And it doesn't hurt that this code is much more terse :-)
library(data.table)
DT <- as.data.table(mydata) # normally one might use setDT(mydata) instead ...
DT[, .SD[min(.N, 2),], by = city]
# city value
# <num> <num>
# 1: 1 5
# 2: 2 7
# 3: 3 2
# 4: 4 5
# 5: 5 4
# 6: 6 2
# 7: 7 2
# 8: 8 3
Here is logic that uses pmin() to choose either 2 or 1 depending on the length of the vector of value-values:
aggregate( value ~ city, mydata, function(x) x[ pmin(2, length(x))] )
city value
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3
The aggregate function delivers vectors of value split on the basis of city-values.
You may try
library(dplyr)
mydata %>%
group_by(city) %>%
filter(case_when(n()> 1 ~ row_number() == 2,
TRUE ~ row_number()== 1))
city value
<dbl> <dbl>
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3
Another dplyr solution:
mydata %>% group_by(city) %>%
summarize(value=value[pmin(2, n())])
Or:
mydata %>% group_by(city) %>%
summarize(value=ifelse(n() >= 2, value[2], value[1]))
Both Output:
city value
<dbl> <dbl>
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3
If base R is ok try this:
EDIT (since performance really seems to be important):
Using if as a function, should give a 100-fold speed-up in some cases.
aggregate( value ~ city, mydata, function(x) `if`(!is.na(x[2]),x[2],x[1]) )
city value
1 1 5
2 2 7
3 3 2
4 4 5
5 5 4
6 6 2
7 7 2
8 8 3
Benchmarks
Here're some benchmarks because I was curious. I gathered all solutions and let them run through microbenchmark.
Bottom line is 'if'(cond,T,F) is fastest (22.3% faster than ifelse and 17-times faster than the slowest), followed by ifelse and aggregate(pmin). Keep in mind that the data.table solution only ran on one core. So all speed-up in that package comes from parallelization. No real shocker but interesting nonetheless.
library(microbenchmark)
lengths( mydata )
city value
20000 20000
c( class(mydata$value), class(mydata$value) )
[1] "integer" "integer"
microbenchmark("aggr_if_function" = { res <- aggregate( value ~ city, mydata, function(x) `if`(!is.na(x[2]),x[2],x[1]) )},
"aggr_ifelse" = { res <- aggregate( value ~ city, mydata, function(x) ifelse(!is.na(x[2]),x[2],x[1]) ) },
"dplyr_filter" = { res <- mydata %>% group_by(city) %>% filter(n() == 1L | row_number() == 2L) %>% ungroup() },
"dplyr_slice" = { res <- mydata %>% group_by(city) %>% slice(min(n(), 2)) %>% ungroup() },
"data.table_single_core" = { res <- DT[, .SD[min(.N, 2),], by = city] },
"aggr_pmin" = { res <- aggregate( value ~ city, mydata, function(x) x[ pmin(2, length(x))] ) },
"dplyr_filter_case_when" = { res <- mydata %>% group_by(city) %>% filter(case_when(n()> 1 ~ row_number() == 2, TRUE ~ row_number()== 1)) },
"group_split_purrr" = { res <- group_split(mydata, city) %>% map_if(~nrow(.) > 1, ~.[2, ]) %>% bind_rows() }, times=50)
Unit: milliseconds
expr min lq mean median uq
aggr_if_function 175.5104 179.3273 184.5157 182.1778 186.8963
aggr_ifelse 214.5846 220.7074 229.2062 228.0688 234.1087
dplyr_filter 585.5275 607.7011 643.6320 632.0794 660.8184
dplyr_slice 713.4047 762.9887 792.7491 780.8475 803.7191
data.table_single_core 2080.3869 2164.3829 2240.8578 2229.5310 2298.9002
aggr_pmin 321.5265 330.5491 343.2752 341.7866 352.2880
dplyr_filter_case_when 3171.4859 3337.1669 3492.6915 3500.7783 3608.1809
group_split_purrr 1466.4527 1543.2597 1590.9994 1588.0186 1630.5590
max neval cld
212.6006 50 a
253.0433 50 a
1066.6018 50 c
1304.4045 50 d
2702.4201 50 f
457.3435 50 b
4195.0774 50 g
1786.5310 50 e
Combining group_split and map_if:
library(tidyverse)
city <- c(1,1,2,3,3,4,5,6,7,7,8)
value <- c(3,5,7,8,2,5,4,2,3,2,3)
value2 <- c(3,5,7,8,2,5,4,2,3,2,3)
mydata <- data.frame(city, value)
group_split(mydata, city) %>%
map_if(~nrow(.) > 1, ~.[2, ]) %>% bind_rows()
#> # A tibble: 8 × 2
#> city value
#> <dbl> <dbl>
#> 1 1 5
#> 2 2 7
#> 3 3 2
#> 4 4 5
#> 5 5 4
#> 6 6 2
#> 7 7 2
#> 8 8 3
Created on 2021-11-30 by the reprex package (v2.0.1)

Add column to grouped data that assigns 1 to individuals and randomly assigns 1 or 0 to pairs

I have a dataframe...
df <- tibble(
id = 1:7,
family = c("a","a","b","b","c", "d", "e")
)
Families will only contain 2 members at most (so they're either individuals or pairs).
I need a new column 'random' that assigns the number 1 to families where there is only one member (e.g. c, d and e) and randomly assigns 0 or 1 to families containing 2 members (a and b in the example).
By the end the data should look like the following (depending on the random assignment of 0/1)...
df <- tibble(
id = 1:7,
family = c("a","a","b","b","c", "d", "e"),
random = c(1, 0, 0, 1, 1, 1, 1)
)
I would like to be able to do this with a combination of group_by and mutate since I am mostly using Tidyverse.
I tried the following (but this didn't randomly assign 0/1 within families)...
df %>%
group_by(family) %>%
mutate(
random = if_else(
condition = n() == 1,
true = 1,
false = as.double(sample(0:1,1,replace = T))
)
You could sample along the sequence length of the family group and take the answer modulo 2:
df %>%
group_by(family) %>%
mutate(random = sample(seq(n())) %% 2)
#> # A tibble: 7 x 3
#> # Groups: family [5]
#> id family random
#> <int> <chr> <dbl>
#> 1 1 a 0
#> 2 2 a 1
#> 3 3 b 0
#> 4 4 b 1
#> 5 5 c 1
#> 6 6 d 1
#> 7 7 e 1
We can use if/else
library(dplyr)
df %>%
group_by(family) %>%
mutate(random = if(n() == 1) 1 else sample(rep(0:1, length.out = n())))
# A tibble: 7 x 3
# Groups: family [5]
# id family random
# <int> <chr> <dbl>
#1 1 a 0
#2 2 a 1
#3 3 b 1
#4 4 b 0
#5 5 c 1
#6 6 d 1
#7 7 e 1
Another option
df %>%
group_by(family) %>%
mutate(random = 2 - sample(1:n()))
# A tibble: 7 x 3
# Groups: family [5]
id family random
# <int> <chr> <dbl>
# 1 1 a 1
# 2 2 a 0
# 3 3 b 1
# 4 4 b 0
# 5 5 c 1
# 6 6 d 1
# 7 7 e 1

Filter (subset) by conditions in 2 columns in R (dplyr or otherwise)

Given a dataset such as:
set.seed(134)
df<- data.frame(ID= rep(LETTERS[1:5], each=2),
condition=rep(0:1, 5),
value=rpois(10, 3)
)
df
ID condition value
1 A 0 2
2 A 1 3
3 B 0 5
4 B 1 2
5 C 0 3
6 C 1 1
7 D 0 2
8 D 1 4
9 E 0 1
10 E 1 5
For each ID, when the value for condition==0 is less than the value for condition==1, I want to keep both observations. When the value for condition==0 is greater than condition==1, I want to keep only the row for condition==0.
The subset returned should be this:
ID condition value
1 A 0 2
2 A 1 3
3 B 0 5
5 C 0 3
7 D 0 2
8 D 1 4
9 E 0 1
10 E 1 5
Using dplyr the first step is:
df %>% group_by(ID) %>%
But not sure where to go from there.
Translating fairly literally,
library(dplyr)
set.seed(134)
df <- data.frame(ID = rep(LETTERS[1:5], each = 2),
condition = rep(0:1, 5),
value = rpois(10, 3))
df %>% group_by(ID) %>%
filter(condition == 0 |
(condition == 1 & value > value[condition == 0]))
#> # A tibble: 8 x 3
#> # Groups: ID [5]
#> ID condition value
#> <fct> <int> <int>
#> 1 A 0 2
#> 2 A 1 3
#> 3 B 0 5
#> 4 C 0 3
#> 5 D 0 2
#> 6 D 1 4
#> 7 E 0 1
#> 8 E 1 5
This depends on each group having a single observation with condition == 0, but should otherwise be fairly robust.
This is may not be the easiest way, but should work as you want.
library(reshape2)
df %>%
dcast(ID ~ condition, value.var = 'value') %>% # cast to wide format
mutate(`1` = ifelse(`1` > `0`, `1`, NA)) %>% # turn 0>1 values as NA
melt('ID') %>% # melt as long format
arrange(ID) %>% # sort by ID
filter(complete.cases(.)) # remove NA rows
Output:
ID variable value
1 A 0 2
2 A 1 3
3 B 0 5
4 C 0 3
5 D 0 2
6 D 1 4
7 E 0 1
8 E 1 5
You always want the value from the first row in each group. You only want the value from the second row in each group if it's larger than the first.
This works:
df %>%
group_by(ID) %>%
filter(row_number() == 1 | value > lag(value))
Edit: as #alistaire points out, this method depends on a particular order in, which is might be a good idea to guarantee as follows:
df %>%
arrange(ID, condition) %>%
group_by(ID) %>%
filter(row_number() == 1 | value > lag(value))

Finding Maximumth value of currently mutating variable in dplyr

While trying to work out this question Identify duplicates of one value with different values in another column; I felt that the solution was closer but I couldn't because the dplyr mutate function refers to the pre-mutated state's max when I use max(ID) in the below code and not post-mutated value (like recursively).
The objective is to assign a new unique ID value for the rows where the current Address has mismatch with the previous Address of the same ID value.
The code I tried:
df <- read.table(text = 'ID Address
1 X
1 X
1 Y
2 Z
2 Z
3 A
3 B
4 C
4 D
4 E
5 F
5 F
5 F
', header= T, stringsAsFactors = F)
df %>% group_by(ID) %>% mutate(flag = ifelse(lag(Address)==Address,F,T)) %>%
mutate(flag = ifelse(is.na(flag),F,flag)) %>% ungroup() %>%
mutate(newID = ifelse(flag | is.na(flag), max(ID)+1,ID))%>%
select(ID = newID,Address)
Received Output:
# A tibble: 13 x 2
ID Address
<dbl> <chr>
1 1 X
2 1 X
3 6 Y
4 2 Z
5 2 Z
6 3 A
7 6 B
8 4 C
9 6 D
10 6 E
11 5 F
12 5 F
13 5 F
Expected Output:
ID Address
1 X
1 X
6 Y
2 Z
2 Z
3 A
7 B
4 C
8 D
9 E
5 F
5 F
5 F
Any help would be appreciated!
Edit:
Ideal code: Where I should've been able to use newID which is the current mutating variable to use.
> df %>% group_by(ID) %>% mutate(flag = ifelse(lag(Address)==Address,F,T)) %>%
+ mutate(flag = ifelse(is.na(flag),F,flag)) %>% ungroup() %>%
+ mutate(newID = ifelse(flag | is.na(flag), max(newID)+1,ID))%>%
+ select(ID = newID,Address)
One problem is the max(ID) + 1 which will give the constant value and the second problem is the ifelse itself which requires equal length vector for 'yes' and 'no'. In the below solution, we replace the max(ID) + 1 with max(ID) + seq_len(sum(flag)) and instead of ifelse used replace
df %>%
group_by(ID) %>%
mutate(flag = lag(Address, default = Address[1])!= Address) %>%
ungroup() %>%
mutate(newID = replace(ID, flag, max(ID) + seq_len(sum(flag))))%>%
select(ID = newID,Address)
# A tibble: 13 x 2
# ID Address
# <dbl> <chr>
# 1 1 X
# 2 1 X
# 3 6 Y
# 4 2 Z
# 5 2 Z
# 6 3 A
# 7 7 B
# 8 4 C
# 9 8 D
#10 9 E
#11 5 F
#12 5 F
#13 5 F
In addition, the two ifelse statements to create the 'flag' can be replaced by a single statement

Resources