recode character to numeric for specific conditions in r - r

I had an original dataset that looks like this.:
> df.1
id score
1 13_B 1
2 13_C 4
3 133_D 5
4 141 2
5 145 3
6 143 4
7 12_B 6
8 12_C 7
9 12_D 9
I needed to do some process that needs all the ids numeric therefore I recoded _B|_C|_D into 1|2|3.
After I finished some processed on the dataset having an extra group column, Here is how my sample dataset looks like:
df.2 <- data.frame(id = c("131","132","133", "141", "145", "143", "121","122","123"),
score = c(1,4,5,2,3,4,6,7,9),
group = c(5,5,5,4,4,4,3,3,3))
> df.2
id score group
1 131 1 5
2 132 4 5
3 133 5 5
4 141 2 4
5 145 3 4
6 143 4 4
7 121 6 3
8 122 7 3
9 123 9 3
At this point, I need to convert the ids back to the original for those items = c(12,13,15). So 15 is not in this dataset but need something that works globally. My desired output is:
> df.3
id score group
1 13_B 1 5
2 13_C 4 5
3 13_D 5 5
4 141 2 4
5 145 3 4
6 143 4 4
7 12_B 6 3
8 12_C 7 3
9 12_D 9 3
Any ideas?
Thanks!

Use str_replace_all to recode the substring replacement by passing a named vector (setNames)
library(dplyr)
library(stringr)
df.1 %>%
mutate(id1 = as.numeric(str_replace_all(str_replace(id, "^(\\d{2})\\d+_(.*)",
"\\1_\\2"), setNames(as.character(c(1, 2, 3)), c("_B", "_C", "_D")))))
-output
id score id1
1 13_B 1 131
2 13_C 4 132
3 133_D 5 133
4 141 2 141
5 145 3 145
6 143 4 143
7 12_B 6 121
8 12_C 7 122
9 12_D 9 123
For replacing from 'df.2'
df.2 %>%
mutate(id2 = case_when(substr(id, 1, 2) %in% c(12, 13, 15) ~
str_replace_all(as.character(id), setNames(c("_B", "_C", "_D"),
str_c(1:3, "$"))), TRUE ~as.character(id)))
-output
id score group id2
1 131 1 5 13_B
2 132 4 5 13_C
3 133 5 5 13_D
4 141 2 4 141
5 145 3 4 145
6 143 4 4 143
7 121 6 3 12_B
8 122 7 3 12_C
9 123 9 3 12_D
data
df.1 <- structure(list(id = c("13_B", "13_C", "133_D", "141", "145",
"143", "12_B", "12_C", "12_D"), score = c(1L, 4L, 5L, 2L, 3L,
4L, 6L, 7L, 9L)), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9"), class = "data.frame")

You may try this:
df.2 %>%
group_by(group) %>%
mutate(group_id=row_number(),
x= paste0("_", LETTERS[2:4])) %>%
mutate(id2 = ifelse(!str_detect(id,"14"), paste0(str_sub(id,1,2),x),id)) %>%
select(id, id2, score, group)
id id2 score group
<chr> <chr> <dbl> <dbl>
1 131 13_B 1 5
2 132 13_C 4 5
3 133 13_D 5 5
4 141 141 2 4
5 145 145 3 4
6 143 143 4 4
7 121 12_B 6 3
8 122 12_C 7 3
9 123 12_D 9 3

Related

Fill in values between start and end value in R

W (blue line below) in my data.frame represents where the water level in the river intersects the elevation profile.
In my data.frame, for each group in ID, I need to fill in values between the start and end value (W)
My data
> head(df, 23)
ID elevation code
1 1 150 <NA>
2 1 140 <NA>
3 1 130 W
4 1 120 <NA>
5 1 110 <NA>
6 1 120 <NA>
7 1 130 W
8 1 140 <NA>
9 1 150 <NA>
10 2 90 <NA>
11 2 80 <NA>
12 2 70 <NA>
13 2 66 W
14 2 60 <NA>
15 2 50 <NA>
16 2 66 W
17 2 70 <NA>
18 2 72 <NA>
19 2 68 W
20 2 65 <NA>
21 2 60 <NA>
22 2 68 W
23 2 70 <NA>
I want the final result to look like below
ID elevation code
1 1 150 <NA>
2 1 140 <NA>
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 <NA>
9 1 150 <NA>
10 2 90 <NA>
11 2 80 <NA>
12 2 70 <NA>
13 2 66 W
14 2 60 W
15 2 50 W
16 2 66 W
17 2 70 <NA>
18 2 72 <NA>
19 2 68 W
20 2 65 W
21 2 60 W
22 2 68 W
23 2 70 <NA>
I tried many things but my trials were not successful. Your help will be appreciated.
DATA
> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), elevation = c(150L,
140L, 130L, 120L, 110L, 120L, 130L, 140L, 150L, 90L, 80L, 70L,
66L, 60L, 50L, 66L, 70L, 72L, 68L, 65L, 60L, 68L, 70L), code = c(NA,
NA, "W", NA, NA, NA, "W", NA, NA, NA, NA, NA, "W", NA, NA, "W",
NA, NA, "W", NA, NA, "W", NA)), class = "data.frame", row.names = c(NA,
-23L))
You could do:
df %>%
group_by(ID)%>%
mutate(code = coalesce(code, c(NA, "W")[cumsum(!is.na(code)) %% 2 + 1]))
ID elevation code
1 1 150 <NA>
2 1 140 <NA>
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 <NA>
9 1 150 <NA>
10 2 90 <NA>
11 2 80 <NA>
12 2 70 <NA>
13 2 66 W
14 2 60 W
15 2 50 W
16 2 66 W
17 2 70 <NA>
18 2 72 <NA>
19 2 68 W
20 2 65 W
21 2 60 W
22 2 68 W
23 2 70 <NA>
We can try replace + cumsum
df %>%
group_by(ID) %>%
mutate(code = replace(code, cumsum(!is.na(code)) %% 2 == 1, "W")) %>%
ungroup()
which gives
# A tibble: 23 x 3
ID elevation code
<int> <int> <chr>
1 1 150 NA
2 1 140 NA
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 NA
9 1 150 NA
10 2 90 NA
# ... with 13 more rows
You can create a helper function that creates a sequence between each start and end and assigns 'W' to it.
assign_w <- function(code) {
inds <- which(code == 'W')
code[unlist(Map(seq, inds[c(TRUE, FALSE)], inds[c(FALSE, TRUE)]))] <- 'W'
code
}
and apply it for each ID :
library(dplyr)
df %>%
group_by(ID) %>%
mutate(result = assign_w(code)) %>%
ungroup
# ID elevation code result
#1 1 150 <NA> <NA>
#2 1 140 <NA> <NA>
#3 1 130 W W
#4 1 120 <NA> W
#5 1 110 <NA> W
#6 1 120 <NA> W
#7 1 130 W W
#8 1 140 <NA> <NA>
#9 1 150 <NA> <NA>
#10 2 90 <NA> <NA>
#11 2 80 <NA> <NA>
#12 2 70 <NA> <NA>
#13 2 66 W W
#14 2 60 <NA> W
#15 2 50 <NA> W
#16 2 66 W W
#17 2 70 <NA> <NA>
#18 2 72 <NA> <NA>
#19 2 68 W W
#20 2 65 <NA> W
#21 2 60 <NA> W
#22 2 68 W W
#23 2 70 <NA> <NA>
library(dplyr)
df %>%
group_by(ID) %>%
mutate(water_flag = (1 * !is.na(code)) * if_else(elevation < lag(elevation, default = 0), 1, -1),
water = if_else(cumsum(water_flag) == 1, "W", NA_character_))
First I tried to use fill but had no success. Then I learned here about the benefit of R's recycling property Rename first and second occurence of the same specific value in a column iteratively (Thanks to Ronak!)
# prepare data with renaming `start` and `stop` sequence
df$code[is.na(df$code)] <- "NA"
df$code[df$code == 'W'] <- c('start', 'end')
df$code[df$code=="NA"]<-NA
# Now with different names of start and stop sequence I was able to implement `cumsum`
library(tidyverse)
df <- df %>%
group_by(grp = cumsum(!is.na(code))) %>%
dplyr::mutate(code = replace(code, first(code) == 'start', 'W'),
code = replace(code, code=='end', 'W')) %>%
ungroup() %>%
select(-grp)
Output:
# A tibble: 23 x 3
ID elevation code
<int> <int> <chr>
1 1 150 NA
2 1 140 NA
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 NA
9 1 150 NA
10 2 90 NA
11 2 80 NA
12 2 70 NA
13 2 66 W
14 2 60 W
15 2 50 W
16 2 66 W
17 2 70 NA
18 2 72 NA
19 2 68 W
20 2 65 W
21 2 60 W
22 2 68 W
23 2 70 NA
This answer is similar to #Onyambu's: create an 'index' (ind) that increases by one each time a non-NA is encountered in the 'code' column. If the index value is divisible by 2 (i.e. it is an even number) insert "NA" into the new column. If the index is not divisible by 2, add a "W" into the new column. Then if there is a "W" in the 'code' or 'new' columns, replace the NA in the 'code' column with W and drop the 'new' column from the dataframe.
df %>%
mutate(ind = ifelse(cumsum(!is.na(code)) %% 2 == 0, NA, "W")) %>%
mutate(code = ifelse(ind == "W" | code == "W", "W", NA)) %>%
select(-c(ind))
#> ID elevation code
#>1 1 150 <NA>
#>2 1 140 <NA>
#>3 1 130 W
#>4 1 120 W
#>5 1 110 W
#>6 1 120 W
#>7 1 130 W
#>8 1 140 <NA>
#>9 1 150 <NA>
#>10 2 90 <NA>
#>11 2 80 <NA>
#>12 2 70 <NA>
#>13 2 66 W
#>14 2 60 W
#>15 2 50 W
#>16 2 66 W
#>17 2 70 <NA>
#>18 2 72 <NA>
#>19 2 68 W
#>20 2 65 W
#>21 2 60 W
#>22 2 68 W
#>23 2 70 <NA>
Though the question has been marked as solved(answer accepted) yet for further/future reference, there is a function fill_run in library runner which does exactly this.
fill_run replaces NA values if they were surrounded by pair of identical values. Since our additional requirement is to look at elevation too we can do something like this
df %>% group_by(ID) %>%
mutate(code = runner::fill_run(ifelse(!is.na(code), paste(elevation,code), code), only_within = T))
# A tibble: 23 x 3
# Groups: ID [2]
ID elevation code
<int> <int> <chr>
1 1 150 NA
2 1 140 NA
3 1 130 130 W
4 1 120 130 W
5 1 110 130 W
6 1 120 130 W
7 1 130 130 W
8 1 140 NA
9 1 150 NA
10 2 90 NA
# ... with 13 more rows
Needless to say, you can again mutate non-NA values from code to W very easily, if required.

Cleaning a data.frame in a semi-reshape/semi-aggregate fashion

First time posting something here, forgive any missteps in my question.
In my example below I've got a data.frame where the unique identifier is the tripID with the name of the vessel, the species code, and a catch metric.
> testFrame1 <- data.frame('tripID' = c(1,1,2,2,3,4,5),
'name' = c('SS Anne','SS Anne', 'HMS Endurance', 'HMS Endurance','Salty Hippo', 'Seagallop', 'Borealis'),
'SPP' = c(101,201,101,201,102,102,103),
'kept' = c(12, 22, 14, 24, 16, 18, 10))
> testFrame1
tripID name SPP kept
1 1 SS Anne 101 12
2 1 SS Anne 201 22
3 2 HMS Endurance 101 14
4 2 HMS Endurance 201 24
5 3 Salty Hippo 102 16
6 4 Seagallop 102 18
7 5 Borealis 103 10
I need a way to basically condense the data.frame so that all there is only one row per tripID as shown below.
> testFrame1
tripID name SPP kept SPP.1 kept.1
1 1 SS Anne 101 12 201 22
2 2 HMS Endurance 101 14 201 24
3 3 Salty Hippo 102 16 NA NA
4 4 Seagallop 102 18 NA NA
5 5 Borealis 103 10 NA NA
I've looked into tidyr and reshape but neither of those are can deliver quite what I'm asking for. Is there anything out there that does this quasi-reshaping?
Here are two alternatives using base::reshape and data.table::dcast:
1) base R
reshape(transform(testFrame1,
timevar = ave(tripID, tripID, FUN = seq_along)),
idvar = cbind("tripID", "name"),
timevar = "timevar",
direction = "wide")
# tripID name SPP.1 kept.1 SPP.2 kept.2
#1 1 SS Anne 101 12 201 22
#3 2 HMS Endurance 101 14 201 24
#5 3 Salty Hippo 102 16 NA NA
#6 4 Seagallop 102 18 NA NA
#7 5 Borealis 103 10 NA NA
2) data.table
library(data.table)
setDT(testFrame1)
dcast(testFrame1, tripID + name ~ rowid(tripID), value.var = c("SPP", "kept"))
# tripID name SPP_1 SPP_2 kept_1 kept_2
#1: 1 SS Anne 101 201 12 22
#2: 2 HMS Endurance 101 201 14 24
#3: 3 Salty Hippo 102 NA 16 NA
#4: 4 Seagallop 102 NA 18 NA
#5: 5 Borealis 103 NA 10 NA
Great reproducible post considering it's your first. Here's a way to do it with dplyr and tidyr -
testFrame1 %>%
group_by(tripID, name) %>%
summarise(
SPP = toString(SPP),
kept = toString(kept)
) %>%
ungroup() %>%
separate("SPP", into = c("SPP", "SPP.1"), sep = ", ", extra = "drop", fill = "right") %>%
separate("kept", into = c("kept", "kept.1"), sep = ", ", extra = "drop", fill = "right")
# A tibble: 5 x 6
tripID name SPP SPP.1 kept kept.1
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1.00 SS Anne 101 201 12 22
2 2.00 HMS Endurance 101 201 14 24
3 3.00 Salty Hippo 102 <NA> 16 <NA>
4 4.00 Seagallop 102 <NA> 18 <NA>
5 5.00 Borealis 103 <NA> 10 <NA>

Fill NAs with next columns for moving average

set.seed(123)
df <- data.frame(loc.id = rep(c(1:3), each = 4*10),
year = rep(rep(c(1980:1983), each = 10), times = 3),
day = rep(1:10, times = 3*4),
x = sample(123:200, 4*3*10, replace = T))
I want to add one more column x.mv which is 3 days moving average of x for each loc.id and year combination
df %>% group_by(loc.id,year) %>% mutate(x.mv = zoo::rollmean(x, 3, fill = "NA", align = "right"))
loc.id year day x x.mv
<int> <int> <int> <int> <dbl>
1 1 1980 1 145 NA
2 1 1980 2 184 NA
3 1 1980 3 154 161
4 1 1980 4 191 176.
5 1 1980 5 196 180.
6 1 1980 6 126 171
7 1 1980 7 164 162
8 1 1980 8 192 161.
9 1 1980 9 166 174
10 1 1980 10 158 172
What I want to do is to replace the NAs in the x.mv column with x. I tried this:
df %>% group_by(loc.id,year) %>% mutate(x.mv = zoo::rollmean(x, 3, fill = x[1:2], align = "right"))
loc.id year day x x.mv
<int> <int> <int> <int> <dbl>
1 1 1980 1 145 145
2 1 1980 2 184 145
3 1 1980 3 154 161
4 1 1980 4 191 176.
5 1 1980 5 196 180.
6 1 1980 6 126 171
7 1 1980 7 164 162
8 1 1980 8 192 161.
9 1 1980 9 166 174
10 1 1980 10 158 172
But what it is doing instead is filling the NAs with the first value of x instead of the corresponding value of x. How do I fix it?
skip the fill argument and pad manually:
df %>%
group_by(loc.id,year) %>%
mutate(x.mv = c(x[1:2],zoo::rollmean(x, 3, align = "right"))) %>%
ungroup
# # A tibble: 120 x 5
# loc.id year day x x.mv
# <int> <int> <int> <int> <dbl>
# 1 1 1980 1 145 145.0000
# 2 1 1980 2 184 184.0000
# 3 1 1980 3 154 161.0000
# 4 1 1980 4 191 176.3333
# 5 1 1980 5 196 180.3333
# 6 1 1980 6 126 171.0000
# 7 1 1980 7 164 162.0000
# 8 1 1980 8 192 160.6667
# 9 1 1980 9 166 174.0000
# 10 1 1980 10 158 172.0000
# # ... with 110 more rows
You might want to use dplyr::cummean(x[1:2]) instead of x[1:2], to have an average for the second value already, or in this case, use #g-grothendieck's suggestion in the comments and rewrite your mutate call as mutate(x.mv = rollapplyr(x, 3, mean, partial = TRUE)).

lapply alternative to for loop to append to data frame

I have a data frame:
df<-structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
pos = c(10L, 200L, 134L, 400L, 600L, 1000L, 20L, 33L, 40L,
45L, 50L, 55L, 100L, 123L)), .Names = c("chrom", "pos"), row.names = c(NA, -14L), class = "data.frame")
> head(df)
chrom pos
1 1 10
2 1 200
3 1 134
4 1 400
5 1 600
6 1 1000
And I want to calculate pos[i+1] - pos[i] on the sample chromosome (chrom)
By using a for loop over each chrom level, and another over each row I get the expected results:
for (c in levels(df$chrom)){
df_chrom<-filter(df, chrom == c)
df_chrom<-arrange(df_chrom, df_chrom$pos)
for (i in 1:nrow(df_chrom)){
dist<-(df_chrom$pos[i+1] - df_chrom$pos[i])
logdist<-log10(dist)
cat(c, i, df_chrom$pos[i], dist, logdist, "\n")
}
}
However, I want to save this to a data frame, and think that lapply or apply is the right way to go about this. I can't work out how to make the pos[i+1] - pos[i] calculation though (seeing as lapply works on each row/column.
Any pointers would be appreciated
Here's the output from my solution:
chrom index pos dist log10dist
1 1 10 124 2.093422
1 2 134 66 1.819544
1 3 200 200 2.30103
1 4 400 200 2.30103
1 5 600 400 2.60206
1 6 1000 NA NA
2 1 20 13 1.113943
2 2 33 NA NA
3 1 40 5 0.69897
3 2 45 NA NA
4 1 50 5 0.69897
4 2 55 45 1.653213
4 3 100 23 1.361728
4 4 123 NA NA
We could do this using a group by difference. Convert the 'data.frame' to 'data.table' (setDT(df)), grouped by 'chrom', order the 'pos', get the difference of 'pos' (diff) and also log of the difference
library(data.table)
setDT(df)[order(pos), {v1 <- diff(pos)
.(index = seq_len(.N), pos = pos,
dist = c(v1, NA), logdiff = c(log10(v1), NA))}
, by = chrom]
# chrom index pos dist logdiff
# 1: 1 1 10 124 2.093422
# 2: 1 2 134 66 1.819544
# 3: 1 3 200 200 2.301030
# 4: 1 4 400 200 2.301030
# 5: 1 5 600 400 2.602060
# 6: 1 6 1000 NA NA
# 7: 2 1 20 13 1.113943
# 8: 2 2 33 NA NA
# 9: 3 1 40 5 0.698970
#10: 3 2 45 NA NA
#11: 4 1 50 5 0.698970
#12: 4 2 55 45 1.653213
#13: 4 3 100 23 1.361728
#14: 4 4 123 NA NA
Upon running the OP's code the output printed are
#1 1 10 124 2.093422
#1 2 134 66 1.819544
#1 3 200 200 2.30103
#1 4 400 200 2.30103
#1 5 600 400 2.60206
#1 6 1000 NA NA
#2 1 20 13 1.113943
#2 2 33 NA NA
#3 1 40 5 0.69897
#3 2 45 NA NA
#4 1 50 5 0.69897
#4 2 55 45 1.653213
#4 3 100 23 1.361728
#4 4 123 NA NA
We split df by df$chrom (Note that we reorder both df and df$chrom before splitting). Then we go through each of the subgroups (the subgroups are called a in this example) using lapply. On the pos column of each subgroup, we calculate difference (diff) of consecutive elements and take log10. Since diff decreases the number of elements by 1, we add a NA to the end. Finally, we rbind all the subgroups together using do.call.
do.call(rbind, lapply(split(df[order(df$chrom, df$pos),], df$chrom[order(df$chrom, df$pos)]),
function(a) data.frame(a, dist = c(log10(diff(a$pos)), NA))))
# chrom pos dist
#1.1 1 10 2.093422
#1.3 1 134 1.819544
#1.2 1 200 2.301030
#1.4 1 400 2.301030
#1.5 1 600 2.602060
#1.6 1 1000 NA
#2.7 2 20 1.113943
#2.8 2 33 NA
#3.9 3 40 0.698970
#3.10 3 45 NA
#4.11 4 50 0.698970
#4.12 4 55 1.653213
#4.13 4 100 1.361728
#4.14 4 123 NA

Remove dates which are not continuous in the data in R

I have a dataframe and I want to filter out the entries that are not continuous in date. In other words, I am looking at the cluster of continuous dates.
a %>% group_by(day) %>% summarise(count = n()) %>% mutate(day_dif = day - lag(day))
Source: local data frame [20 x 3]
day count day_dif
(date) (int) (dfft)
1 2016-02-02 12 NA days
2 2016-02-03 80 1 days
3 2016-02-04 102 1 days
4 2016-02-05 97 1 days
5 2016-02-06 118 1 days
6 2016-02-07 115 1 days
7 2016-02-08 4 1 days
8 2016-02-20 13 12 days
9 2016-02-21 136 1 days
10 2016-02-22 114 1 days
11 2016-02-23 134 1 days
12 2016-02-24 126 1 days
13 2016-02-25 128 1 days
14 2016-02-26 63 1 days
15 2016-02-27 118 1 days
16 2016-03-06 1 8 days
17 2016-03-29 28 23 days
18 2016-04-03 18 5 days
19 2016-04-08 18 5 days
20 2016-04-27 23 19 days
In this, I want to filter out entries that are not continuous in date. For ex, 2016-03-06, 2016-03-29, 2016-04-03 are single day entries which needs to be removed. I am looking only for continuous days entries. entries that occur with multiple days. My ideal output which I am looking is,
day count day_dif Cluster
(date) (int) (dfft)
1 2016-02-02 12 NA days 1
2 2016-02-03 80 1 days 1
3 2016-02-04 102 1 days 1
4 2016-02-05 97 1 days 1
5 2016-02-06 118 1 days 1
6 2016-02-07 115 1 days 1
7 2016-02-08 4 1 days 1
8 2016-02-20 13 12 days 2
9 2016-02-21 136 1 days 2
10 2016-02-22 114 1 days 2
11 2016-02-23 134 1 days 2
12 2016-02-24 126 1 days 2
13 2016-02-25 128 1 days 2
14 2016-02-26 63 1 days 2
15 2016-02-27 118 1 days 2
Where cluster column indicates the date clusters and also the output removes the single dates. Here 1 in the cluster column indicates, first group of dates and 2 indicates second group of dates. If there are more than 3 continuous days, I want to consider as on cluster.
I am trying to do this by using lag functions and all. But without much success. Can anybody help me in doing this? Any idea would be appreciated.
Thanks
We can use rle to subset the rows
i1 <- c(TRUE, a1$day_dif[-1] >=3)
i2 <- inverse.rle(within.list(rle(i1), {values1 <- values
values[values1 &lengths >3] <- FALSE
values[!values1]<- TRUE}))
a1$Cluster <- cumsum(i1)
a1[i2,]
# day count day_dif Cluster
#1 2016-02-02 12 NA days 1
#2 2016-02-03 80 1 days 1
#3 2016-02-04 102 1 days 1
#4 2016-02-05 97 1 days 1
#5 2016-02-06 118 1 days 1
#6 2016-02-07 115 1 days 1
#7 2016-02-08 4 1 days 1
#8 2016-02-20 13 12 days 2
#9 2016-02-21 136 1 days 2
#10 2016-02-22 114 1 days 2
#11 2016-02-23 134 1 days 2
#12 2016-02-24 126 1 days 2
#13 2016-02-25 128 1 days 2
#14 2016-02-26 63 1 days 2
#15 2016-02-27 118 1 days 2
The above code can be also be chained (%>%)
a1 %>%
mutate(i1 = c(TRUE, day_dif[-1] >=3)) %>%
do(data.frame(., i2 = inverse.rle(within.list(rle(.$i1), {
values1 <- values
values[values1 & lengths >3] <- FALSE
values[!values1] <- TRUE
})))) %>%
mutate(Cluster = cumsum(i1)) %>%
filter(i2) %>%
select(-i1, -i2)
# day count day_dif Cluster
#1 2016-02-02 12 NA days 1
#2 2016-02-03 80 1 days 1
#3 2016-02-04 102 1 days 1
#4 2016-02-05 97 1 days 1
#5 2016-02-06 118 1 days 1
#6 2016-02-07 115 1 days 1
#7 2016-02-08 4 1 days 1
#8 2016-02-20 13 12 days 2
#9 2016-02-21 136 1 days 2
#10 2016-02-22 114 1 days 2
#11 2016-02-23 134 1 days 2
#12 2016-02-24 126 1 days 2
#13 2016-02-25 128 1 days 2
#14 2016-02-26 63 1 days 2
#15 2016-02-27 118 1 days 2
data
a <- structure(list(day = structure(c(16833, 16834, 16835, 16836,
16837, 16838, 16839, 16851, 16852, 16853, 16854, 16855, 16856,
16857, 16858, 16866, 16889, 16894, 16899, 16918), class = "Date"),
count = c(12L, 80L, 102L, 97L, 118L, 115L, 4L, 13L, 136L,
114L, 134L, 126L, 128L, 63L, 118L, 1L, 28L, 18L, 18L, 23L
)), .Names = c("day", "count"), row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20"), class = "data.frame")
a1 <- a %>%
mutate(day_dif = day - lag(day))
There is probably a better way to deal with the first NA values. Here, I manually assigned it to 0. Then, because the difference of continuous date will be 1, you can take advantage of this property to create a boolean vector and then use cumsum to get the results. Finally, you can remove those groups which their length are equal to 1.
# Let the first NA equal to 0
df[which(is.na(df), arr.ind=TRUE)] <- 0
df %>% mutate(cluster=cumsum(day_dif !=1)) %>%
group_by(cluster) %>% filter(length(cluster) > 1) %>% ungroup()
# Source: local data frame [15 x 4]
# day count day_dif cluster
# (date) (int) (dfft) (int)
# 1 2016-02-02 12 0 days 1
# 2 2016-02-03 80 1 days 1
# 3 2016-02-04 102 1 days 1
# 4 2016-02-05 97 1 days 1
# 5 2016-02-06 118 1 days 1
# 6 2016-02-07 115 1 days 1
# 7 2016-02-08 4 1 days 1
# 8 2016-02-20 13 12 days 2
# 9 2016-02-21 136 1 days 2
# 10 2016-02-22 114 1 days 2
# 11 2016-02-23 134 1 days 2
# 12 2016-02-24 126 1 days 2
# 13 2016-02-25 128 1 days 2
# 14 2016-02-26 63 1 days 2
# 15 2016-02-27 118 1 days 2

Resources