How to remove subsequent values after first instance only when other values are absent - r

I am trying to remove zeros after the first instance of a zero, when all future values are 0. Eventually I would love to do this group_by species but baby steps. Here's an example;
# Sample
library(tidyverse)
id<-c("a","b","c","d","e","f","g","h","i","j")
time<-c(1,2,3,4,5,6,7,8,9,10)
value<-c(90, 50, 40, 0, 30, 30, 0, 10, 0, 0)
df<-data.frame(id, time, value)
df
id time value
1 a 1 90
2 b 2 50
3 c 3 40
4 d 4 0
5 e 5 30
6 f 6 30
7 g 7 0
8 h 8 10
9 i 9 0
10 j 10 0
I would like to see observation id "j" and only observation id "j" removed. I am not even sure where to start. Any suggestions are much appreciated!

In base R only.It uses rle to get the number of trailing zeros, if any. Then subsets the dataframe with head.
r <- rle(df$value == 0)
if(r$values[length(r$values)]) head(df, -(r$lengths[length(r$values)] - 1))
# id time value
#1 a 1 90
#2 b 2 50
#3 c 3 40
#4 d 4 0
#5 e 5 30
#6 f 6 30
#7 g 7 0
#8 h 8 10
#9 i 9 0
You can write a function with the code above, and maybe *apply it to groups.
trailingZeros <- function(DF, col = "value"){
r <- rle(DF[[col]] == 0)
if(r$values[length(r$values)] && r$lengths[length(r$values)] > 1)
head(DF, -(r$lengths[length(r$values)] - 1))
else
DF
}
trailingZeros(df)
Note that this also works with a larger number of trailing zeros.
id2 <- c("a","b","c","d","e","f","g","h","i","j","k")
time2 <- c(1,2,3,4,5,6,7,8,9,10,11)
value2 <- c(90, 50, 40, 0, 30, 30, 0, 10, 0, 0, 0) # One more zero at end
df2 <- data.frame(id = id2, time = time2, value = value2)
trailingZeros(df2)

here is a solution within the tidyverse which also works on a larger number of trailing zeros:
df <- tibble(id = letters[1:11], time = 1:11,
value = c(90,50,40,0,30,30,0,10,0,0,0))
df %>%
slice(n():1) %>%
slice(c(which(cumsum(value > 0) > 0)[1] - 1, which(cumsum(value > 0) > 0))) %>%
slice(n():1)

Tidyverse solution that also works with groups
based on sample data (without grouping)
code can be shortened, but this looks very readable ;-)
df %>%
#arrange by id
arrange( id ) %>%
#no grouping valiable in sample data.. so don't use group_by here
#group_by( group) %>%
#create dummy's: position in group, last value of group, position of last non-zero in group, previous value (within group)
mutate( pos_in_group = 1:n() ) %>%
mutate( last_value = last( value ) ) %>%
mutate( pos_last_not_zero = max( which( value != 0) ) ) %>%
mutate( prev_value = lag( value ) ) %>%
#filter all rows where:
# the last value of the group != 0 AND
# the previous row (within the group) != 0 AND
# the position of the row is 'below' the last non-zero measurement (in the group)
filter( !(last_value == 0 & prev_value == 0 & pos_in_group >= pos_last_not_zero + 1 ) ) %>%
#throw away the dummy's
select( -c( pos_in_group, last_value, pos_last_not_zero, prev_value ) )
# id time value
# 1 a 1 90
# 2 b 2 50
# 3 c 3 40
# 4 d 4 0
# 5 e 5 30
# 6 f 6 30
# 7 g 7 0
# 8 h 8 10
# 9 i 9 0
Example with some grouping involved
# Sample
library(tidyverse)
id<-c("a","b","c","d","e","f","g","h","i","j","k")
group<-c(1,1,1,1,1,1,2,2,2,2,2)
time<-c(1,2,3,4,5,6,7,8,9,10,11)
value = c(90,0,0,40,0,0,30,30,0,0,0)
df<-data.frame(id, group, time, value)
df
# id group time value
# 1 a 1 1 90
# 2 b 1 2 0
# 3 c 1 3 0
# 4 d 1 4 40
# 5 e 1 5 0
# 6 f 1 6 0
# 7 g 2 7 30
# 8 h 2 8 30
# 9 i 2 9 0
# 10 j 2 10 0
# 11 k 2 11 0
df %>%
#arrange by id
arrange( id ) %>%
#group
group_by( group) %>%
#create dummy's: position in group, last value of group, position of last non-zero in group, previous value (within group)
mutate( pos_in_group = 1:n() ) %>%
mutate( last_value = last( value ) ) %>%
mutate( pos_last_not_zero = max( which( value != 0) ) ) %>%
mutate( prev_value = lag( value ) ) %>%
#filter all rows where:
# the last value of the group != 0 AND
# the previous row (within the group) != 0 AND
# the position of the row is 'below' the last non-zero measurement (in the group)
filter( !(last_value == 0 & prev_value == 0 & pos_in_group >= pos_last_not_zero + 1 ) ) %>%
#throuw away the dummy's
select( -c( pos_in_group, last_value, pos_last_not_zero, prev_value ) )
# # A tibble: 8 x 4
# # Groups: group [2]
# id group time value
# <fct> <dbl> <dbl> <dbl>
# 1 a 1 1 90
# 2 b 1 2 0
# 3 c 1 3 0
# 4 d 1 4 40
# 5 e 1 5 0
# 6 g 2 7 30
# 7 h 2 8 30
# 8 i 2 9 0

Related

Create countdown and counter before and after a certain value

I want to create a variable that counts down a given number of steps before certain value and counts up a given number steps after the value.
In the example below, I want to have a counters before and after the grp == "b". When grp == "b" the value should be 0, and before and after the counters it should be 100.
To do this I can use case_when function as follows:
library(dplyr)
n<-10;k<-3
test_df<-data.frame(id=1:(k*n),grp=rep(letters[1:k],each=n))
before=5;after=3;index=2
test_df2<-test_df %>%
mutate(before_after=
case_when(
grp == letters[index] ~ 0
,dplyr::lag(grp,1) == letters[index] ~ 1
,dplyr::lag(grp,2) == letters[index] ~ 2
,dplyr::lag(grp,3) == letters[index] ~ 3
,dplyr::lead(grp,1) == letters[index] ~ -1
,dplyr::lead(grp,2) == letters[index] ~ -2
,dplyr::lead(grp,3) == letters[index] ~ -3
,dplyr::lead(grp,4) == letters[index] ~ -4
,dplyr::lead(grp,5) == letters[index] ~ -5
,TRUE~100
)
)
The problem is the variables "before" and "after" are not static! So I should make a function to do it as follows:
b_a<-function(before=15,after=6,others=100,grp_f=letters[2]){
}
How to make a function to do it?
Edit:
I created a function but if there us a better and easy solution please feel free to add an answer:
b_a<-unction(test_df,before=15,after=6,others=100,grp_f=letters[2]){
ltr<-grp_f
test_df2<- test_df %>%
group_by(grp) %>%
dplyr::mutate( fr=ifelse(grp==ltr,dplyr::first(id),0 )) %>%
ungroup() %>%
dplyr::mutate( fr=max(fr)) %>%
mutate(fr=-(fr-id)) %>%
mutate(fr=ifelse(fr<0,fr,0 ))
test_df2<- test_df2 %>%
group_by(grp) %>%
dplyr::mutate( lst=ifelse(grp==ltr,dplyr::last(id),0 )) %>%
ungroup() %>%
dplyr::mutate( lst=max(lst)) %>%
mutate(lst=-(lst-id)) %>%
mutate(lst=ifelse(lst>0,lst,0 ))
test_df2<- test_df2 %>% ungroup() %>%
mutate(before_after=lst+fr)
test_df2 %>% mutate(ifelse(before_after >= -before &
before_after <=after,before_after,others))
test_df2
}
This may help:
test_df3<-test_df %>%
group_by(grp) %>%
mutate(is_group_b=ifelse(grp==letters[index],1,0)) %>%
ungroup() %>%
mutate(nr=row_number()) %>%
mutate(nr=nr*is_group_b) %>%
mutate(max_nr=max(nr))%>%
mutate(nr=ifelse(nr==0,NA,nr)) %>%
mutate(min_nr=min(nr,na.rm = TRUE)) %>%
mutate(nr=ifelse(is.na(nr),0,nr)) %>%
mutate(vl=(row_number()-min_nr)) %>%
mutate(vl=vl*(1-is_group_b)) %>%
mutate(vl=if_else(vl>0,row_number()-max_nr,vl))
Try this:
library(dplyr)
test_df %>%
group_by(grp) %>%
mutate(value = ifelse(grp == letters[index-1] & row_number() <= 5, 100, (before:1)*-1),
value1 = ifelse(grp == letters[index+1] & row_number() > 3, 100, 1:after)) %>%
mutate(value = case_when(grp == letters[index] ~ 0,
grp == letters[index-1] ~ value,
grp == letters[index+1] ~ value1), .keep="unused") %>%
print(n=50)
id grp value
<int> <chr> <dbl>
1 1 a 100
2 2 a 100
3 3 a 100
4 4 a 100
5 5 a 100
6 6 a -5
7 7 a -4
8 8 a -3
9 9 a -2
10 10 a -1
11 11 b 0
12 12 b 0
13 13 b 0
14 14 b 0
15 15 b 0
16 16 b 0
17 17 b 0
18 18 b 0
19 19 b 0
20 20 b 0
21 21 c 1
22 22 c 2
23 23 c 3
24 24 c 100
25 25 c 100
26 26 c 100
27 27 c 100
28 28 c 100
29 29 c 100
30 30 c 100
Set result column to 100. Get indices of the focal group. Create replacement values (countdown - zeros - countup) of given lengths. Replace result at relevant indices.
d = data.frame(id = 1:12, grp = rep(letters[1:3], each = 4))
before = 2
after = 3
g = "b"
d$x = 100
i = which(d$grp == g)
v = c(-before:-1, rep(0, length(i)), seq(after))
d$x = replace(d$x, i[1] - before + 0:(length(v) - 1), v)
d
# id grp x
# 1 1 a 100
# 2 2 a 100
# 3 3 a -2
# 4 4 a -1
# 5 5 b 0
# 6 6 b 0
# 7 7 b 0
# 8 8 b 0
# 9 9 c 1
# 10 10 c 2
# 11 11 c 3
# 12 12 c 100
Depending on the data, you may want to add a check so that indices of before- and after-sequences are kept within the bounds of the data, e.g. using %in%:
d = data.frame(id = 1:8, grp = rep(letters[1:3], c(1, 5, 2)))
before = 2
after = 3
g = "b"
d$x = 100
i = which(d$grp == g)
v = c(-before:-1, rep(0, length(i)), seq(after))
i2 = i[1] - before + 0:(length(v) - 1)
ok = i2 %in% seq_len(nrow(d)) # <~ check if indices are within range of data
d$x = replace(d$x, i2[ok], v[ok])
d
# id grp x
# 1 1 a -1 # leading countdown truncated
# 2 2 b 0
# 3 3 b 0
# 4 4 b 0
# 5 5 b 0
# 6 6 b 0
# 7 7 c 1
# 8 8 c 2 # trailing counter truncated

Filter Rows Between with Multiple Events per Subject

I have a large data set and I'm trying to filter the days following a specific event for each subject. This issue is that the "event" of interest may happen multiple times for some subjects and for a few subjects the event doesn't happen at all (in which case they could just be removed from the summarized data).
Here is an example of the data and what I've tried:
library(tidyverse)
set.seed(355)
subject <- c(rep(LETTERS[1:4], each = 40), rep("E", times = 40))
event <- c(sample(0:1, size = length(subject)-40, replace = T, prob = c(0.95, 0.05)), rep(0, times = 40))
df <- data.frame(subject, event)
df %>%
filter(event == 1) %>%
count(subject, event, sort = T)
# A tibble: 4 x 3
subject event n
<fct> <dbl> <int>
1 D 1 3
2 A 1 2
3 B 1 2
4 C 1 2
So we see that subject D has had the event 3 times while subjects A, B, and C have had the event 2 times. Subject E has not had the event at all.
My next step was to create an "event" tag that identifies where each event happened and then produced an NA for all over rows. I also created an event sequence, which sequences along between events, because I thought it might be useful, but I didn't end up trying to use it.
df_cleaned <- df %>%
group_by(subject, event) %>%
mutate(event_seq = seq_along(event == 1),
event_detail = ifelse(event == 1, "event", NA)) %>%
as.data.frame()
I tried two different approaches using a filter() and between() to get each event and the 2 rows following each event. Both of these approaches create an error because of the multiple events within subject. I can't figure out a good workaround for it.
Approach 1:
df_cleaned %>%
group_by(subject) %>%
filter(., between(row_number(),
left = which(!is.na(event_detail)),
right = which(!is.na(event_detail)) + 1))
Approach 2:
df_cleaned %>%
group_by(subject) %>%
mutate(event_group = cumsum(!is.na(event_detail))) %>%
filter(., between(row_number(), left = which(event_detail == "event"), right = which(event_detail == "event") + 2))
If you want to get rows with 1 in event and the following two rows, you can do the following. Thanks to Ananda Mahto who is the author of splitstackshape package, we can handle this type of operation with getMyRows(), which returns a list. You can specify a range of rows in the function. Here I said 0:2. So I am asking R to take each row with 1 in event and the following two rows. I used bind_rows() to return a data frame. But if you need to work with a list, you do not have to do that.
install_github("mrdwab/SOfun")
library(SOfun)
library(dplyr)
ind <- which(x = df$event == 1)
bind_rows(getMyRows(data = df, pattern = ind, range = 0:2))
subject event
1 A 1
2 A 0
3 A 0
4 A 1
5 A 0
6 A 0
7 B 1
8 B 0
9 B 0
10 B 1
11 B 0
12 B 0
13 C 1
14 C 0
15 C 0
16 C 1
17 C 0
18 C 0
19 D 1
20 D 0
21 D 0
22 D 1
23 D 0
24 D 0
25 D 1
26 D 0
27 D 0
Here is a tidyverse approach which uses cumsum() to create groups of rows after (and including) an event and which picks the top 3 rows of each group:
df %>%
group_by(subject) %>%
mutate(event_group = cumsum(event == 1L)) %>%
group_by(event_group, add = TRUE) %>%
filter(event_group > 0 & row_number() <= 3L)
# A tibble: 27 x 3
# Groups: subject, event_group [9]
subject event event_group
<fct> <dbl> <int>
1 A 1 1
2 A 0 1
3 A 0 1
4 A 1 2
5 A 0 2
6 A 0 2
7 B 1 1
8 B 0 1
9 B 0 1
10 B 1 2
# … with 17 more rows
For testing an edge case, here is a modified data set where subject A starts with three subsequent events. Furthermore, I have added row numbers rn in order to check that the correct rows are picked:
df2 <- df %>%
mutate(event = ifelse(row_number() <= 2L, 1L, event),
rn = row_number())
Now we get
df2 %>%
group_by(subject) %>%
mutate(event_group = cumsum(event == 1L)) %>%
group_by(event_group, add = TRUE) %>%
filter(event_group > 0 & row_number() <= 3L)
# A tibble: 29 x 4
# Groups: subject, event_group [11]
subject event rn event_group
<fct> <dbl> <int> <int>
1 A 1 1 1
2 A 1 2 2
3 A 1 3 3
4 A 0 4 3
5 A 0 5 3
6 A 1 22 4
7 A 0 23 4
8 A 0 24 4
9 B 1 59 1
10 B 0 60 1
# … with 19 more rows
which is in line with my expectations for this edge case.
Here is a base R option which looks similar to #jazzurro's attempt. We get the row indices where event == 1, then select next two rows from each index, use unique so in case there are overlapping indices we select only the unique ones and subset it from the original df.
inds <- which(df$event == 1)
df[unique(c(sapply(inds, `+`, 0:2))), ]
# subject event
#3 A 1
#4 A 0
#5 A 0
#22 A 1
#23 A 0
#24 A 0
#59 B 1
#60 B 0
#61 B 0
#62 B 1
#63 B 0
#64 B 0
#....
Another option using dplyr, could be using lag
library(dplyr)
df %>%
group_by(subject) %>%
filter(event == 1 | lag(event) == 1 | lag(event, 2) == 1)

check tables exceeding certain values and count number of times exceed respective threshold by respective id and label

I have a dataframe df
df <- data.frame(id =c(1,2,1,4,1,5,6),
label=c("a","b", "a", "a","a", "e", "a"),
color = c("g","a","g","g","a","a","a"),
threshold = c(12, 10, 12, 12, 12, 35, 40),
value =c(32.1,0,15.0,10,1,50,45),stringsAsFactors = F
)
Threshold value is based on the label
I should get a table below like this by considering each id,with respective label how many times exceeding its threshold by the value
Color is independent in consideration for calculating the exceed values
I tried like this
final_df <- df %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
group_by(id, label) %>%
summarise(exceed = sum(check))
But instead of getting with respective id i have got the number of total in exceed
With base R only, use aggregate.
aggregate(seq.int(nrow(df)) ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
# id label seq.int(nrow(df))
#1 1 a 2
#2 4 a 0
#3 6 a 1
#4 2 b 0
#5 5 e 1
In order to match the expected output posted in the question, it will take a little extra work.
exceed <- seq.int(nrow(df))
agg <- aggregate(exceed ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
res <- merge(df[1:3], agg)
unique(res)
# id label color exceed
#1 1 a g 2
#3 1 a a 2
#4 2 b a 0
#5 4 a g 0
#6 5 e a 1
#7 6 a a 1
By a small modification of your code:
df %>%
group_by(id, label) %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
summarise(exceed = sum(check)) %>%
group_by(id, label)
id label exceed
<dbl> <chr> <dbl>
1 1 a 2
2 2 b 0
3 4 a 0
4 5 e 1
5 6 a 1
To match the expected output more closely:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(if_else(value > threshold, 1, 0))) %>%
group_by(id, label, color) %>%
filter(row_number() == 1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 1 a a 12 1 2
5 5 e a 35 50 1
6 6 a a 40 45 1
library(dplyr)
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 5 e a 35 50 1
5 6 a a 40 45 1
If you like the output to contain a separate row for each combination, of ID, label and color, just add a new group_by before the slice function:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
group_by(id, label, color) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a a 12 1 2
2 1 a g 12 32.1 2
3 2 b a 10 0 0
4 4 a g 12 10 0
5 5 e a 35 50 1
6 6 a a 40 45 1
A little change in your code
final_df <- df %>% mutate(check = if_else(value > threshold, 1, 0)) %>% group_by(id, label) %>% filter(check==1)
unique(final_df$id)
We could use table and merge :
table_ <- table(subset(df,value>threshold, c("id","label")))
df2 <- merge(unique(df[c("id","label","color")]),table_,all.x=TRUE)
df2$Freq[is.na(df2$Freq)] <- 0
# id label color Freq
# 1 1 a g 2
# 2 1 a a 2
# 3 2 b a 0
# 4 4 a g 0
# 5 5 e a 1
# 6 6 a a 1

Filter (subset) by conditions in 2 columns in R (dplyr or otherwise)

Given a dataset such as:
set.seed(134)
df<- data.frame(ID= rep(LETTERS[1:5], each=2),
condition=rep(0:1, 5),
value=rpois(10, 3)
)
df
ID condition value
1 A 0 2
2 A 1 3
3 B 0 5
4 B 1 2
5 C 0 3
6 C 1 1
7 D 0 2
8 D 1 4
9 E 0 1
10 E 1 5
For each ID, when the value for condition==0 is less than the value for condition==1, I want to keep both observations. When the value for condition==0 is greater than condition==1, I want to keep only the row for condition==0.
The subset returned should be this:
ID condition value
1 A 0 2
2 A 1 3
3 B 0 5
5 C 0 3
7 D 0 2
8 D 1 4
9 E 0 1
10 E 1 5
Using dplyr the first step is:
df %>% group_by(ID) %>%
But not sure where to go from there.
Translating fairly literally,
library(dplyr)
set.seed(134)
df <- data.frame(ID = rep(LETTERS[1:5], each = 2),
condition = rep(0:1, 5),
value = rpois(10, 3))
df %>% group_by(ID) %>%
filter(condition == 0 |
(condition == 1 & value > value[condition == 0]))
#> # A tibble: 8 x 3
#> # Groups: ID [5]
#> ID condition value
#> <fct> <int> <int>
#> 1 A 0 2
#> 2 A 1 3
#> 3 B 0 5
#> 4 C 0 3
#> 5 D 0 2
#> 6 D 1 4
#> 7 E 0 1
#> 8 E 1 5
This depends on each group having a single observation with condition == 0, but should otherwise be fairly robust.
This is may not be the easiest way, but should work as you want.
library(reshape2)
df %>%
dcast(ID ~ condition, value.var = 'value') %>% # cast to wide format
mutate(`1` = ifelse(`1` > `0`, `1`, NA)) %>% # turn 0>1 values as NA
melt('ID') %>% # melt as long format
arrange(ID) %>% # sort by ID
filter(complete.cases(.)) # remove NA rows
Output:
ID variable value
1 A 0 2
2 A 1 3
3 B 0 5
4 C 0 3
5 D 0 2
6 D 1 4
7 E 0 1
8 E 1 5
You always want the value from the first row in each group. You only want the value from the second row in each group if it's larger than the first.
This works:
df %>%
group_by(ID) %>%
filter(row_number() == 1 | value > lag(value))
Edit: as #alistaire points out, this method depends on a particular order in, which is might be a good idea to guarantee as follows:
df %>%
arrange(ID, condition) %>%
group_by(ID) %>%
filter(row_number() == 1 | value > lag(value))

Extracting group dependent results from a dataframe

I have a dataframe made from different groups, and for each group real and predicted values. I want to extract values of tests on these values :
library(dplyr)
d = data.frame(group = c(rep(5,x="a"),rep(5,x="b")), real = c(rep(2, x=1:5)), pred = c(2,1,3,4,5,1,2,4,3,5))
group real pred
1 a 1 2
2 a 2 1
3 a 3 3
4 a 4 4
5 a 5 5
6 b 1 1
7 b 2 2
8 b 3 4
9 b 4 3
10 b 5 5
d <- d %>% group_by(group) %>% mutate( sg = ifelse(real == 1 & real == pred, 1, 0))
d <- d %>% group_by(group) %>% mutate( sp = ifelse(real <= 3 & pred <= 3, 1, 0))
d %>% distinct(sg, sp)
sg sp group
1 0 1 a
2 0 0 a
3 1 1 b
4 0 1 b
5 0 0 b
But I want something like this (only 1 result per group)
sg sp group
1 0 1 a
3 1 1 b
I am pretty sure dplyr, data.table or tidyr can do something but I cannot find how.
If it is always the first row of each group that you want to extract, you could use the do function:
d %>% do(.[1,])
Another option is to use the filter function like this:
d %>% filter(seq_along(sp) == 1)

Resources