Using dplyr: Within groups, select the first value meeting a condition - r

I need assistance obtaining a solution that will scan backwards in time and obtain the first value meeting a condition. I have data similar to:
set.seed(42)
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]
id time.var x
A 5 2
A 14 8
A 19 7
A 20 1
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10
For the last member of each group defined in time order by time.var, I'd like to obtain the first value from x less than 5 by scanning in descending time order.
I have tried:
test <- df %>%
group_by(id) %>%
arrange(id, time.var) %>%
mutate(less.5 = which.max(x[x < 5]) )
What strategy can I use to obtain this type of output:
id time.var x previous.less.5
A 5 2
A 14 8
A 19 7
A 20 1 2
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3 4
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10 4

Using library(dplyr):
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = tail(c(x[c((x[-n()] < 5), FALSE)]),1)) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), previous.less.5, NULL))
or
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
slice(1:(n()-1)) %>%
filter(x < 5) %>%
slice(n()) %>%
select(-time.var) %>%
right_join(df, ., by="id", suffix =c("",".y")) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), x.y, NULL)) %>%
select(-x.y)
giving:
#> # A tibble: 20 x 4
#> # Groups: id [3]
#> id time.var x previous.less.5
#> <fct> <int> <int> <int>
#> 1 A 3 10 NA
#> 2 A 4 8 NA
#> 3 A 4 6 NA
#> 4 A 5 2 NA
#> 5 A 5 8 NA
#> 6 A 5 7 NA
#> 7 A 11 6 NA
#> 8 A 13 3 NA
#> 9 A 15 2 3
#> 10 B 2 1 NA
#> 11 B 4 3 NA
#> 12 B 4 6 NA
#> 13 B 8 5 NA
#> 14 B 8 4 NA
#> 15 B 20 7 4
#> 16 C 1 2 NA
#> 17 C 2 10 NA
#> 18 C 10 6 NA
#> 19 C 13 2 NA
#> 20 C 18 5 2
Update:
If there's a group with no record less than 5 (or only last record less than 5) then following works:
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(),
max(tail(c( x[ c( x[-n()] < 5, FALSE) ] ), 1)),
NULL)) %>%
mutate(previous.less.5 = replace(previous.less.5, is.infinite(previous.less.5), NA))
Data:
set.seed(42) # I am getting different data than what you've shown with this seed
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]

We can reverse value of x by id get the first number which is less than 5 using which. The last replace is to assign NA to all the values in previous.less.5 except the last one.
library(dplyr)
df %>%
#Data is already sorted by `id` and `time.var` but if your still need use
#arrange(id, time.var) %>%
group_by(id) %>%
mutate(rev_x = c(NA, rev(x)[-1]), previous.less.5 = rev_x[which(rev_x < 5)[1]],
previous.less.5 = replace(previous.less.5, row_number() != n(), NA)) %>%
select(-rev_x)
# id time.var x previous.less.5
# <fct> <int> <int> <int>
# 1 A 5 2 NA
# 2 A 14 8 NA
# 3 A 19 7 NA
# 4 A 20 1 2
# 5 B 1 1 NA
# 6 B 2 5 NA
# 7 B 9 10 NA
# 8 B 11 10 NA
# 9 B 13 6 NA
#10 B 15 4 NA
#11 B 19 3 4
#12 C 1 7 NA
#13 C 3 5 NA
#14 C 8 9 NA
#15 C 8 4 NA
#16 C 17 7 NA
#17 C 17 4 NA
#18 C 17 8 NA
#19 C 19 4 NA
#20 C 19 10 4
This should also handle the case and return NA's if there is no value less than 5 in an id.

Related

Get mean column values every n rows grouped by value in other column

I have a dataframe df that looks like this
time object
1 1 A
2 2 A
3 3 A
4 4 A
5 5 A
6 6 A
7 7 B
8 8 B
9 9 B
10 10 B
11 11 B
12 12 C
13 13 C
14 14 C
15 15 C
16 16 C
17 17 C
18 18 C
I would like to get the mean of the timecolumn every 3 rows based on the object column
df_mean
time object
1 2 A
2 5 A
3 8 B
4 13 C
5 16 C
I though about using dplyr
df%>%
mutate(grp = 1+ (row_number()-1) %/% 3) %>%
group_by(grp) %>%
summarise(across(c("time"), mean, na.rm = TRUE)) %>%
select(-grp)
but I do not know how to integrate the control for the object.
Another option would be to use aggregate
aggregate(.~object, data=df, mean)
but in this case I do not know how to get the mean every 3 rows.
Your dplyr attempt is on the right track. With a few modifications it will work.
library(dplyr)
df <- tibble(time = 1:18, object = rep(c('A', 'B', 'C'), each = 6))
df %>%
group_by(object, grp = (row_number()-1) %/% 3) %>%
summarise(across(time, mean, na.rm = T), .groups = 'drop') %>%
select(-grp)
#> # A tibble: 6 × 2
#> object time
#> <chr> <dbl>
#> 1 A 2
#> 2 A 5
#> 3 B 8
#> 4 B 11
#> 5 C 14
#> 6 C 17
Here is an alternative dplyr way:
library(dplyr)
n = 3
df %>%
group_by(object, Col2 = rep(row_number(), each=n, length.out = n())) %>%
summarise(time = mean(time, na.rm = TRUE)) %>%
select(-Col2)
object time
<chr> <dbl>
1 A 2
2 A 5
3 B 8
4 B 11
5 C 14
6 C 17
You can do a slight modification, creating your grp variable within object group first, and then filtering where the size of the joint grouping is >=3, and then summarize:
df%>%
group_by(object) %>%
mutate(grp = 1+ (row_number()-1) %/% 3) %>%
group_by(object,grp) %>%
filter(n()>=3) %>%
summarize(time=mean(time), .groups="drop") %>%
select(-grp)
Output:
object time
<chr> <dbl>
1 A 2
2 A 5
3 B 8
4 C 13
5 C 16
data.table option:
library(data.table)
setDT(df)
df[, n3:=gl(.N, 3, length=.N), by=object]
df[, .(time=mean(time)), by=.(object, n3)][, !"n3"]
Output:
object time
1: A 2
2: A 5
3: B 8
4: B 11
5: C 14
6: C 17
in base R:
aggregate(time~., cbind(df, gr=gl(nrow(df),3, nrow(df))), mean)
object gr time
1 A 1 2
2 A 2 5
3 B 3 8
4 B 4 11
5 C 5 14
6 C 6 17

Only rows where difference between them is less than 'n' in groups

Let's say we have the below dataset where values in V2 are ordered ascending in groups V1:
Input =(" V1 V2
1 A 3
2 A 4
3 A 5
4 A 6
5 A 12
6 A 13
7 B 4
8 B 5
9 B 6
10 B 12
11 C 13
12 C 14
13 C 18")
df = as.data.frame(read.table(textConnection(Input), header = T, row.names = 1))
Now I want to keep rows where the difference between consecutive ones is <= 1, so my desired output:
V1 V2
1 A 3
2 A 4
3 A 5
4 A 6
5 A 12
6 A 13
7 B 4
8 B 5
9 B 6
11 C 13
12 C 14
However when I use:
df %>%
group_by(V1) %>%
filter(c(0,diff(V2)) <= 1)
I have:
V1 V2
1 A 3
2 A 4
3 A 5
4 A 6
5 A 13
6 B 4
7 B 5
8 B 6
9 C 13
10 C 14
The row with V2 value 12 is missing and it should be in dataset. I tried also with lag() but result is same.
df %>%
group_by(V1) %>%
filter(V2 - lag(V2) <= 1 | is.na(V2 - lag(V2)))
Could you point my mistake?
You need to subtract the values from both the sides. Try lead and lag :
library(dplyr)
df %>%
group_by(V1) %>%
filter(V2 - lag(V2) <= 1 | V2 - lead(V2) <= 1)
# V1 V2
# <chr> <int>
# 1 A 3
# 2 A 4
# 3 A 5
# 4 A 6
# 5 A 12
# 6 A 13
# 7 B 4
# 8 B 5
# 9 B 6
#10 C 13
#11 C 14
Here is another idea where we create groups with a tolerance of 1, and filter out those groups with only one observation, i.e.
df %>%
group_by(V1, grp = cumsum(c(TRUE, diff(V2) != 1))) %>%
filter(n() > 1) %>%
ungroup() %>%
select(-grp)
# A tibble: 11 x 2
# V1 V2
# <fct> <int>
# 1 A 3
# 2 A 4
# 3 A 5
# 4 A 6
# 5 A 12
# 6 A 13
# 7 B 4
# 8 B 5
# 9 B 6
#10 C 13
#11 C 14

Restructure data based on row numbers in R

I am having troubles restructuring the data as I need to.
My df looks like this:
id <- (1:20)
author <- c("A","A","A","A","A","B","B","B","A","A","A","B","B","B","B"
,"B","B","B","A","A")
df <- data.frame(id, author)
> print(df)
id author
1 1 A
2 2 A
3 3 A
4 4 A
5 5 A
6 6 B
7 7 B
8 8 B
9 9 A
10 10 A
11 11 A
12 12 B
13 13 B
14 14 B
15 15 B
16 16 B
17 17 B
18 18 B
19 19 A
20 20 A
And I'm trying to get a data structure where the columns are the authors and the rwos indicate the first and last id values of each sequence of A or B values. So in this case the first row with author A is id = 1, and the last one of that series is id 5, and so forth.
Something like this:
A <- c(1, 5, 9, 11, 19,20)
B <- c(6, 8, 12, 18, NA, NA)
df.desired <- data.frame(A, B)
print(df.desired)
A B
1 1 6
2 5 8
3 9 12
4 11 18
5 19 NA
6 20 NA
Any ideas?
Thanks a lot!
We can create groups using data.table rleid, select 1st and last row in each group and get data in wide format.
library(dplyr)
df %>%
group_by(grp = data.table::rleid(author)) %>%
slice(1L, n()) %>%
group_by(author) %>%
mutate(grp = row_number()) %>%
tidyr::pivot_wider(names_from = author, values_from = id) %>%
select(-grp)
# A tibble: 6 x 2
# A B
# <int> <int>
#1 1 6
#2 5 8
#3 9 12
#4 11 18
#5 19 NA
#6 20 NA
For the updated request in comments we can do :
df %>%
group_by(grp = data.table::rleid(author)) %>%
slice(1L, n()) %>%
mutate(author = row_number()) %>%
tidyr::pivot_wider(names_from = row, values_from = id) %>%
ungroup %>%
select(-grp)
# A tibble: 5 x 2
# `1` `2`
# <int> <int>
#1 1 5
#2 6 8
#3 9 11
#4 12 18
#5 19 20
Here is a base R option
z <- rle(df$author)
lst <- split(df,findInterval(1:nrow(df),cumsum(z$lengths), left.open = TRUE))
u <- lapply(lst,function(v) range(v$id))
idx <- split(seq_along(z$values),z$values)
x <- lapply(idx,function(v) unlist(u[v],use.names = FALSE))
df.desired <- as.data.frame(lapply(x,`length<-`,max(lengths(x))))
which gives
> df.desired
A B
1 1 6
2 5 8
3 9 12
4 11 18
5 19 NA
6 20 NA
An option using data.table:
library(data.table)
dcast(
setDT(df)[, ri := rleid(author)][, id[c(1L, .N)], .(author, ri)],
rowid(author) ~ author, value.var="V1")
output:
author A B
1: 1 1 6
2: 2 5 8
3: 3 9 12
4: 4 11 18
5: 5 19 NA
6: 6 20 NA
If there is a possibility of an author having a single row, you will need unique(c(1L, .N))

Add a new row for each id in dataframe for ALL variables

I want to add a new row after each id. I found a solution on a stackflow page(Inserting a new row to data frame for each group id)
but there is one thing I want to change and I dont know how. I want to make a new row for all variables, I don't want to write down all the variables ( the stackflow example). It doesnt matter the numbers in the row, I will change that later. If it is possible to add "base" in the new row for trt, that would be good. I want the code to work for many ids and varibles, having a lot of those in the data I'm working with. Many thanks if someone can help me with this!
The example code:
set.seed(1)
> id <- rep(1:3,each=4)
> trt <- rep(c("A","OA", "B", "OB"),3)
> pointA <- sample(1:10,12, replace=TRUE)
> pointB<- sample(1:10,12, replace=TRUE)
> pointC<- sample(1:10,12, replace=TRUE)
> df <- data.frame(id,trt,pointA, pointB,pointC)
> df
id trt pointA pointB pointC
1 1 A 3 7 3
2 1 OA 4 4 4
3 1 B 6 8 1
4 1 OB 10 5 4
5 2 A 3 8 9
6 2 OA 9 10 4
7 2 B 10 4 5
8 2 OB 7 8 6
9 3 A 7 10 5
10 3 OA 1 3 2
11 3 B 3 7 9
12 3 OB 2 2 7
I want it to look like:
df <- rbind(df[1:4,], df1, df[5:8,], df2, df[9:12,],df3)
> df
id trt pointA pointB pointC
1 1 A 3 7 3
2 1 OA 4 4 4
3 1 B 6 8 1
4 1 OB 10 5 4
5 1 base
51 2 A 3 8 9
6 2 OA 9 10 4
7 2 B 10 4 5
8 2 OB 7 8 6
13 2 base
9 3 A 7 10 5
10 3 OA 1 3 2
11 3 B 3 7 9
12 3 OB 2 2 7
14 3 base
>
I'm trying this code:
df %>%
+ group_by(id) %>%
+ summarise(week = "base") %>%
+ mutate_all() %>% #want tomutate allvariables
+ bind_rows(df, .) %>%
+ arrange(id)
You could bind_rows directly, it will add NAs to all other columns by default.
library(dplyr)
df %>% group_by(id) %>% summarise(trt = 'base') %>% bind_rows(df) %>% arrange(id)
# id trt pointA pointB pointC
# <int> <chr> <int> <int> <int>
# 1 1 base NA NA NA
# 2 1 A 3 7 3
# 3 1 OA 4 4 4
# 4 1 B 6 8 1
# 5 1 OB 10 5 4
# 6 2 base NA NA NA
# 7 2 A 3 8 9
# 8 2 OA 9 10 4
# 9 2 B 10 4 5
#10 2 OB 7 8 6
#11 3 base NA NA NA
#12 3 A 7 10 5
#13 3 OA 1 3 2
#14 3 B 3 7 9
#15 3 OB 2 2 7
If you want empty strings instead of NA, we can give a range of columns in mutate_at and replace NA values with empty string.
df %>%
group_by(id) %>%
summarise(trt = 'base') %>%
bind_rows(df) %>%
mutate_at(vars(pointA:pointC), ~replace(., is.na(.) , '')) %>%
arrange(id)
library(dplyr)
library(purrr)
df %>% mutate_if(is.factor, as.character) %>%
group_split(id) %>%
map_dfr(~bind_rows(.x, data.frame(id=.x$id[1], trt="base", stringsAsFactors = FALSE)))
#Note that group_modify is Experimental
df %>% mutate_if(is.factor, as.character) %>%
group_by(id) %>%
group_modify(~bind_rows(.x, data.frame(trt="base", stringsAsFactors = FALSE)))

tidyr spread does not aggregate data

I have data of the following:
> data <- data.frame(unique=1:9, grouping=rep(c('a', 'b', 'c'), each=3), value=sample(1:30, 9))
> data
unique grouping value
1 1 a 15
2 2 a 21
3 3 a 26
4 4 b 8
5 5 b 6
6 6 b 4
7 7 c 17
8 8 c 1
9 9 c 3
I would like to create a table that looks like this:
a b c
1 15 8 17
2 21 6 1
3 26 6 3
I am using tidyr::spread and not getting the correct result:
> data %>% spread(grouping, value)
unique a b c
1 1 15 NA NA
2 2 21 NA NA
3 3 26 NA NA
4 4 NA 8 NA
5 5 NA 6 NA
6 6 NA 4 NA
7 7 NA NA 17
8 8 NA NA 1
9 9 NA NA 3
Or
> data %>% select(grouping, value) %>% spread(grouping, value)
Error: Duplicate identifiers for rows (1, 2, 3), (4, 5, 6), (7, 8, 9)
Is there a way to do this also when one group (c) has a different length than the others?
We need to create a sequence column to avoid the duplicate identifiers row Error.
library(tidyr)
library(dplyr)
data %>%
group_by(grouping) %>%
mutate(id = row_number()) %>%
select(-unique) %>%
spread(grouping, value) %>%
select(-id)
# a b c
# (int) (int) (int)
#1 15 8 17
#2 21 6 1
#3 26 4 3

Resources