Restructure data based on row numbers in R - r

I am having troubles restructuring the data as I need to.
My df looks like this:
id <- (1:20)
author <- c("A","A","A","A","A","B","B","B","A","A","A","B","B","B","B"
,"B","B","B","A","A")
df <- data.frame(id, author)
> print(df)
id author
1 1 A
2 2 A
3 3 A
4 4 A
5 5 A
6 6 B
7 7 B
8 8 B
9 9 A
10 10 A
11 11 A
12 12 B
13 13 B
14 14 B
15 15 B
16 16 B
17 17 B
18 18 B
19 19 A
20 20 A
And I'm trying to get a data structure where the columns are the authors and the rwos indicate the first and last id values of each sequence of A or B values. So in this case the first row with author A is id = 1, and the last one of that series is id 5, and so forth.
Something like this:
A <- c(1, 5, 9, 11, 19,20)
B <- c(6, 8, 12, 18, NA, NA)
df.desired <- data.frame(A, B)
print(df.desired)
A B
1 1 6
2 5 8
3 9 12
4 11 18
5 19 NA
6 20 NA
Any ideas?
Thanks a lot!

We can create groups using data.table rleid, select 1st and last row in each group and get data in wide format.
library(dplyr)
df %>%
group_by(grp = data.table::rleid(author)) %>%
slice(1L, n()) %>%
group_by(author) %>%
mutate(grp = row_number()) %>%
tidyr::pivot_wider(names_from = author, values_from = id) %>%
select(-grp)
# A tibble: 6 x 2
# A B
# <int> <int>
#1 1 6
#2 5 8
#3 9 12
#4 11 18
#5 19 NA
#6 20 NA
For the updated request in comments we can do :
df %>%
group_by(grp = data.table::rleid(author)) %>%
slice(1L, n()) %>%
mutate(author = row_number()) %>%
tidyr::pivot_wider(names_from = row, values_from = id) %>%
ungroup %>%
select(-grp)
# A tibble: 5 x 2
# `1` `2`
# <int> <int>
#1 1 5
#2 6 8
#3 9 11
#4 12 18
#5 19 20

Here is a base R option
z <- rle(df$author)
lst <- split(df,findInterval(1:nrow(df),cumsum(z$lengths), left.open = TRUE))
u <- lapply(lst,function(v) range(v$id))
idx <- split(seq_along(z$values),z$values)
x <- lapply(idx,function(v) unlist(u[v],use.names = FALSE))
df.desired <- as.data.frame(lapply(x,`length<-`,max(lengths(x))))
which gives
> df.desired
A B
1 1 6
2 5 8
3 9 12
4 11 18
5 19 NA
6 20 NA

An option using data.table:
library(data.table)
dcast(
setDT(df)[, ri := rleid(author)][, id[c(1L, .N)], .(author, ri)],
rowid(author) ~ author, value.var="V1")
output:
author A B
1: 1 1 6
2: 2 5 8
3: 3 9 12
4: 4 11 18
5: 5 19 NA
6: 6 20 NA
If there is a possibility of an author having a single row, you will need unique(c(1L, .N))

Related

Get mean column values every n rows grouped by value in other column

I have a dataframe df that looks like this
time object
1 1 A
2 2 A
3 3 A
4 4 A
5 5 A
6 6 A
7 7 B
8 8 B
9 9 B
10 10 B
11 11 B
12 12 C
13 13 C
14 14 C
15 15 C
16 16 C
17 17 C
18 18 C
I would like to get the mean of the timecolumn every 3 rows based on the object column
df_mean
time object
1 2 A
2 5 A
3 8 B
4 13 C
5 16 C
I though about using dplyr
df%>%
mutate(grp = 1+ (row_number()-1) %/% 3) %>%
group_by(grp) %>%
summarise(across(c("time"), mean, na.rm = TRUE)) %>%
select(-grp)
but I do not know how to integrate the control for the object.
Another option would be to use aggregate
aggregate(.~object, data=df, mean)
but in this case I do not know how to get the mean every 3 rows.
Your dplyr attempt is on the right track. With a few modifications it will work.
library(dplyr)
df <- tibble(time = 1:18, object = rep(c('A', 'B', 'C'), each = 6))
df %>%
group_by(object, grp = (row_number()-1) %/% 3) %>%
summarise(across(time, mean, na.rm = T), .groups = 'drop') %>%
select(-grp)
#> # A tibble: 6 × 2
#> object time
#> <chr> <dbl>
#> 1 A 2
#> 2 A 5
#> 3 B 8
#> 4 B 11
#> 5 C 14
#> 6 C 17
Here is an alternative dplyr way:
library(dplyr)
n = 3
df %>%
group_by(object, Col2 = rep(row_number(), each=n, length.out = n())) %>%
summarise(time = mean(time, na.rm = TRUE)) %>%
select(-Col2)
object time
<chr> <dbl>
1 A 2
2 A 5
3 B 8
4 B 11
5 C 14
6 C 17
You can do a slight modification, creating your grp variable within object group first, and then filtering where the size of the joint grouping is >=3, and then summarize:
df%>%
group_by(object) %>%
mutate(grp = 1+ (row_number()-1) %/% 3) %>%
group_by(object,grp) %>%
filter(n()>=3) %>%
summarize(time=mean(time), .groups="drop") %>%
select(-grp)
Output:
object time
<chr> <dbl>
1 A 2
2 A 5
3 B 8
4 C 13
5 C 16
data.table option:
library(data.table)
setDT(df)
df[, n3:=gl(.N, 3, length=.N), by=object]
df[, .(time=mean(time)), by=.(object, n3)][, !"n3"]
Output:
object time
1: A 2
2: A 5
3: B 8
4: B 11
5: C 14
6: C 17
in base R:
aggregate(time~., cbind(df, gr=gl(nrow(df),3, nrow(df))), mean)
object gr time
1 A 1 2
2 A 2 5
3 B 3 8
4 B 4 11
5 C 5 14
6 C 6 17

Using dplyr: Within groups, select the first value meeting a condition

I need assistance obtaining a solution that will scan backwards in time and obtain the first value meeting a condition. I have data similar to:
set.seed(42)
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]
id time.var x
A 5 2
A 14 8
A 19 7
A 20 1
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10
For the last member of each group defined in time order by time.var, I'd like to obtain the first value from x less than 5 by scanning in descending time order.
I have tried:
test <- df %>%
group_by(id) %>%
arrange(id, time.var) %>%
mutate(less.5 = which.max(x[x < 5]) )
What strategy can I use to obtain this type of output:
id time.var x previous.less.5
A 5 2
A 14 8
A 19 7
A 20 1 2
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3 4
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10 4
Using library(dplyr):
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = tail(c(x[c((x[-n()] < 5), FALSE)]),1)) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), previous.less.5, NULL))
or
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
slice(1:(n()-1)) %>%
filter(x < 5) %>%
slice(n()) %>%
select(-time.var) %>%
right_join(df, ., by="id", suffix =c("",".y")) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), x.y, NULL)) %>%
select(-x.y)
giving:
#> # A tibble: 20 x 4
#> # Groups: id [3]
#> id time.var x previous.less.5
#> <fct> <int> <int> <int>
#> 1 A 3 10 NA
#> 2 A 4 8 NA
#> 3 A 4 6 NA
#> 4 A 5 2 NA
#> 5 A 5 8 NA
#> 6 A 5 7 NA
#> 7 A 11 6 NA
#> 8 A 13 3 NA
#> 9 A 15 2 3
#> 10 B 2 1 NA
#> 11 B 4 3 NA
#> 12 B 4 6 NA
#> 13 B 8 5 NA
#> 14 B 8 4 NA
#> 15 B 20 7 4
#> 16 C 1 2 NA
#> 17 C 2 10 NA
#> 18 C 10 6 NA
#> 19 C 13 2 NA
#> 20 C 18 5 2
Update:
If there's a group with no record less than 5 (or only last record less than 5) then following works:
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(),
max(tail(c( x[ c( x[-n()] < 5, FALSE) ] ), 1)),
NULL)) %>%
mutate(previous.less.5 = replace(previous.less.5, is.infinite(previous.less.5), NA))
Data:
set.seed(42) # I am getting different data than what you've shown with this seed
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]
We can reverse value of x by id get the first number which is less than 5 using which. The last replace is to assign NA to all the values in previous.less.5 except the last one.
library(dplyr)
df %>%
#Data is already sorted by `id` and `time.var` but if your still need use
#arrange(id, time.var) %>%
group_by(id) %>%
mutate(rev_x = c(NA, rev(x)[-1]), previous.less.5 = rev_x[which(rev_x < 5)[1]],
previous.less.5 = replace(previous.less.5, row_number() != n(), NA)) %>%
select(-rev_x)
# id time.var x previous.less.5
# <fct> <int> <int> <int>
# 1 A 5 2 NA
# 2 A 14 8 NA
# 3 A 19 7 NA
# 4 A 20 1 2
# 5 B 1 1 NA
# 6 B 2 5 NA
# 7 B 9 10 NA
# 8 B 11 10 NA
# 9 B 13 6 NA
#10 B 15 4 NA
#11 B 19 3 4
#12 C 1 7 NA
#13 C 3 5 NA
#14 C 8 9 NA
#15 C 8 4 NA
#16 C 17 7 NA
#17 C 17 4 NA
#18 C 17 8 NA
#19 C 19 4 NA
#20 C 19 10 4
This should also handle the case and return NA's if there is no value less than 5 in an id.

Add a new row for each id in dataframe for ALL variables

I want to add a new row after each id. I found a solution on a stackflow page(Inserting a new row to data frame for each group id)
but there is one thing I want to change and I dont know how. I want to make a new row for all variables, I don't want to write down all the variables ( the stackflow example). It doesnt matter the numbers in the row, I will change that later. If it is possible to add "base" in the new row for trt, that would be good. I want the code to work for many ids and varibles, having a lot of those in the data I'm working with. Many thanks if someone can help me with this!
The example code:
set.seed(1)
> id <- rep(1:3,each=4)
> trt <- rep(c("A","OA", "B", "OB"),3)
> pointA <- sample(1:10,12, replace=TRUE)
> pointB<- sample(1:10,12, replace=TRUE)
> pointC<- sample(1:10,12, replace=TRUE)
> df <- data.frame(id,trt,pointA, pointB,pointC)
> df
id trt pointA pointB pointC
1 1 A 3 7 3
2 1 OA 4 4 4
3 1 B 6 8 1
4 1 OB 10 5 4
5 2 A 3 8 9
6 2 OA 9 10 4
7 2 B 10 4 5
8 2 OB 7 8 6
9 3 A 7 10 5
10 3 OA 1 3 2
11 3 B 3 7 9
12 3 OB 2 2 7
I want it to look like:
df <- rbind(df[1:4,], df1, df[5:8,], df2, df[9:12,],df3)
> df
id trt pointA pointB pointC
1 1 A 3 7 3
2 1 OA 4 4 4
3 1 B 6 8 1
4 1 OB 10 5 4
5 1 base
51 2 A 3 8 9
6 2 OA 9 10 4
7 2 B 10 4 5
8 2 OB 7 8 6
13 2 base
9 3 A 7 10 5
10 3 OA 1 3 2
11 3 B 3 7 9
12 3 OB 2 2 7
14 3 base
>
I'm trying this code:
df %>%
+ group_by(id) %>%
+ summarise(week = "base") %>%
+ mutate_all() %>% #want tomutate allvariables
+ bind_rows(df, .) %>%
+ arrange(id)
You could bind_rows directly, it will add NAs to all other columns by default.
library(dplyr)
df %>% group_by(id) %>% summarise(trt = 'base') %>% bind_rows(df) %>% arrange(id)
# id trt pointA pointB pointC
# <int> <chr> <int> <int> <int>
# 1 1 base NA NA NA
# 2 1 A 3 7 3
# 3 1 OA 4 4 4
# 4 1 B 6 8 1
# 5 1 OB 10 5 4
# 6 2 base NA NA NA
# 7 2 A 3 8 9
# 8 2 OA 9 10 4
# 9 2 B 10 4 5
#10 2 OB 7 8 6
#11 3 base NA NA NA
#12 3 A 7 10 5
#13 3 OA 1 3 2
#14 3 B 3 7 9
#15 3 OB 2 2 7
If you want empty strings instead of NA, we can give a range of columns in mutate_at and replace NA values with empty string.
df %>%
group_by(id) %>%
summarise(trt = 'base') %>%
bind_rows(df) %>%
mutate_at(vars(pointA:pointC), ~replace(., is.na(.) , '')) %>%
arrange(id)
library(dplyr)
library(purrr)
df %>% mutate_if(is.factor, as.character) %>%
group_split(id) %>%
map_dfr(~bind_rows(.x, data.frame(id=.x$id[1], trt="base", stringsAsFactors = FALSE)))
#Note that group_modify is Experimental
df %>% mutate_if(is.factor, as.character) %>%
group_by(id) %>%
group_modify(~bind_rows(.x, data.frame(trt="base", stringsAsFactors = FALSE)))

How to call columns implicitly in certain R functions

There are some functions in R where you have to call columns explicitly by name, such as pmin. My question is how to get around this, preferably using tidyverse.
Here's some sample data.
library(tidyverse)
df <- tibble(a = c(1:5),
b = c(6:10),
d = c(11:15),
e = c(16:20))
# A tibble: 5 x 4
a b d e
<int> <int> <int> <int>
1 1 6 11 16
2 2 7 12 17
3 3 8 13 18
4 4 9 14 19
5 5 10 15 20
Now I'd like to find the minimum of all the columns except for "e". I can do this:
df %>%
mutate(min = pmin(a, b, d))
# A tibble: 5 x 5
a b d e min
<int> <int> <int> <int> <int>
1 1 6 11 16 1
2 2 7 12 17 2
3 3 8 13 18 3
4 4 9 14 19 4
5 5 10 15 20 5
But what if I have many columns and would like to call every column except "e" without having to type out each column's name? I've made several attempts but none successful. I used the column index in my examples but I'd prefer excluding "e" by name. See below.
df %>%
mutate(min = pmin(-e))
df %>%
mutate(min = pmin(names(. %>% select(.))[-4]))
df %>%
mutate(min = pmin(names(.)[-4]))
df %>%
mutate(min = pmin(noquote(paste0(names(.)[-4], collapse = ","))))
df %>%
mutate(min = pmin(!!ensyms(names(.)[-4])))
None of these worked and I'm a bit at a loss.
One option using dplyr and purrr could be:
df %>%
mutate(min = exec(pmin, !!!select(., -e)))
a b d e min
<int> <int> <int> <int> <int>
1 1 6 11 16 1
2 2 7 12 17 2
3 3 8 13 18 3
4 4 9 14 19 4
5 5 10 15 20 5
For those not reading the comments, a nice option proposed by #IceCreamToucan and involving only dplyr could be:
df %>%
mutate(min = do.call(pmin, select(., -e)))
We can also use reduce with pmin
library(dplyr)
library(purrr)
df %>%
mutate(min = select(., -e) %>%
reduce(pmin))
# A tibble: 5 x 5
# a b d e min
# <int> <int> <int> <int> <int>
#1 1 6 11 16 1
#2 2 7 12 17 2
#3 3 8 13 18 3
#4 4 9 14 19 4
#5 5 10 15 20 5
Or with syms and !!!. Note that en- prefix is used while using from inside a function
df %>%
mutate(min = pmin(!!! syms(names(.)[-4])))
# A tibble: 5 x 5
# a b d e min
# <int> <int> <int> <int> <int>
#1 1 6 11 16 1
#2 2 7 12 17 2
#3 3 8 13 18 3
#4 4 9 14 19 4
#5 5 10 15 20 5
I would do this by reshaping long, calculating the min by group, and then reshaping back to wide:
df %>%
rowid_to_column() %>%
pivot_longer(cols = -c(e, rowid)) %>%
group_by(rowid) %>%
mutate(min = min(value)) %>%
ungroup() %>%
pivot_wider() %>%
select(-rowid, -min, min)

How to use group_by with summarise and summarise_all?

x y
1 1 1
2 3 2
3 2 3
4 3 4
5 2 5
6 4 6
7 5 7
8 2 8
9 1 9
10 1 10
11 3 11
12 4 12
The above is part of the input.
Let's suppose that it also has a bunch of other columns
I want to:
group_by x
summarise y by sum
And for all other columns, I want to summarise_all by just taking the first value
Here's an approach that breaks it into two problems and combines them:
library(dplyr)
left_join(
# Here we want to treat column y specially
df %>%
group_by(x) %>%
summarize(sum_y = sum(y)),
# Here we exclude y and use a different summation for all the remaining columns
df %>%
group_by(x) %>%
select(-y) %>%
summarise_all(first)
)
# A tibble: 5 x 3
x sum_y z
<int> <int> <int>
1 1 20 1
2 2 16 3
3 3 17 2
4 4 18 2
5 5 7 3
Sample data:
df <- read.table(
header = T,
stringsAsFactors = F,
text="x y z
1 1 1
3 2 2
2 3 3
3 4 4
2 5 1
4 6 2
5 7 3
2 8 4
1 9 1
1 10 2
3 11 3
4 12 4")
library(dplyr)
df1 %>%
group_by(x) %>%
summarise_each(list(avg = mean), -y) %>%
bind_cols(.,{df1 %>%
group_by(x) %>%
summarise_at(vars(y), funs(sum)) %>%
select(-x)
})
#> # A tibble: 5 x 4
#> x r_avg r.1_avg y
#> <int> <dbl> <dbl> <int>
#> 1 1 6.67 6.67 20
#> 2 2 5.33 5.33 16
#> 3 3 5.67 5.67 17
#> 4 4 9 9 18
#> 5 5 7 7 7
Created on 2019-06-20 by the reprex package (v0.3.0)
Data:
df1 <- read.table(text="
r x y
1 1 1
2 3 2
3 2 3
4 3 4
5 2 5
6 4 6
7 5 7
8 2 8
9 1 9
10 1 10
11 3 11
12 4 12", header=T)
df1 <- df1[,c(2,3,1,1)]
library(tidyverse)
df <- tribble(~x, ~y, # making a sample data frame
1, 1,
3, 2,
2, 3,
3, 4,
2, 5,
4, 6,
5, 7,
2, 8,
1, 9,
1, 10,
3, 11,
4, 12)
df <- df %>%
add_column(z = sample(1:nrow(df))) #add another column for the example
df
# If there is only one additional column and you need the first value
df %>%
group_by(x) %>%
summarise(sum_y = sum(y), z_1st = z[1])
# otherwise use summarise_at to address all the other columns
f <- function(x){x[1]} # function to extract the first value
df %>%
group_by(x) %>%
summarise_at(.vars = vars(-c('y')), .funs = f) # exclude column y from the calculations

Resources