Related
The code below should group the data by year and then create two new columns with the first and last value of each year.
library(dplyr)
set.seed(123)
d <- data.frame(
group = rep(1:3, each = 3),
year = rep(seq(2000,2002,1),3),
value = sample(1:9, r = T))
d %>%
group_by(group) %>%
mutate(
first = dplyr::first(value),
last = dplyr::last(value)
)
However, it does not work as it should. The expected result would be
group year value first last
<int> <dbl> <int> <int> <int>
1 1 2000 3 3 4
2 1 2001 8 3 4
3 1 2002 4 3 4
4 2 2000 8 8 1
5 2 2001 9 8 1
6 2 2002 1 8 1
7 3 2000 5 5 5
8 3 2001 9 5 5
9 3 2002 5 5 5
Yet, I get this (it takes the first and the last value over the entire data frame, not just the groups):
group year value first last
<int> <dbl> <int> <int> <int>
1 1 2000 3 3 5
2 1 2001 8 3 5
3 1 2002 4 3 5
4 2 2000 8 3 5
5 2 2001 9 3 5
6 2 2002 1 3 5
7 3 2000 5 3 5
8 3 2001 9 3 5
9 3 2002 5 3 5
dplyr::mutate() did the trick
d %>%
group_by(group) %>%
dplyr::mutate(
first = dplyr::first(value),
last = dplyr::last(value)
)
You can also try by using summarise function within dpylr to get the first and last values of unique groups
d %>%
group_by(group) %>%
summarise(first_value = first(na.omit(values)),
last_value = last(na.omit(values))) %>%
left_join(d, ., by = 'group')
If you are from the future and dplyr has stopped supporting the first and last functions or want a future-proof solution, you can just index the columns like you would a list:
> d %>%
group_by(group) %>%
mutate(
first = value[[1]],
last = value[[length(value)]]
)
# A tibble: 9 × 5
# Groups: group [3]
group year value first last
<int> <dbl> <int> <int> <int>
1 1 2000 3 3 4
2 1 2001 8 3 4
3 1 2002 4 3 4
4 2 2000 8 8 1
5 2 2001 9 8 1
6 2 2002 1 8 1
7 3 2000 5 5 5
8 3 2001 9 5 5
9 3 2002 5 5 5
I have two tables. I would like to update the first table using a second table using multiple conditions. In base R I would use if...else type constructs to do this but would like to know how to achieve this using dplyr.
The table to be updated (have a field added) looks like this:
> Intvs
# A tibble: 12 x 3
Group From To
<chr> <dbl> <dbl>
1 A 0 1
2 A 1 2
3 A 2 3
4 A 3 4
5 A 4 5
6 A 5 6
7 B 0 1
8 B 1 2
9 B 2 3
10 B 3 4
11 B 4 5
12 B 5 6
The tibble that I would like to use to make the update looks like this:
>Zns
# A tibble: 2 x 4
Group From To Zone
<chr> <chr> <dbl> <dbl>
1 A X 1 5
2 B Y 3 4
I would like to update the Intvs tibble with the Zns tibble using the fields == Group, >= From, and <= To to control the update. The expected output should look like this
> Intvs
# A tibble: 12 x 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
What is the most efficient way to do this using dplyr?
The code below should make the dummy tables Intv and Zns
# load packages
require(tidyverse)
# Intervals table
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
names(Zns) <- c("Group", "From", "To", "Zone")
Using non-equi join from data.table
library(data.table)
setDT(Intvs)[Zns, Zone := Zone, on = .(Group, From >= From, To <= To)]
-output
> Intvs
Group From To Zone
<char> <num> <num> <char>
1: A 0 1 <NA>
2: A 1 2 X
3: A 2 3 X
4: A 3 4 X
5: A 4 5 X
6: A 5 6 <NA>
7: B 0 1 <NA>
8: B 1 2 <NA>
9: B 2 3 <NA>
10: B 3 4 Y
11: B 4 5 <NA>
12: B 5 6 <NA>
This is the closest I get. It is not giving the expected output:
library(dplyr)
left_join(Intvs, Zns, by="Group") %>%
group_by(Group) %>%
mutate(Zone1 = case_when(From.x <= Zone & From.x >= To.y ~ From.y)) %>%
select(Group, From=From.x, To=To.x, Zone = Zone1)
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 X
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 Y
12 B 5 6 NA
Not sure why your first row does not give NA, since 0 - 1 is not in the range of 1 - 5.
First left_join the two dataframes using the Group column. Here I assign the suffix "_Zns" to values from the Zns dataframe. Then use a single case_when or (ifelse) statement to assign NA to rows that do not fit the range. Finally, drop the columns that end with Zns.
library(dplyr)
left_join(Intvs, Zns, by = "Group", suffix = c("", "_Zns")) %>%
mutate(Zone = case_when(From >= From_Zns & To <= To_Zns ~ Zone,
TRUE ~ NA_character_)) %>%
select(-ends_with("Zns"))
# A tibble: 12 × 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
Data
Note that I have changed your column name order in the Zns dataframe.
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
colnames(Zns) <- c("Group", "Zone", "From", "To")
I would like to calculate max value from first row to current row
df <- data.frame(id = c(1,1,1,1,2,2,2), value = c(2,5,3,2,4,5,4), result = c(NA,2,5,5,NA,4,5))
I have tried grouping by id with dplyr and using rollmax function from zoo but did not success
1) rollmax is used with a fixed width but here we have a variable width so using rollapplyr, which seems close to the approach of the question, we have:
library(dplyr)
library(zoo)
df %>%
group_by(id) %>%
mutate(out = lag(rollapplyr(value, 1:n(), max))) %>%
ungroup
giving:
# A tibble: 7 x 4
# Groups: id [2]
id value result out
<dbl> <dbl> <dbl> <dbl>
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
2) It is also possible to perform the grouping via the width (second) argument of rollapplyr like this eliminating dplyr. In this case the widths are 1, 2, 3, 4, 1, 2, 3 and Max is like max except it does not use the last element of its argument x. (An alternate expression for the width would be seq_along(id) - match(id, id) + 1).
library(zoo)
Max <- function(x) if (length(x) == 1) NA else max(head(x, -1))
transform(df, out = rollapplyr(value, sequence(rle(id)$lengths), Max))
giving:
id value result out
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
A data.table option using shift + cummax
> setDT(df)[, result2 := shift(cummax(value)), id][]
id value result result2
1: 1 2 NA NA
2: 1 5 2 2
3: 1 3 5 5
4: 1 2 5 5
5: 2 4 NA NA
6: 2 5 4 4
7: 2 4 5 5
library(dplyr)
df |>
group_by(id) |>
mutate(result = lag(cummax(value)))
# # A tibble: 7 x 3
# # Groups: id [2]
# id value result
# <dbl> <dbl> <dbl>
# 1 1 2 NA
# 2 1 5 2
# 3 1 3 5
# 4 1 2 5
# 5 2 4 NA
# 6 2 5 4
# 7 2 4 5
Here is a base R solution. This would just get you the cumulative maximum:
df$result = ave(df$value, df$i, FUN=cummax)
To get the cumulative maximum with the lag you wanted:
df$result = ave(df$value, df$i, FUN=function(x) c(NA,cummax(x[-(length(x))])))
For the following Panel data (Tracking the Value for unit "ID" over "Time" :
ID=c(1,1,1,1,1,2,2,2,2,2)
Time=c(1,2,3,4,5,1,2,3,4,5)
Value=c(1,9,4,8,5,2,5,9,7,6)
I would like to create a vector which is a maximum value for each "ID" over the last two days (assuming that the unit of Time is a day)
Output vector "Max_Value" would be as follows:
Max_Value=c(1,9,9,8,8,2,5,9,9,7)
To clarify, here's how Max_Value is computed for ID "1".
For ID "1", the maximum value by the "Time=1" is 1, which is a maximum of {1}.
Similarly, for ID "1", the maximum value at the "Time 2" is 9, which is a maximum of {1,9}.
Again, for ID "1", the maximum value at the "Time 3" is 9, which is a maximum of {9,4}.
For ID "1", the maximum value at the "Time 4" is 8, which is a maximum of {4,8}.
For ID "1", the maximum value at the "Time 5" is 8, which is a maximum of {8,5}.
If you just have vectors and Time is complete and sorted, slide + ave could work well for you:
ave(Value, ID, FUN = function(x) slider::slide_dbl(x, max, .before=1))
#> [1] 1 9 9 8 8 2 5 9 9 7
Or even a full Base R solution:
Value[ave(Value, ID, FUN = function(x) c(0, -(diff(x)<0))) + seq_along(Value)]
#> [1] 1 9 9 8 8 2 5 9 9 7
Otherwise you can solve it with dplyr + slider:
library(dplyr)
data.frame(ID, Time, Value) %>%
group_by(ID) %>%
mutate(Max_Value = slider::slide_index_dbl(Value, Time, max, .before=1)) %>%
ungroup()
#> # A tibble: 10 x 4
#> ID Time Value Max_Value
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1 1
#> 2 1 2 9 9
#> 3 1 3 4 9
#> 4 1 4 8 8
#> 5 1 5 5 8
#> 6 2 1 2 2
#> 7 2 2 5 5
#> 8 2 3 9 9
#> 9 2 4 7 9
#> 10 2 5 6 7
Try this:
library(data.table)
dt <- data.table(ID=c(1,1,1,1,1,2,2,2,2,2),
Time=c(1,2,3,4,5,1,2,3,4,5),
Value=c(1,9,4,8,5,2,5,9,7,6))
max_v <- function(x) max(dt[ID==x$ID & Time <= x$Time & Time > (x$Time-2) ,Value])
sapply(split(dt,1:nrow(dt)),max_v)
I believe you can use a rollapply() style function from zoo setting a width of 2:
library(dplyr)
library(tidyr)
library(zoo)
#Data
df <- data.frame(ID,Time,Value)
#Code
newdf <- df %>% group_by(ID) %>%
mutate(Max=rollapply(Value,width=2,FUN=function(x) max(x, na.rm=TRUE),
by=1, by.column=TRUE,partial=TRUE,fill=NA, align="right"))
Output:
# A tibble: 10 x 4
# Groups: ID [2]
ID Time Value Max
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 1 2 9 9
3 1 3 4 9
4 1 4 8 8
5 1 5 5 8
6 2 1 2 2
7 2 2 5 5
8 2 3 9 9
9 2 4 7 9
10 2 5 6 7
With data.table you also can try frollapply (fast rolling function). Note that fill is set to first(Value) in initial row of ID group where there is only one element available instead of two.
dt <- data.frame(ID,Time,Value)
setDT(dt)
dt[, ValueMax := frollapply(x = Value,
n = 2,
max,
fill = first(Value),
align = "right",
na.rm = TRUE),
by = ID]
Output
ID Time Value ValueMax
1: 1 1 1 1
2: 1 2 9 9
3: 1 3 4 9
4: 1 4 8 8
5: 1 5 5 8
6: 2 1 2 2
7: 2 2 5 5
8: 2 3 9 9
9: 2 4 7 9
10: 2 5 6 7
This question already has answers here:
Why are my dplyr group_by & summarize not working properly? (name-collision with plyr)
(5 answers)
Closed 2 years ago.
I would like to calculate a rolling sum (or a custom function) of 3 previous values, treating each group separately. I have tried this:
require(dplyr)
# Build dataframe
df <- data.frame(person = c(rep("Peter", 5), rep("James", 5)),
score1 = c(1,3,2,5,4,6,8,4,5,3),
score2 = c(1,1,1,5,1,3,4,8,9,0))
# Attempt rolling sum by group
df %>%
group_by(person) %>%
mutate(s1_rolling = rollsumr(score1, k = 3, fill = NA),
s2_rolling = rollsumr(score2, k = 3, fill = NA))
But the new columns do not treat each group separately, instead continuing down the whole dataset:
person score1 score2 s1_rolling s2_rolling
<chr> <dbl> <dbl> <dbl> <dbl>
1 Peter 1 1 NA NA
2 Peter 3 1 NA NA
3 Peter 2 1 6 3
4 Peter 5 5 10 7
5 Peter 4 1 11 7
6 James 6 3 15 9
7 James 8 4 18 8
8 James 4 8 18 15
9 James 5 9 17 21
10 James 3 0 12 17
I would like row 6 and 7 to show NA in the two new columns, because until row 8 there is insufficient James data to sum 3 rows.
How can I do this?
It could be that plyr was also loaded and the mutate from plyr masked the mutate from dplyr. We could use dplyr::mutate
library(dplyr)
library(zoo)
df %>%
group_by(person) %>%
dplyr::mutate(s1_rolling = rollsumr(score1, k = 3, fill = NA),
s2_rolling = rollsumr(score2, k = 3, fill = NA))
# A tibble: 10 x 5
# Groups: person [2]
# person score1 score2 s1_rolling s2_rolling
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 Peter 1 1 NA NA
# 2 Peter 3 1 NA NA
# 3 Peter 2 1 6 3
# 4 Peter 5 5 10 7
# 5 Peter 4 1 11 7
# 6 James 6 3 NA NA
# 7 James 8 4 NA NA
# 8 James 4 8 18 15
# 9 James 5 9 17 21
#10 James 3 0 12 17
If there are more than one column, we can also use across
df %>%
group_by(person) %>%
dplyr::mutate(across(starts_with('score'),
~ rollsumr(., k = 3, fill = NA), .names = '{col}_rolling'))
For a faster version, use RcppRoll::roll_sumr
df %>%
group_by(person) %>%
dplyr::mutate(across(starts_with('score'),
~ RcppRoll::roll_sumr(., 3, fill = NA), .names = '{col}_rolling'))
The behavior can be reproduced with plyr::mutate
df %>%
group_by(person) %>%
plyr::mutate(s1_rolling = rollsumr(score1, k = 3, fill = NA),
s2_rolling = rollsumr(score2, k = 3, fill = NA))
# A tibble: 10 x 5
# Groups: person [2]
# person score1 score2 s1_rolling s2_rolling
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 Peter 1 1 NA NA
# 2 Peter 3 1 NA NA
# 3 Peter 2 1 6 3
# 4 Peter 5 5 10 7
# 5 Peter 4 1 11 7
# 6 James 6 3 15 9
# 7 James 8 4 18 8
# 8 James 4 8 18 15
# 9 James 5 9 17 21
#10 James 3 0 12 17
I would suggest a slider approach with slide_dbl() function with works similar to zoo and it is compatible with dplyr:
library(slider)
library(dplyr)
#Code
# Build dataframe
df <- data.frame(person = c(rep("Peter", 5), rep("James", 5)),
score1 = c(1,3,2,5,4,6,8,4,5,3),
score2 = c(1,1,1,5,1,3,4,8,9,0))
# Attempt rolling sum by group
df %>%
group_by(person) %>%
mutate(s1_rolling = slide_dbl(score1, sum, .before = 2, .complete = TRUE),
s2_rolling = slide_dbl(score2, sum, .before = 2, .complete = TRUE))
Output:
# A tibble: 10 x 5
# Groups: person [2]
person score1 score2 s1_rolling s2_rolling
<fct> <dbl> <dbl> <dbl> <dbl>
1 Peter 1 1 NA NA
2 Peter 3 1 NA NA
3 Peter 2 1 6 3
4 Peter 5 5 10 7
5 Peter 4 1 11 7
6 James 6 3 NA NA
7 James 8 4 NA NA
8 James 4 8 18 15
9 James 5 9 17 21
10 James 3 0 12 17