How to find max values in each group in R using dplyr - r

How could I generate a new summary data.frame with only the largest values of a data.frame
I tried the code below, but the result was not what I expected.
n<-c('A','B','C','A','B','C','A','B','C')
x1<-c(1,2,3,4,5,6,7,8,9)
x2<-c(9,8,7,4,5,6,1,2,3)
df<-data.frame(n, x1, x2)
df%>%
group_by(n)%>%
mutate('x1'=max('x1'), 'x2'=max('x2'))
i wanted this
n X1 X2
A 7 9
B 8 8
C 9 7

Does this work:
library(dplyr)
df %>% group_by(n ) %>% summarise(across(x1:x2, max))
# A tibble: 3 x 3
n x1 x2
<chr> <dbl> <dbl>
1 A 7 9
2 B 8 8
3 C 9 7

We can use across with group_by
library(dplyr)
df %>%
group_by(n) %>%
summarise(across(everything(), max, na.rm = TRUE))
-output
# A tibble: 3 x 3
n x1 x2
<chr> <dbl> <dbl>
1 A 7 9
2 B 8 8
3 C 9 7
Or use aggregate in base R
aggregate(.~ n, df, max)

Just change it to x1 = max(x1) without the quotes and change mutate into summarise.
df%>%
group_by(n)%>%
summarise(x1=max(x1), x2=max(x2))

A data.table option
library(data.table)
setDT(df)[, Map(max, .SD), n]
gives
n x1 x2
1: A 7 9
2: B 8 8
3: C 9 7

Related

How to pass multiple functions with multiple arguments using purr::map?

I'm trying to evaluate multiple functions on my dataset using purr library. I tried spliting the data and then apply map with the functions list but I don't seem to find the right way to introduce my varaibles as arguments. What I am missing?
library(tidyverse)
df <- tibble(f1 = c('a','a','a','b','b','c','c','c','d','d'),
f2 = c('z','x','y','z','x','z','x','y','z','x'),
obs = sample(1:10, 10),
pred = sample(1:10,10))
#RMSE
#------------------------------------------------------------------------------
rmse <- function(obs, model){
n = length(obs)
diff = (model - obs)**2
rmse = sqrt(sum(diff)) / n
return(rmse)
}
#Mean Bias
#------------------------------------------------------------------------------
mean_bias <- function(obs, model){
n =length(obs)
diff=model-obs
mean_bias=sum(diff)/n
return(mean_bias)
}
#Mean error
#------------------------------------------------------------------------------
mean_error <- function(obs, model){
n =length(obs)
abs=abs(model-obs)
mean_error=sum(abs)/n
return(mean_error)
}
mod_eval <- list(rmse, mean_bias,mean_error)
test <- df%>%
split(list(c(.$f1,.$f2)))%>%
invoke_map(.f=mod_eval)##????
I'll expect to get something like this
f1 f2 rmse mean_bias mean_error
1 a x 17 11 5
2 b x 17 11 5
3 c x 17 11 5
4 d x 17 11 5
5 a y 17 11 5
6 b y 17 11 5
You can pass a named list of functions and use map to apply each function to each combination.
library(dplyr)
library(purrr)
mod_eval <- lst(rmse, mean_bias,mean_error)
df %>%
split(.$f1) %>%
map_df(function(x) map_dbl(mod_eval, ~.x(x$obs, x$pred)), .id = 'f1')
# f1 rmse mean_bias mean_error
# <chr> <dbl> <dbl> <dbl>
#1 a 2.33 -1.67 3.67
#2 b 3.91 5.5 5.5
#3 c 2.92 -2.33 4.33
#4 d 2.5 0.5 3.5
We could use tidyverse approaches
library(dplyr)
library(tidyr)
mod_eval <- dplyr::lst(rmse, mean_bias, mean_error)
df %>%
nest_by(f1) %>%
crossing(mod_eval) %>%
mutate(nm1 = names(mod_eval)) %>%
rowwise %>%
transmute(f1, nm1, out = list(mod_eval(data$obs, data$pred))) %>%
unnest(out) %>%
pivot_wider(names_from = nm1, values_from = out)
-output
f1 rmse mean_bias mean_error
<chr> <dbl> <dbl> <dbl>
1 a 1.80 0.333 3
2 b 3.35 -1.5 4.5
3 c 3.09 4 4.67
4 d 4.53 -5 5
if it is grouped by 'f1', 'f2'
df %>%
nest_by(f1, f2) %>%
crossing(mod_eval) %>%
mutate(nm1 = names(mod_eval)) %>%
rowwise %>%
transmute(f1, f2, nm1, out = list(mod_eval(data$obs, data$pred))) %>%
unnest(out) %>%
pivot_wider(names_from = nm1, values_from = out)
-output
# A tibble: 10 x 5
f1 f2 rmse mean_bias mean_error
<chr> <chr> <dbl> <dbl> <dbl>
1 a x 4 -4 4
2 a y 2 2 2
3 a z 3 3 3
4 b x 3 3 3
5 b z 6 -6 6
6 c x 1 -1 1
7 c y 7 7 7
8 c z 6 6 6
9 d x 9 -9 9
10 d z 1 -1 1

dplyr: getting grouped min and max of columns in a for loop [duplicate]

This question already has answers here:
Apply several summary functions (sum, mean, etc.) on several variables by group in one call
(7 answers)
Closed 3 years ago.
I am trying to get the grouped min and max of several columns using a for loop:
My data:
df <- data.frame(a=c(1:5, NA), b=c(6:10, NA), c=c(11:15, NA), group=c(1,1,1,2,2,2))
> df
a b c group
1 1 6 11 1
2 2 7 12 1
3 3 8 13 1
4 4 9 14 2
5 5 10 15 2
6 NA NA NA 2
My attempt:
cols <- df %>% select(a,b) %>% names()
for(i in seq_along(cols)) {
output <- df %>% dplyr::group_by(group) %>%
dplyr::summarise_(min=min(.dots=i, na.rm=T), max=max(.dots=i, na.rm=T))
print(output)
}
Desired output for column a:
group min max
<dbl> <int> <int>
1 1 1 3
2 2 4 5
Using dplyr package, you can get:
df %>%
na.omit() %>%
pivot_longer(-group) %>%
group_by(group, name) %>%
summarise(min = min(value),
max = max(value)) %>%
arrange(name, group)
# group name min max
# <dbl> <chr> <int> <int>
# 1 1 a 1 3
# 2 2 a 4 5
# 3 1 b 6 8
# 4 2 b 9 10
# 5 1 c 11 13
# 6 2 c 14 15
We can use summarise_all after grouping by 'group' and if it needs to be in a particular order, then use select to select based on the column names
library(dplyr)
library(stringr)
df %>%
group_by(group) %>%
summarise_all(list(min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE))) %>%
select(group, order(str_remove(names(.), "_.*")))
# A tibble: 2 x 7
# group a_min a_max b_min b_max c_min c_max
# <dbl> <int> <int> <int> <int> <int> <int>
#1 1 1 3 6 8 11 13
#2 2 4 5 9 10 14 15
Without to use for loop but using dplyr and tidyr from tidyverse, you can get the min and max of each columns by 1) pivoting the dataframe in a longer format, 2) getting the min and max value per group and then 3) pivoting wider the dataframe to get the expected output:
library(tidyverse)
df %>% pivot_longer(., cols = c(a,b,c), names_to = "Names",values_to = "Value") %>%
group_by(group,Names) %>% summarise(Min = min(Value, na.rm =TRUE), Max = max(Value,na.rm = TRUE)) %>%
pivot_wider(., names_from = Names, values_from = c(Min,Max)) %>%
select(group,contains("_a"),contains("_b"),contains("_c"))
# A tibble: 2 x 7
# Groups: group [2]
group Min_a Max_a Min_b Max_b Min_c Max_c
<dbl> <int> <int> <int> <int> <int> <int>
1 1 1 3 6 8 11 13
2 2 4 5 9 10 14 15
Is it what you are looking for ?
In base R, we can use aggregate and get min and max for multiple columns by group.
aggregate(.~group, df, function(x)
c(min = min(x, na.rm = TRUE),max= max(x, na.rm = TRUE)))
# group a.min a.max b.min b.max c.min c.max
#1 1 1 3 6 8 11 13
#2 2 4 5 9 10 14 15

invoke_map has the difficulty on finding arguments

I am exploring the tidyverse package. So I am interested in how to get the following task down in the tidy way. One can easily circumvent the problem using *apply functions.
Consider the following data
tb <-
lapply(matrix(c("a", "b", "c")), function(x)
rep(x, 3)) %>% unlist %>% c(rep(c(1, 2, 3), 6)) %>% matrix(ncol = 3) %>%
as_tibble(.name_repair = ~ c("tag", "x1", "x2")) %>% type.convert()
# A tibble: 9 x 3
tag x1 x2
<fct> <int> <int>
1 a 1 1
2 a 2 2
3 a 3 3
4 b 1 1
5 b 2 2
6 b 3 3
7 c 1 1
8 c 2 2
9 c 3 3
I group them using nest() function and for each group I want to apply a different function from a list of functions f_1, f_2, f_3
f_1 <- function(x)
x[,1] + x[,2]
f_2 <- function(x)
x[,1] - x[,2]
f_3 <- function(x)
x[,1] * x[,2]
tb_func_attached <-
tb %>% group_by(tag) %>% nest() %>% mutate(func = c(f_0, f_1, f_2))
# A tibble: 3 x 3
tag data func
<fct> <list> <list>
1 a <tibble [3 x 2]> <fn>
2 b <tibble [3 x 2]> <fn>
3 c <tibble [3 x 2]> <fn>
I try to use invoke_map to apply the functions
tb_func_attached %>% {invoke_map(.$func, .$data)}
invoke_map(tb_func_attached$func, tb_func_attached$data)
But I get the error Error in (function (x) : unused arguments (x1 = 1:3, x2 = 1:3), while the following code runs
> tb_func_attached$func[[1]](tb_func_attached$data[[1]])
x1
1 2
2 4
3 6
> tb_func_attached$func[[2]](tb_func_attached$data[[2]])
x1
1 0
2 0
3 0
> tb_func_attached$func[[3]](tb_func_attached$data[[3]])
x1
1 1
2 4
3 9
But invoke_map still does not work.
So the question is, given a nested data tb_func_attached, how to apply the functions tb_func_attached$func 'rowwisely' to tb_func_attached$data?
And a side question, what is the reason for the retirement of invoke_map? It fits quitely well in the concept of vetorisation, IMHO.
Update:
The previous version dealt with single column data (tb has only tag and x1 columns) and #A. Suliman's comment provides a solution.
However when the data column in the nested tibble has a matrix structure, the code stops running again.
Use map2 to iterate over the list of functions first, and over the data column second. Like this:
tb_func_attached %>%
mutate(output = map2(func, data, ~ .x(.y))) %>%
unnest(data, output)
The output looks this way:
# A tibble: 9 x 4
tag x1 x2 x11
<fct> <int> <int> <int>
1 a 1 1 2
2 a 2 2 4
3 a 3 3 6
4 b 1 1 0
5 b 2 2 0
6 b 3 3 0
7 c 1 1 1
8 c 2 2 4
9 c 3 3 9

R: dplyr and row_number() does not enumerate as expected

I want to enumerate each record of a dataframe/tibble resulted from a grouping. The index is according a defined order. If I use row_number() it does enumerate but within group. But I want that it enumerates without considering the former grouping.
Here is an example. To make it simple I used the most minimal dataframe:
library(dplyr)
df0 <- data.frame( x1 = rep(LETTERS[1:2],each=2)
, x2 = rep(letters[1:2], 2)
, y = floor(abs(rnorm(4)*10))
)
df0
# x1 x2 y
# 1 A a 12
# 2 A b 24
# 3 B a 0
# 4 B b 12
Now, I group this table:
df1 <- df0 %>% group_by(x1,x2) %>% summarize(y=sum(y))
This gives me a object of class tibble:
# A tibble: 4 x 3
# Groups: x1 [?]
# x1 x2 y
# <fct> <fct> <dbl>
# 1 A a 12
# 2 A b 24
# 3 B a 0
# 4 B b 12
I want to add a row number to this table using row_numer():
df2 <- df1 %>% arrange(desc(y)) %>% mutate(index = row_number())
df2
# A tibble: 4 x 4
# Groups: x1 [2]
# x1 x2 y index
# <fct> <fct> <dbl> <int>
# 1 A b 24 1
# 2 A a 12 2
# 3 B b 12 1
# 4 B a 0 2
row_number() does enumerate within the former grouping. This was not my intention. This can be avoid converting tibble to a dataframe first:
df2 <- df2 %>% as.data.frame() %>% arrange(desc(y)) %>% mutate(index = row_number())
df2
# x1 x2 y index
# 1 A b 24 1
# 2 A a 12 2
# 3 B b 12 3
# 4 B a 0 4
My question is: is this behaviour intended?
If yes: is it not very dangerous to incorporate former data processing into tibble? Which type of processing is incorporated?
At the moment I will convert tibble into dataframe to avoid this kind of unexpected results.
To elaborate on my comment: yes, retaining grouping is intended, and in many cases useful. It's only dangerous if you don't understand how group_by works—and that's true of any function. To undo group_by, you call ungroup.
Take a look at the group_by docs, as they're very thorough and explain how this function interacts with others, how grouping is layered, etc. The docs also explain how each call to summarise removes a layer of grouping—it might be there that you got confused about what's going on.
For example, you can group by x1 and x2, summarize y, and create a row number, which will give you the rows according to x1 (summarise removed a layer of grouping, i.e. drops the x2 grouping). Then ungrouping allows you to get row numbers based on the entire data frame.
library(dplyr)
df0 %>%
group_by(x1, x2) %>%
summarise(y = sum(y)) %>%
mutate(group_row = row_number()) %>%
ungroup() %>%
mutate(all_df_row = row_number())
#> # A tibble: 4 x 5
#> x1 x2 y group_row all_df_row
#> <fct> <fct> <dbl> <int> <int>
#> 1 A a 12 1 1
#> 2 A b 2 2 2
#> 3 B a 10 1 3
#> 4 B b 23 2 4
A use case—I do this for work probably every day—is to get sums within multiple groups (again, x1 and x2), then to find the shares of those values within their larger group (after peeling away a layer of grouping, this is x1) with mutate. Again, here I ungroup to show the shares instead of the entire data frame.
df0 %>%
group_by(x1, x2) %>%
summarise(y = sum(y)) %>%
mutate(share_in_group = y / sum(y)) %>%
ungroup() %>%
mutate(share_all_df = y / sum(y))
#> # A tibble: 4 x 5
#> x1 x2 y share_in_group share_all_df
#> <fct> <fct> <dbl> <dbl> <dbl>
#> 1 A a 12 0.857 0.255
#> 2 A b 2 0.143 0.0426
#> 3 B a 10 0.303 0.213
#> 4 B b 23 0.697 0.489
Created on 2018-10-11 by the reprex package (v0.2.1)
As camille nicely showed, there are good reasons for wanting to have the result of summarize() retain additional layers of grouping and it's a documented behaviour so not really dangerous or unexpected per se.
However one additional tip is that if you are just going to call ungroup() after summarize() you might as well use summarize(.groups = "drop") which will return an ungrouped tibble and save you a line of code.
library(tidyverse)
df0 <- data.frame(
x1 = rep(LETTERS[1:2], each = 2),
x2 = rep(letters[1:2], 2),
y = floor(abs(rnorm(4) * 10))
)
df0 %>%
group_by(x1,x2) %>%
summarize(y=sum(y), .groups = "drop") %>%
arrange(desc(y)) %>%
mutate(index = row_number())
#> # A tibble: 4 x 4
#> x1 x2 y index
#> <chr> <chr> <dbl> <int>
#> 1 A b 8 1
#> 2 A a 2 2
#> 3 B a 2 3
#> 4 B b 1 4
Created on 2022-02-06 by the reprex package (v2.0.1)

Short code to extract fitted value from tbl_df objects

I have a data set containing groups of data and I performed regression on on each group of data. I used dplyr to do the regression and get a tbl_df object with all results. Then I want to extract fitted value vector for each group of regression and put them in a data frame. I used to use summarise() to extract relevant information conveniently. But it only works for scalars. Here is some sample code with lapply I used to extract the information and I feel it kind of cumbersome:
library(dplyr)
library(reshape2)
df1 = data.frame(type1 = c(rep('a',5),rep('b',5)),
x = 1:10,
y = 11:20)
df1 %>%
group_by(type1) %>%
do(model = lm(y~x,.)) -> model1
names(model1$model) = model1$type1
lapply(model1$model,function(mod) mod$fit) %>%
melt
library(broom)
model1 %>% augment(model)
# A tibble: 10 x 10
# Groups: type1 [2]
type1 y x .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
<fctr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 11 1 11 2.482534e-16 3.567051e-19 0.6 3.925229e-16 2.322633e-06 0.001759785
2 a 12 2 12 1.755417e-16 3.026750e-16 0.3 2.977199e-16 2.730293e-01 1.128776594
3 a 13 3 13 1.433292e-16 -3.857170e-16 0.2 2.471607e-16 2.263176e-01 -1.345563357
4 a 14 4 14 1.755417e-16 -1.380180e-16 0.3 3.747906e-16 5.677113e-02 -0.514715401
5 a 15 5 15 2.482534e-16 2.207032e-16 0.6 3.052655e-16 8.891591e-01 1.088827560
6 b 16 6 16 1.709167e-15 -2.416065e-15 0.6 8.008132e-17 2.248024e+00 -1.731290167
7 b 17 7 17 1.208563e-15 2.359219e-15 0.3 1.824137e-15 3.499565e-01 1.277939838
8 b 18 8 18 9.867878e-16 1.265324e-15 0.2 2.510473e-15 5.138141e-02 0.641132787
9 b 19 9 19 1.208563e-15 5.595623e-17 0.3 2.702016e-15 1.968677e-04 0.030310330
10 b 20 10 20 1.709167e-15 -1.264434e-15 0.6 2.303179e-15 6.157097e-01 -0.906060815
Use the tidyverse package, which contains dplyr, purrr, tidyr
library(tidyverse)
Use nest and map
df1 %>%
group_by(type1) %>%
nest() %>%
mutate(data = map(data, ~lm(y~x,.x)$fit)) %>% # combined lm with $fit
unnest()
Output
type1 data
1 a 11
2 a 12
3 a 13
4 a 14
5 a 15
6 b 16
# etc
We can use modelr together with tidyverse. The add_predictions function is handy. Here is an example.
# Load package
library(tidyverse)
library(modelr)
# Create example data frame
df1 = data.frame(type1 = c(rep('a',5),rep('b',5)),
x = 1:10,
y = 11:20)
# Created nested data frame
df2 <- df1 %>%
group_by(type1) %>%
nest()
# A function to fit lm model to y ~ x
lm_model <- function(df) {
lm(y ~ x, data = df)
}
# Fit model
df3 <- df2 %>%
mutate(model = map(data, lm_model))
# Add prediction
df4 <- df3 %>%
mutate(Pred = map2(data, model, add_predictions))
# Unnest the data frame
df5 <- df4 %>% unnest(Pred)
df5
# A tibble: 10 x 4
type1 x y pred
<fctr> <int> <int> <dbl>
1 a 1 11 11
2 a 2 12 12
3 a 3 13 13
4 a 4 14 14
5 a 5 15 15
6 b 6 16 16
7 b 7 17 17
8 b 8 18 18
9 b 9 19 19
10 b 10 20 20
I'd like provide a slick answer using data.table package:
library(data.table)
df1 %>%
data.table %>%
.[,
.(x,
y,
fit = lm(y~x)$fit),
by = type1]

Resources