mutate or summarise across rows by variable containing string - r

I'd like to create a new data table which is the sum across rows from variables which contain a string. I have been trying to keep this within the tidyverse as a noob using new dplyr across. Help much appreciated.
dat<- data.frame("Image" = c(1,2,3,4),
"A" = c(1,2,3,4),
"A:B"= c(5,6,7,8),
"A:B:C"= c(9,10,11,12))
to obtain the sums across the rows of variables containing "A", "B", or "C".
datsums<- data.frame("Image" = c(1,2,3,4),
"Asum"= c(15,18,21,24),
"Bsum"=c(14,16,18,20),
"Csum"=c(9,10,11,12))
I have been unsuccessful using the newer dplyr verbs:
datsums<- dat %>% summarise(across(str_detect("A")), sum, .names ="Asum",
across(str_detect("B")), sum, .names="Bsum",
across(str_detect("C")), sum, .names"Csum")

use rowwise and c_across:
library(tidyverse)
dat %>%
rowwise() %>%
summarise(
Asum = sum(c_across(contains("A"))),
Bsum = sum(c_across(contains("B"))),
Csum = sum(c_across(contains("C")))
)
Returns:
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 4 x 3
Asum Bsum Csum
<dbl> <dbl> <dbl>
1 16 14 9
2 20 16 10
3 24 18 11
4 28 20 12
To add columns to the original data.frame, use mutate instead of summarise:
dat %>%
rowwise() %>%
mutate(
Asum = sum(c_across(contains("A"))),
Bsum = sum(c_across(contains("B"))),
Csum = sum(c_across(contains("C")))
)
# A tibble: 4 x 7
# Rowwise:
Image A A.B A.B.C Asum Bsum Csum
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 5 9 16 14 9
2 2 2 6 10 20 16 10
3 3 3 7 11 24 18 11
4 4 4 8 12 28 20 12

Since you want row-wise sum you could use :
library(dplyr)
dat %>%
transmute(Asum = rowSums(select(., contains('A', ignore.case = FALSE))),
Bsum = rowSums(select(., contains('B', ignore.case = FALSE))),
Csum = rowSums(select(., contains('C', ignore.case = FALSE))))
Or for many variables use :
cols <- c('A', 'B', 'C')
purrr::map_dfc(cols, ~dat %>%
transmute(!!paste0(.x, 'sum') :=
rowSums(select(., contains(.x, ignore.case = FALSE)))))
# Asum Bsum Csum
#1 15 14 9
#2 18 16 10
#3 21 18 11
#4 24 20 12

use pivot_longer and pivot_wider
library(tidyverse)
dat %>%
pivot_longer(-Image) %>%
separate_rows(name, sep = "\\.") %>%
pivot_wider(Image,
names_from = name,
values_from = value,
values_fn = sum,
names_prefix = "sum")
#> # A tibble: 4 x 4
#> Image sumA sumB sumC
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 15 14 9
#> 2 2 18 16 10
#> 3 3 21 18 11
#> 4 4 24 20 12
Created on 2020-12-07 by the reprex package (v0.3.0)

Related

Concisely assign vector output of a function to multiple variables in dplyr

I am trying to assign the vector output (i.e. greater than length 1) of a function to multiple columns in a single operation (or at least as concisely as possible).
Take the range() function for example which returns as output a numeric vector of length 2 denoting the minimum and maximum, respectively. Let's say I want to compute the range() per group and assign the output to two columns min and max.
My current approach is combining summarize followed by manually adding a key and then re-shaping to wide format:
library(magrittr)
# create data
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
df
#> # A tibble: 9 x 2
#> group x
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 a 8
#> 4 b 9
#> 5 b 14
#> 6 b 9
#> 7 c 11
#> 8 c 6
#> 9 c 12
# summarize gives two lines per group
range_df <- df %>%
dplyr::group_by(group) %>%
dplyr::summarize(range = range(x)) %>%
dplyr::ungroup()
range_df
#> # A tibble: 6 x 2
#> group range
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 b 9
#> 4 b 14
#> 5 c 6
#> 6 c 12
# add key and reshape
range_df %>%
dplyr::mutate(key = rep(c("min", "max"), 3)) %>%
tidyr::pivot_wider(names_from = key, values_from = range)
#> # A tibble: 3 x 3
#> group min max
#> <chr> <int> <int>
#> 1 a 8 12
#> 2 b 9 14
#> 3 c 6 12
Is there a more elegant / concise alternative to this?
Edit:
Ideally the alternative solution could handle an arbitrary number of outputs (e.g. if the function returns an output with length 3 then 3 variables should be created).
# Writw a small function that does the job:
library(tidyverse)
f <- function(x){
setNames(data.frame(t(range(x))), c('min', 'max'))
}
df %>%
summarise(across(x, f, .unpack = TRUE), .by=group)
#> # A tibble: 3 × 3
#> group x_min x_max
#> <chr> <int> <int>
#> 1 a 10 13
#> 2 b 7 10
#> 3 c 10 12
If you are using older version of dplyr
df %>%
group_by(group)%>%
summarise(across(x, f))%>%
unpack(x)
#> # A tibble: 3 × 3
#> group min max
#> <chr> <int> <int>
#> 1 a 6 9
#> 2 b 7 12
#> 3 c 6 10
Based on onyambu's answer, I build a small generic function for this. There probably will be some edge cases, where this will not work.
out2col <- function(x, fun, out_names = c(), add_args = list()) {
tmp <- do.call(what = fun, args = c(list(x), add_args))
out <- data.frame(t(tmp))
if (length(out_names) != 0) {
if (length(tmp) != length(out_names)) {
stop("provided names did not match the number of outputs")
}
out <- setNames(object = out, nm = out_names)
}
return(out)
}
Examples without any additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = range),
.by=group)
Output:
# A tibble: 3 × 3
group x_X1 x_X2
<chr> <int> <int>
1 a 7 10
2 b 11 14
3 c 9 14
Examples with additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = quantile,
out_names = c("min", "max", "Q25"),
add_args = list(probs = c(0, 1, 0.25))
),
.by=group)
Output:
# A tibble: 3 × 4
group x_min x_max x_Q25
<chr> <dbl> <dbl> <dbl>
1 a 7 10 7.5
2 b 11 14 11.5
3 c 9 14 10
set.seed(1)
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
function
g <- function(x){
data.frame(min = min(x), max = max(x))
}
calling g:
df %>%
group_by(group) %>%
summarise(across(x, g, .unpack = TRUE))

Count unique values by group

DATA = data.frame("TRIMESTER" = c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),
"STUDENT" = c(1,2,3,4,5,6,7,1,2,3,5,9,10,11,3,7,10,6,12,15,17,16,21))
WANT = data.frame("TRIMESTER" = c(1,2,3),
"NEW_ENROLL" = c(7,3,5),
"TOTAL_ENROLL" = c(7,10,15))
I Have 'DATA' and want to make 'WANT' which has three columns and for every 'TRIMESTER' you count the number of NEW 'STUDENT' and then for 'TOTAL_ENROLL' you just count the total number of unique 'STUDENT' every trimester.
My attempt only counts the number for each TRIMESTER.
library(dplyr)
DATA %>%
group_by(TRIMESTER) %>%
count()
Here is a way.
suppressPackageStartupMessages(library(dplyr))
DATA <- data.frame("TRIMESTER" = c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),
"STUDENT" = c(1,2,3,4,5,6,7,1,2,3,5,9,10,11,3,7,10,6,12,15,17,16,21))
DATA %>%
mutate(NEW_ENROLL = !duplicated(STUDENT)) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = sum(NEW_ENROLL)) %>%
ungroup() %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
#> # A tibble: 3 × 3
#> TRIMESTER NEW_ENROLL TOTAL_ENROLL
#> <dbl> <int> <int>
#> 1 1 7 7
#> 2 2 3 10
#> 3 3 5 15
Created on 2022-08-14 by the reprex package (v2.0.1)
For variety we can use Base R aggregate with transform
transform(aggregate(. ~ TRIMESTER , DATA[!duplicated(DATA$STUDENT),] , length),
TOTAL_ENROLL = cumsum(STUDENT))
Output
TRIMESTER STUDENT TOTAL_ENROLL
1 1 7 7
2 2 3 10
3 3 5 15
We replace the duplicated elements in 'STUDENT' to NA, grouped by TRIMESTER, get the sum of nonNA elements and finally do the cumulative sum (cumsum)
library(dplyr)
DATA %>%
mutate(STUDENT = replace(STUDENT, duplicated(STUDENT), NA)) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = sum(!is.na(STUDENT)), .groups= 'drop') %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
-output
# A tibble: 3 × 3
TRIMESTER NEW_ENROLL TOTAL_ENROLL
<dbl> <int> <int>
1 1 7 7
2 2 3 10
3 3 5 15
Or with distinct
distinct(DATA, STUDENT, .keep_all = TRUE) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = n(), .groups = 'drop') %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
# A tibble: 3 × 3
TRIMESTER NEW_ENROLL TOTAL_ENROLL
<dbl> <int> <int>
1 1 7 7
2 2 3 10
3 3 5 15

Divide group sum by total sum

I am using the dplyr package. Let's suppose I have the below table.
Group
count
A
20
A
10
B
30
B
35
C
50
C
60
My goal is to create a summary table that contains the mean per each group, and also, the percentage of the mean of each group compared to the total means added together. So the final table will look like this:
Group
avg
prcnt_of_total
A
15
.14
B
32.5
.31
C
55
.53
For example, 0.14 is the result of the following calculation: 15/(15+32.5+55)
Right now, I was only able to produce the first column code that calculates the mean for each group:
summary_df<- df %>%
group_by(Group)%>%
summarise(avg=mean(count))
I still don't know how to produce the prcnt_of_total column. Any suggestions?
You can use the following code:
df <- read.table(text="Group count
A 20
A 10
B 30
B 35
C 50
C 60", header = TRUE)
library(dplyr)
df %>%
group_by(Group) %>%
summarise(avg = mean(count)) %>%
ungroup() %>%
mutate(prcnt_of_total = prop.table(avg))
#> # A tibble: 3 × 3
#> Group avg prcnt_of_total
#> <chr> <dbl> <dbl>
#> 1 A 15 0.146
#> 2 B 32.5 0.317
#> 3 C 55 0.537
Created on 2022-07-14 by the reprex package (v2.0.1)
We can drop the group in summarise itself.
library(dplyr)
df1 %>%
group_by(Group) %>%
summarise(avg = mean(count), .groups = "drop") %>%
mutate(prcnt_of_total = avg/sum(avg))
#> # A tibble: 3 x 3
#> Group avg prcnt_of_total
#> <chr> <dbl> <dbl>
#> 1 A 15 0.146
#> 2 B 32.5 0.317
#> 3 C 55 0.537
On another note, I am not sure if getting the average divided by the sum of averages is a meaningful metric unless we are sure to have the same number of entries per group. Given that, I suggested another solution as well.
## if you always have the same number of rows between the groups
df1 %>%
group_by(Group) %>%
summarise(avg = mean(count),
prcnt_of_total = sum(count)/sum(.$count))
#> # A tibble: 3 x 3
#> Group avg prcnt_of_total
#> <chr> <dbl> <dbl>
#> 1 A 15 0.146
#> 2 B 32.5 0.317
#> 3 C 55 0.537
Data:
read.table(text = "Group count
A 20
A 10
B 30
B 35
C 50
C 60",
header = T, stringsAsFactors = F) -> df1
You can do this:
df %>%
group_by(Group) %>%
summarize(avg = mean(count), prcent_of_total = sum(count)/sum(df$count))
Output:
Group avg prcent_of_total
<chr> <dbl> <dbl>
1 A 15 0.146
2 B 32.5 0.317
3 C 55 0.537
data.table is similar:
library(data.table)
setDT(df)[,.(avg = mean(count), prcent_of_total = sum(count)/sum(df$count)),Group]

R: How to summarize and group by variables as column names

I have a wide dataframe with about 200 columns and want to summarize it over various columns. I can not figure the syntax for this, I think it should work with .data$ and .env$ but I don't get it. Heres an example:
> library(dplyr)
> df = data.frame('A'= c('X','X','X','Y','Y'), 'B'= 1:5, 'C' = 6:10)
> df
A B C
1 X 1 6
2 X 2 7
3 X 3 8
4 Y 4 9
5 Y 5 10
> df %>% group_by(A) %>% summarise(sum(B), sum(C))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 2 x 3
A `sum(B)` `sum(C)`
<chr> <int> <int>
1 X 6 21
2 Y 9 19
But I want to be able to do something like this:
columns_to_sum = c('B','C')
columns_to_group = c('A')
df %>% group_by(colums_to_group)%>% summarise(sum(columns_to_sum))
We can use across from the new version of dplyr
library(dplyr)
df %>%
group_by(across(colums_to_group)) %>%
summarise(across(all_of(columns_to_sum), sum, na.rm = TRUE), .groups = 'drop')
# A tibble: 2 x 3
# A B C
# <chr> <int> <int>
#1 X 6 21
#2 Y 9 19
In the previous version, we could use group_by_at along with summarise_at
df %>%
group_by_at(colums_to_group) %>%
summarise_at(vars(columns_to_sum), sum, na.rm = TRUE)

dplyr: getting grouped min and max of columns in a for loop [duplicate]

This question already has answers here:
Apply several summary functions (sum, mean, etc.) on several variables by group in one call
(7 answers)
Closed 3 years ago.
I am trying to get the grouped min and max of several columns using a for loop:
My data:
df <- data.frame(a=c(1:5, NA), b=c(6:10, NA), c=c(11:15, NA), group=c(1,1,1,2,2,2))
> df
a b c group
1 1 6 11 1
2 2 7 12 1
3 3 8 13 1
4 4 9 14 2
5 5 10 15 2
6 NA NA NA 2
My attempt:
cols <- df %>% select(a,b) %>% names()
for(i in seq_along(cols)) {
output <- df %>% dplyr::group_by(group) %>%
dplyr::summarise_(min=min(.dots=i, na.rm=T), max=max(.dots=i, na.rm=T))
print(output)
}
Desired output for column a:
group min max
<dbl> <int> <int>
1 1 1 3
2 2 4 5
Using dplyr package, you can get:
df %>%
na.omit() %>%
pivot_longer(-group) %>%
group_by(group, name) %>%
summarise(min = min(value),
max = max(value)) %>%
arrange(name, group)
# group name min max
# <dbl> <chr> <int> <int>
# 1 1 a 1 3
# 2 2 a 4 5
# 3 1 b 6 8
# 4 2 b 9 10
# 5 1 c 11 13
# 6 2 c 14 15
We can use summarise_all after grouping by 'group' and if it needs to be in a particular order, then use select to select based on the column names
library(dplyr)
library(stringr)
df %>%
group_by(group) %>%
summarise_all(list(min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE))) %>%
select(group, order(str_remove(names(.), "_.*")))
# A tibble: 2 x 7
# group a_min a_max b_min b_max c_min c_max
# <dbl> <int> <int> <int> <int> <int> <int>
#1 1 1 3 6 8 11 13
#2 2 4 5 9 10 14 15
Without to use for loop but using dplyr and tidyr from tidyverse, you can get the min and max of each columns by 1) pivoting the dataframe in a longer format, 2) getting the min and max value per group and then 3) pivoting wider the dataframe to get the expected output:
library(tidyverse)
df %>% pivot_longer(., cols = c(a,b,c), names_to = "Names",values_to = "Value") %>%
group_by(group,Names) %>% summarise(Min = min(Value, na.rm =TRUE), Max = max(Value,na.rm = TRUE)) %>%
pivot_wider(., names_from = Names, values_from = c(Min,Max)) %>%
select(group,contains("_a"),contains("_b"),contains("_c"))
# A tibble: 2 x 7
# Groups: group [2]
group Min_a Max_a Min_b Max_b Min_c Max_c
<dbl> <int> <int> <int> <int> <int> <int>
1 1 1 3 6 8 11 13
2 2 4 5 9 10 14 15
Is it what you are looking for ?
In base R, we can use aggregate and get min and max for multiple columns by group.
aggregate(.~group, df, function(x)
c(min = min(x, na.rm = TRUE),max= max(x, na.rm = TRUE)))
# group a.min a.max b.min b.max c.min c.max
#1 1 1 3 6 8 11 13
#2 2 4 5 9 10 14 15

Resources