conditionally summarize several variables by group - r

I want to conditionally summarize several variables by group. The following code does that, but I'm not sure how to do this without specifying each variable and the conditions in the summarize step.
library(tidyverse)
dat <- data.frame(group = c("A", "A", "A", "B", "B", "B"),
indicator = c(1, 2, 3, 1, 2, 3),
var1 = c(1, 0, 1, 2, 1, 2),
var2 = c(1, 0, 1, 1, 2, 1))
# dat
# group indicator var1 var2
#1 A 1 1 1
#2 A 2 0 0
#3 A 3 1 1
#4 B 1 2 1
#5 B 2 1 2
#6 B 3 2 1
dat %>%
group_by(group) %>%
summarise(var1 = sum(var1[indicator==1 | indicator==2]),
var2 = sum(var2[indicator==1 | indicator==2]))
# A tibble: 2 x 3
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3

Use across :
library(dplyr)
dat %>%
group_by(group) %>%
summarise(across(starts_with('var'), ~sum(.[indicator %in% 1:2])))
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3

Related

how to recode dummy column with the column name?

I have the input dataset, and I'm looking for generating the output dataset by recoding 1 as the name of the columns and 0 as NA. I managed to do it manually see Not optional solution below. But I have a dataset with hundreds of columns, so I'm looking for a way to automatize this process.
Packages
library(tibble)
library(dplyr)
Input
input <- tibble( a = c(1, 0, 0, 1, 0),
b = c(0, 0, 0, 1, 1),
c = c(1, 1, 1, 1, 1),
d = c(0, 0, 0, 0, 0))
# # A tibble: 5 × 4
# a b c d
# <dbl> <dbl> <dbl> <dbl>
# 1 1 0 1 0
# 2 0 0 1 0
# 3 0 0 1 0
# 4 1 1 1 0
# 5 0 1 1 0
Output
output <- tibble( a = c("a", NA, NA, "a", NA),
b = c(NA, NA, NA, "b", NA),
c = c("c", "c", "c", "c", "c"),
d = c(NA, NA, NA, NA, NA))
# # A tibble: 5 × 4
# a b c d
# <chr> <chr> <chr> <lgl>
# 1 a NA c NA
# 2 NA NA c NA
# 3 NA NA c NA
# 4 a b c NA
# 5 NA NA c NA
Not optional solution
input %>%
mutate(a = case_when(a == 1 ~ "a",
T ~ NA_character_),
b = case_when(b == 1 ~ "b",
T ~ NA_character_),
c = case_when(c == 1 ~ "c",
T ~ NA_character_),
d = case_when(d == 1 ~ "d",
T ~ NA_character_))
We could use across with an ifelse statement:
library(dplyr)
input %>%
mutate(across(everything(), ~ifelse(. == 1, cur_column(), NA)))
a b c d
<chr> <chr> <chr> <lgl>
1 a NA c NA
2 NA NA c NA
3 NA NA c NA
4 a b c NA
5 NA b c NA

Group when values in two columns are identical and calculate the mean

I got a dataset like this:
df1 <- data.frame(
var1 = c(1, 1, 1, 2),
var2 = c(1, 2, 2, 1),
value = c(1, 2, 3, 4))
I want to group rows in var1 and var2 and calculate the mean of value, and the condition is that when rows with the same var1 and var2 values will be grouped together (so it is not simply grouped by unique values in var1 and var2).
The output dataset will be this:
df2 <- data.frame(
var1 = c(1, 1, 2),
var2 = c(1, 2, 1),
value = c(1, 2.5, 4))
How can I do this?
Using aggregate().
aggregate(value ~ var1 + var2, df1, mean)
# var1 var2 value
# 1 1 1 1.0
# 2 2 1 4.0
# 3 1 2 2.5
You may try
library(dplyr)
df1 %>%
group_by(var1, var2) %>%
summarise(value = mean(value))
var1 var2 value
<dbl> <dbl> <dbl>
1 1 1 1
2 1 2 2.5
3 2 1 4
group_by(var1, var2) will group both variable together.

filter infinite values and NAs in same call using dplyr::c_across and filter_if

I'm looking to filter dataframe rows with Inf and NA in the same call using filter with c_across and deprecated filter_if:
library(dplyr)
df <- tibble(a = c(1, 2, 3, NA, 1), b = c(5, Inf, 8, 8, 3), c = c(9, 10, Inf, 11, 12), d = c('a', 'b', 'c', 'd', 'e'), e = c(1, 2, 3, 4, -Inf))
# # A tibble: 5 x 5
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
# 2 2 Inf 10 b 2
# 3 3 8 Inf c 3
# 4 NA 8 11 d 4
# 5 1 3 12 e -Inf
I could do this in two calls using either c_across or filter_if:
df %>%
rowwise %>%
filter(!any(is.infinite(c_across(where(is.numeric))))) %>%
filter(!any(is.na(c_across(where(is.numeric)))))
# # A tibble: 1 x 5
# # Rowwise:
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
#OR filter_if:
df %>%
filter_if(~is.numeric(.), all_vars(!is.infinite(.))) %>%
filter_if(~is.numeric(.), all_vars(!is.na(.)))
# # A tibble: 1 x 5
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
How would I do both approaches in one call to filter (and filter_if)? There may be an across approach too?
thanks
Try this. Use the where to identify your numeric columns.
df %>%
filter(across(.cols = where(is.numeric),
.fns = ~!is.infinite(.x) & !is.na(.x)))
I would suggest an approach with across() from dplyr:
library(dplyr)
#Data
df <- tibble(a = c(1, 2, 3, NA, 1),
b = c(5, Inf, 8, 8, 3),
c = c(9, 10, Inf, 11, 12),
d = c('a', 'b', 'c', 'd', 'e'),
e = c(1, 2, 3, 4, -Inf))
#Mutate
df %>% filter(across(c(a:e), ~ !is.na(.) & !is.infinite(.)))
Output:
# A tibble: 1 x 5
a b c d e
<dbl> <dbl> <dbl> <chr> <dbl>
1 1 5 9 a 1

How do you use dot referencing (for the data frame) and groups in dplyr?

Consider the following example data:
tmp_df_dplyr <- data.frame(groups = rep(c("C", "B", "A"), each = 3),
a = c(-2, 0, -1, -1, 0, 1, 0, 1, 2),
b = rep(c(-1, 0, 1), each = 3))
I wish to do the following, except using colSums:
tmp_df_dplyr %>%
group_by(groups) %>%
summarise(min_group = min(c(sum(a), sum(b))))
# produces:
# A tibble: 3 × 2
groups min_group
<fctr> <dbl>
1 A 3
2 B 0
3 C -3
Using dot referencing, I get an unexpected result:
tmp_df_dplyr %>%
group_by(groups) %>%
summarise(min_group = min(colSums(.[, c('a', 'b')])))
# produces
# A tibble: 3 × 2
groups min_group
<fctr> <dbl>
1 A 0
2 B 0
3 C 0
that is, it looks like the groups are not being applied.

R: how to use table to tabulate m x n data

mydat = data.frame(Q1 = c(0, 1, 0, 1), Q2 = c(0, 1, 1, 1),
Q3 = c(1, 1, 1, 1), Gender = c("M", "M", "F", "F"))
> mydat
Q1 Q2 Q3 Gender
1 0 0 1 M
2 1 1 1 M
3 0 1 1 F
4 1 1 1 F
> table(mydat[,1:3], mydat$Gender)
Error in sort.list(y) : 'x' must be atomic for 'sort.list'
Have you called 'sort' on a list?
I have a very simple data set with 3 binary questions and a gender variable. I'm interested to see if there is any association between the 3 questions and gender, so I would like to tabulate my data into a 3 (questions) x 2 (gender) count table. I want my table to look something like this
Q1 Q2 Q3
M 1 1 2
F 1 2 2
Edit:
mydat = data.frame(Q1 = c(0, 1, NA, 1), Q2 = c(0, 1, 1, 1),
Q3 = c(1, NA, 1, 1), Gender = c("M", "M", "F", "F"))
> rowsum(mydat[1:3], mydat$Gender)
Q1 Q2 Q3
F NA 2 2
M 1 1 NA
We can do a group by operation and sum the elements of other columns
library(dplyr)
mydat %>%
group_by(Gender) %>%
summarise_all(funs(sum(., na.rm = TRUE)))
# A tibble: 2 x 4
# Gender Q1 Q2 Q3
# <fctr> <int> <int> <int>
#1 F 1 2 2
#2 M 1 1 2
Or using base R
rowsum(mydat[-4], mydat$Gender, na.rm = TRUE)
# Q1 Q2 Q3
#F 1 2 2
#M 1 1 2

Resources