group by and conditional summarize in R

group by and conditional summarize in R - r

My code is dirty.
if condition smaller than two, names = unpopular.
df <- data.frame(vote=c("A","A","A","B","B","B","B","B","B","C","D"),
val=c(rep(1,11))
)
df %>% group_by(vote) %>% summarise(val=sum(val))
out
vote val
<fct> <dbl>
1 A 3
2 B 6
3 C 1
4 D 1
but I need
vote val
<fct> <dbl>
1 A 3
2 B 6
3 unpopular 2
my idea is
df2 <- df %>% group_by(vote) %>% summarise(val=sum(val))
df2$vote[df2$val < 2] <- "unpop"
df2 %>% group_by....
it's not cool.
do you know any cool & helpful function ?

We can do a double grouping
library(dplyr)
df %>%
group_by(vote) %>%
summarise(val=sum(val)) %>%
group_by(vote = replace(vote, val <2, 'unpop')) %>%
summarise(val = sum(val))
-output
# A tibble: 3 x 2
# vote val
# <chr> <dbl>
#1 A 3
#2 B 6
#3 unpop 2
Or another option with rowsum
df %>%
group_by(vote = replace(vote, vote %in%
names(which((rowsum(val, vote) < 2)[,1])), 'unpopular')) %>%
summarise(val = sum(val))
Or using fct_lump_n from forcats
library(forcats)
df %>%
group_by(vote = fct_lump_n(vote, 2, other_level = "unpop")) %>%
summarise(val = sum(val))
# A tibble: 3 x 2
# vote val
# <fct> <dbl>
#1 A 3
#2 B 6
#3 unpop 2
Or using table
df %>%
group_by(vote = replace(vote,
vote %in% names(which(table(vote) < 2)), 'unpop')) %>%
summarise(val = sum(val))

If you want to vote based on sum of val in base R you can do this as :
aggregate(val~vote, transform(aggregate(val~vote, df, sum),
vote = replace(vote, val < 2, 'unpop')), sum)
# vote val
#1 A 3
#2 B 6
#3 unpop 2

Related

How to filter out groups empty for 1 column in Tidyverse

tibble(
A = c("A","A","B","B"),
x = c(NA,NA,NA,1),
y = c(1,2,3,4),
) %>% group_by(A) -> df
desired output:
tibble(
A = c("B","B"),
x = c(NA,1)
y = c(3,4),
)
I want to find all groups for which all elements of x and x only are all NA, then remove those groups. "B" is filtered in because it has at least 1 non NA element.
I tried:
df %>%
filter(all(!is.na(x)))
but it seems that filters out if it finds at least 1 NA; I need the correct word, which is not all.

This will remove groups of column A if all elements of x are NA:
library(dplyr)
df %>%
group_by(A) %>%
filter(! all(is.na(x)))
# A tibble: 2 × 3
# Groups: A [1]
# A x y
# <chr> <dbl> <dbl>
#1 B NA 3
#2 B 1 4
Note that group "A" was removed because both cells in the column x are not defined.

We can use any with complete.cases
library(dplyr)
df %>%
group_by(A) %>%
filter(any(complete.cases(x))) %>%
ungroup
-output
# A tibble: 2 × 3
A x y
<chr> <dbl> <dbl>
1 B NA 3
2 B 1 4
In the devel version of dplyr, we could use .by in filter thus we don't need to group_by/ungroup
df %>%
filter(any(complete.cases(x)), .by = 'A')
# A tibble: 2 × 3
A x y
<chr> <dbl> <dbl>
1 B NA 3
2 B 1 4

Apply the same function with multiple columns as inputs to multiple columns in R with tidyverse

As an example, I have the following data frame:
df <- data.frame(a1=1,a2=2,a3=3,b1=1,b2=2,b3=3)
I have a function:
fn <- function(x,y,z) x^y+(z-x)^(y-x)
I want the following:
df <- df %>% mutate(a=fn(a1,a2,a3),b=fn(b1,b2,b3))
The problem is, I have tons of triplets in my dataset, so it is not ideal to write them out one by one.

Here are base R options using:
split.default + lapply + do.call
cbind(
df,
lapply(
split.default(df, gsub("\\d+", "", names(df))),
function(x) do.call(fn, unname(x))
)
)
reshape + lapply + do.call
cbind(
df,
lapply(
subset(
reshape(
setNames(df, gsub("(\\d+)$", "\\.\\1", names(df))),
direction = "long",
varying = 1:length(df)
),
select = -c(time, id)
),
function(x) do.call(fn, as.list(x))
)
)
Output
a1 a2 a3 b1 b2 b3 a b
1 1 2 3 1 2 3 3 3

I would convert df to long format then use lag to create 3 columns then apply fn() on them
library(tidyverse)
df_long <- df %>%
pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)")
df_longer <- df_long %>%
pivot_longer(-c(set),
names_to = "key",
values_to = "val") %>%
arrange(key)
df_longer
#> # A tibble: 6 x 3
#> set key val
#> <chr> <chr> <dbl>
#> 1 1 a 1
#> 2 2 a 2
#> 3 3 a 3
#> 4 1 b 1
#> 5 2 b 2
#> 6 3 b 3
lag then apply fn(), keep only non-NA val_fn
df_longer <- df_longer %>%
group_by(key) %>%
mutate(val_lag1 = lag(val, n = 1),
val_lag2 = lag(val, n = 2)) %>%
mutate(val_fn = fn(val_lag2, val_lag1, val)) %>%
filter(!is.na(val_fn))
df_longer
#> # A tibble: 2 x 6
#> # Groups: key [2]
#> set key val val_lag1 val_lag2 val_fn
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 3 a 3 2 1 3
#> 2 3 b 3 2 1 3
Created on 2020-12-03 by the reprex package (v0.3.0)

I think it would be easier/shorter to combine columns into their separate group and apply the function to each column.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything(),
names_to = '.value',
names_pattern = '([a-z]+)') %>%
summarise(across(.fns = ~do.call(fn, as.list(.)))) -> result
result
# a b
# <dbl> <dbl>
#1 3 3
You can bind the result to your original dataset if needed.
bind_cols(df, result)
# a1 a2 a3 b1 b2 b3 a b
#1 1 2 3 1 2 3 3 3

turn pivot_wider() into spread()

I love the new tidyr pivot_wider function but since it hasn't been officially added to the CRAN package I was wondering how to convert the following code into the older spread() function (I do not have access to the server to DL tidyr from github)
test <- data.frame(x = c(1,1,2,2,2,2,3,3,3,4),
y = c(rep("a", 5), rep("b", 5)))
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
pivot_wider(id_cols = x, names_from = y, values_from = v1)
Desired Output:
# A tibble: 4 x 3
# Groups: x [4]
x a b
<dbl> <chr> <chr>
1 1 2 (1) NA
2 2 3 (0.75) 1 (0.25)
3 3 NA 3 (1)
4 4 NA 1 (1)
I tried (but is not quite right):
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
spread(y, v1) %>%
select(-n, -prop)
Any help appreciated!

One option is to remove the columns 'n', 'prop' before the spread statement as including them would create unique rows with that column values as well
library(dplyr)
library(tidyr)
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
select(-n, -prop) %>%
spread(y, v1)
# A tibble: 4 x 3
# Groups: x [4]
# x a b
# <dbl> <chr> <chr>
#1 1 2 (1) <NA>
#2 2 3 (0.75) 1 (0.25)
#3 3 <NA> 3 (1)
#4 4 <NA> 1 (1)
Or using base R
tbl <- table(test)
tbl[] <- paste0(tbl, "(", prop.table(tbl, 1), ")")

You can use data.table package:
> library(data.table)
> setDT(test)[,.(n=.N),by=.(x,y)][,.(y=y,n=n,final=gsub('\\(1\\)','',paste0(n,'(',round(prop.table(n),2), ')'))),by=x]
x y n final
1: 1 a 2 2
2: 2 a 3 3(0.75)
3: 2 b 1 1(0.25)
4: 3 b 3 3
5: 4 b 1 1

How to keep real values of grouped variable within dplyr package in R

My data is something like this:
group <- c(21, 21, 21, 9, 9, 9, 25, 25, 25)
a <- c(8,3,5,6,8,3,3,9,3)
b <- c(4,9,0,1,3,5,6,1,1)
c <- c(1,7,2,5,6,8,4,8,6)
value <- c(23,34,43,52,65,21,12,89,76)
df <- data.frame(group,a,b,c,value)
I applied following function to it.
out <- df %>%
select(group, a, b, value) %>%
group_by(group = gl(n()/3, 3)) %>%
summarise(res = mean(value), a=a[1], b=b[1])
print(out)
Then I am getting following result.
group res a b
<fct> <dbl> <dbl> <dbl>
1 1 33.3 8 4
2 2 46 6 1
3 3 59 3 6
>
My question is how to keep the orgiignal values of ID as they were in the output df like this
group res a b
<fct> <dbl> <dbl> <dbl>
1 21 33.3 8 4
2 9 46 6 1
3 25 59 3 6
>
Thanks in advance!

The issue is you are overwriting your group variable in group_by call hence you are not getting the original variable. You need to use some other name in group_by and then do the calculations.
We can use two options -
1) With summarise
library(dplyr)
df %>%
group_by(group1 = gl(n()/3, 3)) %>%
summarise(res = mean(value), a=a[1], b=b[1], group = group[1])
# group1 res a b group
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 1 33.3 8 4 21
#2 2 46 6 1 9
#3 3 59 3 6 25
2) With mutate
df %>%
select(group, a, b, value) %>%
group_by(group1 = gl(n()/3, 3)) %>%
mutate(res = mean(value), a=a[1], b=b[1]) %>%
slice(1)
In both the case, if you are no longer interested in keeping the grouping variable do ungroup() %>% select(-group1) to remove it.

R dplyr: summarise complete cases by group for all variables

I want to summarise variables by group for every variable in a dataset using dplyr. The summarised variables should be stored under a new name.
An example:
df <- data.frame(
group = c("A", "B", "A", "B"),
a = c(1,1,NA,2),
b = c(1,NA,1,1),
c = c(1,1,2,NA),
d = c(1,2,1,1)
)
df %>% group_by(group) %>%
mutate(complete_a = sum(complete.cases(a))) %>%
mutate(complete_b = sum(complete.cases(b))) %>%
mutate(complete_c = sum(complete.cases(c))) %>%
mutate(complete_d = sum(complete.cases(d))) %>%
group_by(group, complete_a, complete_b, complete_c, complete_d) %>% summarise()
results in my expected output:
# # A tibble: 2 x 5
# # Groups: group, complete_a, complete_b, complete_c [?]
# group complete_a complete_b complete_c complete_d
# <fct> <int> <int> <int> <int>
# A 1 2 2 2
# B 2 1 1 2
How can I generate the same output without duplicating the mutate statements per variable?
I tried:
df %>% group_by(group) %>% summarise_all(funs(sum(complete.cases(.))))
which works but does not rename the variables.

You are almost there. You have to use rename_all
library(dplyr)
df %>%
group_by(group) %>%
summarise_all(funs(sum(complete.cases(.)))) %>%
rename_all(~paste0("complete_", colnames(df)))
# A tibble: 2 x 5
# complete_group complete_a complete_b complete_c complete_d
# <fct> <int> <int> <int> <int>
#1 A 1 2 2 2
#2 B 2 1 1 2
Edit
Or as pointed all by #symbolrush, more directly without colnames:
df %>%
group_by(group) %>%
summarise_all(funs(sum(complete.cases(.)))) %>%
rename_all(~paste0("complete_", .))
## A tibble: 2 x 5
# complete_group complete_a complete_b complete_c complete_d
# <fct> <int> <int> <int> <int>
#1 A 1 2 2 2
#2 B 2 1 1 2

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

group by and conditional summarize in R - r

If you want to vote based on sum of val in base R you can do this as : aggregate(val~vote, transform(aggregate(val~vote, df, sum), vote = replace(vote, val < 2, 'unpop')), sum) # vote val #1 A 3 #2 B 6 #3 unpop 2

Related

How to filter out groups empty for 1 column in Tidyverse

Apply the same function with multiple columns as inputs to multiple columns in R with tidyverse

turn pivot_wider() into spread()

How to keep real values of grouped variable within dplyr package in R

R dplyr: summarise complete cases by group for all variables

Categories

Resources