r calculate grouped counts for multiple factor variables - r

For demonstration purpose, I have dataset as following:
df <- data.frame(A = as.factor(floor(runif(20,1,6))),
B = as.factor(floor(runif(20,1,6))),
C = as.factor(floor(runif(20,1,6))),
D = c(rep('X',3), rep("Y",7), rep('Z',10)))
How can I iterate through column A, B and C to get counts for
count(df, D, A), count(df, D, B) and count(df, D, C)
This is a simplified version, if I need to do this for 20 or more variables, how can I automate the process?
I have tried:
f <- function(x) count(df, D, x)
result <- bind_rows(lapply(df[ , c('A','B','C')], f))
and I got the following error:
Error in grouped_df_impl(data, unname(vars), drop) :
Column `x` is unknown

Would using tidyr::gather first work for you so you can do the count all at once for the different variables? As #alistaire noted in the comments, this can be done using
df %>% gather(key, value, -D) %>% count(D, key, value)
which results in the same output as my unnecessary extra use of group_by
df %>% gather(key, value, -D) %>% group_by(D, key) %>% count(value)
Worked Solution
library(tidyverse)
df %>% gather(key, value, -D) %>% group_by(D, key) %>% count(value)
#> # A tibble: 34 x 4
#> # Groups: D, key [9]
#> D key value n
#> <fctr> <chr> <chr> <int>
#> 1 X A 2 1
#> 2 X A 3 1
#> 3 X A 4 1
#> 4 X B 4 2
#> 5 X B 5 1
#> 6 X C 1 1
#> 7 X C 3 2
#> 8 Y A 1 1
#> 9 Y A 3 3
#> 10 Y A 5 3
#> # ... with 24 more rows
Source data
set.seed(123)
df<-data.frame(A=as.factor(floor(runif(20,1,6))),
B=as.factor(floor(runif(20,1,6))),
C=as.factor(floor(runif(20,1,6))),
D=c(rep('X',3),rep("Y",7),rep('Z',10)))

We can use map2 to do the individual count of the subset of columns that involve columns other than 'D' with that of 'D'
library(tidyverse)
lst <- map2(names(df)[1:3], names(df)[4], ~count(df[c(.x, .y)],
!!!rlang::syms(c(.x, .y))))
lst
#[[1]]
# A tibble: 11 x 3
# A D n
# <fctr> <fctr> <int>
# 1 1 Z 2
# 2 2 X 1
# 3 2 Y 1
# 4 2 Z 2
# 5 3 X 2
# 6 3 Y 2
# 7 3 Z 4
# 8 4 Y 2
# 9 4 Z 1
#10 5 Y 2
#11 5 Z 1
#[[2]]
# A tibble: 11 x 3
# B D n
# <fctr> <fctr> <int>
# 1 1 Y 2
# 2 1 Z 2
# 3 2 Y 1
# 4 2 Z 1
# 5 3 Y 1
# 6 3 Z 2
# 7 4 X 3
# 8 4 Y 2
# 9 4 Z 3
#10 5 Y 1
#11 5 Z 2
#[[3]]
# A tibble: 12 x 3
# C D n
# <fctr> <fctr> <int>
# 1 1 Y 1
# 2 1 Z 1
# 3 2 X 2
# 4 2 Y 1
# 5 2 Z 4
# 6 3 X 1
# 7 3 Y 2
# 8 3 Z 1
# 9 4 Y 2
#10 4 Z 3
#11 5 Y 1
#12 5 Z 1
It is not clear whether to have a single dataset or a list of datasets

Related

Multiply every new rows created by `separate_rows`

I am using the separate_rows function from tidyr.
Essentially, I would like to change the value of the data that is copied -- in the example below, it would read: "everytime a new row is created, multiply z by 0.5"
I already added an index in the default df. so it could be "everytime the index N is the same as [-1], multiply z by 0.5"
df <- tibble(
x = 1:4,
y = c("a", "b,c,d", "e,f"),
z = 1:4
)
# A tibble: 3 x 3
x y z
<int> <chr> <int>
1 1 a 1
2 2 b,c,d 2
3 3 e,f 3
what we get:
> separate_rows(df, y)
# A tibble: 6 x 3
x y z
<int> <chr> <int>
1 1 a 1
2 2 b 2
3 2 c 2
4 2 d 2
5 3 e 3
6 3 f 3
what I would need (the z values that have a new row multipled by 0.5:
# A tibble: 6 x 3
x y z
<int> <chr> <int>
1 1 a 1
2 2 b 1
3 2 c 1
4 2 d 1
5 3 e 1.5
6 3 f 1.5
You can group by z and multiply if n > 1.
df %>%
separate_rows(y) %>%
group_by(z) %>%
mutate(z = ifelse(n() > 1, z*0.5, z))
x y z
<int> <chr> <dbl>
1 1 a 1
2 2 b 1
3 2 c 1
4 2 d 1
5 3 e 1.5
6 3 f 1.5
An option is also to multiply 'z' by 0.5, get the pmax with 1 and then use separate_rows
library(dplyr)
library(tidyr)
df %>%
mutate(z = pmax(1, z * 0.5)) %>%
separate_rows(y)
-output
# A tibble: 6 × 3
x y z
<int> <chr> <dbl>
1 1 a 1
2 2 b 1
3 2 c 1
4 2 d 1
5 3 e 1.5
6 3 f 1.5

Is there a way to get subdataframes with purrr in magrittr pipes workflow without using data.frame name?

That is, I was interested in doing the same as in the example, but with purrr functions.
tibble(a, b = a * 2, c = 1) %>%
{lapply(X = names(.), FUN = function(.x) select(., 1:.x))}
[[1]]
# A tibble: 5 x 1
a
<int>
1 1
2 2
3 3
4 4
5 5
[[2]]
# A tibble: 5 x 2
a b
<int> <dbl>
1 1 2
2 2 4
3 3 6
4 4 8
5 5 10
[[3]]
# A tibble: 5 x 3
a b c
<int> <dbl> <dbl>
1 1 2 1
2 2 4 1
3 3 6 1
4 4 8 1
5 5 10 1
I only could do it if I named foo <- tibble(a, b = a * 2, c = 1) and inside map I did select(foo, ...), but I wanted to avoid that, since I wanted to mutate the named dataframe in pipe workflow.
Thank you!
You can use map in the following way :
library(dplyr)
library(purrr)
tibble(a = 1:5, b = a * 2, c = 1) %>%
{map(names(.), function(.x) select(., 1:.x))}
Based on your actual use case you can also use imap which will pass column value (.x) along with it's name (.y).
tibble(a = 1:5, b = a * 2, c = 1) %>%
imap(function(.x, .y) select(., 1:.y))
#$a
# A tibble: 5 x 1
# a
# <int>
#1 1
#2 2
#3 3
#4 4
#5 5
#$b
# A tibble: 5 x 2
# a b
# <int> <dbl>
#1 1 2
#2 2 4
#3 3 6
#4 4 8
#5 5 10
#$c
# A tibble: 5 x 3
# a b c
# <int> <dbl> <dbl>
#1 1 2 1
#2 2 4 1
#3 3 6 1
#4 4 8 1
#5 5 10 1

How do I sum values from different rows in the tidyverse?

I have a data frame that looks as follows:
WORD CATEGORY n
<fct> <fct> <int>
1 A X 4
2 B X 3
3 C X 6
4 C Y 3
5 D X 2
6 E X 2
7 F Y 2
I want to add a column sum that adds together values in the column n based on CATEGORY. So in rows 3 and 4, for instance, the value of the sum column would be 9.
Here is what the full dataset would look like:
WORD CATEGORY n sum
<fct> <fct> <int> <int>
1 A X 4 4
2 B X 3 3
3 C X 6 9
4 C Y 3 9
5 D X 2 2
6 E X 2 2
7 F Y 2 2
How do I do this in the tidyverse?
If we count the number of unique values in CATEGORY and add it to the grouping variables we can directly sum up the n's:
dt %>%
group_by(WORD) %>%
mutate(uni=length(unique(CATEGORY))) %>%
group_by(WORD,uni) %>%
mutate(sum=sum(n)) %>%
ungroup %>%
select(-uni)
# A tibble: 7 x 4
WORD CATEGORY n sum
<fct> <fct> <int> <int>
1 A X 4 4
2 B X 3 3
3 C X 6 9
4 C Y 3 9
5 D X 2 2
6 E X 2 2
7 F Y 2 2

How to create columns from anothers columns?

I want to built a dataframe like df2 from df1, looking always for the name of the column where the value is closet to 0: Where clossets_1 - closer value to 0 of the columns x,y and z. clossets_2 - closer value to 0 of the columns x and a, because x is the most received value in clossets_1. clossets_3 - closer value to 0 of the columns a and b, because a is the most received value in clossets_2.
df1
df1
# x y z a b
#1 1 2 3 4 3
#2 2 3 4 1 2
#3 3 2 4 2 1
#4 4 3 2 3 6
Desire output:
df2
# x y z clossets_1 a clossets_2 b clossets_3
#1 1 2 3 x 4 x 3 b
#2 2 3 4 x 1 a 2 a
#3 3 2 4 y 2 a 1 b
#4 4 3 2 z 3 a 2 b
Here is the first step to get you started:
cols = c("x","y","z")
df2 = df1
df2$clossets_1 = cols[apply(df1[,cols], 1, function(x) {which(x == min(x))})]
df2
## x y z a b clossets_1
## 1 1 2 3 4 3 x
## 2 2 3 4 1 2 x
## 3 3 2 4 2 1 y
## 4 4 3 2 3 6 z
I solved it this way, using the first step of #BigFinger answer and the mlv() function from the package modeest to find the most repeated value in the closests columns
library(DescTools)
library(modeest)
library(tibble)
df1 = tibble(x = c(1,2,3,4),
y = c(2,3,2,3),
z = c(3,4,4,2),
clossest_1 = c("x","y","z")[apply(data.frame(x,y,z),1,function(x){which(x == Closest(x,0))})],
a = c(4,1,2,3),
clossest_2 = c(mlv(clossest_1),"a")[apply(data.frame(get(mlv(clossest_1)),a),1,function(x){which(x == Closest(x,0))})],
b = c(3,2,1,2),
clossest_3 = c(mlv(clossest_2),"b")[apply(data.frame(get(mlv(clossest_2)),b),1,function(x){which(x == Closest(x,0))})])
df1
# A tibble: 4 x 8
# x y z clossest_1 a clossest_2 b clossest_3
# <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <chr>
#1 1 2 3 x 4 x 3 b
#2 2 3 4 x 1 a 2 a
#3 3 2 4 y 2 a 1 b
#4 4 3 2 z 3 a 2 b

Programmatically rename data frame columns using lookup data frame

What is the best way to batch rename columns using a lookup data frame?
Can I do it as part of a pipe?
library(tidyverse)
df <- data_frame(
a = seq(1, 10)
, b = seq(10, 1)
, c = rep(1, 10)
)
df_lookup <- data_frame(
old_name = c("b", "c", "a")
, new_name = c("y", "z", "x")
)
I know how to do it manually
df %>%
rename(x = a
, y = b
, z = c)
I am seeking a solution in tidyverse / dplyr packages.
Use rlang; Firstly build up a list of names using syms, and then splice the arguments to rename with UQS or !!! operator:
library(rlang); library(dplyr)
df %>% rename(!!!syms(with(df_lookup, setNames(old_name, new_name))))
# A tibble: 10 x 3
# x y z
# <int> <int> <dbl>
# 1 1 10 1
# 2 2 9 1
# 3 3 8 1
# 4 4 7 1
# 5 5 6 1
# 6 6 5 1
# 7 7 4 1
# 8 8 3 1
# 9 9 2 1
#10 10 1 1
You could write your own helper to make it easier
rename_to <- function(data, old, new) {
data %>% rename_at(old, function(x) new[old==x])
}
df %>% rename_to(df_lookup$old_name, df_lookup$new_name)
In base-R:
names(df)[match(df_lookup$old_name,names(df))] <- df_lookup$new_name
# # A tibble: 10 x 3
# x y z
# <int> <int> <dbl>
# 1 1 10 1
# 2 2 9 1
# 3 3 8 1
# 4 4 7 1
# 5 5 6 1
# 6 6 5 1
# 7 7 4 1
# 8 8 3 1
# 9 9 2 1
# 10 10 1 1
Using data.table:
library(data.table)
setnames(setDT(df), old = df_lookup$old_name, new = df_lookup$new_name)
# x y z
# 1: 1 10 1
# 2: 2 9 1
# 3: 3 8 1
# 4: 4 7 1
# 5: 5 6 1
# 6: 6 5 1
# 7: 7 4 1
# 8: 8 3 1
# 9: 9 2 1
# 10: 10 1 1

Resources