R data imputation from group_by table based on count - r

group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)
test = data.frame(group, animal, sleep)
print(test)
group_animal = test %>% group_by(`group`, `animal`) %>% count(sleep)
print(group_animal)
I would like to replace the NA values in the test df's sleep column by the highest count of sleep answer based on group and animal.
Such that Group 1, Animal a with NAs in the sleep column should have a sleep value of 'y' because that is the value with the highest count among Group 1 Animal a.
Group 4 animal c with NAs for sleep should have 'n' as the sleep value as well.

Another option is replacing the NAs with the Mode. You can use the Mode function from this post in the na.aggregate function from zoo to replace these NAs like this:
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)
test = data.frame(group, animal, sleep)
library(dplyr)
library(zoo)
test %>%
group_by(group, animal) %>%
mutate(sleep = na.aggregate(sleep , FUN=Mode)) %>%
ungroup()
#> # A tibble: 24 × 3
#> group animal sleep
#> <dbl> <chr> <chr>
#> 1 1 a y
#> 2 1 b n
#> 3 4 c y
#> 4 4 c y
#> 5 4 d y
#> 6 5 a n
#> 7 5 b n
#> 8 6 c y
#> 9 1 b n
#> 10 4 d y
#> # … with 14 more rows
#> # ℹ Use `print(n = ...)` to see more rows
Created on 2022-07-26 by the reprex package (v2.0.1)
Here is tail of output:
> tail(test)
# A tibble: 6 × 3
group animal sleep
<dbl> <chr> <chr>
1 4 c n
2 4 c n
3 1 a y
4 4 c n
5 5 a n
6 6 c y

Update now with group_by(group, animal) thnx #Quinten, removed prior answer:
group by animal
use replace_na with the replace argument as sleep[n==max(n)]
new: in case of ties like in group 5 add !is.na(sleep) to avoid conflicts:
library(dplyr)
library(tidyr)
group_animal %>%
group_by(group, animal) %>%
arrange(desc(sleep), .by_group = TRUE) %>%
mutate(sleep = replace_na(sleep, sleep[n==max(n) & !is.na(sleep)]))
group animal sleep n
<dbl> <chr> <chr> <int>
1 1 a y 3
2 1 a n 1
3 1 a m 1
4 1 a y 1
5 1 b n 2
6 4 c y 2
7 4 c n 4
8 4 c n 1
9 4 d y 2
10 5 a n 1
11 5 a n 1
12 5 b n 1
13 6 c y 2
14 6 c n 1
15 6 c y 1

Try this.
This method essential creates a custom column to coalesce with sleep, it subsets sleep based on the max count values obtained from str_count
library(dplyr)
test |>
group_by(group, animal) |>
mutate(sleep = coalesce(sleep, sleep[max(stringr::str_count(paste(sleep, collapse = ""), pattern = sleep), na.rm = TRUE)])) |>
ungroup()
group animal sleep
1 1 a y
2 1 b n
3 4 c y
4 4 c y
5 4 d y
6 5 a n
7 5 b n
8 6 c y
9 1 b n
10 4 d y
11 6 c n
12 1 a y
13 1 a y
14 1 a n
15 1 a m
16 6 c y
17 4 c n
18 4 c n
19 4 c n
20 4 c n
21 1 a y
22 4 c n
23 5 a n
24 6 c n

Related

Re-ordering a factor (in a table) with hierarchical groups in R

Suppose the following table with two factor variabels and one numerical variable:
df <- tibble(
x = as_factor(c("a", "a", "a", "b", "b", "b")),
y = as_factor(1:6),
val = c(10, 3, 8, 2, 6, 1)
)
> df
# A tibble: 6 x 3
x y val
<fct> <fct> <dbl>
1 a 1 10
2 a 2 3
3 a 3 8
4 b 4 2
5 b 5 6
6 b 6 1
I would like to re-order y such that the sum of val, when grouped by x, takes precedent, but y is still ordered by val. To illustrate the goal:
# A tibble: 6 x 4
# Groups: x [2]
x y val sum
<fct> <fct> <dbl> <dbl>
1 a 1 10 21 # all y for which x=="a" come first, because
2 a 3 8 21 # the sum of val for x=="a" is greater than
3 a 2 3 21 # for x=="b"
4 b 5 6 9 # within each group, y is ordered by val
5 b 4 2 9
6 b 6 1 9
But how do I get there? Within tidyverse, I tried to solve it with forcats::fct_reorder(), thinking that grouping might help (df |> group_by(x) |> mutate(y = fct_reorder(y, val))), but it doesn't.
Can fct_reorder() do that at all? What other approaches could work?
Edit: I have found a solution, but it feels rather hacky:
df |>
group_by(x) |>
mutate(sum = sum(val)) |>
arrange(desc(sum), desc(val)) |> ungroup() |>
tibble::rowid_to_column() |>
mutate(across(c(x, y), \(x) fct_reorder(x, rowid)))
Perhaps, we need to arrange
library(dplyr)
library(forcats)
df %>%
arrange(desc(ave(val, x, FUN = sum)), desc(val)) %>%
mutate(across(where(is.factor), fct_inorder))
-output
# A tibble: 6 × 3
x y val
<fct> <fct> <dbl>
1 a 1 10
2 a 3 8
3 a 2 3
4 b 5 6
5 b 4 2
6 b 6 1
Or use fct_reorder/reorder in arrange
df %>%
arrange(desc(fct_reorder(x, val, .fun = sum)), desc(val)) %>%
mutate(across(where(is.factor), fct_inorder)
Probably we can use the following data.table option along with fct_inorder
setorder(
setDT(df)[
,
sum := sum(val), x
],
-sum, -val
)[
,
lapply(
.SD,
function(x) ifelse(is.factor(x), fct_inorder, c)(x)
)
]
and you will obtain
x y val sum
1: a 1 10 21
2: a 3 8 21
3: a 2 3 21
4: b 5 6 9
5: b 4 2 9
6: b 6 1 9

dplyr mutate: create column using first occurrence of another column

I was wondering if there's a more elegant way of taking a dataframe, grouping by x to see how many x's occur in the dataset, then mutating to find the first occurrence of every x (y)
test <- data.frame(x = c("a", "b", "c", "d",
"c", "b", "e", "f", "g"),
y = c(1,1,1,1,2,2,2,2,2))
x y
1 a 1
2 b 1
3 c 1
4 d 1
5 c 2
6 b 2
7 e 2
8 f 2
9 g 2
Current Output
output <- test %>%
group_by(x) %>%
summarise(count = n())
x count
<fct> <int>
1 a 1
2 b 2
3 c 2
4 d 1
5 e 1
6 f 1
7 g 1
Desired Output
x count first_seen
<fct> <int> <dbl>
1 a 1 1
2 b 2 1
3 c 2 1
4 d 1 1
5 e 1 2
6 f 1 2
7 g 1 2
I can filter the test dataframe for the first occurrences then use a left_join but was hoping there's a more elegant solution using mutate?
# filter for first occurrences of y
right <- test %>%
group_by(x) %>%
filter(y == min(y)) %>%
slice(1) %>%
ungroup()
# bind to the output dataframe
left_join(output, right, by = "x")
We can use first after grouping by 'x' to create a new column, use that also in group_by and get the count with n()
library(dplyr)
test %>%
group_by(x) %>%
group_by(first_seen = first(y), add = TRUE) %>%
summarise(count = n())
# A tibble: 7 x 3
# Groups: x [7]
# x first_seen count
# <fct> <dbl> <int>
#1 a 1 1
#2 b 1 2
#3 c 1 2
#4 d 1 1
#5 e 2 1
#6 f 2 1
#7 g 2 1
I have a question. Why not keep it simple? for example
test %>%
group_by(x) %>%
summarise(
count = n(),
first_seen = first(y)
)
#> # A tibble: 7 x 3
#> x count first_seen
#> <chr> <int> <dbl>
#> 1 a 1 1
#> 2 b 2 1
#> 3 c 2 1
#> 4 d 1 1
#> 5 e 1 2
#> 6 f 1 2
#> 7 g 1 2

Sum of individual elements in a vector

I would like to determine the sum for each individual element in a vector.
For example, suppose I have the vector
x <- c(2,3,2,2,5,5,3,3)
and I want to find the sum for each element.
The answer would be something like
2: 6
3: 9
5: 10
This is because there are three 2's (2+2+2 or 2*), etc.
In other words, I want to essentially multiply the number times the number of times that element is found in the vector.
Using base R tapply
tapply(x, x, sum)
# 2 3 5
# 6 9 10
If you need it as dataframe wrap it in stack
stack(tapply(x, x, sum))
# values ind
#1 6 2
#2 9 3
#3 10 5
If you convert this to a dataframe then this becomes (How to sum a variable by group)
library(dplyr)
tibble::tibble(x) %>%
group_by(x) %>%
summarise(n = sum(x))
# A tibble: 3 x 2
# x n
# <dbl> <dbl>
#1 2 6
#2 3 9
#3 5 10
A method with dplyr:
x <- c(2,3,2,2,5,5,3,3)
a = tibble(x)
a %>% count(x) %>% mutate(xn = x*n)
# A tibble: 3 x 3
x n xn
<dbl> <int> <dbl>
1 2 3 6
2 3 3 9
3 5 2 10
Lots of ways to do this. A couple of base approaches:
with(rle(sort(x)), data.frame(val = values, freq = lengths, prod = lengths*values))
val freq prod
1 2 3 6
2 3 3 9
3 5 2 10
Or:
transform(as.data.frame(table(x), stringsAsFactors = FALSE), sum = as.numeric(x) * Freq)
x Freq sum
1 2 3 6
2 3 3 9
3 5 2 10
library(tidyverse)
x <- c(2,3,2,2,5,5,3,3)
tibble(x) %>%
count(x) %>%
mutate(xn = x*n ) %>%
pull(xn)
We can use rowsum from base R
rowsum(x, group = x)
# [,1]
#2 6
#3 9
#5 10
Or with by
by(x, x, FUN = sum)
Or with split
sapply(split(x, x), sum)
# 2 3 5
# 6 9 10
Or another option with xtabs
xtabs(x1 ~ x, cbind(x1 = x, x))
# 2 3 5
# 6 9 10
Or with ave
unique(data.frame(x, Sum = ave(x, x, FUN = sum)))
# x Sum
#1 2 6
#2 3 9
#5 5 10
Or using data.table
library(data.table)
data.table(grp = x, x=x)[, .(Sum = sum(x)), grp]
# grp Sum
#1: 2 6
#2: 3 9
#3: 5 10

dplyr - How to obtain the order of one column within a group?

Example data:
tibbly = tibble(age = c(10,30,50,10,30,50,10,30,50,10,30,50),
grouping1 = c("A","A","A","A","A","A","B","B","B","B","B","B"),
grouping2 = c("X", "X", "X","Y","Y","Y","X","X","X","Y","Y","Y"),
value = c(1,2,3,4,4,6,2,5,3,6,3,2))
> tibbly
# A tibble: 12 x 4
age grouping1 grouping2 value
<dbl> <chr> <chr> <dbl>
1 10 A X 1
2 30 A X 2
3 50 A X 3
4 10 A Y 4
5 30 A Y 4
6 50 A Y 6
7 10 B X 2
8 30 B X 5
9 50 B X 3
10 10 B Y 6
11 30 B Y 3
12 50 B Y 2
Question:
How to obtain the order of rows for each group in a dataframe? I can use dplyr to arrange the data in the an appropriate form to visualize what I am interested in:
> tibbly %>%
group_by(grouping1, grouping2) %>%
arrange(grouping1, grouping2, desc(value))
# A tibble: 12 x 4
# Groups: grouping1, grouping2 [4]
age grouping1 grouping2 value
<dbl> <chr> <chr> <dbl>
1 50 A X 3
2 30 A X 2
3 10 A X 1
4 50 A Y 6
5 10 A Y 4
6 30 A Y 4
7 30 B X 5
8 50 B X 3
9 10 B X 2
10 10 B Y 6
11 30 B Y 3
12 50 B Y 2
In the end I am interested in the order of the age column, for each group based on the value column. Is there a elegant way to do this with dplyr? Something like summarise() based on the order of rows and not actual values
library(dplyr)
tibbly = tibble(age = c(10,30,50,10,30,50,10,30,50,10,30,50),
grouping1 = c("A","A","A","A","A","A","B","B","B","B","B","B"),
grouping2 = c("X", "X", "X","Y","Y","Y","X","X","X","Y","Y","Y"),
value = c(1,2,3,4,4,6,2,5,3,6,3,2))
tibbly %>%
group_by(grouping1, grouping2) %>% # for each group
arrange(desc(value)) %>% # arrange value descending
summarise(order = paste0(age, collapse = ",")) %>% # get the order of age as a strings
ungroup() # forget the grouping
# # A tibble: 4 x 3
# grouping1 grouping2 order
# <chr> <chr> <chr>
# 1 A X 50,30,10
# 2 A Y 50,10,30
# 3 B X 30,50,10
# 4 B Y 10,30,50
With data.table
library(data.table)
setDT(tibbly)[order(-value), .(order = toString(age)),.(grouping1, grouping2)]

summarise and group_by using two different columns consecutively

I have a dataframe df with three columns a,b,c.
df <- data.frame(a = c('a','b','c','d','e','f','g','e','f','g'),
b = c('X','Y','Z','X','Y','Z','X','X','Y','Z'),
c = c('cat','dog','cat','dog','cat','cat','dog','cat','cat','dog'))
df
# output
a b c
1 a X cat
2 b Y dog
3 c Z cat
4 d X dog
5 e Y cat
6 f Z cat
7 g X dog
8 e X cat
9 f Y cat
10 g Z dog
I have to group_by using the column b followed by summarise using the column c with counts of available values in it.
df %>% group_by(b) %>%
summarise(nCat = sum(c == 'cat'),
nDog = sum(c == 'dog'))
#output
# A tibble: 3 × 3
b nCat nDog
<fctr> <int> <int>
1 X 2 2
2 Y 2 1
3 Z 2 1
However, before doing the above task, I should remove the rows belonging to a value in a which has more than one value in b.
df %>% group_by(a) %>% summarise(count = n())
#output
# A tibble: 7 × 2
a count
<fctr> <int>
1 a 1
2 b 1
3 c 1
4 d 1
5 e 2
6 f 2
7 g 2
For example, in this dataframe, all the rows having value e(values: Y,X), f(values: Z,Y), g(values: X,Z) in column a.
# Expected output
# A tibble: 3 × 3
b nCat nDog
<fctr> <int> <int>
1 X 1 1
2 Y 0 1
3 Z 1 0
We can use filter with n_distinct to filter the values in 'b' that have only one unique element for each 'a' group, then grouped by 'b', we do the summarise
df %>%
group_by(a) %>%
filter(n_distinct(b)==1) %>%
group_by(b) %>%
summarise(nCat =sum(c=='cat'), nDog = sum(c=='dog'), Total = n())
# A tibble: 3 × 4
# b nCat nDog Total
# <fctr> <int> <int> <int>
#1 X 1 1 2
#2 Y 0 1 1
#3 Z 1 0 1

Resources