Related
I'm using dplyr distinct() for the first time and I'm trying to figure out how to use it with multiple variables and how to handle "ties". For example, when I run the code shown at the bottom of this post against example data frame label_18, I get the below correct results as shown and explained here (note that there no ties with eleCnt and grpID columns in this example):
Element Group eleCnt grpID grpRnk Explain grpRnk column...
<chr> <dbl> <int> <int> <int>
1 B 2 1 3 1 Ranked 1st since it has lowest eleCnt & lowest grpID
2 R 3 1 6 2 Ranked 2nd since it has lowest elecCnt & 2nd lowest grpID
3 X 4 1 10 3 Same pattern as above
4 R 1 4 9 4 Same pattern as above
5 R 2 6 13 5 Same pattern as above
Now when I run the code against label_7, there is a tie between eleCnt and grpID, and I get these results:
Element Group eleCnt grpID grpRnk
<chr> <dbl> <int> <int> <int>
1 R 1 1 3 1
2 R 2 3 7 2
Expected output: I would like the results for label_7 to be (while retaining the output for label_18 shown above):
Element Group eleCnt grpID grpRnk Explain grpRnk column...
<chr> <dbl> <int> <int> <int>
1 R 1 1 3 1 Ranked 1st since it has lowest eleCnt & lowest grpID
2 X 3 1 3 1 Also ranked 1st since it ties with above
3 R 2 3 7 2 Ranked 2nd since its eleCnt is 2nd and its grpRnk is 2nd
How do I modify distinct() for handling ties, so I can get the desired results for label_7 while keeping the same results for label_18? Maybe there's a better way to do this completely, some function other than distinct() for this sort of thing.
Code:
library(dplyr)
label_7 <- data.frame(Element=c("B","R","R","R","R","B","X","X","X","X","X"),
Group = c(0,1,1,2,2,0,3,3,0,0,0),
eleCnt = c(1,1,2,3,4,2,1,2,3,4,5),
grpID = c(0,3,3,7,7,0,3,3,0,0,0))
label_18 <- data.frame(Element = c("R","R","R","X","X","X","X","B","B","R","R","R","R"),
Group = c(3,3,3,4,4,4,4,2,2,1,1,2,2),
eleCnt = c(1,2,3,1,2,3,4,1,2,4,5,6,7),
grpID = c(6,6,6,10,10,10,10,3,3,9,9,13,13))
label_7 %>% select(Element,Group,eleCnt,grpID) %>%
filter(Group > 0) %>%
group_by(Element,Group) %>%
slice(which.min(Group)) %>%
ungroup() %>%
distinct(eleCnt,grpID, .keep_all = TRUE) %>%
arrange(eleCnt,grpID) %>%
mutate(grpRnk = 1:n())
Edit: adding another data frame to test against, label_15 --
> label_15
Element Group eleCnt grpID
1 B 0 1 0
2 R 1 1 3
3 R 1 2 3
4 R 0 3 0
5 X 2 1 3
6 X 2 2 3
7 X 3 3 7
8 X 3 4 7
Expected results would be similar to label_7, because of a tie between Elements R and X in rows 2 and 5 of the above data frame:
Element Group eleCnt grpID grpRank
<chr> <dbl> <dbl> <dbl> <int>
1 R 1 1 3 1
2 X 2 1 3 1
3 X 3 3 7 2
Code for label_15 data frame:
label_15 <- data.frame(Element = c("B","R","R","R","X","X","X","X"),
Group = c(0,1,1,0,2,2,3,3),
eleCnt = c(1,1,2,3,1,2,3,4),
grpID = c(0,3,3,0,3,3,7,7))
We could try
library(dplyr)
library(data.table)
label_7 %>%
select(Element,Group,eleCnt,grpID) %>%
filter(Group > 0) %>%
group_by(Element,Group) %>%
slice(which.min(Group)) %>%
ungroup() %>%
distinct(tmp = rleid(eleCnt, grpID), .keep_all = TRUE) %>%
arrange(eleCnt,grpID) %>%
select(-tmp) %>%
mutate(grpRank= match(grpID, unique(grpID)))
-output
# A tibble: 3 × 5
Element Group eleCnt grpID grpRank
<chr> <dbl> <dbl> <dbl> <int>
1 R 1 1 3 1
2 X 3 1 3 1
3 R 2 3 7 2
For the second case
label_18 %>%
select(Element,Group,eleCnt,grpID) %>%
filter(Group > 0) %>%
group_by(Element,Group) %>%
slice(which.min(Group)) %>%
ungroup() %>%
distinct(tmp = rleid(eleCnt, grpID), .keep_all = TRUE) %>%
arrange(eleCnt,grpID) %>%
select(-tmp) %>%
mutate(grpRank= match(grpID, unique(grpID)))
-output
# A tibble: 5 × 5
Element Group eleCnt grpID grpRank
<chr> <dbl> <dbl> <dbl> <int>
1 B 2 1 3 1
2 R 3 1 6 2
3 X 4 1 10 3
4 R 1 4 9 4
5 R 2 6 13 5
Here's another possibility that correctly processes all 3 scenarios in the post:
filter(label_15,Group!=0) %>%
arrange(eleCnt,grpID) %>%
mutate(grpRnk = data.table::rleid(eleCnt,grpID)) %>%
group_by(grpID) %>%
filter(grpRnk==min(grpRnk)) %>%
ungroup %>%
mutate(grpRnk=data.table::rleid(grpID))
Output:
# A tibble: 3 x 5
Element Group eleCnt grpID grpRnk
<chr> <dbl> <dbl> <dbl> <int>
1 R 1 1 3 1
2 X 2 1 3 1
3 X 3 3 7 2
DATA = data.frame("TRIMESTER" = c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),
"STUDENT" = c(1,2,3,4,5,6,7,1,2,3,5,9,10,11,3,7,10,6,12,15,17,16,21))
WANT = data.frame("TRIMESTER" = c(1,2,3),
"NEW_ENROLL" = c(7,3,5),
"TOTAL_ENROLL" = c(7,10,15))
I Have 'DATA' and want to make 'WANT' which has three columns and for every 'TRIMESTER' you count the number of NEW 'STUDENT' and then for 'TOTAL_ENROLL' you just count the total number of unique 'STUDENT' every trimester.
My attempt only counts the number for each TRIMESTER.
library(dplyr)
DATA %>%
group_by(TRIMESTER) %>%
count()
Here is a way.
suppressPackageStartupMessages(library(dplyr))
DATA <- data.frame("TRIMESTER" = c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),
"STUDENT" = c(1,2,3,4,5,6,7,1,2,3,5,9,10,11,3,7,10,6,12,15,17,16,21))
DATA %>%
mutate(NEW_ENROLL = !duplicated(STUDENT)) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = sum(NEW_ENROLL)) %>%
ungroup() %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
#> # A tibble: 3 × 3
#> TRIMESTER NEW_ENROLL TOTAL_ENROLL
#> <dbl> <int> <int>
#> 1 1 7 7
#> 2 2 3 10
#> 3 3 5 15
Created on 2022-08-14 by the reprex package (v2.0.1)
For variety we can use Base R aggregate with transform
transform(aggregate(. ~ TRIMESTER , DATA[!duplicated(DATA$STUDENT),] , length),
TOTAL_ENROLL = cumsum(STUDENT))
Output
TRIMESTER STUDENT TOTAL_ENROLL
1 1 7 7
2 2 3 10
3 3 5 15
We replace the duplicated elements in 'STUDENT' to NA, grouped by TRIMESTER, get the sum of nonNA elements and finally do the cumulative sum (cumsum)
library(dplyr)
DATA %>%
mutate(STUDENT = replace(STUDENT, duplicated(STUDENT), NA)) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = sum(!is.na(STUDENT)), .groups= 'drop') %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
-output
# A tibble: 3 × 3
TRIMESTER NEW_ENROLL TOTAL_ENROLL
<dbl> <int> <int>
1 1 7 7
2 2 3 10
3 3 5 15
Or with distinct
distinct(DATA, STUDENT, .keep_all = TRUE) %>%
group_by(TRIMESTER) %>%
summarise(NEW_ENROLL = n(), .groups = 'drop') %>%
mutate(TOTAL_ENROLL = cumsum(NEW_ENROLL))
# A tibble: 3 × 3
TRIMESTER NEW_ENROLL TOTAL_ENROLL
<dbl> <int> <int>
1 1 7 7
2 2 3 10
3 3 5 15
This is probably a dumb question, but how do I create a new group ID based on a string column in R? The values of the ID are arbitrary.
ID: the column I want to create
Name ID
A09john 1
J43mary 2
B7you 3
A09john 1
J43mary 2
B7you 3
I was hoping to use simple codes like below, but I don't know how to do it. Thank you!
df1 %>%
group_by(Name) %>%
mutate(ID = row_number(as.numeric(????)))
Here is a tidyverse approach that use dplyr::cur_group_id() (current group identifier)
library(tidyverse)
d <- data.frame(
Name = c("A09john", "J43mary", "B7you", "A09john", "J43mary", "B7you")
)
new_data <- d |>
dplyr::group_by(Name) |>
dplyr::mutate(ID = dplyr::cur_group_id()) |>
ungroup()
new_data
#> # A tibble: 6 x 2
#> Name ID
#> <chr> <int>
#> 1 A09john 1
#> 2 J43mary 3
#> 3 B7you 2
#> 4 A09john 1
#> 5 J43mary 3
#> 6 B7you 2
# If you want to have the ID based on the order of appearance.
# You have to convert Name to factor first
new_data2 <- d |>
dplyr::mutate(Name = factor(Name, levels = unique(Name))) |>
dplyr::group_by(Name) |>
mutate(ID = dplyr::cur_group_id()) |>
ungroup()
new_data2
#> # A tibble: 6 x 2
#> # Groups: Name [3]
#> Name ID
#> <fct> <int>
#> 1 A09john 1
#> 2 J43mary 2
#> 3 B7you 3
#> 4 A09john 1
#> 5 J43mary 2
#> 6 B7you 3
Created on 2022-06-16 by the reprex package (v2.0.1)
row_number() is not the solution as it will compute the row number in each group.
I would use sequence and dplyr in this way:
df1 %>%
group_by(Name) %>%
mutate(ID = sequence(n()))
I have a dataframe that looks like this:
library(tidyverse)
x <- tibble(
batch = rep(c(1,2), each=10),
exp_id = c(rep('a',3),rep('b',2),rep('c',5),rep('d',6),rep('e',4))
)
I can run the code below to get the count perexp_id:
x %>% group_by(batch,exp_id) %>%
summarise(count=n())
which generates:
batch exp_id count
<dbl> <chr> <dbl>
1 1 a 3
2 1 b 2
3 1 c 5
4 2 d 6
5 2 e 4
A really ugly way to generate the mean of these counts is:
x %>% group_by(batch,exp_id) %>%
summarise(count=n()) %>%
ungroup() %>%
group_by(batch) %>%
summarise(avg_exp = mean(count))
which generates:
batch avg_exp
<dbl> <dbl>
1 1 3.33
2 2 5
Is there a more succinct and "tidy" way generate this?
library(dplyr)
group_by(x, batch) %>%
summarize(avg_exp = mean(table(exp_id)))
# # A tibble: 2 x 2
# batch avg_exp
# <dbl> <dbl>
# 1 1 3.33
# 2 2 5
Here's another way -
library(dplyr)
x %>%
count(batch, exp_id, name = "count") %>%
group_by(batch) %>%
summarise(count = mean(count))
# batch count
# <dbl> <dbl>
#1 1 3.33
#2 2 5
I have this tibble:
library(tibble)
library(dplyr)
df <- tibble(id = c("one", "two", "three"),
A = c(1,2,3),
B = c(4,5,6))
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 two 2 5
3 three 3 6
I want to add a row to each group AND assign values to the new column BUT with a function (here the new row in each group should get A=4 B = the first group value of column B USING first(B)-> desired output:
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 6
5 two 2 5
6 two 4 5
I have tried so far:
If I add a row in a ungrouped tibble with add_row -> this works perfect!
df %>%
add_row(A=4, B=4)
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 two 2 5
3 three 3 6
4 NA 4 4
If I try to use add_row in a grouped tibble -> this works not:
df %>%
group_by(id) %>%
add_row(A=4, B=4)
Error: Can't add rows to grouped data frames.
Run `rlang::last_error()` to see where the error occurred.
According to this post Add row in each group using dplyr and add_row() we could use group_modify -> this works great:
df %>%
group_by(id) %>%
group_modify(~ add_row(A=4, B=4, .x))
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 4
5 two 2 5
6 two 4 4
I want to assign to column B the first value of column B (or it can be any function min(B), max(B) etccc.) -> this does not work:
df %>%
group_by(id) %>%
group_modify(~ add_row(A=4, B=first(B), .x))
Error in h(simpleError(msg, call)) :
Fehler bei der Auswertung des Argumentes 'x' bei der Methodenauswahl für Funktion 'first': object 'B' not found
library(tidyverse)
df <- tibble(id = c("one", "two", "three"),
A = c(1,2,3),
B = c(4,5,6))
df %>%
group_by(id) %>%
summarise(add_row(cur_data(), A = 4, B = first(cur_data()$B)))
#> `summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
#> # A tibble: 6 × 3
#> # Groups: id [3]
#> id A B
#> <chr> <dbl> <dbl>
#> 1 one 1 4
#> 2 one 4 4
#> 3 three 3 6
#> 4 three 4 6
#> 5 two 2 5
#> 6 two 4 5
Or
df %>%
group_by(id) %>%
group_split() %>%
map_dfr(~ add_row(.,id = first(.$id), A = 4, B = first(.$B)))
#> # A tibble: 6 × 3
#> id A B
#> <chr> <dbl> <dbl>
#> 1 one 1 4
#> 2 one 4 4
#> 3 three 3 6
#> 4 three 4 6
#> 5 two 2 5
#> 6 two 4 5
Created on 2022-01-02 by the reprex package (v2.0.1)
Maybe this is an option
library(dplyr)
df %>%
group_by(id) %>%
summarise( A=c(A,4), B=c(B,first(B)) ) %>%
ungroup
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
# A tibble: 6 x 3
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 6
5 two 2 5
6 two 4 5
According to the documentation of the function group_modify, if you use a formula, you must use ". or .x to refer to the subset of rows of .tbl for the given group;" that's why you used .x inside the add_row function. To be entirely consistent, you have to do it also within the first function.
df %>%
group_by(id) %>%
group_modify(~ add_row(A=4, B=first(.x$B), .x))
# A tibble: 6 x 3
# Groups: id [3]
id A B
<chr> <dbl> <dbl>
1 one 1 4
2 one 4 4
3 three 3 6
4 three 4 6
5 two 2 5
6 two 4 5
Using first(.$B) or first(df$B) will provide the same results.
A possible solution:
library(tidyverse)
df <- tibble(id = c("one", "two", "three"),
A = c(1,2,3),
B = c(4,5,6))
df %>%
group_by(id) %>%
slice(rep(1,2)) %>% mutate(A = if_else(row_number() > 1, first(df$B), A)) %>%
ungroup
#> # A tibble: 6 × 3
#> id A B
#> <chr> <dbl> <dbl>
#> 1 one 1 4
#> 2 one 4 4
#> 3 three 3 6
#> 4 three 4 6
#> 5 two 2 5
#> 6 two 4 5