Applying adjusted boxplot method adjboxstats() with group_by in R? - r

I am a beginner and would like to
produce adjboxStats() for every code in my data (see below)
eliminate the outliers for every code
Some dummy data:
code=c("A1","A2","A3","A1","A2","A3","A1","A2","A3","A1","A2","A3","A1","A2","A3","A1","A2","A3","A2","A3","A1","A2","A3","A1","A2"),
duration=c(100,100,100,200,200,200,23523,213123,12,23213,968,37253,573012,472662,3846516,233,262,5737,3038,2,5,123,969,6,40582)
)
At the moment, I am able to produce the results across all codes, see below. But I have problems i) to run the statistics for every code (would group_by(code) work?) and then ii) to exclude the found outliers ($out) for every code.
library(robustbase)
adjboxStats(data$duration, coef = 1.5, a = -4, b = 3, do.conf = TRUE, do.out = TRUE)
$stats
[1] 2 100 262 23523 573012
$n
[1] 50
$conf
[1] -4971.77 5495.77
$fence
[1] -571.2153 707257.8400
$out
[1] 3846516 3846516
Thank you very much in advance for your help!

We can do a group by and summarise in a list
library(dplyr)
library(robustbase)
data1 <- data %>%
group_by(code) %>%
summarise(out = list(adjboxStats(duration, coef = 1.5,
a = -4, b = 3, do.conf = TRUE, do.out = TRUE)))
data1
# A tibble: 3 x 2
# code out
# <chr> <list>
#1 A1 <named list [5]>
#2 A2 <named list [5]>
#3 A3 <named list [5]>
data1$out[[1]]
#$stats
#[1] 5.0 53.0 216.5 23368.0 573012.0
#$n
#[1] 8
#$conf
#[1] -12807.59 13240.59
#$fence
#[1] -624.4143 696935.1967
#$out
#numeric(0)
If we are interested in filtering out the outliers, then use %in% with ! after extracting the 'out' component
data %>%
group_by(code) %>%
filter(!duration %in% adjboxStats(duration, coef = 1.5,
a = -4, b = 3, do.conf = TRUE, do.out = TRUE)$out)
# A tibble: 24 x 2
# Groups: code [3]
# code duration
# <chr> <dbl>
# 1 A1 100
# 2 A2 100
# 3 A3 100
# 4 A1 200
# 5 A2 200
# 6 A3 200
# 7 A1 23523
# 8 A2 213123
# 9 A3 12
#10 A1 23213
# … with 14 more rows
data
data <- structure(list(code = c("A1", "A2", "A3", "A1", "A2", "A3", "A1",
"A2", "A3", "A1", "A2", "A3", "A1", "A2", "A3", "A1", "A2", "A3",
"A2", "A3", "A1", "A2", "A3", "A1", "A2"), duration = c(100,
100, 100, 200, 200, 200, 23523, 213123, 12, 23213, 968, 37253,
573012, 472662, 3846516, 233, 262, 5737, 3038, 2, 5, 123, 969,
6, 40582)), class = "data.frame", row.names = c(NA, -25L))

Related

tidyverse: Simulating random sample with nested factor

I want to simulate random sample with nested factor. Factor Dept has two levels A & B. Level A has two nested levels A1 and A2. Level B has three nested levels B1, B2 and B3. Want to simulate random sample from 2022-01-01 to 2022-01-31 using some R code. Part of desired output is given below (from 2022-01-01 to 2022-01-02 only for reference).
library(tibble)
set.seed(12345)
df1 <-
tibble(
Date = c(rep("2022-01-01", 5), rep("2022-01-02", 4), rep("2022-01-03", 4))
, Dept = c("A", "A", "B", "B", "B", "A", "B", "B", "B", "A", "A", "B", "B")
, Prog = c("A1", "A2", "B1", "B2", "B3", "A1", "B1", "B2", "B3", "A1", "A2", "B2", "B3")
, Amount = runif(n = 13, min = 50000, max = 100000)
)
df1
#> # A tibble: 13 x 4
#> Date Dept Prog Amount
#> <chr> <chr> <chr> <dbl>
#> 1 2022-01-01 A A1 86045.
#> 2 2022-01-01 A A2 93789.
#> 3 2022-01-01 B B1 88049.
#> 4 2022-01-01 B B2 94306.
#> 5 2022-01-01 B B3 72824.
#> 6 2022-01-02 A A1 58319.
#> 7 2022-01-02 B B1 66255.
#> 8 2022-01-02 B B2 75461.
#> 9 2022-01-02 B B3 86385.
#> 10 2022-01-03 A A1 99487.
#> 11 2022-01-03 A A2 51727.
#> 12 2022-01-03 B B2 57619.
#> 13 2022-01-03 B B3 86784.
If we want to sample randomly, create the expanded data with crossing and then filter/slice to return random rows for each 'date'
library(dplyr)
library(tidyr)
library(stringr)
crossing(Date = seq(as.Date("2022-01-01"), as.Date("2022-01-31"),
by = "1 day"), Dept = c("A", "B"), Prog = 1:3) %>%
mutate(Prog = str_c(Dept, Prog)) %>%
filter(Prog != "A3") %>%
mutate(Amount = runif(n = n(), min = 50000, max = 100000)) %>%
group_by(Date) %>%
slice(seq_len(sample(row_number(), 1))) %>%
ungroup
-output
# A tibble: 102 × 4
Date Dept Prog Amount
<date> <chr> <chr> <dbl>
1 2022-01-01 A A1 83964.
2 2022-01-01 A A2 93428.
3 2022-01-01 B B1 85187.
4 2022-01-01 B B2 79144.
5 2022-01-01 B B3 65784.
6 2022-01-02 A A1 86014.
7 2022-01-03 A A1 76060.
8 2022-01-03 A A2 56412.
9 2022-01-03 B B1 87365.
10 2022-01-03 B B2 66169.
# … with 92 more rows

Set values to zero in specific columns in a data.table

In a data.table, I try to set the values in the columns; 2018, 2019, 2020, 2021 to zero, if the values in column ID and Score are identical.
This would be an example data.table:
library(data.table)
data = data.table(
ID = c("a1", "a2", "a2", "a1", "a2", "a1", "a1"),
Score = c("A","B","C","A","B","C","A"),
"2018" = c(3,5,1,3,5,6,6),
"2019" = c(3,5,6,2,1,4,2),
"2020" = c(9,6,6,9,6,9,9),
"2021" = c(4,0,3,8,5,4,6))
data <- data[order(ID, Score)]
I tried it with duplicated(), but with that function, it deletes the whole row. I only want to set the values to zero as shown in the solution example data.table.
solution = data.table(
ID = c("a1", "a2", "a2", "a1", "a2", "a1", "a1"),
Score = c("A","B","C","A","B","C","A"),
"2018" = c(3,5,1,0,0,6,6),
"2019" = c(3,5,6,2,1,4,0),
"2020" = c(9,6,6,0,0,9,0),
"2021" = c(4,0,3,8,5,4,6))
solution <- solution[order(ID, Score)]
Thanks a lot!
You can use replace with duplicated to turn the repeated values to 0 in each group.
library(data.table)
cols <- 3:6
data[, (cols) := lapply(.SD, function(x) replace(x, duplicated(x), 0)),
.SDcols = cols, .(ID, Score)]
data
# ID Score 2018 2019 2020 2021
#1: a1 A 3 3 9 4
#2: a1 A 0 2 0 8
#3: a1 A 6 0 0 6
#4: a1 C 6 4 9 4
#5: a2 B 5 5 6 0
#6: a2 B 0 1 0 5
#7: a2 C 1 6 6 3

For each ID, separate groups into columns and collapse multiple value strings in R

I have a dataframe that looks like this:
in.dat <- data.frame(ID = c("A1", "A1", "A1", "A1", "B1", "B1", "B1", "B1"),
DB = rep(c("bio", "bio", "func", "loc"), 2),
val = c("IPR1", "IPR2", "s43", "333-456",
"IPR7", "IPR8", "q87", "566-900"))
ID DB val
1 A1 bio IPR1
2 A1 bio IPR2
3 A1 func s43
4 A1 loc 333-456
5 B1 bio IPR7
6 B1 bio IPR8
7 B1 func q87
8 B1 loc 566-900
I want to turn "DB" into columns and take the string values and collapse by ";"
out.dat <- data.frame(ID = c("A1", "B1"),
bio = c("IPR1;IPR2", "IPR7;IPR8"),
func = c("s47", "q87"),
loc = c("333-456", "566-900"))
> out
ID bio func loc
1 A1 IPR1;IPR2 s47 333-456
2 B1 IPR7;IPR8 q87 566-900
I've played around with pivot_wider and group using dplyr but not quite getting what I want, since a group can have multiple values per ID that I want to collapse into one cell (e.g., "IPR1;IPR2")
Any solution would be appreciated!
pivot_wider in recent tidyr versions takes an argument values_fn for a function that aggregates values before reshaping. This lets you do your operation in one function call.
library(tidyr)
in.dat %>%
pivot_wider(names_from = DB, values_from = val,
values_fn = list(val = ~paste(., collapse = ";")))
#> # A tibble: 2 x 4
#> ID bio func loc
#> <fct> <chr> <chr> <chr>
#> 1 A1 IPR1;IPR2 s43 333-456
#> 2 B1 IPR7;IPR8 q87 566-900
We can collapse val by ID and DB and then use pivot_wider.
library(dplyr)
in.dat %>%
group_by(ID, DB) %>%
summarise(val = paste0(val, collapse = ";")) %>%
tidyr::pivot_wider(names_from = DB, values_from = val)
# ID bio func loc
# <fct> <chr> <chr> <chr>
#1 A1 IPR1;IPR2 s43 333-456
#2 B1 IPR7;IPR8 q87 566-900
You can use dcast to do this.
in.dat <- data.frame(ID = c("A1", "A1", "A1", "A1", "B1", "B1", "B1", "B1"),
DB = rep(c("bio", "bio", "func", "loc"), 2),
val = c("IPR1", "IPR2", "s43", "333-456",
"IPR7", "IPR8", "q87", "566-900"))
library(reshape2)
dcast(in.dat, ID ~ DB, paste0, collapse = ";")
# ID bio func loc
#1 A1 IPR1;IPR2 s43 333-456
#2 B1 IPR7;IPR8 q87 566-900
We can also use spread with str_c
library(dplyr)
library(tidyr)
library(stringr)
in.dat %>%
group_by(ID, DB) %>%
summarise(val = str_c(val, collapse=";")) %>%
spread(DB, val)
# A tibble: 2 x 4
# Groups: ID [2]
# ID bio func loc
# <fct> <chr> <chr> <chr>
#1 A1 IPR1;IPR2 s43 333-456
#2 B1 IPR7;IPR8 q87 566-900

how to create new variables from one variable using two rules

I would appreciate any help to create new variables from one variable.
Specifically, I need help to simultaneously create one row per each ID and various columns of E, where each of the new columns of E, (that is, E1, E2, E3) contains the values of E for each row of ID. I tried doing this which melt followed by spread but I am getting the error:
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
Additionally, I tried the solutions discussed here and here but these did not work for my case because I need to be able to create row identifiers for rows (4, 1, 2), (7, 3, 5), and (9, 6, 8). That is, E for rows (4, 1, 2) should be named E1, E for rows (7, 3, 5) should be named E2, E for rows (9, 6, 8) should be named E3, and so on.
#data
dT<-structure(list(A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1",
"a2", "a1"), B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1",
"b2", "b1"), ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"
), E = c(0.621142094943352, 0.742109450696123, 0.39439152996948,
0.40694392882818, 0.779607277916503, 0.550579323666347, 0.352622183880119,
0.690660491345867, 0.23378944873769)), class = c("data.table",
"data.frame"), row.names = c(NA, -9L))
#my attempt
A B ID E
1: a1 b2 3 0.6211421
2: a2 b2 4 0.7421095
3: a1 b2 3 0.3943915
4: a1 b1 1 0.4069439
5: a2 b2 4 0.7796073
6: a1 b2 3 0.5505793
7: a1 b1 1 0.3526222
8: a2 b2 4 0.6906605
9: a1 b1 1 0.2337894
aTempDF <- melt(dT, id.vars = c("A", "B", "ID")) )
A B ID variable value
1: a1 b2 3 E 0.6211421
2: a2 b2 4 E 0.7421095
3: a1 b2 3 E 0.3943915
4: a1 b1 1 E 0.4069439
5: a2 b2 4 E 0.7796073
6: a1 b2 3 E 0.5505793
7: a1 b1 1 E 0.3526222
8: a2 b2 4 E 0.6906605
9: a1 b1 1 E 0.2337894
aTempDF%>%spread(variable, value)
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
#expected output
A B ID E1 E2 E3
1: a1 b2 3 0.6211421 0.3943915 0.5505793
2: a2 b2 4 0.7421095 0.7796073 0.6906605
3: a1 b1 1 0.4069439 0.3526222 0.2337894
Thanks in advance for any help.
You can use dcast from data.table
library(data.table)
dcast(dT, A + B + ID ~ paste0("E", rowid(ID)))
# A B ID E1 E2 E3
#1 a1 b1 1 0.4069439 0.3526222 0.2337894
#2 a1 b2 3 0.6211421 0.3943915 0.5505793
#3 a2 b2 4 0.7421095 0.7796073 0.6906605
You need to create the correct 'time variable' first which is what rowid(ID) does.
For those looking for a tidyverse solution:
library(tidyverse)
dT <- structure(
list(
A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1", "a2", "a1"),
B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1", "b2", "b1"),
ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"),
E = c(0.621142094943352, 0.742109450696123, 0.39439152996948, 0.40694392882818,
0.550579323666347, 0.352622183880119, 0.690660491345867, 0.23378944873769,
0.779607277916503)),
class = c("data.table",
"data.frame"),
row.names = c(NA, -9L))
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# Just so columns are "E1", "E2", etc.
mutate(rn = glue::glue("E{row_number()}")) %>%
ungroup() %>%
spread(rn, E) %>%
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234
As mentioned in the accepted answer, you need a "key" variable to spread on first. This is created using row_number() and glue where glue just gives you the proper E1, E2, etc. variable names.
The group_by piece just makes sure that the row numbers are with respect to A, B and ID.
EDIT for tidyr >= 1.0.0
The (not-so) new pivot_ functions supercede gather and spread and eliminate the need to glue the new variable names together in a mutate.
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# no longer need to glue (or paste) the names together but still need a row number
mutate(rn = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = rn, values_from = E, names_glue = "E{.name}") %>% # names_glue argument allows for easy transforming of the new variable names
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234

Pipe output of one data.frame to another using dplyr

I have two data.frames--one look-up table that tells me a set products included in a group. Each group has at least one product of Type 1 and Type 2.
The second data.frame tells me details about the transaction. Each transaction can have one of the following products:
a) Only products of Type 1 from one of the groups
b) Only products of Type 2 from one of the groups
c) Product of Type 1 and Type 2 from the same group
For my analysis, I am interested in finding out c) above i.e. how many transactions have products of Type 1 and Type 2 (from the same group) sold. We will ignore the transaction altogether if Product of Type 1 and that of Type 2 from different groups that are sold in the same transaction.
Thus, each product of Type 1 or Type 2 MUST belong to the same group.
Here's my look up table:
> P_Lookup
Group ProductID1 ProductID2
Group1 A 1
Group1 B 2
Group1 B 3
Group2 C 4
Group2 C 5
Group2 C 6
Group3 D 7
Group3 C 8
Group3 C 9
Group4 E 10
Group4 F 11
Group4 G 12
Group5 H 13
Group5 H 14
Group5 H 15
For instance, I won't have Product G and Product 15 in one transaction because they belong to different group.
Here are the transactions:
TransactionID ProductID ProductType
a1 A 1
a1 B 1
a1 1 2
a2 C 1
a2 4 2
a2 5 2
a3 D 1
a3 C 1
a3 7 2
a3 8 2
a4 H 1
a5 1 2
a5 2 2
a5 3 2
a5 3 2
a5 1 2
a6 H 1
a6 15 2
My Code:
Now, I was able to write code using dplyr for shortlisting transactions from one group. However, I am not sure how I can vectorize my code for all groups.
Here's my code:
P_Groups<-unique(P_Lookup$Group)
Chosen_Group<-P_Groups[5]
P_Group_Ind <- P_Trans %>%
group_by(TransactionID)%>%
dplyr::filter((ProductID %in% unique(P_Lookup[P_Lookup$Group==Chosen_Group,]$ProductID1)) |
(ProductID %in% unique(P_Lookup[P_Lookup$Group==Chosen_Group,]$ProductID2)) ) %>%
mutate(No_of_PIDs = n_distinct(ProductType)) %>%
mutate(Group_Name = Chosen_Group)
P_Group_Ind<-P_Group_Ind[P_Group_Ind$No_of_PIDs>1,]
This works well as long as I manually select each group i.e. by setting Chosen_Group. However, I am not sure how I can automate this. One way, I am thinking is to use for loop, but I know that the beauty of R is vectorization, so I want to stay away from using for loop.
I'd sincerely appreciate any help. I have spent almost two days on this. I looked at using dplyr in for loop in r, but it seems this thread is talking about a different issue.
DATA:
Here's dput for P_Trans:
structure(list(TransactionID = c("a1", "a1", "a1", "a2", "a2",
"a2", "a3", "a3", "a3", "a3", "a4", "a5", "a5", "a5", "a5", "a5",
"a6", "a6"), ProductID = c("A", "B", "1", "C", "4", "5", "D",
"C", "7", "8", "H", "1", "2", "3", "3", "1", "H", "15"), ProductType = c(1,
1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2)), .Names = c("TransactionID",
"ProductID", "ProductType"), row.names = c(NA, 18L), class = "data.frame")
Here's dput for P_Lookup:
structure(list(Group = c("Group1", "Group1", "Group1", "Group2",
"Group2", "Group2", "Group3", "Group3", "Group3", "Group4", "Group4",
"Group4", "Group5", "Group5", "Group5"), ProductID1 = c("A",
"B", "B", "C", "C", "C", "D", "C", "C", "E", "F", "G", "H", "H",
"H"), ProductID2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15)), .Names = c("Group", "ProductID1", "ProductID2"), row.names = c(NA,
15L), class = "data.frame")
Here's the dput() after adding a product to P_Trans that doesn't exist in the look-up table:
structure(list(TransactionID = c("a1", "a1", "a1", "a2", "a2",
"a2", "a3", "a3", "a3", "a3", "a4", "a5", "a5", "a5", "a5", "a5",
"a6", "a6", "a7"), ProductID = c("A", "B", "1", "C", "4", "5",
"D", "C", "7", "8", "H", "1", "2", "3", "3", "1", "H", "15",
"22"), ProductType = c(1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2,
2, 2, 2, 1, 2, 3)), .Names = c("TransactionID", "ProductID",
"ProductType"), row.names = c(NA, 19L), class = "data.frame")
Below is a tidyverse (dplyr, tidyr, and purrr) solution that I hope will help.
Note that the use of map_df in the last line returns all results as a data frame. If you'd prefer it to be a list object for each group, then simply use map.
library(dplyr)
library(tidyr)
library(purrr)
# Save unique groups for later use
P_Groups <- unique(P_Lookup$Group)
# Convert lookup table to product IDs and Groups
P_Lookup <- P_Lookup %>%
gather(ProductIDn, ProductID, ProductID1, ProductID2) %>%
select(ProductID, Group) %>%
distinct() %>%
nest(-ProductID, .key = Group)
# Bind Group information to transactions
# and group for next analysis
P_Trans <- P_Trans %>%
left_join(P_Lookup) %>%
filter(!map_lgl(Group, is.null)) %>%
unnest(Group) %>%
group_by(TransactionID)
# Iterate through Groups to produce results
map(P_Groups, ~ filter(P_Trans, Group == .)) %>%
map(~ mutate(., No_of_PIDs = n_distinct(ProductType))) %>%
map_df(~ filter(., No_of_PIDs > 1))
#> Source: local data frame [12 x 5]
#> Groups: TransactionID [4]
#>
#> TransactionID ProductID ProductType Group No_of_PIDs
#> <chr> <chr> <dbl> <chr> <int>
#> 1 a1 A 1 Group1 2
#> 2 a1 B 1 Group1 2
#> 3 a1 1 2 Group1 2
#> 4 a2 C 1 Group2 2
#> 5 a2 4 2 Group2 2
#> 6 a2 5 2 Group2 2
#> 7 a3 D 1 Group3 2
#> 8 a3 C 1 Group3 2
#> 9 a3 7 2 Group3 2
#> 10 a3 8 2 Group3 2
#> 11 a6 H 1 Group5 2
#> 12 a6 15 2 Group5 2
Here is a single pipe dplyr solution:
P_DualGroupTransactionsCount <-
P_Lookup %>% # data needing single column map of Keys
gather(IDnum, ProductID, ProductID1:ProductID2) %>% # produce long single map of Keys for GroupID (tidyr::)
right_join(P_trans) %>% # join transactions to groupID info
group_by(TransactionID, Group) %>% # organize for same transaction & same group
mutate(DualGroup = ifelse(n_distinct(ProductType)==2, T, F)) %>% # flag groups with both groups in a single transaction
filter(DualGroup == T) %>% # choose only doubles
select(TransactionID, Group) %>% # remove excess columns
distinct %>% # remove excess rows
nrow # count of unique transaction ID's
# P_DualGroupTransactions
# Source: local data frame [4 x 2]
# Groups: TransactionID, Group [4]
#
# TransactionID Group
# <chr> <chr>
# 1 a1 Group1
# 2 a2 Group2
# 3 a3 Group3
# 4 a6 Group5
# P_DualGroupTransactionsCount
[1] 4

Resources