I have a df with turnover for each subgroup from main. Now I want to get the two main groups with highest turnover.
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15))
> df
grp sub_grp turnover
1 A 1 98.14430
2 A 2 107.90811
3 A 3 103.93973
4 A 4 95.78222
5 A 5 63.19635
6 B 1 97.85688
7 B 2 92.65572
8 B 3 86.02872
9 B 4 101.88177
10 B 5 120.66959
11 C 1 125.93533
12 C 2 98.49771
13 C 3 77.28770
14 C 4 101.44822
15 C 5 107.08171
16 D 1 77.73252
17 D 2 107.49374
18 D 3 87.46436
19 D 4 101.49984
20 D 5 99.13047
21 E 1 91.48636
22 E 2 115.63716
23 E 3 99.34567
24 E 4 104.65408
25 E 5 121.41820
I know how to get two main groups with highest turnover, but not how to keep my subgroups and turnover still split on subgroup.
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2)
grp total.turnover
(fctr) (dbl)
1 C 510.2507
2 E 532.5415
Result I want from this example.
grp sub_grp turnover
1 C 1 125.93533
2 C 2 98.49771
3 C 3 77.28770
4 C 4 101.44822
5 C 5 107.08171
6 E 1 91.48636
7 E 2 115.63716
8 E 3 99.34567
9 E 4 104.65408
10 E 5 121.41820
Here are a couple of different approaches with dplyr.
Rejoining the original object
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2) %>%
inner_join(df, by = "grp") %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
Using a windowing function (dense_rank)
df %>%
group_by(grp) %>%
mutate(total.turnover = sum(turnover)) %>%
ungroup() %>%
filter(dense_rank(desc(total.turnover)) < 3) %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
Using data.table (similar to the dplyr windowing function approach)
library(data.table)
dt <- data.table(df)
dt[,total.turnover := sum(turnover), by = .(grp)
][,rank := frank(-total.turnover, ties.method = "dense")
][rank < 3, .(grp, sub_grp, turnover)]
# grp sub_grp turnover
# 1: A 1 91.59287
# 2: A 2 96.54734
# 3: A 3 123.38062
# 4: A 4 101.05763
# 5: A 5 101.93932
# 6: C 1 118.36123
# 7: C 2 105.39721
# 8: C 3 106.01157
# 9: C 4 101.66024
# 10: C 5 91.66238
library(dplyr)
set.seed(123)
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15)
)
One option is dplyr where we use filter on the summarised output object
df %>%
filter(grp %in% df1$grp)
where 'df1' is the summarised output object
Or if we wanted in the same chain
df %>%
group_by(grp) %>%
summarise(val = sum(turnover)) %>%
top_n(2) %>%
semi_join(df, .)
# grp sub_grp turnover
#1 C 1 125.93533
#2 C 2 98.49771
#3 C 3 77.28770
#4 C 4 101.44822
#5 C 5 107.08171
#6 E 1 91.48636
#7 E 2 115.63716
#8 E 3 99.34567
#9 E 4 104.65408
#10 E 5 121.41820
Or another one-line option is data.table
library(data.table)
setDT(df)[grp %in% df[, sum(turnover), grp][order(-V1), head(grp, 2)]]
# grp sub_grp turnover
# 1: C 1 125.93533
# 2: C 2 98.49771
# 3: C 3 77.28770
# 4: C 4 101.44822
# 5: C 5 107.08171
# 6: E 1 91.48636
# 7: E 2 115.63716
# 8: E 3 99.34567
# 9: E 4 104.65408
#10: E 5 121.41820
Or we can do this easily with base R
subset(df, grp %in% names(tail(sort(xtabs(turnover~grp , df)),2)))
# grp sub_grp turnover
#11 C 1 125.93533
#12 C 2 98.49771
#13 C 3 77.28770
#14 C 4 101.44822
#15 C 5 107.08171
#21 E 1 91.48636
#22 E 2 115.63716
#23 E 3 99.34567
#24 E 4 104.65408
#25 E 5 121.41820
data
df <- structure(list(grp = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "D", "E",
"E", "E", "E", "E"), sub_grp = c(1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L), turnover = c(98.1443, 107.90811, 103.93973, 95.78222,
63.19635, 97.85688, 92.65572, 86.02872, 101.88177, 120.66959,
125.93533, 98.49771, 77.2877, 101.44822, 107.08171, 77.73252,
107.49374, 87.46436, 101.49984, 99.13047, 91.48636, 115.63716,
99.34567, 104.65408, 121.4182)), .Names = c("grp", "sub_grp",
"turnover"),
class = "data.frame", row.names = c(NA, -25L),
index = structure(integer(0), "`__grp`" = integer(0)))
Related
I have this dataframe:
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 3L), var = c("A", "B",
"C", "B", "C", "C")), class = "data.frame", row.names = c(NA,
-6L))
id var
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 3 C
I would like to get this dataframe:
id var
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C
I would like to learn how to use complete or expand.grid in this situation
I have tried several ways but was not successful: One of my tries:
df %>%
complete(id, var, fill=list(NA))
Create a duplicate column of 'var' and then do the complete on the other column, which makes the NA in the 'var' column and then remove the duplicate 'var' column
library(dplyr)
library(tidyr)
df %>%
mutate(var1 = var) %>%
complete(id, var1) %>%
select(-var1)
-output
# A tibble: 9 × 2
id var
<int> <chr>
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C
The solution to this simple problem has eluded me for several hours. I have a data table in which a value is identified by several classification variables (A, B, L). Where there are observations characterized by duplicate classification variables A & B, I want to retain the one that has the highest 'L'. So, if I have a table generated with this code
set.seed(17)
DT <- data.table(A=rep(c("a","b"),each=5),
B=c("a","b","c","d","d","a","b","b","c","d"),
L=c(1,1,1,2,1,1,1,2,1,1),
val=rnbinom(10, size=2, mu=3))
Making the following:
A B L val
1: a a 1 1
2: a b 1 10
3: a c 1 3
4: a d 1 5
5: a d 2 2
6: b a 1 8
7: b b 1 7
8: b b 2 1
9: b c 1 2
10: b d 1 2
I have tried commands such as
setkey(DT,A,B,L)
DT[ , .(A,B,L,val) , mult="last"]
but I'm just not getting something.
I want a resulting table that looks like this
A B L val
1: a a 1 1
2: a b 1 10
3: a c 1 3
5: a d 2 2
6: b a 1 8
8: b b 2 1
9: b c 1 2
10: b d 1 2
DT[, lapply(.SD, last), .(A,B)])
should also work and seems to be a bit faster than the merge solution
solution option
library(data.table)
dt <- structure(list(A = c("a", "a", "a", "a", "a", "b", "b", "b",
"b", "b"), B = c("a", "b", "c", "d", "d", "a", "b", "b", "c",
"d"), L = c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L), val = c(1L,
10L, 3L, 5L, 2L, 8L, 7L, 1L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-10L))
setDT(dt)
merge(dt[, list(L = last(L)), by =list(A, B)], dt)
#> A B L val
#> 1: a a 1 1
#> 2: a b 1 10
#> 3: a c 1 3
#> 4: a d 2 2
#> 5: b a 1 8
#> 6: b b 2 1
#> 7: b c 1 2
#> 8: b d 1 2
Created on 2021-03-24 by the reprex package (v1.0.0)
set.seed(17)
library(data.table)
DT <- data.table(A=rep(c("a","b"),each=5),
B=c("a","b","c","d","d","a","b","b","c","d"),
L=c(1,1,1,2,1,1,1,2,1,1),
val=rnbinom(10, size=2, mu=3))
result <- DT[DT[, .I[L == max(L)], by = list(A, B)]$V1]
> result
A B L val
1: a a 1 1
2: a b 1 1
3: a c 1 3
4: a d 2 12
5: b a 1 6
6: b b 2 2
7: b c 1 3
8: b d 1 5
Here's how I'd do it (without mult)
DT[order(-L), .SD[1], .(A,B)]
With mult something like this would do it - note that Im doing an actual join here
DT[order(L)][unique(DT[, .(A, B)]), on = c('A', 'B'), mult = 'last']
#> A B L val
#> 1: a a 1 1
#> 2: a b 1 1
#> 3: a c 1 3
#> 4: a d 2 12
#> 5: b a 1 6
#> 6: b b 2 2
#> 7: b c 1 3
#> 8: b d 1 5
I have a dataframe in R that has a large number of bank_account_IDs and Vendor_Codes. Bank_account_IDs should not be shared between Vendor_Codes, but sometimes a fraudulent vendor exists that shares another vendor's bank_account_ID.
I want to add a new field to the dataframe that provides a count for the number of times an account_ID exists with more than 1 Vendor_Code.
My sample dataframe is as follows:
bank_account_ID = c(a, b, c, a, a, d, e, f, b, c)
Vendor_Code = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
df <-data.frame(a,b)
My ideal new dataframe should look something like this:
bank_account_ID Vendor_Code duplicate_count
a 1 2
b 2 1
c 3 1
a 4 2
a 5 2
d 6 0
e 7 0
f 8 0
b 9 1
c 10 1
Thanks in advance!
We can get the number of distinct elements with n_distinct grouped by the 'bank_account_ID' and subtract 1
library(dplyr)
df %>%
group_by(bank_account_ID) %>%
mutate(dupe_count = n_distinct(Vendor_Code)-1) %>%
ungroup
-output
# A tibble: 10 x 4
# bank_account_ID Vendor_Code duplicate_count dupe_count
# <chr> <int> <int> <dbl>
# 1 a 1 2 2
# 2 b 2 1 1
# 3 c 3 1 1
# 4 a 4 2 2
# 5 a 5 2 2
# 6 d 6 0 0
# 7 e 7 0 0
# 8 f 8 0 0
# 9 b 9 1 1
#10 c 10 1 1
data
df <- structure(list(bank_account_ID = c("a", "b", "c", "a", "a", "d",
"e", "f", "b", "c"), Vendor_Code = 1:10, duplicate_count = c(2L,
1L, 1L, 2L, 2L, 0L, 0L, 0L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
I have a dataframe of the following type
ID case1 case2 case3 case4
1 A B C D
2 B A
3 E F
4 G C A
5 T
I need to change its format, to a long shape, similar as the below:
ID col1 col2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 B A
3 E F
4 G C
4 G A
4 C A
5 T
As you can see, I need to maintain the ID and ignore empty columns. There are some cases like T that need to remain in the dataset, but without a col2.
I am honestly not sure how to approach this, so that is why there are no examples of what I have tried.
You can get the data in long format and create all combination of values for each ID if the number of rows is greater than 1 in that ID.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
summarise(value = if(n() > 1) list(setNames(as.data.frame(t(combn(value, 2))),
c('col1', 'col2')))
else list(data.frame(col1 = value[1], col2 = NA_character_))) %>%
unnest(value)
# A tibble: 12 x 3
# ID col1 col2
# <int> <chr> <chr>
# 1 1 A B
# 2 1 A C
# 3 1 A D
# 4 1 B C
# 5 1 B D
# 6 1 C D
# 7 2 B A
# 8 3 E F
# 9 4 G C
#10 4 G A
#11 4 C A
#12 5 T NA
data
df <- structure(list(ID = 1:5, case1 = c("A", "B", "E", "G", "T"),
case2 = c("B", "A", "F", "C", NA), case3 = c("C", NA, NA,
"A", NA), case4 = c("D", NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -5L))
I have this data frame:
df <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
stringsAsFactors = FALSE)
For each group (id), I aim to remove the rows with one or multiple leading NA in front of an "a" (in the column "status") but not in front of a "b".
The final data frame should look like this:
structure(list(
id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L),
status = c("a", "c", "a", NA, "b", "c", "c", "a", "c", NA, NA, "b", "b")),
.Names = c("id", "status"), row.names = c(NA, -13L), class = "data.frame")
How do I do that?
Edit: alternatively, how would I do it to preserve other variables in the data frame such as the variable otherVar in the following example:
df2 <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
otherVar = letters[1:16],
stringsAsFactors = FALSE)
We can group by 'id', summarise the 'status' by pasteing the elements together, then use gsub to remove the NA before the 'a' and convert it to 'long' format with separate_rows
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
summarise(status = gsub("(NA, ){1,}(?=a)", "", toString(status),
perl = TRUE)) %>%
separate_rows(status, convert = TRUE)
# A tibble: 13 x 2
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 NA
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
#10 4 NA
#11 4 NA
#12 4 b
#13 4 b
Or using data.table with the same methodology
library(data.table)
out1 <- setDT(df)[, strsplit(gsub("(NA, ){1,}(?=a)", "",
toString(status), perl = TRUE), ", "), id]
setnames(out1, 'V1', "status")[]
# id status
# 1: 1 a
# 2: 1 c
# 3: 1 a
# 4: 2 NA
# 5: 2 b
# 6: 2 c
# 7: 2 c
# 8: 3 a
# 9: 3 c
#10: 4 NA
#11: 4 NA
#12: 4 b
#13: 4 b
Update
For the updated dataset 'df2'
i1 <- setDT(df2)[, .I[seq(which(c(diff((status %in% "a") +
rleid(is.na(status))) > 1), FALSE))] , id]$V1
df2[-i1]
# id status otherVar
# 1: 1 a b
# 2: 1 c c
# 3: 1 a d
# 4: 2 NA e
# 5: 2 b f
# 6: 2 c g
# 7: 2 c h
# 8: 3 a k
# 9: 3 c l
#10: 4 NA m
#11: 4 NA n
#12: 4 b o
#13: 4 b p
From zoo with na.locf and is.na, notice it assuming you data is ordered.
df[!(na.locf(df$status,fromLast = T)=='a'&is.na(df$status)),]
id status
2 1 a
3 1 c
4 1 a
5 2 <NA>
6 2 b
7 2 c
8 2 c
11 3 a
12 3 c
13 4 <NA>
14 4 <NA>
15 4 b
16 4 b
Here's a dplyr solution and a not as pretty base translation :
dplyr
library(dplyr)
df %>% group_by(id) %>%
filter(status[!is.na(status)][1]!="a" | !is.na(status))
# # A tibble: 13 x 2
# # Groups: id [4]
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 <NA>
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
# 10 4 <NA>
# 11 4 <NA>
# 12 4 b
# 13 4 b
base
do.call(rbind,
lapply(split(df,df$id),
function(x) x[x$status[!is.na(x$status)][1]!="a" | !is.na(x$status),]))
# id status
# 1.2 1 a
# 1.3 1 c
# 1.4 1 a
# 2.5 2 <NA>
# 2.6 2 b
# 2.7 2 c
# 2.8 2 c
# 3.11 3 a
# 3.12 3 c
# 4.13 4 <NA>
# 4.14 4 <NA>
# 4.15 4 b
# 4.16 4 b
note
Will fail if not all NAs are leading because will remove all NAs from groups starting with "a" as a first non NA value.