How do I eliminate duplicates in data.table using mult - r

The solution to this simple problem has eluded me for several hours. I have a data table in which a value is identified by several classification variables (A, B, L). Where there are observations characterized by duplicate classification variables A & B, I want to retain the one that has the highest 'L'. So, if I have a table generated with this code
set.seed(17)
DT <- data.table(A=rep(c("a","b"),each=5),
B=c("a","b","c","d","d","a","b","b","c","d"),
L=c(1,1,1,2,1,1,1,2,1,1),
val=rnbinom(10, size=2, mu=3))
Making the following:
A B L val
1: a a 1 1
2: a b 1 10
3: a c 1 3
4: a d 1 5
5: a d 2 2
6: b a 1 8
7: b b 1 7
8: b b 2 1
9: b c 1 2
10: b d 1 2
I have tried commands such as
setkey(DT,A,B,L)
DT[ , .(A,B,L,val) , mult="last"]
but I'm just not getting something.
I want a resulting table that looks like this
A B L val
1: a a 1 1
2: a b 1 10
3: a c 1 3
5: a d 2 2
6: b a 1 8
8: b b 2 1
9: b c 1 2
10: b d 1 2

DT[, lapply(.SD, last), .(A,B)])
should also work and seems to be a bit faster than the merge solution

solution option
library(data.table)
dt <- structure(list(A = c("a", "a", "a", "a", "a", "b", "b", "b",
"b", "b"), B = c("a", "b", "c", "d", "d", "a", "b", "b", "c",
"d"), L = c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L), val = c(1L,
10L, 3L, 5L, 2L, 8L, 7L, 1L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-10L))
setDT(dt)
merge(dt[, list(L = last(L)), by =list(A, B)], dt)
#> A B L val
#> 1: a a 1 1
#> 2: a b 1 10
#> 3: a c 1 3
#> 4: a d 2 2
#> 5: b a 1 8
#> 6: b b 2 1
#> 7: b c 1 2
#> 8: b d 1 2
Created on 2021-03-24 by the reprex package (v1.0.0)

set.seed(17)
library(data.table)
DT <- data.table(A=rep(c("a","b"),each=5),
B=c("a","b","c","d","d","a","b","b","c","d"),
L=c(1,1,1,2,1,1,1,2,1,1),
val=rnbinom(10, size=2, mu=3))
result <- DT[DT[, .I[L == max(L)], by = list(A, B)]$V1]
> result
A B L val
1: a a 1 1
2: a b 1 1
3: a c 1 3
4: a d 2 12
5: b a 1 6
6: b b 2 2
7: b c 1 3
8: b d 1 5

Here's how I'd do it (without mult)
DT[order(-L), .SD[1], .(A,B)]
With mult something like this would do it - note that Im doing an actual join here
DT[order(L)][unique(DT[, .(A, B)]), on = c('A', 'B'), mult = 'last']
#> A B L val
#> 1: a a 1 1
#> 2: a b 1 1
#> 3: a c 1 3
#> 4: a d 2 12
#> 5: b a 1 6
#> 6: b b 2 2
#> 7: b c 1 3
#> 8: b d 1 5

Related

Remove NA in front of one specific string but leave in front of another specific string, by group

I have this data frame:
df <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
stringsAsFactors = FALSE)
For each group (id), I aim to remove the rows with one or multiple leading NA in front of an "a" (in the column "status") but not in front of a "b".
The final data frame should look like this:
structure(list(
id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L),
status = c("a", "c", "a", NA, "b", "c", "c", "a", "c", NA, NA, "b", "b")),
.Names = c("id", "status"), row.names = c(NA, -13L), class = "data.frame")
How do I do that?
Edit: alternatively, how would I do it to preserve other variables in the data frame such as the variable otherVar in the following example:
df2 <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
otherVar = letters[1:16],
stringsAsFactors = FALSE)
We can group by 'id', summarise the 'status' by pasteing the elements together, then use gsub to remove the NA before the 'a' and convert it to 'long' format with separate_rows
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
summarise(status = gsub("(NA, ){1,}(?=a)", "", toString(status),
perl = TRUE)) %>%
separate_rows(status, convert = TRUE)
# A tibble: 13 x 2
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 NA
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
#10 4 NA
#11 4 NA
#12 4 b
#13 4 b
Or using data.table with the same methodology
library(data.table)
out1 <- setDT(df)[, strsplit(gsub("(NA, ){1,}(?=a)", "",
toString(status), perl = TRUE), ", "), id]
setnames(out1, 'V1', "status")[]
# id status
# 1: 1 a
# 2: 1 c
# 3: 1 a
# 4: 2 NA
# 5: 2 b
# 6: 2 c
# 7: 2 c
# 8: 3 a
# 9: 3 c
#10: 4 NA
#11: 4 NA
#12: 4 b
#13: 4 b
Update
For the updated dataset 'df2'
i1 <- setDT(df2)[, .I[seq(which(c(diff((status %in% "a") +
rleid(is.na(status))) > 1), FALSE))] , id]$V1
df2[-i1]
# id status otherVar
# 1: 1 a b
# 2: 1 c c
# 3: 1 a d
# 4: 2 NA e
# 5: 2 b f
# 6: 2 c g
# 7: 2 c h
# 8: 3 a k
# 9: 3 c l
#10: 4 NA m
#11: 4 NA n
#12: 4 b o
#13: 4 b p
From zoo with na.locf and is.na, notice it assuming you data is ordered.
df[!(na.locf(df$status,fromLast = T)=='a'&is.na(df$status)),]
id status
2 1 a
3 1 c
4 1 a
5 2 <NA>
6 2 b
7 2 c
8 2 c
11 3 a
12 3 c
13 4 <NA>
14 4 <NA>
15 4 b
16 4 b
Here's a dplyr solution and a not as pretty base translation :
dplyr
library(dplyr)
df %>% group_by(id) %>%
filter(status[!is.na(status)][1]!="a" | !is.na(status))
# # A tibble: 13 x 2
# # Groups: id [4]
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 <NA>
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
# 10 4 <NA>
# 11 4 <NA>
# 12 4 b
# 13 4 b
base
do.call(rbind,
lapply(split(df,df$id),
function(x) x[x$status[!is.na(x$status)][1]!="a" | !is.na(x$status),]))
# id status
# 1.2 1 a
# 1.3 1 c
# 1.4 1 a
# 2.5 2 <NA>
# 2.6 2 b
# 2.7 2 c
# 2.8 2 c
# 3.11 3 a
# 3.12 3 c
# 4.13 4 <NA>
# 4.14 4 <NA>
# 4.15 4 b
# 4.16 4 b
note
Will fail if not all NAs are leading because will remove all NAs from groups starting with "a" as a first non NA value.

Getting top_n when multiple groups and keep subgroups

I have a df with turnover for each subgroup from main. Now I want to get the two main groups with highest turnover.
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15))
> df
grp sub_grp turnover
1 A 1 98.14430
2 A 2 107.90811
3 A 3 103.93973
4 A 4 95.78222
5 A 5 63.19635
6 B 1 97.85688
7 B 2 92.65572
8 B 3 86.02872
9 B 4 101.88177
10 B 5 120.66959
11 C 1 125.93533
12 C 2 98.49771
13 C 3 77.28770
14 C 4 101.44822
15 C 5 107.08171
16 D 1 77.73252
17 D 2 107.49374
18 D 3 87.46436
19 D 4 101.49984
20 D 5 99.13047
21 E 1 91.48636
22 E 2 115.63716
23 E 3 99.34567
24 E 4 104.65408
25 E 5 121.41820
I know how to get two main groups with highest turnover, but not how to keep my subgroups and turnover still split on subgroup.
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2)
grp total.turnover
(fctr) (dbl)
1 C 510.2507
2 E 532.5415
Result I want from this example.
grp sub_grp turnover
1 C 1 125.93533
2 C 2 98.49771
3 C 3 77.28770
4 C 4 101.44822
5 C 5 107.08171
6 E 1 91.48636
7 E 2 115.63716
8 E 3 99.34567
9 E 4 104.65408
10 E 5 121.41820
Here are a couple of different approaches with dplyr.
Rejoining the original object
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2) %>%
inner_join(df, by = "grp") %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
Using a windowing function (dense_rank)
df %>%
group_by(grp) %>%
mutate(total.turnover = sum(turnover)) %>%
ungroup() %>%
filter(dense_rank(desc(total.turnover)) < 3) %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
Using data.table (similar to the dplyr windowing function approach)
library(data.table)
dt <- data.table(df)
dt[,total.turnover := sum(turnover), by = .(grp)
][,rank := frank(-total.turnover, ties.method = "dense")
][rank < 3, .(grp, sub_grp, turnover)]
# grp sub_grp turnover
# 1: A 1 91.59287
# 2: A 2 96.54734
# 3: A 3 123.38062
# 4: A 4 101.05763
# 5: A 5 101.93932
# 6: C 1 118.36123
# 7: C 2 105.39721
# 8: C 3 106.01157
# 9: C 4 101.66024
# 10: C 5 91.66238
library(dplyr)
set.seed(123)
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15)
)
One option is dplyr where we use filter on the summarised output object
df %>%
filter(grp %in% df1$grp)
where 'df1' is the summarised output object
Or if we wanted in the same chain
df %>%
group_by(grp) %>%
summarise(val = sum(turnover)) %>%
top_n(2) %>%
semi_join(df, .)
# grp sub_grp turnover
#1 C 1 125.93533
#2 C 2 98.49771
#3 C 3 77.28770
#4 C 4 101.44822
#5 C 5 107.08171
#6 E 1 91.48636
#7 E 2 115.63716
#8 E 3 99.34567
#9 E 4 104.65408
#10 E 5 121.41820
Or another one-line option is data.table
library(data.table)
setDT(df)[grp %in% df[, sum(turnover), grp][order(-V1), head(grp, 2)]]
# grp sub_grp turnover
# 1: C 1 125.93533
# 2: C 2 98.49771
# 3: C 3 77.28770
# 4: C 4 101.44822
# 5: C 5 107.08171
# 6: E 1 91.48636
# 7: E 2 115.63716
# 8: E 3 99.34567
# 9: E 4 104.65408
#10: E 5 121.41820
Or we can do this easily with base R
subset(df, grp %in% names(tail(sort(xtabs(turnover~grp , df)),2)))
# grp sub_grp turnover
#11 C 1 125.93533
#12 C 2 98.49771
#13 C 3 77.28770
#14 C 4 101.44822
#15 C 5 107.08171
#21 E 1 91.48636
#22 E 2 115.63716
#23 E 3 99.34567
#24 E 4 104.65408
#25 E 5 121.41820
data
df <- structure(list(grp = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "D", "E",
"E", "E", "E", "E"), sub_grp = c(1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L), turnover = c(98.1443, 107.90811, 103.93973, 95.78222,
63.19635, 97.85688, 92.65572, 86.02872, 101.88177, 120.66959,
125.93533, 98.49771, 77.2877, 101.44822, 107.08171, 77.73252,
107.49374, 87.46436, 101.49984, 99.13047, 91.48636, 115.63716,
99.34567, 104.65408, 121.4182)), .Names = c("grp", "sub_grp",
"turnover"),
class = "data.frame", row.names = c(NA, -25L),
index = structure(integer(0), "`__grp`" = integer(0)))

Preserving missing fields when row-wise binding elements of a list in R

I have a named list, and I want to bind its elements. I am a big fan of data.table::rbindlist() but it removes NA entries. Is there anyway I can preserve NA entries?
Here's my code:
dput(Result)
structure(list(a = c(1L, 3L), b = c(2L, 4L), c = 4L, d = integer(0),
e = integer(0), f = integer(0)), .Names = c("a", "b", "c",
"d", "e", "f"))
Here's what I tried for data.table
Attempt1 : Using data.table
Result1<-data.table::rbindlist(lapply(Result, as.data.frame),use.names=TRUE, fill=TRUE, idcol="Name")
However, I lost d and e.
Attempt2 : Using dplyr
dplyr::bind_rows(lapply(Result, as.data.frame))
Again, I lost d and e.
Expected Output:
Result1
Name X[[i]]
1: a 1
2: a 3
3: b 2
4: b 4
5: c 4
6: d NA
7: e NA
8: f NA
I'd appreciate any help.
Here you go:
Result = structure(list(a = c(1L, 3L), b = c(2L, 4L), c = 4L, d = integer(0),
e = integer(0), f = integer(0)), .Names = c("a", "b", "c",
"d", "e", "f"))
Result2 = lapply(Result, function(x){
if(length(x)==0){NA}else{x}
})
Result3 = data.table::rbindlist(lapply(Result2,
as.data.frame),use.names=TRUE, fill=TRUE, idcol="Name")
The problem is that integer(0) is not NA, so you must convert them to NA as shown for Result2.
Result:
> Result3
Name X[[i]]
1: a 1
2: a 3
3: b 2
4: b 4
5: c 4
6: d NA
7: e NA
8: f NA
Replace the zero length elements with NA, then use rbindlist.
Result[!lengths(Result)] <- NA
## or
## is.na(Result) <- !lengths(Result)
rbindlist(lapply(Result, as.data.table), id = "Name")
# Name V1
# 1: a 1
# 2: a 3
# 3: b 2
# 4: b 4
# 5: c 4
# 6: d NA
# 7: e NA
# 8: f NA
You could also do this in base R with
is.na(Result) <- !lengths(Result)
data.frame(
Name = rep(names(Result), lengths(Result)),
V1 = unlist(Result, use.names = FALSE)
)
# Name V1
# 1 a 1
# 2 a 3
# 3 b 2
# 4 b 4
# 5 c 4
# 6 d NA
# 7 e NA
# 8 f NA

R: frequency with group by ID [duplicate]

This question already has answers here:
Frequency count of two column in R
(8 answers)
Closed 6 years ago.
I have a data frame like this:
ID Cont
1 a
1 a
1 b
2 a
2 c
2 d
I need to report the frequence of "Cont" by ID. The output should be
ID Cont Freq
1 a 2
1 b 1
2 a 1
2 c 1
2 d 1
Using dplyr, you can group_by both ID and Cont and summarise using n() to get Freq:
library(dplyr)
res <- df %>% group_by(ID,Cont) %>% summarise(Freq=n())
##Source: local data frame [5 x 3]
##Groups: ID [?]
##
## ID Cont Freq
## <int> <fctr> <int>
##1 1 a 2
##2 1 b 1
##3 2 a 1
##4 2 c 1
##5 2 d 1
Data:
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L), Cont = structure(c(1L,
1L, 2L, 1L, 3L, 4L), .Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("ID",
"Cont"), class = "data.frame", row.names = c(NA, -6L))
## ID Cont
##1 1 a
##2 1 a
##3 1 b
##4 2 a
##5 2 c
##6 2 d
library(data.table)
setDT(x)[, .(Freq = .N), by = .(ID, Cont)]
# ID Cont Freq
# 1: 1 a 2
# 2: 1 b 1
# 3: 2 a 1
# 4: 2 c 1
# 5: 2 d 1
With base R:
df1 <- subset(as.data.frame(table(df)), Freq != 0)
if you want to order by ID, add this line:
df1[order(df1$ID)]
ID Cont Freq
1 1 a 2
3 1 b 1
2 2 a 1
6 2 c 1
8 2 d 1

R: Subsetting a data.table with repeated column names with numerical positions

I have a data.table that looks like this
> dput(DT)
A B C A B C D
1: 1 2 3 3 5 6 7
2: 2 1 3 2 1 3 4
Here's the dput
DT <- structure(list(A = 1:2, B = c(2L, 1L), C = c(3L, 3L), A = c(3L,
2L), B = c(5L, 1L), C = c(6L, 3L), D = c(7L, 4L)), .Names = c("A",
"B", "C", "A", "B", "C", "D"), row.names = c(NA, -2L), class = c("data.table",
"data.frame"))
Basically, I want to subset them according to their headers. So for header "B", I would do this:
subset(DT,,grep(unique(names(DT))[2],names(DT)))
B B
1: 2 2
2: 1 1
As you can see, the values are wrong as the second column is simply a repeat of the first. I want to get this instead:
B B
1: 2 5
2: 1 1
Can anyone help me please?
The following alternatives work for me:
pos <- grep("B", names(DT))
DT[, ..pos]
# B B
# 1: 2 5
# 2: 1 1
DT[, .SD, .SDcols = patterns("B")]
# B B
# 1: 2 5
# 2: 1 1
DT[, names(DT) %in% unique(names(DT))[2], with = FALSE]
# B B
# 1: 2 5
# 2: 1 1

Resources