Find duplicated rows with original

Find duplicated rows with original - r

I can get duplicated rows in R on a data.table dt using
dt[duplicated(dt, by=someColumns)]
However, I would like to get pairs of duplicated rows and the "non-duplicates", for example consider dt:
col1, col2, col3
A B C1
A B C2
A B1 C1
Now, dt[duplicated(dt, by=c('col1', "col2")) would give me something along the lines of
col1, col2, col3
A B C2
I would like to get this together with the row that it did not chose to be duplicated, that is
col1, col2, col3
A B C1
A B C2
Speed comparison of answers:
> system.time(dt[duplicated(dt2, by = t) | duplicated(dt, by = t, fromLast = TRUE)])
user system elapsed
0.008 0.000 0.009
> system.time(dt[, .SD[.N > 1], by = t])
user system elapsed
77.555 0.100 77.703

I believe this is essentially a duplicate of this question, though i can see how you may not have found it...
...here's an answer building off the logic outlined in the referenced question:
dt <- read.table(text = "col1 col2 col3
A B C1
A B C2
A B1 C1", header = TRUE, stringsAsFactors = FALSE)
idx <- duplicated(dt[, 1:2]) | duplicated(dt[, 1:2], fromLast = TRUE)
dt[idx, ]
#---
col1 col2 col3
1 A B C1
2 A B C2
Since you are using data.table, this is probably what you want:
library(data.table)
dt <- data.table(dt)
dt[duplicated(dt, by = c("col1", "col2")) | duplicated(dt, by = c("col1", "col2"), fromLast = TRUE)]
#---
col1 col2 col3
1: A B C1
2: A B C2

You can easily achieve this just by using .N:
dt[, .SD[.N > 1], by = list(col1, col2)]
## col1 col2 col3
## 1: A B C1
## 2: A B C2
Edit:
You can also try to use binary search which is very efficient, though it seems like duplicated is still more efficient
setkey(dt[, indx := .N, by = list(col1, col2)], indx)[!J(1)]
## col1 col2 col3
## 1: A B C1
## 2: A B C2

Related

Merge two large data.tables based on column name of one table and column value of the other without melting

I've got two large data.tables DT1 (2M rows x 300 cols) and DT2 (50M rows x 2 cols) and i would like to merge the values of DT1 columns to a new column in DT2 based on the name of the column specified in a DT2 column. I'd like to achieve this without having to melt DT1, and by using data.table operations only, if possible.
Hora, a sample dataset.
> require(data.table)
> DT1 <- data.table(ID = c('A', 'B', 'C', 'D'), col1 = (1:4), col2 = (5:8), col3 = (9:12), col4 = (13:16))
> DT1
ID col1 col2 col3 col4
1: A 1 5 9 13
2: B 2 6 10 14
3: C 3 7 11 15
4: D 4 8 12 16
> DT2
ID col
1: A col1
2: B col2
3: B col3
4: C col1
5: A col4
#desired output
> DT2_merge
ID col col_value
1: A col1 1
2: B col2 6
3: B col3 10
4: C col1 3
5: A col4 13
Since dealing with two large data.tables, hoping to find the most efficient way of doing this.

Maybe there is a pure data.table version to do this but one way is to use matrix subsetting
library(data.table)
setDF(DT1)
DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13

Using set():
setkey(DT1, "ID")
setkey(DT2, "ID")
for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
}
ID col col_value
1: A col1 1
2: A col4 13
3: B col2 6
4: B col3 10
5: C col1 3
Note: Setting the key up front speeds up the process but reorders the rows.

You can use lookup tables to find the indices for subsetting like:
setDF(DT1)
DT2[, col_value := DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13
Using a matrix for subsetting is currently not sported in DT so if you have data.frame instead of data.table you can do it in base with:
DT2$col_value <- DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]
You can also change your data structure before and change from matrix- to vector-subsetting:
DT1ID <- setNames(seq_len(nrow(DT1)), DT1$ID)
DT1 <- as.matrix(DT1[,-1])
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(DT1ID[DT2$ID] + (DT2$col-1)*nrow(DT1))]
Maybe also try fastmatch:
library(fastmatch)
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID)
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1)) + (DT2$col-1)*nrow(DT1))]
Or you avoid lookup during subsetting und use levels when creating factor:
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID, colnames=colnames(DT1)[-1])
DT2$ID <- factor(DT2$ID, levels=rownames(DT1))
DT2$col <- factor(DT2$col, levels=colnames(DT1))
DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))]

Here are two solutions also applicable to data.frame():
Solution 1
DT2$col_value <- apply(DT2, 1, function(v) DT1[which(DT1$ID==v[1]),which(colnames(DT1)==v[2])])
Solution 2 (same with solution by #Ronak Shah) maybe much faster than Solution 1 when with large dataset
DT2$col_value <- DT1[cbind(match(DT2$ID,DT1$ID),match(DT2$col,colnames(DT1)))]
Solution 3 (maybe the fastest)
m <- as.matrix(DT1[-1])
rownames(m) <- DT1$ID
DT2$col_value <- m[as.matrix(DT2)]

Testing some of the methods on a larger data-set and show their performance:
#sindri_baldur
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setkey(DT1, "ID")
setkey(DT2, "ID")
system.time(for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
})
#User: 6.696
#Ronak Shah
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setDF(DT1)
system.time(DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]])
#User: 5.210
#Using fastmatch
library(fastmatch)
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1))
+ (fmatch(DT2$col, colnames(DT1))-1)*nrow(DT1))])
#User: 0.061
#Using factors
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))])
#User: 0.024
Data:
set.seed(7)
nrows <- 1e5
ncols <- 300
x1 <- matrix(sample(0:20, nrows*ncols, replace=TRUE), ncol=ncols
, dimnames=list(sample(do.call("paste0", expand.grid(rep(list(letters)
, ceiling(log(nrows, length(letters)))))), nrows), seq_len(ncols)))
x2 <- data.frame(ID=factor(sample(rownames(x1), nrows*10, replace=TRUE)
, levels=rownames(x1))
, col=factor(sample(colnames(x1), nrows*10, replace=TRUE), levels=colnames(x1)))

data.table: count rows within time moving window

library(data.table)
df <- data.table(col1 = c('B', 'A', 'A', 'B', 'B', 'B'), col2 = c("2015-03-06 01:37:57", "2015-03-06 01:39:57", "2015-03-06 01:45:28", "2015-03-06 02:31:44", "2015-03-06 03:55:45", "2015-03-06 04:01:40"))
For each row I want to count number of rows with same values of 'col1' and time within window of past 10 minutes before time of this row(include)
I run next code:
df$col2 <- as_datetime(df$col2)
window = 10L
(counts = setDT(df)[.(t1=col2-window*60L, t2=col2), on=.((col2>=t1) & (col2<=t2)),
.(counts=.N), by=col1]$counts)
df[, counts := counts]
and got next mistake:
Error in `[.data.table`(setDT(df), .(t1 = col2 - window * 60L, t2 = col2), : Column(s) [(col2] not found in x
I want result like next:
col1 col2 counts
B 2015-03-06 01:37:57 1
A 2015-03-06 01:39:57 1
A 2015-03-06 01:45:28 2
B 2015-03-06 02:31:44 1
B 2015-03-06 03:55:45 1
B 2015-03-06 04:01:40 2

A possible solution:
df[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(counts = .N), by = .EACHI][, (2) := NULL][]
which gives:
col1 col2 counts
1: B 2015-03-06 01:37:57 1
2: A 2015-03-06 01:39:57 1
3: A 2015-03-06 01:45:28 2
4: B 2015-03-06 02:31:44 1
5: B 2015-03-06 03:55:45 1
6: B 2015-03-06 04:01:40 2
A couple of notes about your approach:
You don't need setDT because you already constructed df with data.table(...).
You on-statement isn't specified correctly: you need to separate the join conditions with a , and not with a &. For example: on = .(col1, col2 >= t1, col2 <= t2)
Use by = .EACHI to get the result for each row.
An alternative approach:
df[, counts := .SD[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .N, by = .EACHI]$N][]
which gives the same result.

count unique values in each row data table

I have a data table like
col1 col2 col3 col4
1: a a a 2
2: b b a 4.4
3: w w s 6.3
I want to get something like below without running a for loop.
col1 col2 col3 col4 count
1: a a a 2 1
2: b b a 4.4 2
3: w w s 6.3 2
I am counting unique values of col1, col2, col3 in each row and storing in count column. How do I do this in 1 line?

This might be of use:
data.frame solution
df <- read.table(header=T, text=' col1 col2 col3 col4
1 a a a 2
2 b b a 4.4
3 w w s 6.3')
#one line using apply
df$count <- apply(df[1:3], 1, function(x) {length(unique(x))})
Output:
> df
col1 col2 col3 col4 count
1 a a a 2.0 1
2 b b a 4.4 2
3 w w s 6.3 2
data.table solution
It will be a bit trickier to use the data.table syntax in this example.
First I create an id column by which to group_by:
#convert original df to data.table
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
and then I use my self-made luna function to calculate the length of the unique elements:
luna <- function(x) length(unique(unlist(strsplit(x,''))))
df2[, count := luna(paste0(col1, col2, col3)), by=id ]
Output:
> df2
col1 col2 col3 col4 id count
1: a a a 2.0 1 1
2: b b a 4.4 2 2
3: w w s 6.3 3 2
Or as #Tensibai mentions in the comments, this is much faster:
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
luna <- function(x) length(unique(x))
df2[, count2 := luna(c(col1, col2, col3)), by=id ]
> df2
col1 col2 col3 col4 id count2
1: a a a 2.0 1 1
2: b b a 4.4 2 2
3: w w s 6.3 3 2
And if we combine #Frank's and #Tensibai 's comments this should be the fastest (data.table 1.9.5+):
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
#not run
#works only in data.table >= 1.9.5
df2[, count2 := uniqueN(c(col1, col2, col3)), by=id ]
#not run

How about the following:
dt <- CJ(1:5,1:3,1:4,1:2)
dt[, cnt:=apply(dt, 1, function(r) length(unique(r)))]
or if you only want to keep the rows with unique entries, you can try
dt <- CJ(1:5,1:3,1:4,1:2)
dt[apply(dt, 1, function(r) length(unique(r))==ncol(dt))]

Remove rows from a data frame that contain duplicate information across the columns

col1 <- c('A','B','C', 'D')
col2 <- c('B','A','C', 'C')
col3 <- c('B','C','C', 'A')
dat <- data.frame(cbind(col1, col2, col3))
dat
col1 col2 col3
1 A B B
2 B A C
3 C C C
4 D C A
I would like to remove rows 1 and 3 from dat as the letter B is present more than once in row 1 and the letter C is present more than once in row 3.
EDIT:
My actual data contains over 1 million rows and 14 columns, all of which contain character data. The solution that runs the fastest is preferred as I am using the dataframe in a live setting to make decisions, and the underlying data is changing every few minutes.

You could try this (but I'm sure there is a better way)
cols <- ncol(dat)
indx <- apply(dat, 1, function(x) length(unique(x)) == cols)
dat[indx, ]
# col1 col2 col3
# 2 B A C
# 4 D C A
Another way (if your columns are characters and if you don't have too many columns) is something like the following (which is vectorized)
indx <- with(dat, (col1 == col2) | (col1 == col3) | (col2 == col3))
dat[!indx, ]
# col1 col2 col3
# 2 B A C
# 4 D C A

You could do this in dplyr, if you don't mind specifying the columns:
library(dplyr)
dat %>%
rowwise() %>%
mutate(repeats = max(table(c(col1, col2, col3))) - 1) %>%
filter(repeats == 0) %>%
select(-repeats) # if you don't want that column to appear in results
Source: local data frame [2 x 3]
col1 col2 col3
1 B A C
2 D C A

Here is an alternative. I haven't tested on big dataset,
library(data.table) #devel version v1.9.5
dat[setDT(melt(as.matrix(dat)))[,uniqueN(value)==.N , Var1]$V1,]
# col1 col2 col3
#2 B A C
#4 D C A
Or use anyDuplicated
dat[!apply(dat, 1, anyDuplicated),]
# col1 col2 col3
#2 B A C
#4 D C A

R parallel execution

I have a dataframe containing 5 columns
COL1 | COL2 | COL 3 | COL 4 | COL 5
I need to aggregate on COL1 and apply 4 different function on COL2 to COL5 columns
a1<-aggregate( COL2 ~ COL1, data = dataframe, sum)
a2<-aggregate( COL3 ~ COL1, data = dataframe, length)
a3<-aggregate( COL4 ~ COL1, data = dataframe, max)
a4<-aggregate( COL5 ~ COL1, data = dataframe, min)
finalDF<- Reduce(function(x, y) merge(x, y, all=TRUE), list(a1,a2,a3,a4))
1)I have 24 cores on the machine.
How can I execute above 4 lines of code (a1,a2,a3,a4) in parallel?
I want to use 4 cores simultaneously and then use Reduce to compute finalDF
2) Can I use different function on different column in one aggregate
(I can use one fun on multiple column and I can also use multiple function on one column in aggregate but I was unable to apply multiple functions on different columns
[COL2-sum,COL3-length,COL4-max,COL5-min])

This is an example of how you might do it with dplyr as suggested by #Roland
set.seed(2)
df <- data.frame(COL1 = sample(LETTERS, 1e6, replace=T),
COL2 = rnorm(1e6),
COL3 = runif(1e6, 100, 1000),
COL4 = rnorm(1e6, 25, 100),
COL5 = runif(1e6, -100, 10))
#> head(df)
# COL1 COL2 COL3 COL4 COL5
#1 E 1.0579823 586.2360 -3.157057 -14.462318
#2 S 0.1238110 872.3868 129.579090 9.525772
#3 O 0.4902512 498.0537 93.063487 1.910506
#4 E 1.7215843 200.7077 126.716256 -5.865204
#5 Y 0.6515853 275.3369 12.554218 -26.301225
#6 Y 0.7959678 134.4977 54.789415 -33.145334
require(dplyr)
df <- df %.%
group_by(COL1) %.%
summarize(a1 = sum(COL2),
a2 = length(COL3),
a3 = max(COL4),
a4 = min(COL5)) #add as many calculations as you like
On my machine this took 0.064 seconds.
#> head(df)
#Source: local data frame [6 x 5]
#
# COL1 a1 a2 a3 a4
#1 A -0.9068368 38378 403.4208 -99.99943
#2 B 6.0557452 38551 419.0970 -99.99449
#3 C 108.5680251 38673 491.8061 -99.99382
#4 D -34.1217133 38469 481.0626 -99.99697
#5 E -68.2998926 38168 452.8280 -99.99602
#6 F -185.9059338 38159 417.2271 -99.99995

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Find duplicated rows with original - r

Related

Merge two large data.tables based on column name of one table and column value of the other without melting

data.table: count rows within time moving window

count unique values in each row data table

Remove rows from a data frame that contain duplicate information across the columns

R parallel execution

Categories

Resources