un-intersect values in R - r

I have two data sets of at least 420,500 observations each, e.g.
dataset1 <- data.frame(col1=c("microsoft","apple","vmware","delta","microsoft"),
col2=paste0(c("a","b","c",4,"asd"),".exe"),
col3=rnorm(5))
dataset2 <- data.frame(col1=c("apple","cisco","proactive","dtex","microsoft"),
col2=paste0(c("a","b","c",4,"asd"),".exe"),
col3=rnorm(5))
> dataset1
col1 col2 col3
1 microsoft a.exe 2
2 apple b.exe 1
3 vmware c.exe 3
4 delta 4.exe 4
5 microsoft asd.exe 5
> dataset2
col1 col2 col3
1 apple a.exe 3
2 cisco b.exe 4
3 vmware d.exe 1
4 delta 5.exe 5
5 microsoft asd.exe 2
I would like to print all the observations in dataset1 that do not intersect one in dataset2 (comparing both col1 and col2 in each), which in this case would print everything except the last observation - observations 1 & 2 match on col2 but not col1 and observation 3 & 4 match on col1 but not col2, i.e.:
col1 col2 col3
1: apple b.exe 1
2: delta 4.exe 4
3: microsoft a.exe 2
4: vmware c.exe 3

You could use anti_join from dplyr
library(dplyr)
anti_join(df1, df2, by = c('col1', 'col2'))
# col1 col2 col3
#1 delta 4.exe -0.5836272
#2 vmware c.exe 0.4196231
#3 apple b.exe 0.5365853
#4 microsoft a.exe -0.5458808
data
set.seed(24)
df1 <- data.frame(col1 = c('microsoft', 'apple', 'vmware', 'delta',
'microsoft'), col2= c('a.exe', 'b.exe', 'c.exe', '4.exe', 'asd.exe'),
col3=rnorm(5), stringsAsFactors=FALSE)
set.seed(22)
df2 <- data.frame(col1 = c( 'apple', 'cisco', 'proactive', 'dtex',
'microsoft'), col2= c('a.exe', 'b.exe', 'c.exe', '4.exe', 'asd.exe'),
col3=rnorm(5), stringsAsFactors=FALSE)

data.table solution inspired by this:
library(data.table) #1.9.5+
setDT(dataset1,key=c("col1","col2"))
setDT(dataset2,key=key(dataset1))
dataset1[!dataset2]
col1 col2 col3
1: apple b.exe 1
2: delta 4.exe 4
3: microsoft a.exe 2
4: vmware c.exe 3
You could also try without keying:
library(data.table) #1.9.5+
setDT(dataset1); setDT(dataset2)
dataset1[!dataset2,on=c("col1","col2")]

Related

R find unduplicated rows based on other data's columns [duplicate]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 7 years ago.
Improve this question
I have a data table in R, called A, which has three columns Col1, Col2, and Col3. Another table, called B, also has the same three columns. I want to remove all the rows in table A, for which the pairs (Col1, Col2) are present in table B. I tried, but I am not sure how to do this. I am stuck on this for last few days.
Thanks,
library(data.table)
A = data.table(Col1 = 1:4, Col2 = 4:1, Col3 = letters[1:4])
# Col1 Col2 Col3
#1: 1 4 a
#2: 2 3 b
#3: 3 2 c
#4: 4 1 d
B = data.table(Col1 = c(1,3,5), Col2 = c(4,2,1))
# Col1 Col2
#1: 1 4
#2: 3 2
#3: 5 1
A[!B, on = c("Col1", "Col2")]
# Col1 Col2 Col3
#1: 2 3 b
#2: 4 1 d
We can use anti_join
library(dplyr)
anti_join(A, B, by = c('Col1', 'Col2'))
Here's a go, using interaction:
A <- data.frame(Col1=1:3, Col2=2:4, Col3=10:12)
B <- data.frame(Col1=1:2, Col2=2:3, Col3=10:11)
A
# Col1 Col2 Col3
#1 1 2 10
#2 2 3 11
#3 3 4 12
B
# Col1 Col2 Col3
#1 1 2 10
#2 2 3 11
byv <- c("Col1","Col2")
A[!(interaction(A[byv]) %in% interaction(B[byv])),]
# Col1 Col2 Col3
#3 3 4 12
Or create a unique id for each row, and then exclude those that merged:
A[-merge(cbind(A[byv],id=seq_len(nrow(A))), B[byv], by=byv)$id,]

Extract the subset of a dataframe based with values unique from other two dataframes

I have three dataframes df1,df2,df3. I would like to identify the value(s) in col1 of df2 not present in col1 of df1 and/or col1 of df3.
df1 <- data.frame(col1=c('A','C','E'),col2=c(4,8,2))
df1
df2 <- data.frame(col1=c('A','B','C','E','G','I'),col2=c(4,8,2,6,1,9))
df2
df3 <- data.frame(col1=LETTERS[3:26],col2=sample(3:26))
df3
# Expected output
#2 B 8
What I have done?
table(df2$col1 %in% df1$col1)
# FALSE TRUE
# 3 3
df2[df2$col1 %in% df1$col1,]
# col1 col2
#1 A 4
#3 C 2
#4 E 6
df2[!df2$col1 %in% df1$col1,]
# col1 col2
#2 B 8
#5 G 1
#6 I 9
table(df2$col1 %in% df3$col1)
#FALSE TRUE
# 2 4
df2[df2$col1 %in% df3$col1,]
# col1 col2
#3 C 2
#4 E 6
#5 G 1
#6 I 9
df2[!df2$col1 %in% df3$col1,]
# col1 col2
#1 A 4
#2 B 8
In a wrong approach,
df2[!df2$col1[!df2$col1 %in% df1$col1] %in% df3$col1,]
# col1 col2
#1 A 4
#4 E 6
How to avoid the repetition of the indices?
Is there any better approach than the below?
df2[!df2$col1 %in% df1$col1,][!df2$col1[!df2$col1 %in% df1$col1] %in% df3$col1,]
# col1 col2
#2 B 8
While the correct approach,
df2[!(df2$col1 %in% df1$col1 | df2$col1 %in% df3$col1),]
# col1 col2
#2 B 8
We can use anti_join
library(dplyr)
bind_rows(df1, df3) %>%
anti_join(df2, ., by = "col1")
# col1 col2
#1 B 8

remove duplicate rows in R based on values in all columns

I have the following dataset
col1 col2 col3
a b 1
a b 2
a b 3
unique(dataset) returns
col1 col2 col3
a b 1
dataset[!duplicated(1:3),] returns
col1 col2 col3
a b 1
a b 2
a b 3
But the same thing fails to work in following
dataset2
col1 col2 col3
a b 1
a b 1
unique(dataset2) returns
col1 col2 col3
a b 1
dataset2[!duplicated(1:3),] returns
col1 col2 col3
a b 1
a b 1
NA NA NA
Use !duplicated:
dataset[!duplicated(dataset[c("col1", "col2", "col3")]),]

data.table in R: Replace a column value with a value from same column after matching two other columns values

I'm not able to get a solution, for my below requirement.
If a data.table(as below) has matching values in Col1 and Col3. Replace Col2 value(old with New-Val).
Col1 Col2 Col3
1 old a
1 old a
1 New-Val a
After manipulating data table should look as below:
Col1 Col2 Col3
1 New-Val a
1 New-Val a
1 New-Val a
Update:
I've written New-Val for understanding the requirement. However I cannot match this value because it varies for different Col1 and Col3 values. For example as below:
Col1 Col2 Col3
1 blank a
1 blank a
1 New1 a
2 blank b
2 new2 b
2 new2 b
Likewise the entries are huge. So I ideally want to match Col1 and Col3 and in Col2 it is blank(always) which is to be replaced irrespective of different matched Col1 and Col3 values.
This should be manipulated to:
Col1 Col2 Col3
1 New1 a
1 New1 a
1 New1 a
2 new2 b
2 new2 b
2 new2 b
We can replace the "blank" values in "Col2" with NA and use na.locf to replace the NA with "New" values grouped by "Col1" and "Col3".
library(zoo)
dt[Col2=="blank", Col2 := NA]
dt[, Col2 := na.locf(Col2, fromLast=TRUE) ,.(Col1, Col3)]
dt
# Col1 Col2 Col3
#1: 1 New1 a
#2: 1 New1 a
#3: 1 New1 a
#4: 2 new2 b
#5: 2 new2 b
#6: 2 new2 b
Or we can do without using any additional package
dt[, Col2 := Col2[Col2!='blank'][1L] , .(Col1, Col3)]
Another option is to use a binary join combined with by = .EACHI- this will work for factors too
dt[dt[Col2 != "blank"], Col2 := i.Col2, on = c("Col1", "Col3"), by = .EACHI]
dt
# Col1 Col2 Col3
# 1: 1 New1 a
# 2: 1 New1 a
# 3: 1 New1 a
# 4: 2 new2 b
# 5: 2 new2 b
# 6: 2 new2 b

Removing one table from another in R [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 7 years ago.
Improve this question
I have a data table in R, called A, which has three columns Col1, Col2, and Col3. Another table, called B, also has the same three columns. I want to remove all the rows in table A, for which the pairs (Col1, Col2) are present in table B. I tried, but I am not sure how to do this. I am stuck on this for last few days.
Thanks,
library(data.table)
A = data.table(Col1 = 1:4, Col2 = 4:1, Col3 = letters[1:4])
# Col1 Col2 Col3
#1: 1 4 a
#2: 2 3 b
#3: 3 2 c
#4: 4 1 d
B = data.table(Col1 = c(1,3,5), Col2 = c(4,2,1))
# Col1 Col2
#1: 1 4
#2: 3 2
#3: 5 1
A[!B, on = c("Col1", "Col2")]
# Col1 Col2 Col3
#1: 2 3 b
#2: 4 1 d
We can use anti_join
library(dplyr)
anti_join(A, B, by = c('Col1', 'Col2'))
Here's a go, using interaction:
A <- data.frame(Col1=1:3, Col2=2:4, Col3=10:12)
B <- data.frame(Col1=1:2, Col2=2:3, Col3=10:11)
A
# Col1 Col2 Col3
#1 1 2 10
#2 2 3 11
#3 3 4 12
B
# Col1 Col2 Col3
#1 1 2 10
#2 2 3 11
byv <- c("Col1","Col2")
A[!(interaction(A[byv]) %in% interaction(B[byv])),]
# Col1 Col2 Col3
#3 3 4 12
Or create a unique id for each row, and then exclude those that merged:
A[-merge(cbind(A[byv],id=seq_len(nrow(A))), B[byv], by=byv)$id,]

Resources