Find and remove duplicated observations across two columns in R

Find and remove duplicated observations across two columns in R - r

I have an example data set like this:
df1 <- data.frame(c1=c('a','b','c','d','e','f','g', 'h'),
c2=c('l','m','a','g','e','q','a','d'))
and I just want a data frame that removed the duplicates between c1 and c2. I already know how to grab the unique elements from c1 and c2, but what do I do after that, to end up with something like the following:
data.frame(c1=c(b,c,f,h),c2=c(l,m,q,NA))

An option is to get the intersecting elements with Reduce, remove those elements from each column with %in% and !, and then pad NA at the end
v1 <- Reduce(intersect, df1)
lst1 <- lapply(df1, function(x) x[!x %in% v1])
data.frame(lapply(lst1, `length<-`, max(lengths(lst1))))
# c1 c2
#1 b l
#2 c m
#3 f q
#4 h <NA>
data
df1 <- data.frame(c1=c('a','b','c','d','e','f','g', 'h'),
c2=c('l','m','a','g','e','q','a','d'))

One-liner in base:
sapply(list(df1$c1[!df1$c1%in%df1$c2],
df1$c2[!df1$c2%in%df1$c1]), '[', 1:length(setdiff(df1$c1, df1$c2)))
# [,1] [,2]
# [1,] "b" "l"
# [2,] "c" "m"
# [3,] "f" "q"
# [4,] "h" NA

Related

Display identical columns in R dataframe

Suppose I have the following dataframe :
df <- data.frame(A=c(1,2,3),B=c("a","b","c"),C=c(2,1,3),D=c(1,2,3),E=c("a","b","c"),F=c(1,2,3))
> df
A B C D E F
1 1 a 2 1 a 1
2 2 b 1 2 b 2
3 3 c 3 3 c 3
I want to filter out the columns that are identical. I know that I can do it with
DuplCols <- df[duplicated(as.list(df))]
UniqueCols <- df[ ! duplicated(as.list(df))]
In the real world my dataframe has more than 500 columns and I do not know how many identical columns of the same kind I have and I do not know the names of the columns. However, each columnname is unique (as in df). My desired result is (optimally) a dataframe where in each row the column names of the identical columns of one kind are stored. The number of columns in the DesiredResult dataframe is the maximal number of identical columns of one kind in the original dataframe and if there are less identical columns of another kind, NA should be stored:
> DesiredResult
X1 X2 X3
1 A D F
2 B E NA
3 C NA NA
(With "identical column of the same kind" I mean the following: in df the columns A, D, F are identical columns of the same kind and B, E are identical columns of the same kind.)

You can use unique and then test with %in% where it matches to extract the colname.
tt_lapply(unique(as.list(df)), function(x) {colnames(df)[as.list(df) %in% list(x)]})
tt
#[[1]]
#[1] "A" "D" "F"
#
#[[2]]
#[1] "B" "E"
#
#[[3]]
#[1] "C"
t(sapply(tt, "length<-", max(lengths(tt)))) #As data.frame
# [,1] [,2] [,3]
#[1,] "A" "D" "F"
#[2,] "B" "E" NA
#[3,] "C" NA NA

splitting vector every two indices

Given vector of N elements:
LETTERS[1:10]
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J"
How can one get a data.table/frame (df) as follows?
>df
one two
A B
C D
E F
G H
I J
EDIT
Generalizing I would like to know given a vector to split as follows:
[A B C],[D E],[F G H I J]
and obtaining:
V1 V2 V3 V4 V5
A B C NA NA
D E NA NA NA
F G H I J

One option is the matrix way
as.data.frame(matrix(LETTERS[1:10], ncol=2,byrow=TRUE,
dimnames = list(NULL, c('one', 'two'))), stringsAsFactors=FALSE)
# one two
#1 A B
#2 C D
#3 E F
#4 G H
#5 I J
f we need to create an index, we can use gl to split the vector and rbind
do.call(rbind, split(v1, as.integer(gl(length(v1), 2, length(v1)))))
where
v1 <- LETTERS[1:10]
Update
Based on the update in OP's post
lst <- split(v1, rep(1:3, c(3, 2, 5)))
do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
# [,1] [,2] [,3] [,4] [,5]
#1 "A" "B" "C" NA NA
#2 "D" "E" NA NA NA
#3 "F" "G" "H" "I" "J"
Or otherwise
library(stringi)
stri_list2matrix(lst, byrow = TRUE)
Update2
If we are using a 'splitVec'
lst <- split(v1, cumsum(seq_along(v1) %in% splitVec))
and then proceed as above

Function on all column combinations of nm matrix yielding a mm result

Say I have a function which takes two vectors and returns a single integer, for instance the number of elements in one that is also in the other vector. Like:
f <- function(v1,v2)sum(v1 %in% v2)
How can I apply that function to all pairwise combinations of m columns in a n*m matrix.
set.seed(1)
m <- replicate(3, sample(letters[1:10], size = 5))
dimnames(m) <- list(NULL, paste0('c', 1:ncol(m)))
Now,
> m
[,1] [,2] [,3]
[1,] "c" "i" "c"
[2,] "d" "j" "b"
[3,] "e" "f" "f"
[4,] "g" "e" "j"
[5,] "b" "a" "e"
And take the function on the first two columns:
> f(m[,1], m[,2])
[1] 1 #'e' is shared.
How to do that on all column combinations ? The result could be a m*m matrix (where the results are symmetric around the diagonal) or, alternative, it could be a long-format data frame with columns for v1, v2 and the function's result (e.g. the first row would be c1, c2 and 3 )
I tried to investigate the functions outer and expand.grid but could not find a solution.

sapply(1:3, function(i) sapply(1:3, function(j) f(m[,i], m[,j])))
# [,1] [,2] [,3]
#[1,] 5 1 3
#[2,] 1 5 3
#[3,] 3 3 5
Or the following output might be friendlier
sapply(data.frame(m), function(x1) sapply(data.frame(m), function(x2) f(x1, x2)))
# c1 c2 c3
#c1 5 1 3
#c2 1 5 3
#c3 3 3 5

Using expand.grid to get all combinations, then loop through pairs get length of intersected items.
myComb <- expand.grid(colnames(m), colnames(m))
myComb$N <- apply(myComb, 1, function(i){
length(intersect(m[, i[1]], m[, i[2]]))
# or use your own function
# f(m[, i[1]], m[, i[2]])
})
myComb
# Var1 Var2 N
# 1 c1 c1 5
# 2 c2 c1 1
# 3 c3 c1 3
# 4 c1 c2 1
# 5 c2 c2 5
# 6 c3 c2 3
# 7 c1 c3 3
# 8 c2 c3 3
# 9 c3 c3 5

We can do this with outer
f1 <- function(x, y) length(intersect(m[,x], m[,y]))
res <- outer(colnames(m), colnames(m), FUN = Vectorize(f1))
dimnames(res) <- list(colnames(m), colnames(m))
res
# c1 c2 c3
#c1 5 1 3
#c2 1 5 3
#c3 3 3 5

A double loop also works. Only thing is I converted m to be a dataframe M:
f <- function(v1,v2)sum(v1 %in% v2)
set.seed(1) #Leads to different m values than you posted
m <- replicate(3, sample(letters[1:10], size = 5))
dimnames(m) <- list(NULL, paste0('c', 1:ncol(m)))
#Convert m to dataframe M
M <- as.data.frame(m)
#Initialize dataframe of answers
df <- data.frame(matrix(ncol=3, nrow=ncol(M)))
#Loop and get answers
row <- 1
for(i in 1:(ncol(M)-1)){
for(j in 1:(ncol(M)-i)){
df[row, 1] <- names(M)[i]
df[row, 2] <- names(M)[i+j]
df[row, 3] <- f(M[,i], M[,i+j])
row <- row+1
}
}
df
X1 X2 X3
1 c1 c2 1
2 c1 c3 3
3 c2 c3 3

Sort dataframe with multiple columns for multiple years

I have a data.frame with multiple columns and first column being Year. I want to sort my data frame in descending order for each year. I have fifteen years of data and then over 3000 columns.
I illustrate as follows:
Year A B C D
2000 2 3 4 NA
2001 3 4 NA 1
Desired output, my data frame has NAs as well but I can not remove those.
Year C B A
2000 4 3 2
Year B A D
2001 4 3 1
And this verion as well
Year
2000 C B A
2001 B A D
I have scripted this code
Asc <-order(df[-1], decreasing=True)
But I'm unable to obtain my desired output. I have referred in R sort row data in ascending order but still its different for what I'm looking for.
Would appreciate your help in this regard.

We can use apply with MARGIN=1. We loop through the rows of the dataset (excluding the first column) with apply, get the index of non-NA elements ('i1'), order the non-NA values descendingly ('i2'), and use that to rearrange the column names of the dataset.
m1 <- t(apply(df1[-1], 1, function(x) {
i1 <- !is.na(x)
i2 <- order(-x[i1])
names(df1)[-1][i1][i2]}))
m1
# [,1] [,2] [,3]
#[1,] "C" "B" "A"
#[2,] "B" "A" "D"
If we need the values and also the names, a list approach would be more suitable as it won't create any problems in the class
lst <- apply(df1[-1], 1, function(x){
i1 <- !is.na(x)
list(sort(x[i1],decreasing=TRUE))})
lst
#[[1]]
#[[1]][[1]]
#C B A
#4 3 2
#[[2]]
#[[2]][[1]]
#B A D
#4 3 1
We can extract the names or the elements from the 'lst'
do.call(rbind, do.call(`c`,rapply(lst, names,
how='list')))
# [,1] [,2] [,3]
#[1,] "C" "B" "A"
#[2,] "B" "A" "D"
Or
t(sapply(do.call(c, lst), names))
and the values as
t(simplify2array(do.call(c, lst)))

Column Split without repeat

I have a dataframe with one column that I would like to split into several columns, but the number of splits is dynamic throughout the rows.
Var1
====
A/B
A/B/C
C/B
A/C/D/E
I have tried using colsplit(df$Var1,split="/",names=c("Var1","Var2","Var3","Var4")), but rows with less than 4 variables will repeat.
From Hansi, the desired output would be:
Var1 Var2 Var3 Var4
[1,] "A" "B" NA NA
[2,] "A" "B" "C" NA
[3,] "C" "B" NA NA
[4,] "A" "C" "D" "E"

> read.table(text=as.character(df$Var1), sep="/", fill=TRUE)
V1 V2 V3 V4
1 A B
2 A B C
3 C B
4 A C D E
Leading zeros in digit only fields can be preserved with colClasses="character"
a <- data.frame(Var1=c("01/B","04/B/C","0098/B","8708/C/D/E"))
read.table(text=as.character(a$Var1), sep="/", fill=TRUE, colClasses="character")
V1 V2 V3 V4
1 01 B
2 04 B C
3 0098 B
4 8708 C D E

If I understood your objective correctly here is one possible solution, I'm sure there is a better way of doing it but this was the first that came to mind:
a <- data.frame(Var1=c("A/B","A/B/C","C/B","A/C/D/E"))
splitNames <- c("Var1","Var2","Var3","Var4")
# R> a
# Var1
# 1 A/B
# 2 A/B/C
# 3 C/B
# 4 A/C/D/E
b <- t(apply(a,1,function(x){
temp <- unlist(strsplit(x,"/"));
return(c(temp,rep(NA,max(0,length(splitNames)-length(temp)))))
}))
colnames(b) <- splitNames
# R> b
# Var1 Var2 Var3 Var4
# [1,] "A" "B" NA NA
# [2,] "A" "B" "C" NA
# [3,] "C" "B" NA NA
# [4,] "A" "C" "D" "E"

i do not know a function to solve your problem, but you can achieve it easily with standard R commands :
# Here are your data
df <- data.frame(Var1=c("A/B", "A/B/C", "C/B", "A/C/D/E"), stringsAsFactors=FALSE)
# Split
rows <- strsplit(df$Var1, split="/")
# Maximum amount of columns
columnCount <- max(sapply(rows, length))
# Fill with NA
rows <- lapply(rows, `length<-`, columnCount)
# Coerce to data.frame
out <- as.data.frame(rows)
# Transpose
out <- t(out)
As it relies on strsplit, you may need to make some type conversion. See type.con

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Find and remove duplicated observations across two columns in R - r

One-liner in base: sapply(list(df1$c1[!df1$c1%in%df1$c2], df1$c2[!df1$c2%in%df1$c1]), '[', 1:length(setdiff(df1$c1, df1$c2))) # [,1] [,2] # [1,] "b" "l" # [2,] "c" "m" # [3,] "f" "q" # [4,] "h" NA

Related

Display identical columns in R dataframe

splitting vector every two indices

Function on all column combinations of nm matrix yielding a mm result

Sort dataframe with multiple columns for multiple years

Column Split without repeat

Categories

Resources

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Find and remove duplicated observations across two columns in R - r

One-liner in base: sapply(list(df1$c1[!df1$c1%in%df1$c2], df1$c2[!df1$c2%in%df1$c1]), '[', 1:length(setdiff(df1$c1, df1$c2))) # [,1] [,2] # [1,] "b" "l" # [2,] "c" "m" # [3,] "f" "q" # [4,] "h" NA

Related

Display identical columns in R dataframe

splitting vector every two indices

Function on all column combinations of n*m matrix yielding a m*m result

Sort dataframe with multiple columns for multiple years

Column Split without repeat

Categories

Resources

Function on all column combinations of nm matrix yielding a mm result