Combining vectors of unequal length and non-unique values - r

I would like to do the following:
combine into a data frame, two vectors that
have different length
contain sequences found also in the other vector
contain sequences not found in the other vector
sequences that are not found in other vector are never longer than 3 elements
always have same first element
The data frame should show the equal sequences in the two vectors aligned, with NA in the column if a vector lacks a sequence present in the other vector.
For example:
vector 1 vector 2 vector 1 vector 2
1 1 a a
2 2 g g
3 3 b b
4 1 or h a
1 2 a g
2 3 g b
5 4 c h
5 c
should be combined into data frame
1 1 a a
2 2 g g
3 3 b b
4 NA h NA
1 1 or a a
2 2 g g
NA 3 NA b
NA 4 NA h
5 5 c c
What I did, is to search for merge, combine, cbind, plyr examples but was not able to find solutions. I am afraid I will need to start write a function with nested for loops to solve this problem.

Note - this was proposed as an answer to the first version of the OP. The question has been modified since then but the problem is still not well-defined in my opinion.
Here is a solution that works with your integer example and would also work with numeric vectors. I am also assuming that:
both vectors contain the same number of sequences
a new sequence starts where value[i+1] <= value[i]
If your vectors are non-numeric or if one of my assumptions does not fit your problem, you'll have to clarify.
v1 <- c(1,2,3,4,1,2,5)
v2 <- c(1,2,3,1,2,3,4,5)
v1.sequences <- split(v1, cumsum(c(TRUE, diff(v1) <= 0)))
v2.sequences <- split(v2, cumsum(c(TRUE, diff(v2) <= 0)))
align.fun <- function(s1, s2) { #aligns two sequences
s12 <- sort(unique(c(s1, s2)))
cbind(ifelse(s12 %in% s1, s12, NA),
ifelse(s12 %in% s2, s12, NA))
}
do.call(rbind, mapply(align.fun, v1.sequences, v2.sequences))
# [,1] [,2]
# [1,] 1 1
# [2,] 2 2
# [3,] 3 3
# [4,] 4 NA
# [5,] 1 1
# [6,] 2 2
# [7,] NA 3
# [8,] NA 4
# [9,] 5 5

I maintain that your problem might be solved in terms of the shortest common supersequence. It assumes that your two vectors each represent one sequence. Please give the code below a try.
If it still does not solve your problem, you'll have to explain exactly what you mean by "my vector contains not one but many sequences": define what you mean by a sequence and tell us how sequences can be identified by scanning through your two vectors.
Part I: given two sequences, find the longest common subsequence
LongestCommonSubsequence <- function(X, Y) {
m <- length(X)
n <- length(Y)
C <- matrix(0, 1 + m, 1 + n)
for (i in seq_len(m)) {
for (j in seq_len(n)) {
if (X[i] == Y[j]) {
C[i + 1, j + 1] = C[i, j] + 1
} else {
C[i + 1, j + 1] = max(C[i + 1, j], C[i, j + 1])
}
}
}
backtrack <- function(C, X, Y, i, j) {
if (i == 1 | j == 1) {
return(data.frame(I = c(), J = c(), LCS = c()))
} else if (X[i - 1] == Y[j - 1]) {
return(rbind(backtrack(C, X, Y, i - 1, j - 1),
data.frame(LCS = X[i - 1], I = i - 1, J = j - 1)))
} else if (C[i, j - 1] > C[i - 1, j]) {
return(backtrack(C, X, Y, i, j - 1))
} else {
return(backtrack(C, X, Y, i - 1, j))
}
}
return(backtrack(C, X, Y, m + 1, n + 1))
}
Part II: given two sequences, find the shortest common supersequence
ShortestCommonSupersequence <- function(X, Y) {
LCS <- LongestCommonSubsequence(X, Y)[c("I", "J")]
X.df <- data.frame(X = X, I = seq_along(X), stringsAsFactors = FALSE)
Y.df <- data.frame(Y = Y, J = seq_along(Y), stringsAsFactors = FALSE)
ALL <- merge(LCS, X.df, by = "I", all = TRUE)
ALL <- merge(ALL, Y.df, by = "J", all = TRUE)
ALL <- ALL[order(pmax(ifelse(is.na(ALL$I), 0, ALL$I),
ifelse(is.na(ALL$J), 0, ALL$J))), ]
ALL$SCS <- ifelse(is.na(ALL$X), ALL$Y, ALL$X)
ALL
}
Your Example:
ShortestCommonSupersequence(X = c("a","g","b","h","a","g","c"),
Y = c("a","g","b","a","g","b","h","c"))
# J I X Y SCS
# 1 1 1 a a a
# 2 2 2 g g g
# 3 3 3 b b b
# 9 NA 4 h <NA> h
# 4 4 5 a a a
# 5 5 6 g g g
# 6 6 NA <NA> b b
# 7 7 NA <NA> h h
# 8 8 7 c c c
(where the two updated vectors are in columns X and Y.)

Related

Producing equal results for column entries which are the same

I have the piece of code below. What i want is alter the code
such that when the column entries for mat are the same, i get the same result in their respective positions in summation without it performing the fit operation?
So instead of getting
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 5 16 4
I want
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 3 16 3
set.seed(123)
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
newmat=rbind(mat,summation)
newmat
You can find out columns that are duplicates and replace the corresponding summation value with the first value of summation so that you get the same value.
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- ave(summation, match(vals, unique(vals)), FUN = function(x) x[1])
newmat=rbind(mat,summation)
newmat
To pass only unique columns to fit function we can do :
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- apply(mat[, !duplicated(vals)], 2, fit)
summation <- summation[match(vals, unique(vals))]
newmat=rbind(mat,summation)
newmat

Select two sets of column based on row indices

The working example is the best way to show what I'm looking for.
Given input df
df <- data.frame( l = letters[1:10], n = 1:10)
l n
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
8 h 8
9 i 9
10 j 10
I want to select rows from column l based on a vector of starting indices and a length. For example
start <- c(2, 4)
len <- 2
I want to get the output
b c
d e
I tried
df[(start):(start+len),1]
[1] b c d
Levels: a b c d e f g h i j
Warning messages:
1: In (start):(start + len) :
numerical expression has 2 elements: only the first used
2: In (start):(start + len) :
numerical expression has 2 elements: only the first used
Apply doesn't work either.
apply(start, 1, function(x, d) {d[x:(x+2),1]}, d = df)
Error in apply(start, 1, function(x, d) { :
dim(X) must have a positive length
We can use lapply on 'start' to get the sequence with length.out specified as 'len'. Then either extract the 'l' as a vector
df$l[unlist(lapply(start, function(x) seq(x, length.out =len)))]
or as a list of vectors
lapply(start, function(x) as.character(df$l)[seq(x, length.out = len)])
Here are two options to get the exact output you have specified, but first make sure your df$l is not a factor.
df <- data.frame(l = letters[1:10], n = 1:10, stringsAsFactors = FALSE)
start <- c(2, 4)
len <- 2
for (s in start) {cat(df[s:(s+len-1), 1]); cat("\n")}
# b c
# d e
cat(sapply(start, function(x) {paste(df[x:(x+len-1), 1], collapse = " ")}), sep = "\n")
# b c
# d e

Match values between two matrices based on max number of matches needed

I want to get only the rows from matrix A that have only any 5 or any 4 numbers existing in matrix B. Is there an R function I can use to solve it?
Μatrix A
1 2 3 4 5
2 3 5 6 7
3 5 7 8 1
2 7 5 4 3
matrix B:
1 2 4 5 6
2 4 1 3 7
3 5 7 9 8
5 8 9 2 6
if I ask for 5 numbers from B to match rows in A I will get no matching row.
if I ask for 4 numbers from B to match rows in A I will get :
B1 - A1
B2 - A1, A4
B3 - A3
I don't know of anything built into R that will work, but this custom function may help you get what you're after.
find.matches <- function(A, B, num.matches){
# Set rownames for the matrix
rownames(A) = paste0(deparse(substitute(A)), 1:nrow(A))
rownames(B) = paste0(deparse(substitute(B)), 1:nrow(B))
# Create matrix indicating matching items
out <- t(apply(cbind(seq_len(nrow(B)),B), 1,
function(y) {
cur.b = y[-1]
res <- apply(cbind(seq_len(nrow(A)),A), 1,
function(z) {
cur.a = z[-1]
ifelse(sum(table(cur.a[cur.a %in% cur.b])) == num.matches, rownames(A)[z[1]], NA)})}))
# Create list of matching items
out <- apply(out, 1, function(x) paste(x[!is.na(x)]))
# Remove non matches from list
out <- out[lapply(out,length) > 0]
if(length(out) > 0){
# Convert list to a vector
out <- paste0(names(out), " - ", lapply(c(out), paste, collapse = ", "))
# Print the vector
cat(out, sep = "\n")
} else{
print("No Matches Found")
}
}
# Create matrices to compare
A <- matrix(c(1,2,3,2,2,3,5,7,3,5,7,5,4,6,8,4,5,7,1,3), nrow = 4)
B <- matrix(c(1,2,3,5,2,4,5,8,4,1,7,9,5,3,9,2,6,7,8,6), nrow = 4)
# Compare matrices
find.matches(A, B, 4)

R increment column value based on character match in another column

I have an array of data that can be modelled roughly as follows:
x=data.frame(c(2,2,2),c(3,4,6),c(3,4,6), c("x/-","x/x","-/x"))
names(x)=c("A","B","C","D")
I wish to change the values of B to (C + 1) if only the first character in D is -.
I have tried using the following and iterating over the rows:
if(substring(x$D, 1,1) == "-")
{
x$B <- x$C + 1
}
However this method does not seem to work. Is there a way to do this using sapply?
Thanks,
Matt
You can use ifelse and within
within(x, B <- ifelse(substr(D, 1, 1) == "-", C + 1, B))
# A B C D
# 1 2 3 3 x/-
# 2 2 4 4 x/x
# 3 2 7 6 -/x
Or instead of substr, you could use grepl
within(x, B <- ifelse(grepl("^[-]", D), C + 1, B))
# A B C D
# 1 2 3 3 x/-
# 2 2 4 4 x/x
# 3 2 7 6 -/x
data.table solution.
require(data.table)
x <- data.table(c(2,2,2), c(3,4,6), c(3,4,6), c("x/-","x/x","-/x"))
setnames(x, c("A","B","C","D"))
x[grepl("^[-]", D), B := C + 1]

cbind: is there a way to have missing values set to NA?

Please forgive me if I missed an answer to such a simple question.
I want to use cbind() to bind two columns. One of them is a single entry shorter in length.
Can I have R supply an NA for the missing value?
The documentation discusses a deparse.level argument but this doesn't seem to be my solution.
Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?
Try this:
x <- c(1:5)
y <- c(4:1)
length(y) = length(x)
cbind(x,y)
x y
[1,] 1 4
[2,] 2 3
[3,] 3 2
[4,] 4 1
[5,] 5 NA
or this:
x <- c(4:1)
y <- c(1:5)
length(x) = length(y)
cbind(x,y)
x y
[1,] 4 1
[2,] 3 2
[3,] 2 3
[4,] 1 4
[5,] NA 5
I think this will do something similar to what DWin suggested and work regardless of which vector is shorter:
x <- c(4:1)
y <- c(1:5)
lengths <- max(c(length(x), length(y)))
length(x) <- lengths
length(y) <- lengths
cbind(x,y)
The code above can also be condensed to:
x <- c(4:1)
y <- c(1:5)
length(x) <- length(y) <- max(c(length(x), length(y)))
cbind(x,y)
EDIT
Here is what I came up with to address the question:
"Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?"
inserted into the original post by Matt O'Brien.
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
cbind(x,y)
# x y
# [1,] NA 1
# [2,] 4 2
# [3,] 3 3
# [4,] 2 4
# [5,] 1 5
Here is a function:
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
return(cbind(x,y))
}
my.cbind(x,y,first)
my.cbind(c(1:5),c(4:1),1)
my.cbind(c(1:5),c(4:1),0)
my.cbind(c(1:4),c(5:1),1)
my.cbind(c(1:4),c(5:1),0)
my.cbind(c(1:5),c(5:1),1)
my.cbind(c(1:5),c(5:1),0)
This version allows you to cbind two vectors of different mode:
x <- c(4:1)
y <- letters[1:5]
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
x <- as.data.frame(x)
y <- as.data.frame(y)
return(data.frame(x,y))
}
my.cbind(x,y,first)
# x y
# 1 NA a
# 2 4 b
# 3 3 c
# 4 2 d
# 5 1 e
my.cbind(c(1:5),letters[1:4],1)
my.cbind(c(1:5),letters[1:4],0)
my.cbind(c(1:4),letters[1:5],1)
my.cbind(c(1:4),letters[1:5],0)
my.cbind(c(1:5),letters[1:5],1)
my.cbind(c(1:5),letters[1:5],0)
A while back I had put together a function called Cbind that was meant to do this sort of thing. In its current form, it should be able to handle vectors, data.frames, and matrices as the input.
For now, the function is here: https://gist.github.com/mrdwab/6789277
Here is how one would use the function:
x <- 1:5
y <- letters[1:4]
z <- matrix(1:4, ncol = 2, dimnames = list(NULL, c("a", "b")))
Cbind(x, y, z)
# x y z_a z_b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA
Cbind(x, y, z, first = FALSE)
# x y z_a z_b
# 1 1 <NA> NA NA
# 2 2 a NA NA
# 3 3 b NA NA
# 4 4 c 1 3
# 5 5 d 2 4
The two three functions required are padNA, dotnames, and Cbind, which are defined as follows:
padNA <- function (mydata, rowsneeded, first = TRUE) {
## Pads vectors, data.frames, or matrices with NA
temp1 = colnames(mydata)
rowsneeded = rowsneeded - nrow(mydata)
temp2 = setNames(
data.frame(matrix(rep(NA, length(temp1) * rowsneeded),
ncol = length(temp1))), temp1)
if (isTRUE(first)) rbind(mydata, temp2)
else rbind(temp2, mydata)
}
dotnames <- function(...) {
## Gets the names of the objects passed through ...
vnames <- as.list(substitute(list(...)))[-1L]
vnames <- unlist(lapply(vnames,deparse), FALSE, FALSE)
vnames
}
Cbind <- function(..., first = TRUE) {
## cbinds vectors, data.frames, and matrices together
Names <- dotnames(...)
datalist <- setNames(list(...), Names)
nrows <- max(sapply(datalist, function(x)
ifelse(is.null(dim(x)), length(x), nrow(x))))
datalist <- lapply(seq_along(datalist), function(x) {
z <- datalist[[x]]
if (is.null(dim(z))) {
z <- setNames(data.frame(z), Names[x])
} else {
if (is.null(colnames(z))) {
colnames(z) <- paste(Names[x], sequence(ncol(z)), sep = "_")
} else {
colnames(z) <- paste(Names[x], colnames(z), sep = "_")
}
}
padNA(z, rowsneeded = nrows, first = first)
})
do.call(cbind, datalist)
}
Part of the reason I stopped working on the function was that the gdata package already has a function called cbindX that handles cbinding data.frames and matrices with different numbers of rows. It will not work directly on vectors, so you need to convert them to data.frames first.
library(gdata)
cbindX(data.frame(x), data.frame(y), z)
# x y a b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA

Resources