cbind: is there a way to have missing values set to NA? - r

Please forgive me if I missed an answer to such a simple question.
I want to use cbind() to bind two columns. One of them is a single entry shorter in length.
Can I have R supply an NA for the missing value?
The documentation discusses a deparse.level argument but this doesn't seem to be my solution.
Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?

Try this:
x <- c(1:5)
y <- c(4:1)
length(y) = length(x)
cbind(x,y)
x y
[1,] 1 4
[2,] 2 3
[3,] 3 2
[4,] 4 1
[5,] 5 NA
or this:
x <- c(4:1)
y <- c(1:5)
length(x) = length(y)
cbind(x,y)
x y
[1,] 4 1
[2,] 3 2
[3,] 2 3
[4,] 1 4
[5,] NA 5
I think this will do something similar to what DWin suggested and work regardless of which vector is shorter:
x <- c(4:1)
y <- c(1:5)
lengths <- max(c(length(x), length(y)))
length(x) <- lengths
length(y) <- lengths
cbind(x,y)
The code above can also be condensed to:
x <- c(4:1)
y <- c(1:5)
length(x) <- length(y) <- max(c(length(x), length(y)))
cbind(x,y)
EDIT
Here is what I came up with to address the question:
"Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?"
inserted into the original post by Matt O'Brien.
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
cbind(x,y)
# x y
# [1,] NA 1
# [2,] 4 2
# [3,] 3 3
# [4,] 2 4
# [5,] 1 5
Here is a function:
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
return(cbind(x,y))
}
my.cbind(x,y,first)
my.cbind(c(1:5),c(4:1),1)
my.cbind(c(1:5),c(4:1),0)
my.cbind(c(1:4),c(5:1),1)
my.cbind(c(1:4),c(5:1),0)
my.cbind(c(1:5),c(5:1),1)
my.cbind(c(1:5),c(5:1),0)
This version allows you to cbind two vectors of different mode:
x <- c(4:1)
y <- letters[1:5]
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
x <- as.data.frame(x)
y <- as.data.frame(y)
return(data.frame(x,y))
}
my.cbind(x,y,first)
# x y
# 1 NA a
# 2 4 b
# 3 3 c
# 4 2 d
# 5 1 e
my.cbind(c(1:5),letters[1:4],1)
my.cbind(c(1:5),letters[1:4],0)
my.cbind(c(1:4),letters[1:5],1)
my.cbind(c(1:4),letters[1:5],0)
my.cbind(c(1:5),letters[1:5],1)
my.cbind(c(1:5),letters[1:5],0)

A while back I had put together a function called Cbind that was meant to do this sort of thing. In its current form, it should be able to handle vectors, data.frames, and matrices as the input.
For now, the function is here: https://gist.github.com/mrdwab/6789277
Here is how one would use the function:
x <- 1:5
y <- letters[1:4]
z <- matrix(1:4, ncol = 2, dimnames = list(NULL, c("a", "b")))
Cbind(x, y, z)
# x y z_a z_b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA
Cbind(x, y, z, first = FALSE)
# x y z_a z_b
# 1 1 <NA> NA NA
# 2 2 a NA NA
# 3 3 b NA NA
# 4 4 c 1 3
# 5 5 d 2 4
The two three functions required are padNA, dotnames, and Cbind, which are defined as follows:
padNA <- function (mydata, rowsneeded, first = TRUE) {
## Pads vectors, data.frames, or matrices with NA
temp1 = colnames(mydata)
rowsneeded = rowsneeded - nrow(mydata)
temp2 = setNames(
data.frame(matrix(rep(NA, length(temp1) * rowsneeded),
ncol = length(temp1))), temp1)
if (isTRUE(first)) rbind(mydata, temp2)
else rbind(temp2, mydata)
}
dotnames <- function(...) {
## Gets the names of the objects passed through ...
vnames <- as.list(substitute(list(...)))[-1L]
vnames <- unlist(lapply(vnames,deparse), FALSE, FALSE)
vnames
}
Cbind <- function(..., first = TRUE) {
## cbinds vectors, data.frames, and matrices together
Names <- dotnames(...)
datalist <- setNames(list(...), Names)
nrows <- max(sapply(datalist, function(x)
ifelse(is.null(dim(x)), length(x), nrow(x))))
datalist <- lapply(seq_along(datalist), function(x) {
z <- datalist[[x]]
if (is.null(dim(z))) {
z <- setNames(data.frame(z), Names[x])
} else {
if (is.null(colnames(z))) {
colnames(z) <- paste(Names[x], sequence(ncol(z)), sep = "_")
} else {
colnames(z) <- paste(Names[x], colnames(z), sep = "_")
}
}
padNA(z, rowsneeded = nrows, first = first)
})
do.call(cbind, datalist)
}
Part of the reason I stopped working on the function was that the gdata package already has a function called cbindX that handles cbinding data.frames and matrices with different numbers of rows. It will not work directly on vectors, so you need to convert them to data.frames first.
library(gdata)
cbindX(data.frame(x), data.frame(y), z)
# x y a b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA

Related

data.table avoid recycling

I'm constructing a data.table from two (or more) input vectors with different lengths:
x <- c(1,2,3,4)
y <- c(8,9)
dt <- data.table(x = x, y = y)
And need the shorter vector(s) to be filled with NA rather than recycling their values, resulting in a data.table like this:
x y
1: 1 8
2: 2 9
3: 3 NA
4: 4 NA
Is there a way to achieve this without explicitly filling the shorter vector(s) with NA before passing them to the data.table() constructor?
Thanks!
One can use out of range indices:
library("data.table")
x <- c(1,2,3,4)
y <- c(8,9)
n <- max(length(x), length(y))
dt <- data.table(x = x[1:n], y = y[1:n])
# > dt
# x y
# 1: 1 8
# 2: 2 9
# 3: 3 NA
# 4: 4 NA
Or you can extend y by doing (as #Roland recommended in the comment):
length(y) <- length(x) <- max(length(x), length(y))
dt <- data.table(x, y)
An option is cbind.fill from rowr
library(rowr)
setNames(cbind.fill(x, y, fill = NA), c("x", "y"))
Or place the vectors in a list and then pad NA at the end based on the maximum length of the list elements
library(data.table)
lst <- list(x = x, y = y)
as.data.table(lapply(lst, `length<-`, max(lengths(lst))))
# x y
#1: 1 8
#2: 2 9
#3: 3 NA
#4: 4 NA
The "out of range indices" answer provided by jogo can be extended cleanly to in-place assignment using .N:
x <- c(1,2,3,4)
y <- c(8,9)
n <- max(length(x), length(y))
dt <- data.table(x = x[1:n], y = y[1:n])
z <- c(6,7)
dt[, z := z[1:.N]]
# x y z
# 1: 1 8 6
# 2: 2 9 7
# 3: 3 NA NA
# 4: 4 NA NA

Why R resets dataframe column after function call?

I made simple R script and run:
f <- data.frame(x = c(1,1))
f$y <- NA
test <- function(n) {
f$y[1] <- 1
print(f)
}
test(0)
print(f)
the result is
> test(0)
x y
1 1 1
2 1 NA
> print(f)
x y
1 1 NA
2 1 NA
>
I would expect in the final output (1, 1) (1, NA) but for some reason f$y[1] had been reset to NA.
When I inline the function, the result are ok.
Why function call has this side effect on dataframe column?
Is it avoidable?
How about this:
f <- data.frame(x = c(1,1))
f$y <- NA
test <- function(f) {
f$y[1] <- 1
return(f)
}
f <- test(f)
print(f)
x y
1 1 1
2 1 NA

R - Collapse into vector same member of a list

I have a list with same structure for every member as the following
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
and I need to obtain 3 vectors VAL, ARR and DF, each with the concatenated elements of the corresponding member. such as
# VAL: 0,1,5
# ARR: 1,2,3,4,5,1,3,2,4,9,4,2,1,5,8
# DF: 1,5,3,8,2,6,1,9,4,2,1,7
Looking at similar situations, I have the feeling I need to use a combination of do.call and cbind or lapply but I have no clue. any suggestions?
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
sapply(names(config[[1]]), function(x)
unname(unlist(sapply(config, `[`, x))), USE.NAMES = TRUE)
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Or you can use this clist function
Unfortunately there were no other answers.
(l <- Reduce(clist, config))
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# X1 X2 X1 X2 X1 X2
# 1 1 3 2 1 4 1
# 2 5 8 6 9 2 7
It merges data frames and matrices, so you need to unlist to get the vector you want
l$DF <- unname(unlist(l$DF))
l
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Function
clist <- function (x, y) {
islist <- function(x) inherits(x, 'list')
'%||%' <- function(a, b) if (!is.null(a)) a else b
get_fun <- function(x, y)
switch(class(x %||% y),
matrix = cbind,
data.frame = function(x, y)
do.call('cbind.data.frame', Filter(Negate(is.null), list(x, y))),
factor = function(...) unlist(list(...)), c)
stopifnot(islist(x), islist(y))
nn <- names(rapply(c(x, y), names, how = 'list'))
if (is.null(nn) || any(!nzchar(nn)))
stop('All non-NULL list elements should have unique names', domain = NA)
nn <- unique(c(names(x), names(y)))
z <- setNames(vector('list', length(nn)), nn)
for (ii in nn)
z[[ii]] <- if (islist(x[[ii]]) && islist(y[[ii]]))
Recall(x[[ii]], y[[ii]]) else
(get_fun(x[[ii]], y[[ii]]))(x[[ii]], y[[ii]])
z
}
Another approach, with slightly less code.
un_config <- unlist(config)
un_configNAM <- names(un_config)
vecNAM <- c("VAL", "ARR", "DF")
for(n in vecNAM){
assign(n, un_config[grepl(n, un_configNAM)])
}
This will return 3 vectors as the OP requested. However, generally it is more advantageous to store results in a list as rawr suggests. You of course can adopt the above code so that results are stored within a list.
l <- rep(list(NA), length(vecNAM))
i = 1
for(n in vecNAM){
l[[i]] <- un_config[grepl(n, un_configNAM)]
i = i +1
}

for loop matrix construction

I am trying to construct a matrix in a for loop. I am declaring an empty matrix called matrix1 in my code and I am trying to fill it row-wise with the simulated x, y and z variables:
simulation <- function(ss){
nsim <- 100
matrix1 <- matrix(0, ncol=3, nrow=nsim)
colnames(matrix1) <- c("x", "y", "z")
for(i in 1:nsim) {
set.seed(i)
x <- relevel(as.factor(sample(1:4,ss, replace=TRUE)), ref="4")
y <- relevel(as.factor(sample(1:3,ss, replace=TRUE)), ref="3")
z <- relevel(as.factor(sample(1:2,ss, replace=TRUE)), ref="2")
matrix1[i, ] <- cbind(x, y, z)
}
return(matrix1)
}
Now, when I run this, I am getting an error:
Error in matrix1[i, ] <- cbind(x, y, z) :
number of items to replace is not a multiple of replacement length
.
I don't see why that is happening since matrix1 has 3 columns and I am filling it recursively with the 3 variables x, y and z.
Too fill it row by row, do like this (this is not efficient):
x = data.frame(name = character(), y = numeric())
for(i in 1:10){
x = rbind(x, data.frame(name = letters[i], y = runif(1)))
}
# name y
1 a 0.09931082
2 b 0.85088120
3 c 0.39535348
4 d 0.08633770
5 e 0.08329996
6 f 0.46080032
7 g 0.39309986
8 h 0.01993358
9 i 0.96079532
10 j 0.19371701
If you have a lot of rows, this is very inefficient. In this case, you could try to pre-allocate the rows:
n = 10
x = data.frame(name = rep(NA_character_, n), y = rep(NA_real_, n))
name y
1 <NA> NA
2 <NA> NA
3 <NA> NA
4 <NA> NA
5 <NA> NA
6 <NA> NA
7 <NA> NA
8 <NA> NA
9 <NA> NA
10 <NA> NA
Then loop over the rows and columns to set the values.
for (i in 1:n){
for (j in 1:2){
# x[i, j] = ...
}
}

Combining vectors of unequal length and non-unique values

I would like to do the following:
combine into a data frame, two vectors that
have different length
contain sequences found also in the other vector
contain sequences not found in the other vector
sequences that are not found in other vector are never longer than 3 elements
always have same first element
The data frame should show the equal sequences in the two vectors aligned, with NA in the column if a vector lacks a sequence present in the other vector.
For example:
vector 1 vector 2 vector 1 vector 2
1 1 a a
2 2 g g
3 3 b b
4 1 or h a
1 2 a g
2 3 g b
5 4 c h
5 c
should be combined into data frame
1 1 a a
2 2 g g
3 3 b b
4 NA h NA
1 1 or a a
2 2 g g
NA 3 NA b
NA 4 NA h
5 5 c c
What I did, is to search for merge, combine, cbind, plyr examples but was not able to find solutions. I am afraid I will need to start write a function with nested for loops to solve this problem.
Note - this was proposed as an answer to the first version of the OP. The question has been modified since then but the problem is still not well-defined in my opinion.
Here is a solution that works with your integer example and would also work with numeric vectors. I am also assuming that:
both vectors contain the same number of sequences
a new sequence starts where value[i+1] <= value[i]
If your vectors are non-numeric or if one of my assumptions does not fit your problem, you'll have to clarify.
v1 <- c(1,2,3,4,1,2,5)
v2 <- c(1,2,3,1,2,3,4,5)
v1.sequences <- split(v1, cumsum(c(TRUE, diff(v1) <= 0)))
v2.sequences <- split(v2, cumsum(c(TRUE, diff(v2) <= 0)))
align.fun <- function(s1, s2) { #aligns two sequences
s12 <- sort(unique(c(s1, s2)))
cbind(ifelse(s12 %in% s1, s12, NA),
ifelse(s12 %in% s2, s12, NA))
}
do.call(rbind, mapply(align.fun, v1.sequences, v2.sequences))
# [,1] [,2]
# [1,] 1 1
# [2,] 2 2
# [3,] 3 3
# [4,] 4 NA
# [5,] 1 1
# [6,] 2 2
# [7,] NA 3
# [8,] NA 4
# [9,] 5 5
I maintain that your problem might be solved in terms of the shortest common supersequence. It assumes that your two vectors each represent one sequence. Please give the code below a try.
If it still does not solve your problem, you'll have to explain exactly what you mean by "my vector contains not one but many sequences": define what you mean by a sequence and tell us how sequences can be identified by scanning through your two vectors.
Part I: given two sequences, find the longest common subsequence
LongestCommonSubsequence <- function(X, Y) {
m <- length(X)
n <- length(Y)
C <- matrix(0, 1 + m, 1 + n)
for (i in seq_len(m)) {
for (j in seq_len(n)) {
if (X[i] == Y[j]) {
C[i + 1, j + 1] = C[i, j] + 1
} else {
C[i + 1, j + 1] = max(C[i + 1, j], C[i, j + 1])
}
}
}
backtrack <- function(C, X, Y, i, j) {
if (i == 1 | j == 1) {
return(data.frame(I = c(), J = c(), LCS = c()))
} else if (X[i - 1] == Y[j - 1]) {
return(rbind(backtrack(C, X, Y, i - 1, j - 1),
data.frame(LCS = X[i - 1], I = i - 1, J = j - 1)))
} else if (C[i, j - 1] > C[i - 1, j]) {
return(backtrack(C, X, Y, i, j - 1))
} else {
return(backtrack(C, X, Y, i - 1, j))
}
}
return(backtrack(C, X, Y, m + 1, n + 1))
}
Part II: given two sequences, find the shortest common supersequence
ShortestCommonSupersequence <- function(X, Y) {
LCS <- LongestCommonSubsequence(X, Y)[c("I", "J")]
X.df <- data.frame(X = X, I = seq_along(X), stringsAsFactors = FALSE)
Y.df <- data.frame(Y = Y, J = seq_along(Y), stringsAsFactors = FALSE)
ALL <- merge(LCS, X.df, by = "I", all = TRUE)
ALL <- merge(ALL, Y.df, by = "J", all = TRUE)
ALL <- ALL[order(pmax(ifelse(is.na(ALL$I), 0, ALL$I),
ifelse(is.na(ALL$J), 0, ALL$J))), ]
ALL$SCS <- ifelse(is.na(ALL$X), ALL$Y, ALL$X)
ALL
}
Your Example:
ShortestCommonSupersequence(X = c("a","g","b","h","a","g","c"),
Y = c("a","g","b","a","g","b","h","c"))
# J I X Y SCS
# 1 1 1 a a a
# 2 2 2 g g g
# 3 3 3 b b b
# 9 NA 4 h <NA> h
# 4 4 5 a a a
# 5 5 6 g g g
# 6 6 NA <NA> b b
# 7 7 NA <NA> h h
# 8 8 7 c c c
(where the two updated vectors are in columns X and Y.)

Resources