Interpolating a spline within dplyr - r

I am trying interpolate splines for the following example data:
trt depth root carbon
A 2 1 14
A 4 2 18
A 6 3 18
A 8 3 17
A 10 1 12
B 2 3 16
B 4 4 18
B 6 4 17
B 8 2 15
B 10 1 12
in the following way:
new_df<-df%>%
group_by(trt)%>%
summarise_each(funs(splinefun(., x=depth, method="natural")))
I get an Error: not a vector, but I don't see why not. Am I not expressing the function in the right way?

Do you want a dataset that contains the values interpolated? If so, I've expanded the dataset to contain the desired x locations before the splines are calculated.
The resolution of those points are determined in the second line of the expand.grid() function. Just make sure the original depth points are a subset of the expanded depth points (eg, don't use something uneven like by=.732).
library(magrittr)
ds <- readr::read_csv("trt,depth,root,carbon\nA,2,1,14\nA,4,2,18\nA,6,3,18\nA,8,3,17\nA,10,1,12\nB,2,3,16\nB,4,4,18\nB,6,4,17\nB,8,2,15\nB,10,1,12")
ds_depths_possible <- expand.grid(
depth = seq(from=min(ds$depth), max(ds$depth), by=.5), #Decide resolution here.
trt = c("A", "B"),
stringsAsFactors = FALSE
)
ds_intpolated <- ds %>%
dplyr::right_join(ds_depths_possible, by=c("trt", "depth")) %>% #Incorporate locations to interpolate
dplyr::group_by(trt) %>%
dplyr::mutate(
root_interpolated = spline(x=depth, y=root , xout=depth)$y,
carbon_interpolated = spline(x=depth, y=carbon, xout=depth)$y
) %>%
dplyr::ungroup()
ds_intpolated
Output:
Source: local data frame [34 x 6]
trt depth root carbon root_interpolated carbon_interpolated
(chr) (dbl) (int) (int) (dbl) (dbl)
1 A 2.0 1 14 1.000000 14.00000
2 A 2.5 NA NA 1.195312 15.57031
3 A 3.0 NA NA 1.437500 16.72917
4 A 3.5 NA NA 1.710938 17.52344
5 A 4.0 2 18 2.000000 18.00000
6 A 4.5 NA NA 2.289062 18.21094
7 A 5.0 NA NA 2.562500 18.22917
8 A 5.5 NA NA 2.804688 18.13281
9 A 6.0 3 18 3.000000 18.00000
10 A 6.5 NA NA 3.132812 17.88281
.. ... ... ... ... ... ...
In the graphs above, the little points & lines are interpolated. The big fat points are observed.
library(ggplot2)
ggplot(ds_intpolated, aes(x=depth, y=root_interpolated, color=trt)) +
geom_line() +
geom_point(shape=1) +
geom_point(aes(y=root), size=5, alpha=.3, na.rm=T) +
theme_bw()
ggplot(ds_intpolated, aes(x=depth, y=carbon_interpolated, color=trt)) +
geom_line() +
geom_point(shape=1) +
geom_point(aes(y=carbon), size=5, alpha=.3, na.rm=T) +
theme_bw()
If you want an additional example, here's some recent code and slides. We needed a rolling median for some missing points, and linear stats::approx() for some others. Another option is also stats::loess(), but it's arguments aren't as similar as approx() and spline().

I gave up trying to get dplyr::summarise_each (and also tried dplyr::summarise, since your choice of functions didn't seem to match you desire for multiple column input to return only two functions.) I'm not sure it's possible in dply. Here's what might be called the canonical method of approaching this:
lapply( split(df, df$trt), function(d) splinefun(x=d$depth, y=d$carbon) )
#-------------
$A
function (x, deriv = 0L)
{
deriv <- as.integer(deriv)
if (deriv < 0L || deriv > 3L)
stop("'deriv' must be between 0 and 3")
if (deriv > 0L) {
z0 <- double(z$n)
z[c("y", "b", "c")] <- switch(deriv, list(y = z$b, b = 2 *
z$c, c = 3 * z$d), list(y = 2 * z$c, b = 6 * z$d,
c = z0), list(y = 6 * z$d, b = z0, c = z0))
z[["d"]] <- z0
}
res <- .splinefun(x, z)
if (deriv > 0 && z$method == 2 && any(ind <- x <= z$x[1L]))
res[ind] <- ifelse(deriv == 1, z$y[1L], 0)
res
}
<bytecode: 0x7fe56e4853f8>
<environment: 0x7fe56efd3d80>
$B
function (x, deriv = 0L)
{
deriv <- as.integer(deriv)
if (deriv < 0L || deriv > 3L)
stop("'deriv' must be between 0 and 3")
if (deriv > 0L) {
z0 <- double(z$n)
z[c("y", "b", "c")] <- switch(deriv, list(y = z$b, b = 2 *
z$c, c = 3 * z$d), list(y = 2 * z$c, b = 6 * z$d,
c = z0), list(y = 6 * z$d, b = z0, c = z0))
z[["d"]] <- z0
}
res <- .splinefun(x, z)
if (deriv > 0 && z$method == 2 && any(ind <- x <= z$x[1L]))
res[ind] <- ifelse(deriv == 1, z$y[1L], 0)
res
}
<bytecode: 0x7fe56e4853f8>
<environment: 0x7fe56efc4db8>

Related

R function for returning multiplication result of a function

appreciate your guidance as im new to R programme. basically i've created a function to check whether the value is even or odd.
i wish to create a new result column whereby 'even' results in the original value * 2, and 'odd' results in the original value - 5.
not sure where i've gone wrong with the second part of the code but i am trying to figure out where can i include my 'check' column in the second function to specify it should be checked for even or odd.
i only learnt about ifelse(check(df$check) but it doesnt seem to work in my instance.
much appreciated!
## print 'odd' or 'even' results in df
check = function(df,col){
df['check'] =
ifelse(df[,col] %% 2 ==0, 'even', 'odd')
return(df)
}
# multiplication and subtraction for odd_even results
checkresult = function(df,col){
df['res'] =
ifelse(check(df) == 'even', df[,col] * 2, df[,col]-5)
return(df)
}
checkresult(df)
The simplest way to do it is by not implementing a new function, just use ifelse() as it was intended:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
df$res <- ifelse(df$x %% 2 == 0, df$x * 2, df$x - 5)
df |> head()
#> x res
#> 1 10 20
#> 2 7 2
#> 3 6 12
#> 4 3 -2
#> 5 9 4
#> 6 10 20
If you need to implement a function that returns a dataframe:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
my_func <- function(dframe, column){
vec <- dframe[,column]
dframe$res <- ifelse(vec %% 2 == 0, vec * 2, vec - 5)
return(dframe)
}
my_func(df, "x") |> head()
#> x res
#> 1 10 20
#> 2 7 2
#> 3 6 12
#> 4 3 -2
#> 5 9 4
#> 6 10 20
Edit 1
If you want the parity of the number, one extra line is required:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
my_func <- function(dframe, column){
vec <- dframe[,column]
dframe$parity <- ifelse(vec %% 2 == 0, "EVEN", "ODD")
dframe$res <- ifelse(vec %% 2 == 0, vec * 2, vec - 5)
return(dframe)
}
my_func(df, "x") |> head()
#> x parity res
#> 1 10 EVEN 20
#> 2 7 ODD 2
#> 3 6 EVEN 12
#> 4 3 ODD -2
#> 5 9 ODD 4
#> 6 10 EVEN 20

Counting split rules in decision trees in R

I'm trying to count each unique split rule from a data frame of decision trees in R. For example, if I have a data frame containing 4 trees like the one shown below:
df <- data.frame(
var = c('x10', NA, NA,
'x10', NA, 'x7', NA, NA,
'x5', 'x2', NA, NA, 'x9', NA, NA,
'x5', NA, NA),
num = c(1,1,1,
2,2,2,2,2,
1,1,1,1,1,1,1,
2,2,2),
iter = c(rep(1, 8), rep(2, 10))
)
> df
var num iter
1 x10 1 1
2 <NA> 1 1
3 <NA> 1 1
4 x10 2 1
5 <NA> 2 1
6 x7 2 1
7 <NA> 2 1
8 <NA> 2 1
9 x5 1 2
10 x2 1 2
11 <NA> 1 2
12 <NA> 1 2
13 x9 1 2
14 <NA> 1 2
15 <NA> 1 2
16 x5 2 2
17 <NA> 2 2
18 <NA> 2 2
The var column contains the variable name used in the splitting rule and is ordered by depth first. So, for example, the 4 trees created from that data would look like this:
I'm trying to find a way to return the count of each pair of variables used in a split rule, but grouped by iter. For example, if we look at the 2nd tree (i.e.,num == 2, iter == 1) we can see that x7 splits on x10. so, the pair x10:x7 appears 1 time when iter == 1.
My desired output would look something like this:
allSplits count iter
1 x10:x7 1 1
2 x5:x2 1 2
3 x5:x9 1 2
Any suggestions as to how I could do this?
There is probably a package that knows how to operate on this kind of data frame, but maybe these two hand-crafted recursive functions can get you started.
mkTree <- function(x, pos = 1L) {
var <- x[pos]
if (is.na(var)) {
list(NA_character_, NULL, NULL, 1L)
} else {
node <- vector("list", 4L)
node[[1L]] <- var
node[[2L]] <- l <- Recall(x, pos + 1L)
node[[3L]] <- r <- Recall(x, pos + 1L + l[[4L]])
node[[4L]] <- 1L + l[[4L]] + r[[4L]]
node
}
}
tabTree <- function(tree, sep = ":") {
x <- rep.int(NA_character_, tree[[4L]])
pos <- 1L
recurse <- function(subtree) {
var1 <- subtree[[1L]]
if (!is.na(var1)) {
for (i in 2:3) {
var2 <- subtree[[c(i, 1L)]]
if (!is.na(var2)) {
x[pos] <<- paste0(var1, sep, var2)
pos <<- pos + 1L
Recall(subtree[[i]])
}
}
}
}
recurse(tree)
x <- x[!is.na(x)]
if (length(x)) {
x <- factor(x)
setNames(tabulate(x), levels(x))
} else {
integer(0L)
}
}
mkTree transforms into recursive lists the segments of var in your data frame that specify a tree. Nodes in these recursive structures have the form:
list(variable_name, left_node, right_node, subtree_size)
tabTree takes the mkTree result and returns a named integer vector tabulating the splits. So you could do
f <- function(x) tabTree(mkTree(x))
L <- tapply(df[["var"]], df[c("num", "iter")], f, simplify = FALSE)
to get a list matrix storing the tabulated splits for each [num, iter] pair (i.e., for each tree).
L
## iter
## num 1 2
## 1 integer,0 integer,2
## 2 1 integer,0
L[2L, 1L]
## [[1]]
## x10:x7
## 1
L[1L, 2L]
## [[1]]
## x5:x2 x5:x9
## 1 1
And you could sum over num to get tabulated splits for each level of iter.
g <- function(l) {
x <- unlist(unname(l))
tapply(x, names(x), sum)
}
apply(L, 2L, g)
## $`1`
## x10:x7
## 1
## $`2`
## x5:x2 x5:x9
## 1 1

Step/ramp function in R

I'm trying to come up with a generic R function to produce figure (b) in the image, where x is the x-axis and g is the y-axis. I'm trying to come up with a function f with the prototype f(x, start_x, dx, init_g, end_g) where x is a vector of ints representing timesteps (e.g. 1:100), start_x represents the step to start the ramp, dx represents the distance between start_x and end of ramp. init_g is the starting value of the function on the vertical axis, and end_g is the value at the end of the ramp. The function will return the ramp which I can plot to get that figure.
If dx=0, we get a step function like in figure (a). Also, the ramp can slope down depending on whether end_g is more or less than init_g.
I don't care about figures (c) or (d) in the image. I just can't figure out what to do to get the part between start_x and start_x + dx. Thanks for the help.
It only takes four x and four g values to define a plot like this.
plot_fun <- function(x, start_x, dx, init_g, end_g) {
x <- c(x[1], start_x, start_x + dx, tail(x, 1))
g <- c(init_g, init_g, end_g, end_g)
plot(x, g, type = "l")
}
plot_fun(x = 1:100, start_x = 20, dx = 30, init_g = 2, end_g = 5)
plot_fun(x = 1:100, start_x = 20, dx = 0, init_g = 2, end_g = 5)
If you just want to generate a vector g of the same length as x that is needed to produce the plot, here is how that can work:
make_g <- function(x, start_x, dx, init_g, end_g) {
require(dplyr)
require(tidyr)
x_g <- data.frame(x = c(x[1], start_x + dx),
g = c(init_g, end_g))
x_g <- data.frame(x) %>%
left_join(x_g) %>%
fill(g, .direction = "down")
return(x_g$g)
}
make_g(x = 1:100, start_x = 20, dx = 30, init_g = 2, end_g = 5)
Joining, by = "x"
[1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 5 5 5 5 5
[55] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
I figured out something that seems to work, not sure exactly why. There's probably a more elegant way to do it.
f <- function(x, x0, dx, g0, g1) {
s <- (x0 - dx) * (g1-g0)/dx
ifelse(x < x0, g0,
ifelse(x < (x0+dx), (x-dx) * (g1 - g0)/dx + (g0 - s),
g1))
}
Here's a function that returns data that you can use to plot.
f <- function(x, start_x, dx, init_g, end_g, type = c("ramp", "step")) {
type <- match.arg(type)
y <- numeric(length(x))
if (!length(x)) return(y)
end_x <- start_x + dx
y[x < start_x] <- init_g
y[x > end_x] <- end_g
mid <- (x >= start_x & x <= end_x)
y[mid] <-
if (type == "ramp") {
approx(c(start_x, end_x), c(init_g, end_g), xout = x[mid])$y
} else c(rep(init_g, length(mid) - 1), end_g)
return(data.frame(x = x, y = y))
}
Demonstration:
plot(y ~ x, data = f(1:100, 50, 10, 10, 20, "ramp"), type = "b")
plot(y ~ x, data = f(1:100, 50, 10, 10, 20, "step"), type = "b")

In R, how do I make an iterative calculation without using a loop?

Here is a simple example of one type of iterative calc:
vals <- data.frame( "x"=c( 14, 15, 12, 10, 17 ), "ema"=0 )
vals$ema[1] <- vals$x[1]
K <- 0.90
for( jj in 2:nrow( vals ) )
vals$ema[jj] <- K * vals$ema[jj-1] + (1-K) * vals$x[jj]
vals
x ema
1 14 14.0000
2 15 14.1000
3 12 13.8900
4 10 13.5010
5 17 13.8509
The more involved examples use if...else to determine the next value:
for( jj in 2:nrow( vals ) )
if( K * vals$ema[jj-1] + (1-K) * vals$x[jj] < 5.0 )
vals$ema[jj] <- 5.0
else if( K * vals$ema[jj-1] + (1-K) * vals$x[jj] > 15.0 )
vals$ema[jj] <- 15.0
else
vals$ema[jj] <- K * vals$ema[jj-1] + (1-K) * vals$x[jj]
I am not sure if it would be more involved or not, but the decision can be based on the previous value as well:
K1 <- 0.999
K2 <- 0.95
K3 <- 0.90
for( jj in 2:now( vals ) )
if( vals$ema[jj-1] < 0.0 )
vals$ema[jj] <- K1 * vals$ema[jj-1] + (1-K1) * vals$x[jj]
else if( vals$ema[jj-1] > 100.0 )
vals$ema[jj] <- K3 * vals$ema[jj-1] + (1-K3) * vals$x[jj]
else
vals$ema[jj] <- K2 * vals$ema[jj-1] + (1-K2) * vals$x[jj]
This answer by WaltS to a similar question I had about recursive calculations provides two potential solutions. Adapting one of them to your question:
vals$ema.Reduce <- Reduce(function(myema, x) K * myema + (1-K) * x,
x = tail(vals$x, -1), init = 14, accumulate = TRUE)
vals
# x ema ema.Reduce
#1 14 14.0000 14.0000
#2 15 14.1000 14.1000
#3 12 13.8900 13.8900
#4 10 13.5010 13.5010
#5 17 13.8509 13.8509
Explanation of the function:
Reduce() is calculating ema for the current jj row, and myema is the previous value (jj-1) starting with init. The x vector required by Reduce consists of vals$x for the rows you want to calculate: row 2 to the last row = x = tail(vals$x, -1). The accumulate = TRUE option returns the vector instead of the final value. (Note the x term in Reduce is a generic term and not the same as vals$x in the example data. For calculations that do not require the additional term vals$x, a vector of 0's would work (as in the linked answer)).
Adding if/else conditions to Reduce (note: init is changed in these examples to illustrate the conditional statements):
Reduce(function(myema, x) {
if(myema < 5) {
5
} else if(myema > 15) {
15
} else {
K * myema + (1-K) * x
}
}, x = tail(vals$x, -1), init = 16, accumulate = TRUE)
#[1] 16.000 15.000 14.700 14.230 14.507
Reduce(function(myema, x) {
if(myema < 0) {
K1 * myema + (1-K1) * x
} else if(myema > 100) {
K3 * myema + (1-K3) * x
} else {
K2 * myema + (1-K2) * x
}
}, x = tail(vals$x, -1), init = 110, accumulate = TRUE)
#[1] 110.00000 100.50000 91.65000 87.56750 84.03912
K3*110 + (1-K3)*vals$x[2] #100.5
K3*100.5 + (1-K3)*vals$x[3] #91.65
K2*91.65 + (1-K2)*vals$x[4] #87.5675
K2*87.5675 + (1-K2)*vals$x[5] #84.03912
Seems this succeeds:
vals$ema2 <- c(vals$ema[1], K*vals$ema[1:4] +(1-K)*vals$x[2:5] )
> vals
x ema ema2
1 14 14.0000 14.0000
2 15 14.1000 14.1000
3 12 13.8900 13.8900
4 10 13.5010 13.5010
5 17 13.8509 13.8509
Sometimes it is best to work with the time series and data munging libraries. In this case, lag.zoo from the zoo library handles lagged values for you.
library(dplyr)
library(zoo)
vals <- data.frame( "x"=c( 14, 15, 12, 10, 17 ) )
K <- 0.90
vals %>% mutate(ema = (1-K)*vals$x + K*(lag(vals$x,1)))
For this particular problem, the weights for each value is some function of k and i (as in the ith value). We can write a function for the weights, and vectorize it:
weights <- function(i, k) {
q <- 1-k
qs <- '^'(q, 1:i)
rev(qs) * c(1, rep(k, (i-1)))
}
v_weights <- Vectorize(weights)
An example:
> v_weights(1:3, .1)
[[1]]
[1] 0.9
[[2]]
[1] 0.81 0.09
[[3]]
[1] 0.729 0.081 0.090
where these are the weights of the "preceding" x values. We proceed with some matrix algebra. I write a function to make the weights (above) into a matrix:
weight_matrix <- function(j, k) {
w <- v_weights(1:j, k=k)
Ws <- matrix(0, j+1, j+1)
Ws[row(Ws)+col(Ws)<(j+2)] <- unlist(rev(w))
Ws <- t(Ws)
Ws[row(Ws)+col(Ws)==(j+2)] <- k
Ws[(j+1),1] <- 1
Ws
}
Example:
> weight_matrix(3, .1)
[,1] [,2] [,3] [,4]
[1,] 0.729 0.081 0.09 0.1
[2,] 0.810 0.090 0.10 0.0
[3,] 0.900 0.100 0.00 0.0
[4,] 1.000 0.000 0.00 0.0
Then multiply this with the vector of xs. Function: ema <- function(x, k) rev(weight_matrix(length(x)-1, k) %*% x[1:(length(x))]).
To get the dataframe above (I "flipped" the k so it's 0.1 instead of 0.9):
> x <- c(14, 15, 12, 10, 17)
> k <- .1
> vals <- data.frame("x"=x, "ema"=ema(x, k))
> vals
x ema
1 14 14.0000
2 15 14.1000
3 12 13.8900
4 10 13.5010
5 17 13.8509
#shayaa's answer is 99% correct. dplyr implements lag just fine, and apart from a typo in that answer (one value of x should be ema), extraneous calls to column names, and a missing default value (otherwise it puts NA in the first row) it works perfectly well.
library(dplyr)
vals %>% mutate(ema = K*lag(ema, 1, default=ema[1]) + (1-K)*x)
#> x ema
#> 1 14 14.0000
#> 2 15 14.1000
#> 3 12 13.8900
#> 4 10 13.5010
#> 5 17 13.8509

Combining vectors of unequal length and non-unique values

I would like to do the following:
combine into a data frame, two vectors that
have different length
contain sequences found also in the other vector
contain sequences not found in the other vector
sequences that are not found in other vector are never longer than 3 elements
always have same first element
The data frame should show the equal sequences in the two vectors aligned, with NA in the column if a vector lacks a sequence present in the other vector.
For example:
vector 1 vector 2 vector 1 vector 2
1 1 a a
2 2 g g
3 3 b b
4 1 or h a
1 2 a g
2 3 g b
5 4 c h
5 c
should be combined into data frame
1 1 a a
2 2 g g
3 3 b b
4 NA h NA
1 1 or a a
2 2 g g
NA 3 NA b
NA 4 NA h
5 5 c c
What I did, is to search for merge, combine, cbind, plyr examples but was not able to find solutions. I am afraid I will need to start write a function with nested for loops to solve this problem.
Note - this was proposed as an answer to the first version of the OP. The question has been modified since then but the problem is still not well-defined in my opinion.
Here is a solution that works with your integer example and would also work with numeric vectors. I am also assuming that:
both vectors contain the same number of sequences
a new sequence starts where value[i+1] <= value[i]
If your vectors are non-numeric or if one of my assumptions does not fit your problem, you'll have to clarify.
v1 <- c(1,2,3,4,1,2,5)
v2 <- c(1,2,3,1,2,3,4,5)
v1.sequences <- split(v1, cumsum(c(TRUE, diff(v1) <= 0)))
v2.sequences <- split(v2, cumsum(c(TRUE, diff(v2) <= 0)))
align.fun <- function(s1, s2) { #aligns two sequences
s12 <- sort(unique(c(s1, s2)))
cbind(ifelse(s12 %in% s1, s12, NA),
ifelse(s12 %in% s2, s12, NA))
}
do.call(rbind, mapply(align.fun, v1.sequences, v2.sequences))
# [,1] [,2]
# [1,] 1 1
# [2,] 2 2
# [3,] 3 3
# [4,] 4 NA
# [5,] 1 1
# [6,] 2 2
# [7,] NA 3
# [8,] NA 4
# [9,] 5 5
I maintain that your problem might be solved in terms of the shortest common supersequence. It assumes that your two vectors each represent one sequence. Please give the code below a try.
If it still does not solve your problem, you'll have to explain exactly what you mean by "my vector contains not one but many sequences": define what you mean by a sequence and tell us how sequences can be identified by scanning through your two vectors.
Part I: given two sequences, find the longest common subsequence
LongestCommonSubsequence <- function(X, Y) {
m <- length(X)
n <- length(Y)
C <- matrix(0, 1 + m, 1 + n)
for (i in seq_len(m)) {
for (j in seq_len(n)) {
if (X[i] == Y[j]) {
C[i + 1, j + 1] = C[i, j] + 1
} else {
C[i + 1, j + 1] = max(C[i + 1, j], C[i, j + 1])
}
}
}
backtrack <- function(C, X, Y, i, j) {
if (i == 1 | j == 1) {
return(data.frame(I = c(), J = c(), LCS = c()))
} else if (X[i - 1] == Y[j - 1]) {
return(rbind(backtrack(C, X, Y, i - 1, j - 1),
data.frame(LCS = X[i - 1], I = i - 1, J = j - 1)))
} else if (C[i, j - 1] > C[i - 1, j]) {
return(backtrack(C, X, Y, i, j - 1))
} else {
return(backtrack(C, X, Y, i - 1, j))
}
}
return(backtrack(C, X, Y, m + 1, n + 1))
}
Part II: given two sequences, find the shortest common supersequence
ShortestCommonSupersequence <- function(X, Y) {
LCS <- LongestCommonSubsequence(X, Y)[c("I", "J")]
X.df <- data.frame(X = X, I = seq_along(X), stringsAsFactors = FALSE)
Y.df <- data.frame(Y = Y, J = seq_along(Y), stringsAsFactors = FALSE)
ALL <- merge(LCS, X.df, by = "I", all = TRUE)
ALL <- merge(ALL, Y.df, by = "J", all = TRUE)
ALL <- ALL[order(pmax(ifelse(is.na(ALL$I), 0, ALL$I),
ifelse(is.na(ALL$J), 0, ALL$J))), ]
ALL$SCS <- ifelse(is.na(ALL$X), ALL$Y, ALL$X)
ALL
}
Your Example:
ShortestCommonSupersequence(X = c("a","g","b","h","a","g","c"),
Y = c("a","g","b","a","g","b","h","c"))
# J I X Y SCS
# 1 1 1 a a a
# 2 2 2 g g g
# 3 3 3 b b b
# 9 NA 4 h <NA> h
# 4 4 5 a a a
# 5 5 6 g g g
# 6 6 NA <NA> b b
# 7 7 NA <NA> h h
# 8 8 7 c c c
(where the two updated vectors are in columns X and Y.)

Resources