I have the piece of code below. What i want is alter the code
such that when the column entries for mat are the same, i get the same result in their respective positions in summation without it performing the fit operation?
So instead of getting
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 5 16 4
I want
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 3 16 3
set.seed(123)
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
newmat=rbind(mat,summation)
newmat
You can find out columns that are duplicates and replace the corresponding summation value with the first value of summation so that you get the same value.
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- ave(summation, match(vals, unique(vals)), FUN = function(x) x[1])
newmat=rbind(mat,summation)
newmat
To pass only unique columns to fit function we can do :
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- apply(mat[, !duplicated(vals)], 2, fit)
summation <- summation[match(vals, unique(vals))]
newmat=rbind(mat,summation)
newmat
Related
I am trying to create a new variable for each observation using the following formula:
Index = ∑(BAj / DISTANCEij)
where:
j = focal observation; i= other observation
Basically, I'm taking the focal individual (i) and finding the euclidean distance between it and another point and dividing the other points BA by that distance. Do that for all the other points and then sum them all and repeat all of this for each point.
Here is some sample data:
ID <- 1:4
BA <- c(3, 5, 6, 9)
x <- c(0, 2, 3, 7)
y <- c(1, 3, 4, 9)
df <- data.frame(ID, BA, x, y)
print(df)
ID BA x y
1 1 3 0 1
2 2 5 2 3
3 3 6 3 4
4 4 9 7 9
Currently, I've extracted out vectors and created a formula to calculate part of the formula shown here:
vec1 <- df[1, ]
vec2 <- df[2, ]
dist <- function(vec1, vec2) vec1$BA/sqrt((vec2$x - vec1$x)^2 +
(vec2$y - vec1$y)^2)
My question is how do I repeat this with the x and y values for vec2 changing for each new other point with vec1 remaining the same and then sum them all together?
We may loop over the row sequence, extract the data and apply the dist function
library(dplyr)
library(purrr)
df %>%
mutate(dist_out = map_dbl(row_number(), ~ {
othr <- cur_data()[-.x,]
cur <- cur_data()[.x, ]
sum(dist(cur, othr))
}))
-output
ID BA x y dist_out
1 1 3 0 1 2.049983
2 2 5 2 3 5.943485
3 3 6 3 4 6.593897
4 4 9 7 9 3.404545
Here are two base R ways.
1. for loop
ID <- 1:4
BA <- c(3, 5, 6, 9)
x <- c(0, 2, 3, 7)
y <- c(1, 3, 4, 9)
df <- data.frame(ID, BA, x, y)
n <- nrow(df)
d <- dist(df[c("x", "y")], upper = TRUE)
d <- as.matrix(d)
Index <- numeric(n)
for(j in seq_len(n)) {
d_j <- d[-j, j, drop = TRUE]
Index[j] <- sum(df$BA[j]/d_j)
}
Index
#> [1] 2.049983 5.943485 6.593897 3.404545
Created on 2022-08-18 by the reprex package (v2.0.1)
2. sapply loop
Index <- sapply(seq_len(n), \(j) sum(df$BA[j]/d[-j, j, drop = TRUE]))
Index
#> [1] 2.049983 5.943485 6.593897 3.404545
Created on 2022-08-18 by the reprex package (v2.0.1)
I want to get only the rows from matrix A that have only any 5 or any 4 numbers existing in matrix B. Is there an R function I can use to solve it?
Μatrix A
1 2 3 4 5
2 3 5 6 7
3 5 7 8 1
2 7 5 4 3
matrix B:
1 2 4 5 6
2 4 1 3 7
3 5 7 9 8
5 8 9 2 6
if I ask for 5 numbers from B to match rows in A I will get no matching row.
if I ask for 4 numbers from B to match rows in A I will get :
B1 - A1
B2 - A1, A4
B3 - A3
I don't know of anything built into R that will work, but this custom function may help you get what you're after.
find.matches <- function(A, B, num.matches){
# Set rownames for the matrix
rownames(A) = paste0(deparse(substitute(A)), 1:nrow(A))
rownames(B) = paste0(deparse(substitute(B)), 1:nrow(B))
# Create matrix indicating matching items
out <- t(apply(cbind(seq_len(nrow(B)),B), 1,
function(y) {
cur.b = y[-1]
res <- apply(cbind(seq_len(nrow(A)),A), 1,
function(z) {
cur.a = z[-1]
ifelse(sum(table(cur.a[cur.a %in% cur.b])) == num.matches, rownames(A)[z[1]], NA)})}))
# Create list of matching items
out <- apply(out, 1, function(x) paste(x[!is.na(x)]))
# Remove non matches from list
out <- out[lapply(out,length) > 0]
if(length(out) > 0){
# Convert list to a vector
out <- paste0(names(out), " - ", lapply(c(out), paste, collapse = ", "))
# Print the vector
cat(out, sep = "\n")
} else{
print("No Matches Found")
}
}
# Create matrices to compare
A <- matrix(c(1,2,3,2,2,3,5,7,3,5,7,5,4,6,8,4,5,7,1,3), nrow = 4)
B <- matrix(c(1,2,3,5,2,4,5,8,4,1,7,9,5,3,9,2,6,7,8,6), nrow = 4)
# Compare matrices
find.matches(A, B, 4)
I have a function runBootstrap whose output result is a vector of variable length (depending on # of values for cat, which itself is a product of test). Apologies that this isn't "minimal".
require(dplyr)
test <- function(combo) {
if(combo[1] == 4) {
cat <- 4
} else if((combo[1] == 3 & combo[2] == 2) | (combo[1] == 2 & combo[2] == 2)) {
cat <- 3
} else if((combo[1] == 2 & combo[2] == 1) | (combo[1] == 1 & combo[2] == 2)) {
cat <- 2
} else {
cat <- 1
}
}
arg1.freqs <- c(0.5, 0.2, 0.1, 0.1)
arg2.freqs <- c(0.8, 0.2)
runBootstrap <- function(arg1.freqs, arg2.freqs) {
sim.df <- data.frame(x1 = 1:10000, y1 = NA)
sim.df$x1 <- sample(1:4, 10000, replace = TRUE,
prob = arg1.freqs)
sim.df$y1 <- sample(1:2, 10000, replace = TRUE,
prob = arg2.freqs)
sim.df$cat <- NA
for(i in 1:nrow(sim.df)) {
combo <- c(sim.df[i, 1], sim.df[i, 2])
sim.df$cat[i] <- test(combo)
}
sim.df <- sim.df %>%
select(cat) %>%
group_by(cat) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
sim.df <- as.data.frame(sim.df)
result <- c(sim.df[1, 3], sim.df[2, 3])
}
In this current version there are only two values for cat so result is a vector of length 2; in a future version I will adjust code so that length(result) will equal # values of cat.
When using the function in a for loop, I would like to use the vector values to create new columns in an already existing data.frame df1. The code I've tried thus far is as follows:
df1$result <- NA
for (i in 1:nrow(df1)) {
df1$result[i] <- runBootstrap(arg1.freqs, arg2.freqs)
}
This clearly doesn't work unless the result vector is length = 1. But I don't know the length of the vector until the function runs (although once it runs it will be same length each iteration).
What I would like to achieve is the following:
Example 1: if length(result) == 2
df1.col x1 x2
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
6 6 6 6
Example 2: if length(result) == 3
df1.col x1 x2 x3
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
Thanks for any advice or direction.
edited for clarification
UPDATE - edited with solution
I got it to work as I wanted by creating a blank list, populating, then using rbind as follows:
appendResults <- function(df1, arg1, arg2) {
my.list <- vector("list", nrow(df1))
for (i in 1:nrow(df1)) {
arg1.freqs <- as.numeric(arg1[i, 3:6])
arg2.freqs <- as.numeric(arg2[i, 3:4])
my.list[[i]] <- runBootstrap(arg1.freqs, arg2.freqs)
}
result.df <- do.call(rbind, my.list)
df2 <- do.call(cbind, list(df1, result.df))
}
Check this one, not sure what the result looks like, but this creates empty columns, equal to the length of results, with NAs:
# fake data frame
df1 <- data.frame(x = c(1,2,3), y = c("a", "b", "c"))
# say result has length 3
res <- c(5,6,7)
# make columns with names x1, ..., x + length of res
# and assign NA values to those column
df1[ , paste("x", 1:length(res), sep = "")] <- NA
Please forgive me if I missed an answer to such a simple question.
I want to use cbind() to bind two columns. One of them is a single entry shorter in length.
Can I have R supply an NA for the missing value?
The documentation discusses a deparse.level argument but this doesn't seem to be my solution.
Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?
Try this:
x <- c(1:5)
y <- c(4:1)
length(y) = length(x)
cbind(x,y)
x y
[1,] 1 4
[2,] 2 3
[3,] 3 2
[4,] 4 1
[5,] 5 NA
or this:
x <- c(4:1)
y <- c(1:5)
length(x) = length(y)
cbind(x,y)
x y
[1,] 4 1
[2,] 3 2
[3,] 2 3
[4,] 1 4
[5,] NA 5
I think this will do something similar to what DWin suggested and work regardless of which vector is shorter:
x <- c(4:1)
y <- c(1:5)
lengths <- max(c(length(x), length(y)))
length(x) <- lengths
length(y) <- lengths
cbind(x,y)
The code above can also be condensed to:
x <- c(4:1)
y <- c(1:5)
length(x) <- length(y) <- max(c(length(x), length(y)))
cbind(x,y)
EDIT
Here is what I came up with to address the question:
"Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?"
inserted into the original post by Matt O'Brien.
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
cbind(x,y)
# x y
# [1,] NA 1
# [2,] 4 2
# [3,] 3 3
# [4,] 2 4
# [5,] 1 5
Here is a function:
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
return(cbind(x,y))
}
my.cbind(x,y,first)
my.cbind(c(1:5),c(4:1),1)
my.cbind(c(1:5),c(4:1),0)
my.cbind(c(1:4),c(5:1),1)
my.cbind(c(1:4),c(5:1),0)
my.cbind(c(1:5),c(5:1),1)
my.cbind(c(1:5),c(5:1),0)
This version allows you to cbind two vectors of different mode:
x <- c(4:1)
y <- letters[1:5]
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
x <- as.data.frame(x)
y <- as.data.frame(y)
return(data.frame(x,y))
}
my.cbind(x,y,first)
# x y
# 1 NA a
# 2 4 b
# 3 3 c
# 4 2 d
# 5 1 e
my.cbind(c(1:5),letters[1:4],1)
my.cbind(c(1:5),letters[1:4],0)
my.cbind(c(1:4),letters[1:5],1)
my.cbind(c(1:4),letters[1:5],0)
my.cbind(c(1:5),letters[1:5],1)
my.cbind(c(1:5),letters[1:5],0)
A while back I had put together a function called Cbind that was meant to do this sort of thing. In its current form, it should be able to handle vectors, data.frames, and matrices as the input.
For now, the function is here: https://gist.github.com/mrdwab/6789277
Here is how one would use the function:
x <- 1:5
y <- letters[1:4]
z <- matrix(1:4, ncol = 2, dimnames = list(NULL, c("a", "b")))
Cbind(x, y, z)
# x y z_a z_b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA
Cbind(x, y, z, first = FALSE)
# x y z_a z_b
# 1 1 <NA> NA NA
# 2 2 a NA NA
# 3 3 b NA NA
# 4 4 c 1 3
# 5 5 d 2 4
The two three functions required are padNA, dotnames, and Cbind, which are defined as follows:
padNA <- function (mydata, rowsneeded, first = TRUE) {
## Pads vectors, data.frames, or matrices with NA
temp1 = colnames(mydata)
rowsneeded = rowsneeded - nrow(mydata)
temp2 = setNames(
data.frame(matrix(rep(NA, length(temp1) * rowsneeded),
ncol = length(temp1))), temp1)
if (isTRUE(first)) rbind(mydata, temp2)
else rbind(temp2, mydata)
}
dotnames <- function(...) {
## Gets the names of the objects passed through ...
vnames <- as.list(substitute(list(...)))[-1L]
vnames <- unlist(lapply(vnames,deparse), FALSE, FALSE)
vnames
}
Cbind <- function(..., first = TRUE) {
## cbinds vectors, data.frames, and matrices together
Names <- dotnames(...)
datalist <- setNames(list(...), Names)
nrows <- max(sapply(datalist, function(x)
ifelse(is.null(dim(x)), length(x), nrow(x))))
datalist <- lapply(seq_along(datalist), function(x) {
z <- datalist[[x]]
if (is.null(dim(z))) {
z <- setNames(data.frame(z), Names[x])
} else {
if (is.null(colnames(z))) {
colnames(z) <- paste(Names[x], sequence(ncol(z)), sep = "_")
} else {
colnames(z) <- paste(Names[x], colnames(z), sep = "_")
}
}
padNA(z, rowsneeded = nrows, first = first)
})
do.call(cbind, datalist)
}
Part of the reason I stopped working on the function was that the gdata package already has a function called cbindX that handles cbinding data.frames and matrices with different numbers of rows. It will not work directly on vectors, so you need to convert them to data.frames first.
library(gdata)
cbindX(data.frame(x), data.frame(y), z)
# x y a b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA
I would like to do the following:
combine into a data frame, two vectors that
have different length
contain sequences found also in the other vector
contain sequences not found in the other vector
sequences that are not found in other vector are never longer than 3 elements
always have same first element
The data frame should show the equal sequences in the two vectors aligned, with NA in the column if a vector lacks a sequence present in the other vector.
For example:
vector 1 vector 2 vector 1 vector 2
1 1 a a
2 2 g g
3 3 b b
4 1 or h a
1 2 a g
2 3 g b
5 4 c h
5 c
should be combined into data frame
1 1 a a
2 2 g g
3 3 b b
4 NA h NA
1 1 or a a
2 2 g g
NA 3 NA b
NA 4 NA h
5 5 c c
What I did, is to search for merge, combine, cbind, plyr examples but was not able to find solutions. I am afraid I will need to start write a function with nested for loops to solve this problem.
Note - this was proposed as an answer to the first version of the OP. The question has been modified since then but the problem is still not well-defined in my opinion.
Here is a solution that works with your integer example and would also work with numeric vectors. I am also assuming that:
both vectors contain the same number of sequences
a new sequence starts where value[i+1] <= value[i]
If your vectors are non-numeric or if one of my assumptions does not fit your problem, you'll have to clarify.
v1 <- c(1,2,3,4,1,2,5)
v2 <- c(1,2,3,1,2,3,4,5)
v1.sequences <- split(v1, cumsum(c(TRUE, diff(v1) <= 0)))
v2.sequences <- split(v2, cumsum(c(TRUE, diff(v2) <= 0)))
align.fun <- function(s1, s2) { #aligns two sequences
s12 <- sort(unique(c(s1, s2)))
cbind(ifelse(s12 %in% s1, s12, NA),
ifelse(s12 %in% s2, s12, NA))
}
do.call(rbind, mapply(align.fun, v1.sequences, v2.sequences))
# [,1] [,2]
# [1,] 1 1
# [2,] 2 2
# [3,] 3 3
# [4,] 4 NA
# [5,] 1 1
# [6,] 2 2
# [7,] NA 3
# [8,] NA 4
# [9,] 5 5
I maintain that your problem might be solved in terms of the shortest common supersequence. It assumes that your two vectors each represent one sequence. Please give the code below a try.
If it still does not solve your problem, you'll have to explain exactly what you mean by "my vector contains not one but many sequences": define what you mean by a sequence and tell us how sequences can be identified by scanning through your two vectors.
Part I: given two sequences, find the longest common subsequence
LongestCommonSubsequence <- function(X, Y) {
m <- length(X)
n <- length(Y)
C <- matrix(0, 1 + m, 1 + n)
for (i in seq_len(m)) {
for (j in seq_len(n)) {
if (X[i] == Y[j]) {
C[i + 1, j + 1] = C[i, j] + 1
} else {
C[i + 1, j + 1] = max(C[i + 1, j], C[i, j + 1])
}
}
}
backtrack <- function(C, X, Y, i, j) {
if (i == 1 | j == 1) {
return(data.frame(I = c(), J = c(), LCS = c()))
} else if (X[i - 1] == Y[j - 1]) {
return(rbind(backtrack(C, X, Y, i - 1, j - 1),
data.frame(LCS = X[i - 1], I = i - 1, J = j - 1)))
} else if (C[i, j - 1] > C[i - 1, j]) {
return(backtrack(C, X, Y, i, j - 1))
} else {
return(backtrack(C, X, Y, i - 1, j))
}
}
return(backtrack(C, X, Y, m + 1, n + 1))
}
Part II: given two sequences, find the shortest common supersequence
ShortestCommonSupersequence <- function(X, Y) {
LCS <- LongestCommonSubsequence(X, Y)[c("I", "J")]
X.df <- data.frame(X = X, I = seq_along(X), stringsAsFactors = FALSE)
Y.df <- data.frame(Y = Y, J = seq_along(Y), stringsAsFactors = FALSE)
ALL <- merge(LCS, X.df, by = "I", all = TRUE)
ALL <- merge(ALL, Y.df, by = "J", all = TRUE)
ALL <- ALL[order(pmax(ifelse(is.na(ALL$I), 0, ALL$I),
ifelse(is.na(ALL$J), 0, ALL$J))), ]
ALL$SCS <- ifelse(is.na(ALL$X), ALL$Y, ALL$X)
ALL
}
Your Example:
ShortestCommonSupersequence(X = c("a","g","b","h","a","g","c"),
Y = c("a","g","b","a","g","b","h","c"))
# J I X Y SCS
# 1 1 1 a a a
# 2 2 2 g g g
# 3 3 3 b b b
# 9 NA 4 h <NA> h
# 4 4 5 a a a
# 5 5 6 g g g
# 6 6 NA <NA> b b
# 7 7 NA <NA> h h
# 8 8 7 c c c
(where the two updated vectors are in columns X and Y.)