Writing a summation formula using variables from multiple observations - r

I am trying to create a new variable for each observation using the following formula:
Index = ∑(BAj / DISTANCEij)
where:
j = focal observation; i= other observation
Basically, I'm taking the focal individual (i) and finding the euclidean distance between it and another point and dividing the other points BA by that distance. Do that for all the other points and then sum them all and repeat all of this for each point.
Here is some sample data:
ID <- 1:4
BA <- c(3, 5, 6, 9)
x <- c(0, 2, 3, 7)
y <- c(1, 3, 4, 9)
df <- data.frame(ID, BA, x, y)
print(df)
ID BA x y
1 1 3 0 1
2 2 5 2 3
3 3 6 3 4
4 4 9 7 9
Currently, I've extracted out vectors and created a formula to calculate part of the formula shown here:
vec1 <- df[1, ]
vec2 <- df[2, ]
dist <- function(vec1, vec2) vec1$BA/sqrt((vec2$x - vec1$x)^2 +
(vec2$y - vec1$y)^2)
My question is how do I repeat this with the x and y values for vec2 changing for each new other point with vec1 remaining the same and then sum them all together?

We may loop over the row sequence, extract the data and apply the dist function
library(dplyr)
library(purrr)
df %>%
mutate(dist_out = map_dbl(row_number(), ~ {
othr <- cur_data()[-.x,]
cur <- cur_data()[.x, ]
sum(dist(cur, othr))
}))
-output
ID BA x y dist_out
1 1 3 0 1 2.049983
2 2 5 2 3 5.943485
3 3 6 3 4 6.593897
4 4 9 7 9 3.404545

Here are two base R ways.
1. for loop
ID <- 1:4
BA <- c(3, 5, 6, 9)
x <- c(0, 2, 3, 7)
y <- c(1, 3, 4, 9)
df <- data.frame(ID, BA, x, y)
n <- nrow(df)
d <- dist(df[c("x", "y")], upper = TRUE)
d <- as.matrix(d)
Index <- numeric(n)
for(j in seq_len(n)) {
d_j <- d[-j, j, drop = TRUE]
Index[j] <- sum(df$BA[j]/d_j)
}
Index
#> [1] 2.049983 5.943485 6.593897 3.404545
Created on 2022-08-18 by the reprex package (v2.0.1)
2. sapply loop
Index <- sapply(seq_len(n), \(j) sum(df$BA[j]/d[-j, j, drop = TRUE]))
Index
#> [1] 2.049983 5.943485 6.593897 3.404545
Created on 2022-08-18 by the reprex package (v2.0.1)

Related

Producing equal results for column entries which are the same

I have the piece of code below. What i want is alter the code
such that when the column entries for mat are the same, i get the same result in their respective positions in summation without it performing the fit operation?
So instead of getting
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 5 16 4
I want
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 3 16 3
set.seed(123)
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
newmat=rbind(mat,summation)
newmat
You can find out columns that are duplicates and replace the corresponding summation value with the first value of summation so that you get the same value.
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- ave(summation, match(vals, unique(vals)), FUN = function(x) x[1])
newmat=rbind(mat,summation)
newmat
To pass only unique columns to fit function we can do :
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- apply(mat[, !duplicated(vals)], 2, fit)
summation <- summation[match(vals, unique(vals))]
newmat=rbind(mat,summation)
newmat

How do I use output of indexes to subset my dataframe?

I want to match vector 1 to vector 2 to see if items in vector 1 and found in vector 2. Then I want to create 2 new vectors - a subset of vector 1 of the rows of values contained both vectors, and a subset of vector 1 for the values not found in both vectors. match() function followed by which(is.na()) works great for small data sets, but I have a data set with 1000 elements.
Data1 <- c(1, 2, 3, 4, 5)
Data2 <- c(1, 3, 5, 6, 7)
#Match vector1 to vector2
A <- match(Data1, Data2)
[1] 1 NA 2 NA 3
#to obtain positions of non matching elements
x <- which(is.na(A), arr.ind = TRUE)
[1] 2 4
Data1[c(2,4)]
#to obtain positions of matching elements
y < which(A >= 1)
[1] 1 3 5
Data1[c(1,3,5)]
Try this so you do not have to deal with the NAs from match():
Data1 <- c(1, 2, 3, 4, 5)
Data2 <- c(1, 3, 5, 6, 7)
# Values of Data1 in Data2
A <- Data1[Data1 %in% Data2]
A
# output:
# > A
# [1] 1 3 5
# create not in function
'%ni%' <- Negate('%in%')
# Values of Data1 not in Data2
B <- Data1[Data1 %ni% Data2]
B
# output:
# > B
# [1] 2 4

Groupby bins and aggregate in R

I have data like (a,b,c)
a b c
1 2 1
2 3 1
9 2 2
1 6 2
where 'a' range is divided into n (say 3) equal parts and aggregate function calculates b values (say max) and grouped by at 'c' also.
So the output looks like
a_bin b_m(c=1) b_m(c=2)
1-3 3 6
4-6 NaN NaN
7-9 NaN 2
Which is MxN where M=number of a bins, N=unique c samples or all range
How do I approach this? Can any R package help me through?
A combination of aggregate, cut and reshape seems to work
df <- data.frame(a = c(1,2,9,1),
b = c(2,3,2,6),
c = c(1,1,2,2))
breaks <- c(0, 3, 6, 9)
# Aggregate data
ag <- aggregate(df$b, FUN=max,
by=list(a=cut(df$a, breaks, include.lowest=T), c=df$c))
# Reshape data
res <- reshape(ag, idvar="a", timevar="c", direction="wide")
There would be easier ways.
If your dataset is dat
res <- sapply(split(dat[, -3], dat$c), function(x) {
a_bin <- with(x, cut(a, breaks = c(1, 3, 6, 9), include.lowest = T, labels = c("1-3",
"4-6", "7-9")))
c(by(x$b, a_bin, FUN = max))
})
res1 <- setNames(data.frame(row.names(res), res),
c("a_bin", "b_m(c=1)", "b_m(c=2)"))
row.names(res1) <- 1:nrow(res1)
res1
a_bin b_m(c=1) b_m(c=2)
1 1-3 3 6
2 4-6 NA NA
3 7-9 NA 2
I would use a combination of data.table and reshape2 which are both fully optimized for speed (not using for loops from apply family).
The output won't return the unused bins.
v <- c(1, 4, 7, 10) # creating bins
temp$int <- findInterval(temp$a, v)
library(data.table)
temp <- setDT(temp)[, list(b_m = max(b)), by = c("c", "int")]
library(reshape2)
temp <- dcast.data.table(temp, int ~ c, value.var = "b_m")
## colnames(temp) <- c("a_bin", "b_m(c=1)", "b_m(c=2)") # Optional for prettier table
## temp$a_bin<- c("1-3", "7-9") # Optional for prettier table
## a_bin b_m(c=1) b_m(c=2)
## 1 1-3 3 6
## 2 7-9 NA 2

Data frame subset with specified sum of elements

Having a data frame like this:
df <- data.frame(a=c(31, 18, 0, 1, 20, 2),
b=c(1, 0, 0, 3, 1, 1),
c=c(12, 0, 9, 8, 10, 3))
> df
a b c
1 31 1 12
2 18 0 0
3 0 0 9
4 1 3 8
5 20 1 10
6 2 1 3
How can I do a random subset so the sum of rows and columns is equal to a value, i.e , 100?
As I understand your question, you're trying to sample a subset of the rows and columns of your matrix so that they sum to a target value.
You can use integer optimization to accomplish this. You'll have a binary decision variable for each row, column, and cell, and constraints to force the cell values to be equal to the product of the row and column values. I'll use the lpSolve package to do this, because it has a convenient mechanism to get multiple optimal solutions. We can then use the sample function to select between them:
library(lpSolve)
get.subset <- function(dat, target) {
nr <- nrow(dat)
nc <- ncol(dat)
nvar <- nr + nc + nr*nc
# Cells upper bounded by row and column variable values (r and c) and lower bounded by r+c-1
mat <- as.matrix(do.call(rbind, apply(expand.grid(seq(nr), seq(nc)), 1, function(x) {
r <- x[1]
c <- x[2]
pos <- nr + nc + (r-1)*nc + c
ltc <- rep(0, nvar)
ltc[nr + c] <- 1
ltc[pos] <- -1
ltr <- rep(0, nvar)
ltr[r] <- 1
ltr[pos] <- -1
gtrc <- rep(0, nvar)
gtrc[nr + c] <- 1
gtrc[r] <- 1
gtrc[pos] <- -1
return(as.data.frame(rbind(ltc, ltr, gtrc)))
})))
dir <- rep(c(">=", ">=", "<="), nr*nc)
rhs <- rep(c(0, 0, 1), nr*nc)
# Sum of selected cells equals target
mat <- rbind(mat, c(rep(0, nr+nc), as.vector(t(dat))))
dir <- c(dir, "=")
rhs <- c(rhs, target)
res <- lp(objective.in=rep(0, nvar), # Feasibility problem
const.mat=mat,
const.dir=dir,
const.rhs=rhs,
all.bin=TRUE,
num.bin.solns=100 # Number of feasible solutions to get
)
if (res$status != 0) {
return(list(rows=NA, cols=NA, subset=NA, num.sol=0))
}
sol.num <- sample(res$num.bin.solns, 1)
vals <- res$solution[seq((sol.num-1)*nvar+1, sol.num*nvar)]
rows <- which(vals[seq(nr)] >= 0.999)
cols <- which(vals[seq(nr+1, nr+nc)] >= 0.999)
return(list(rows=rows, cols=cols, subset=dat[rows,cols], num.sol=res$num.bin.solns))
}
The function returns the number of subset with that sum and returns the randomly selected subset:
set.seed(144)
get.subset(df, 1)
# $rows
# [1] 1
# $cols
# [1] 2
# $subset
# [1] 1
# $num.sol
# [1] 14
get.subset(df, 100)
# $rows
# [1] 1 2 4 5
# $cols
# [1] 1 3
# $subset
# a c
# 1 31 12
# 2 18 0
# 4 1 8
# 5 20 10
# $num.sol
# [1] 2
get.subset(df, 10000)
# $rows
# [1] NA
# $cols
# [1] NA
# $subset
# [1] NA
# $num.sol
# [1] 0

Including squared predictors in model matrix

I have the following code
x <- c(1, 2, 3)
y <- c(2, 3, 4)
z <- c(3, 4, 5)
df <- data.frame(x, y, z)
model.matrix(x ~ .^4, df)
This gives me a model matrix with predictors $y, z$, and $y:z$. However, I also want y^2 and z^2, and want to use a solution that uses "$.$", since I have lots of other predictors beyond $y$ and $z$. What's the best way to approach this?
Try this:
> x <- c(1, 2, 3)
> y <- c(2, 3, 4)
> z <- c(3, 4, 5)
> df <- data.frame(x, y, z)
>
> #Assuming that your 1st column is the response variable, then I excluded it to have
> #just the independent variables as a new data.frame called df.2
> df.2=df[,-1]
> model.matrix(x ~ .^4+I(df.2^2), df)
(Intercept) y z I(df.2^2)y I(df.2^2)z y:z
1 1 2 3 4 9 6
2 1 3 4 9 16 12
3 1 4 5 16 25 20
attr(,"assign")
[1] 0 1 2 3 3 4

Resources