Assign multiple results from function when grouping - r

I have this code which does exactly what I want but I have to call my function three times for every group which seems very inefficient.
library(data.table)
myRegr = function(x, y) {
regr = lm.fit(cbind(1, x), y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(y)
return (c(k = k, m = m, r2 = r2))
}
dt = data.table(a = c(0, 0, 0, 1, 1, 1),
x = c(12, 21, 15, 34, 32, 31),
y = c(3, 1, 6, 4, 2, 8))
result = dt[,list(minX = min(x),
minY = min(y),
k = myRegr(x, y)["k"],
m = myRegr(x, y)["m"],
r2 = myRegr(x, y)["r2"]
),
by = list(a)]
print(result)
Outputs:
a minX minY k m r2
0 12 1 -0.3095238 8.285714 0.3176692
1 31 2 -1.0000000 37.000000 0.2500000
Any idea how I can rewrite this to only call the function once?
UPDATE:
My example didn't cover the complete problem as I have a fourth column which I'm selecting on, here is a better example:
library(data.table)
myRegr = function(x, y) {
regr = lm.fit(cbind(1, x), y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(y)
return (c(k = k, m = m, r2 = r2))
}
df = data.frame(a = c(0, 0, 0, 1, 1, 1),
x = c(12, 21, 15, 34, 32, 31),
y = c(3, 1, 6, 4, 2, 8),
time = as.POSIXct(c("2019-01-01 08:12:00", "2019-01-01 08:13:00", "2019-01-01 08:14:00", "2019-01-01 08:12:00", "2019-01-01 08:13:00", "2019-01-01 08:14:00")))
dt = data.table(df)
result = dt[, list(firstX = x[time == min(time)],
firstY = y[time == min(time)],
k = myRegr(x, y)["k"],
m = myRegr(x, y)["m"],
r2 = myRegr(x, y)["r2"]
),
by = a]
print(result)
Outputs:
a firstX firstY k m r2
0 12 3 -0.3095238 8.285714 0.3176692
1 34 4 -1.0000000 37.000000 0.2500000
Tried wrapping it all in a function but it actually slowed things down:
library(data.table)
myRegrList = function(group) {
firstX = group[,x[time == min(time)]]
firstY = group[,y[time == min(time)]]
regr = lm.fit(cbind(1, group$x), group$y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(group$y)
return (list(firstX = firstX, firstY = firstY, k = k, m = m, r2 = r2))
}
result = dt[, myRegrList(.SD), by = a]
print(result)

If you make your function return a list you only need to call
dt[, myRegr(x, y), by = a]
# a minX minY k m r2
#1: 0 12 1 -0.3095238 8.285714 0.3176692
#2: 1 31 2 -1.0000000 37.000000 0.2500000
With
myRegr = function(x, y) {
regr = lm.fit(cbind(1, x), y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(y)
return (list(# minX = min(x),
# minY = min(y),
k = k,
m = m,
r2 = r2))
}
update
You might subset for x and y values and then join with the result of your function
result <- dt[dt[, .I[which.min(time)], by = a]$V1, .(a, x, y)]
result <- result[dt[, myRegr(x, y), by = a], on = .(a)]
result
# a x y k m r2
#1: 0 12 3 -0.3095238 8.285714 0.3176692
#2: 1 34 4 -1.0000000 37.000000 0.2500000

You can modify your function to return a vector and dcast final result:
library(data.table)
myRegr = function(x, y) {
regr <- lm.fit(cbind(1, x), y)
c(
regr$coef[[1]],
regr$coef[[2]],
1 - var(regr$residuals) / var(y)
)
}
result <- df[, .(minX = min(x), minY = min(y), myRegr(x, y), c("m", "k", "r2")), a]
dcast(result, a + minX + minY ~ V4, value.var = "V3")
This solution is not perfect as I have to create V4 (add c("m", "k", "r2") vector). There should be a better way to do this (perhaps even not to use dcast). Maybe more experienced data.table users could advice on this?
Data:
df <- data.table(
a = c(0, 0, 0, 1, 1, 1),
x = c(12, 21, 15, 34, 32, 31),
y = c(3, 1, 6, 4, 2, 8)
)

Related

Change Error Message to An Instruction for Users

When I run this R code I get Error in order(res2$seed): argument 1 is not a vector as an error message in the function call at first instance but when I change the range of i to be something different like in function call at second instance, I get the expected data frame format that I want.
The Function
abc <- function(a, z, n, ar11, p, d, q, sd = sd, j1, arr1, n_cores){
future::plan(future::multisession)
n_cores <- parallel::detectCores()
cl <- parallel::makeCluster(n_cores)
doParallel::registerDoParallel(cores = n_cores)
message('processing...')
`%dopar%` <- foreach::`%dopar%`
i <- a:z
res <- foreach::foreach(i = a:z, .packages = c('foreach', 'forecast')) %dopar% {
set.seed(i)
mod <- stats::arima.sim(n = n, model = list(ar = c(ar11), order = c(p, d, q)), sd = sd)
best.mod <- forecast::auto.arima(mod, ic = "aicc")
(cf <- best.mod$coef)
if (length(cf) == 0) {
rep(NA, 2)
} else if (all(grepl(c("ar1|intercept"), names(cf))) &
substr(cf["ar1"], 1, j1) %in% arr1) {
c(cf, seed = i)
} else {
rep(NA, 2)
}
}
message(' done!\n')
res1 = res[!sapply(res, anyNA)]
parallel::stopCluster(cl)
options(max.print = .Machine$integer.max)
res2 <- tibble::tibble(Reduce(function(...) merge(..., all = T), lapply(res1, function(x) as.data.frame(t(x)))))
res2[order(res2$seed), ]
res2 <- Reduce(function(...) merge(..., all = T), lapply(res1, function(x) as.data.frame(t(x))))
res2[order(res2$seed), ]
}
Call Function at First Instance
abc(a = 280000, z = 281000, n = 10, p = 1, d = 0, q = 0, ar11 = 0.8, sd = 1, j1 = 4, arr1 = "0.80")
#Error in order(res2$seed) : argument 1 is not a vector
Call Function at Second Instance
abc(a = 289800, z = 289989, n = 10, p = 1, d = 0, q = 0, ar11 = 0.8, sd = 1, j1 = 4, arr1 = "0.80")
#ar1 seed
#1 0.8000000 289805
#2 0.8000368 289989
I want to change Error in order(res2$seed): argument 1 is not a vector when need be to instruction for this R function useers to Try another range of seeds
You can either look before you leap by testing if the seed column exists:
abc <- function(a, z, n, ar11, p, d, q, sd = sd, j1, arr1, n_cores){
# ...code as in OP...
res2 <- tibble::tibble(Reduce(function(...) merge(..., all = T), lapply(res1, function(x) as.data.frame(t(x)))))
if (!("seed" %in% colnames(res2))) {
warning("Try another range of seeds", call. = FALSE)
} else {
res2[order(res2$seed), ]
}
}
abc(a = 280000, z = 281000, n = 10, p = 1, d = 0, q = 0, ar11 = 0.8, sd = 1, j1 = 4, arr1 = "0.80")
# processing...
# done!
#
# Warning message:
# Try another range of seeds
Or ask for forgiveness instead of permission using tryCatch() and suppressWarnings() for a slightly more generic approach:
abc <- function(a, z, n, ar11, p, d, q, sd = sd, j1, arr1, n_cores){
# ...code as in OP...
res2 <- tibble::tibble(Reduce(function(...) merge(..., all = T), lapply(res1, function(x) as.data.frame(t(x)))))
tryCatch(
suppressWarnings(res2[order(res2$seed), ]),
error = \(err) {
if (grepl("argument 1 is not a vector", err$message)) {
warning("Try another range of seeds", call. = FALSE)
} else {
stop(err)
}
}
)
}
abc(a = 280000, z = 281000, n = 10, p = 1, d = 0, q = 0, ar11 = 0.8, sd = 1, j1 = 4, arr1 = "0.80")
# processing...
# done!
#
# Warning message:
# Try another range of seeds
That said, it’s better in my opinion to throw an error than a warning when a function doesn’t return the expected output. Especially if other code will depend on the result of this function. You can throw an error with your desired message by replacing warning() with stop().

Solutions to a system of inequalities in R

Suppose I have the following system of inequalities:
-2x + y <= -3
1.25x + y <= 2.5
y >= -3
I want to find multiple tuples of (x, y) that satisfy the above inequalities.
library(Rglpk)
obj <- numeric(2)
mat <- matrix(c(-2, 1, 1.25, 1, 0, 1), nrow = 3)
dir <- c("<=", "<=", ">=")
rhs <- c(-3, 2.5, -3)
Rglpk_solve_LP(obj = obj, mat = mat, dir = dir, rhs = rhs)
Using the above code only seems to return 1 possible solution tuple (1.5, 0). Is possible to return other solution tuples?
Edit: Based on the comments, I would be interested to learn if there are any functions that could help me find the corner points.
Actually to understand the possible answers for the given question we can try to solve the system of inequalities graphically.
There was a nice answer concerning plotting of inequations in R at stackowerflow. Using the given aproach we can plot the following graph:
library(ggplot2)
fun1 <- function(x) 2*x - 3 # this is the same as -2x + y <= -3
fun2 <- function(x) -1.25*x + 2.5 # 1.25x + y <= 2.5
fun3 <- function(x) -3 # y >= -3
x1 = seq(-1,5, by = 1/16)
mydf = data.frame(x1, y1=fun1(x1), y2=fun2(x1),y3= fun3(x1))
mydf <- transform(mydf, z = pmax(y3,pmin(y1,y2)))
ggplot(mydf, aes(x = x1)) +
geom_line(aes(y = y1), colour = 'blue') +
geom_line(aes(y = y2), colour = 'green') +
geom_line(aes(y = y3), colour = 'red') +
geom_ribbon(aes(ymin=y3,ymax = z), fill = 'gray60')
All the possible (infinite by number) tuples lie inside the gray triangle.
The vertexes can be found using the following code.
obj <- numeric(2)
mat <- matrix(c(-2, 1.25, 1, 1), nrow = 2)
rhs <- matrix(c(-3, 2.5), nrow = 2)
aPoint <- solve(mat, rhs)
mat <- matrix(c(-2, 0, 1, 1), nrow = 2)
rhs <- matrix(c(-3, -3), nrow = 2)
bPoint <- solve(mat, rhs)
mat <- matrix(c(1.25, 0, 1, 1), nrow = 2)
rhs <- matrix(c(2.5, -3), nrow = 2)
cPoint <- solve(mat, rhs)
Note the order of arguments of matrices.
And you get the coordinates:
> aPoint
[,1]
[1,] 1.6923077
[2,] 0.3846154
> bPoint
[,1]
[1,] 0
[2,] -3
> cPoint
[,1]
[1,] 4.4
[2,] -3.0
All the codes below are with base R only (no need library(Rglpk))
1. Corner Points
If you want to get all the corner points, here is one option
A <- matrix(c(-2, 1.25, 0, 1, 1, -1), nrow = 3)
b <- c(-3, 2.5, 3)
# we use `det` to check if the coefficient matrix is singular. If so, we return `Inf`.
xh <-
combn(nrow(A), 2, function(k) {
if (det(A[k, ]) == 0) {
rep(NA, length(k))
} else {
solve(A[k, ], b[k])
}
})
# We filter out the points that satisfy the constraint
corner_points <- t(xh[, colSums(A %*% xh <= b, na.rm = TRUE) == length(b)])
such that
> corner_points
[,1] [,2]
[1,] 1.692308 0.3846154
[2,] 0.000000 -3.0000000
[3,] 4.400000 -3.0000000
2. Possible Tuples
If you want to have multiple tuples, e.g., n=10, we can use Monte Carlo simulation (based on the obtained corner_points in the previous step) to select the tuples under the constraints:
xrange <- range(corner_points[, 1])
yrange <- range(corner_points[, 2])
n <- 10
res <- list()
while (length(res) < n) {
px <- runif(1, xrange[1], xrange[2])
py <- runif(1, yrange[1], yrange[2])
if (all(A %*% c(px, py) <= b)) {
res[length(res) + 1] <- list(c(px, py))
}
}
and you will see n possible tuples in a list like below
> res
[[1]]
[1] 3.643167 -2.425809
[[2]]
[1] 2.039007 -2.174171
[[3]]
[1] 0.4990635 -2.3363637
[[4]]
[1] 0.6168402 -2.6736421
[[5]]
[1] 3.687389 -2.661733
[[6]]
[1] 3.852258 -2.704395
[[7]]
[1] 1.7571062 0.1067597
[[8]]
[1] 3.668024 -2.771307
[[9]]
[1] 2.108187 -1.365349
[[10]]
[1] 2.106528 -2.134310
First of all, the matrix representing the three equations needs a small correction, because R fills matrices column by column :
-2x + y <= -3
1.25x + y <= 2.5
y >= -3
mat <- matrix(c(-2, 1.25, 0, 1, 1, 1), nrow = 3
# and not : mat <- matrix(c(-2, 1, 1.25, 1, 0, 1), nrow = 3)
To get different tuples, you could modify the objective function :
obj <- numeric(2) results in an objective function 0 * x + 0 * y which is always equal to 0 and can't be maximized : the first valid x,y will be selected.
Optimization on x is achieved by using obj <- c(1,0), resulting in maximization / minimization of 1 * x + 0 * y.
Optimization on y is achieved by using obj <- c(0,1).
#setting the bounds is necessary, otherwise optimization occurs only for x>=0 and y>=0
bounds <- list(lower = list(ind = c(1L, 2L), val = c(-Inf, -Inf)),
upper = list(ind = c(1L, 2L), val = c(Inf, Inf)))
# finding maximum x: obj = c(1,0), max = T
Rglpk_solve_LP(obj = c(10,0), mat = mat, dir = dir, rhs = rhs,bound=bounds, max = T)$solution
# [1] 4.4 -3.0
# finding minimum x: obj = c(1,0), max = F
Rglpk_solve_LP(obj = c(10,0), mat = mat, dir = dir, rhs = rhs,bound=bounds, max = F)$solution
#[1] 0 -3
# finding maximum y: obj = c(0,1), max = T
Rglpk_solve_LP(obj = c(0,1), mat = mat, dir = dir, rhs = rhs,bound=bounds, max = T)$solution
#[1] 1.6923077 0.3846154

Need to fix a problem with seq function in R

I have to create a function with 2 arguments (a,b) that will return the first n multiples of 3 that are less than or equal to a. Here is what i am doing:
f <- function(a, b){
v = seq(from = 0, to= a, by = 3, length.out = b)
return(v)
}
It says that the seq() has too many arguments, and I understand why. If i remove the 'from', there would be some cases where the vector wouldnt started with zero. How could i fix the problem?
THank you
seq supports either by= or length.out=, not both. You can have the same affect with head(seq(...)):
seq(from = 0, to = 20, by = 3, length.out = 4)
# Error in seq.default(from = 0, to = 20, by = 3, length.out = 4) :
# too many arguments
seq(from = 0, to = 20, by = 3)
# [1] 0 3 6 9 12 15 18
head(seq(from = 0, to = 20, by = 3), n = 4)
# [1] 0 3 6 9
which for your function should be:
f <- function(a, b){
head(seq(from = 0, to = a, by = 3), n = b)
}
f(20, 4)
# [1] 0 3 6 9
How about the following?
f <- function(a, b) {
if (3 * b > a) return(seq(0L, a, by = 3L))
return(seq(0, by = 3L, length.out = b))
}

loop with certain values is not working

I just need help for the first loop! I would like to run the loop for each certain value of m (see first line in code) but its running only for 1:10? The outcome shoud be stored in the last rows msediff1 to msediff100! Also i need the graphics for each value of m!Thanks in advance!
m = c(1,2,3,4,5,6,7,8,9,10,25,50,100)
for (m in 1:length(unique(m))){
n <- 150
x1 <- rnorm(n = n, mean = 10, sd = 4)
R <- 100 # Number of reps
results.true <- matrix(NA , ncol = 2, nrow = R)
colnames(results.true) <- c("beta0.hat", "beta1.hat")
results.diff <- matrix(NA, ncol = 2, nrow = R)
colnames(results.diff) <- c("beta0.hat", "betadiff.hat")
sigma <- 1.2
beta <- c(1.2)
X <- cbind(x1)
if (m==1){d0 <- .7071; d <- c(-.7071)}
if (m==2){d0 = .8090; d = c(-.5,-.309)}
if (m==3){d0 = .8582; d = c(-.3832,-.2809,-.1942) }
if (m==4){d0 = .8873; d = c(-.3090,-.2464,-.1901,-.1409)}
if (m==5){d0 <- .9064; d <- c(-.2600,-.2167,-.1774,-.1420,-.1103)}
if (m==6){d0 = .92; d = c(-.2238,-.1925,-.1635,-.1369,-.1126,-.0906)}
if (m==7){d0 = .9302; d = c(-.1965,-.1728,-.1506,-.1299,-.1107,-.093,-.0768)}
if (m==8){d0 = .9380; d = c(-.1751,-.1565,-.1389,-.1224,-.1069,-.0925,-.0791,-.0666)}
if (m==9){d0 = .9443; d = c(-.1578,-.1429,-.1287,-.1152,-.1025,-.0905,-.0792,-.0687,-.0538)}
if (m==10){d0 <- .9494;
d <- c(-.1437, -.1314, -.1197, -.1085, -.0978, -.0877, -.0782, -.0691, -.0606, -.0527)}
if (m==25){d0 <- 0.97873;
d <- c(-0.06128, -0.05915, -0.05705, -0.05500, -0.05298, -0.05100, -0.04906, -0.04715, -0.04528, -0.04345, -0.04166, -0.03990, -0.03818, -0.03650, -0.03486, -0.03325, -0.03168, -0.03015, -0.02865, -0.02719,
-0.02577, -0.02438, -0.02303, -0.02171, -0.02043) }
if (m==50) {d0 <- 0.98918;
d <- c(-0.03132, -0.03077, -0.03023, -0.02969, -0.02916, -0.02863, -0.02811, -0.02759, -0.02708, -0.02657, -0.02606, -0.02556, -0.02507, -0.02458, -0.02409, -0.02361, -0.02314, -0.02266, -0.02220, -0.02174, -0.02128, -0.02083, -0.02038, -0.01994, -0.01950, -0.01907, -0.01864, -0.01822, -0.01780, -0.01739,-0.01698,-0.01658,-0.01618,-0.01578,-0.01539,-0.01501,-0.01463,-0.01425,-0.01388,-0.01352,
-0.01316,-0.01280,-0.01245,-0.01210,-0.01176,-0.01142,-0.01108,-0.01075,-0.01043,-0.01011) }
if (m==100) { d0 <- 0.99454083;
d <- c(-0.01583636,-0.01569757,-0.01555936,-0.01542178,-0.01528478,-0.01514841,-0.01501262,-0.01487745,-0.01474289,-0.01460892,
-0.01447556,-0.01434282,-0.01421067,-0.01407914,-0.01394819,-0.01381786,-0.01368816,-0.01355903,-0.01343053,-0.01330264,
-0.01317535,-0.01304868,-0.01292260,-0.01279714,-0.01267228,-0.01254803,-0.01242439,-0.01230136,-0.01217894,-0.01205713,
-0.01193592,-0.01181533,-0.01169534,-0.01157596,-0.01145719,-0.01133903,-0.01122148,-0.01110453,-0.01098819,-0.01087247,
-0.01075735,-0.01064283,-0.01052892,-0.01041563,-0.01030293,-0.01019085,-0.01007937,-0.00996850,-0.00985823,-0.00974857,
-0.00963952,-0.00953107,-0.00942322,-0.00931598,-0.00920935,-0.00910332,-0.00899789,-0.00889306,-0.00878884,-0.00868522,
-0.00858220,-0.00847978,-0.00837797,-0.00827675,-0.00817614,-0.00807612,-0.00797670,-0.00787788,-0.00777966,-0.00768203,
-0.00758500,-0.00748857,-0.00739273,-0.00729749,-0.00720284,-0.00710878,-0.00701532,-0.00692245,-0.00683017,-0.00673848,
-0.00664738,-0.00655687,-0.00646694,-0.00637761,-0.00628886,-0.00620070,-0.00611312,-0.00602612,-0.00593971,-0.00585389,
-0.00576864,-0.00568397,-0.00559989,-0.00551638,-0.00543345,-0.00535110,-0.00526933,-0.00518813,-0.00510750,-0.00502745) }
for(r in 1:R){
u <- rnorm(n = n, mean = 0, sd = sigma)
y <- X%*%beta + u
yy = d0* y[(m+1):n]; Xd <- d0* x1[(m+1):n];
for (i in 1:m) { yy <- yy + d[i]* y[(m+1-i):(n-i) ]
Xd = Xd + d[i]* x1[(m+1-i):(n-i)] }
reg.true <- lm(y ~ x1)
reg.diff <- lm(yy ~ Xd)
results.true[r, ] <- coef(reg.true)
results.diff[r, ] <- coef(reg.diff)
}
results.true
results.diff
beta
apply(results.true, MARGIN = 2, FUN = mean)
apply(results.diff, MARGIN = 2, FUN = mean)
co <- 2
dens.true <- density(results.true[, co])
dens.diff <- density(results.diff[, co])
win.graph()
plot(dens.true,
xlim = range(c(results.true[, co], results.diff[, co])),
ylim = range(c(dens.true$y, dens.diff$yy)),
main = "beta estimation true vs. diff", lwd = 2,)
lines(density(results.diff[, co]), col = "red", lwd = 2)
abline(v = beta, col = "blue", lwd = 2)
legend(x=1.24,y=12,c("outcome true","outcome diff"),lty=c(1,1),col =c("black","red") )
legend(x=1.12,y=12,c("m=",m))
#Mean Squared Error
mse=mean(reg.true$residuals^2)
if (m==1) {msediff1=mean(reg.diff$residuals^2)}
if (m==2) {msediff2=mean(reg.diff$residuals^2)}
if (m==3) {msediff3=mean(reg.diff$residuals^2)}
if (m==4) {msediff4=mean(reg.diff$residuals^2)}
if (m==5) {msediff5=mean(reg.diff$residuals^2)}
if (m==6) {msediff6=mean(reg.diff$residuals^2)}
if (m==7) {msediff7=mean(reg.diff$residuals^2)}
if (m==8) {msediff8=mean(reg.diff$residuals^2)}
if (m==9) {msediff9=mean(reg.diff$residuals^2)}
if (m==10) {msediff10=mean(reg.diff$residuals^2)}
if (m==25) {msediff25=mean(reg.diff$residuals^2)}
if (m==50) {msediff50=mean(reg.diff$residuals^2)}
if (m==100) {msediff100=mean(reg.diff$residuals^2)}
}
I can see an error in the code.
m = c(1,2,3,4,5,6,7,8,9,10,25,50,100)
for (m in 1:length(unique(m))){
As soon as the loop starts, m is changed. It's not what's in the first line anymore...
Try, for (ind in 1:length(unique(m))){ if that's not the intention.

Subset list, keep names

I have list out like this:
u <- list(a = list(b = 1, c = 2),
x = list(k = list(ka = 1, kb = 3),
l = list(la = 1, la = 4)))
v <- list(a = list(b = 1, c = 2),
x = list(m = list(ma = 5, mb = 8),
n = list(na = 5, nb = 8)))
w <- list(a = list(b = 1, c = 2),
x = list(o = list(oa = 4, ob = 1),
p = list(pa = 8, pb = 0)))
out <- list(u, v, w)
I would like to create another list where there are elements k, l, m, n, o, p and names of the list elements are preserved. I found a solution, but looks sub-optimal:
x <- lapply(out, function(y) y[['x']])
o <- list()
for (a in x) {
o <- c(o, a)
}
> str(o, max.level = 1)
List of 6
$ k:List of 2
$ l:List of 2
$ m:List of 2
$ n:List of 2
$ o:List of 2
$ p:List of 2
Is there a better way?
The loop could be replaced with unlist:
res <- unlist( lapply(out,"[[","x"), recursive=FALSE)
identical(res,o)
# [1] TRUE
My lapply is the same as in the OP; it's just a shortcut.
As #akrun suggested, you could more closely mirror the OP's loop with
do.call("c", lapply(out, '[[', 'x'))

Resources