I’m trying to optimize a function using one of the algorithms that require a gradient. Basically I’m trying to learn how to optimize a function using a gradient in Julia. I’m fairly confident that my gradient is specified correctly. I know this because the similarly defined Matlab function for the gradient gives me the same values as in Julia for some test values of the arguments. Also, the Matlab version using fminunc with the gradient seems to optimize the function fine.
However when I run the Julia script, I seem to get the following error:
julia> include("ex2b.jl")
ERROR: `g!` has no method matching g!(::Array{Float64,1}, ::Array{Float64,1})
while loading ...\ex2b.jl, in ex
pression starting on line 64
I'm running Julia 0.3.2 on a windows 7 32bit machine. Here is the code (basically a translation of some Matlab to Julia):
using Optim
function mapFeature(X1, X2)
degrees = 5
out = ones(size(X1)[1])
for i in range(1, degrees+1)
for j in range(0, i+1)
term = reshape( (X1.^(i-j) .* X2.^(j)), size(X1.^(i-j))[1], 1)
out = hcat(out, term)
end
end
return out
end
function sigmoid(z)
return 1 ./ (1 + exp(-z))
end
function costFunc_logistic(theta, X, y, lam)
m = length(y)
regularization = sum(theta[2:end].^2) * lam / (2 * m)
return sum( (-y .* log(sigmoid(X * theta)) - (1 - y) .* log(1 - sigmoid(X * theta))) ) ./ m + regularization
end
function costFunc_logistic_gradient!(theta, X, y, lam, m)
grad= X' * ( sigmoid(X * theta) .- y ) ./ m
grad[2:end] = grad[2:end] + theta[2:end] .* lam / m
return grad
end
data = readcsv("ex2data2.txt")
X = mapFeature(data[:,1], data[:,2])
m, n = size(data)
y = data[:, end]
theta = zeros(size(X)[2])
lam = 1.0
f(theta::Array) = costFunc_logistic(theta, X, y, lam)
g!(theta::Array) = costFunc_logistic_gradient!(theta, X, y, lam, m)
optimize(f, g!, theta, method = :l_bfgs)
And here is some of the data:
0.051267,0.69956,1
-0.092742,0.68494,1
-0.21371,0.69225,1
-0.375,0.50219,1
-0.51325,0.46564,1
-0.52477,0.2098,1
-0.39804,0.034357,1
-0.30588,-0.19225,1
0.016705,-0.40424,1
0.13191,-0.51389,1
0.38537,-0.56506,1
0.52938,-0.5212,1
0.63882,-0.24342,1
0.73675,-0.18494,1
0.54666,0.48757,1
0.322,0.5826,1
0.16647,0.53874,1
-0.046659,0.81652,1
-0.17339,0.69956,1
-0.47869,0.63377,1
-0.60541,0.59722,1
-0.62846,0.33406,1
-0.59389,0.005117,1
-0.42108,-0.27266,1
-0.11578,-0.39693,1
0.20104,-0.60161,1
0.46601,-0.53582,1
0.67339,-0.53582,1
-0.13882,0.54605,1
-0.29435,0.77997,1
-0.26555,0.96272,1
-0.16187,0.8019,1
-0.17339,0.64839,1
-0.28283,0.47295,1
-0.36348,0.31213,1
-0.30012,0.027047,1
-0.23675,-0.21418,1
-0.06394,-0.18494,1
0.062788,-0.16301,1
0.22984,-0.41155,1
0.2932,-0.2288,1
0.48329,-0.18494,1
0.64459,-0.14108,1
0.46025,0.012427,1
0.6273,0.15863,1
0.57546,0.26827,1
0.72523,0.44371,1
0.22408,0.52412,1
0.44297,0.67032,1
0.322,0.69225,1
0.13767,0.57529,1
-0.0063364,0.39985,1
-0.092742,0.55336,1
-0.20795,0.35599,1
-0.20795,0.17325,1
-0.43836,0.21711,1
-0.21947,-0.016813,1
-0.13882,-0.27266,1
0.18376,0.93348,0
0.22408,0.77997,0
Let me know if you guys need additional details. Btw, this relates to a coursera machine learning course if curious.
The gradient should not be a function to compute the gradient,
but a function to store it
(hence the exclamation mark in the function name, and the second argument in the error message).
The following seems to work.
function g!(theta::Array, storage::Array)
storage[:] = costFunc_logistic_gradient!(theta, X, y, lam, m)
end
optimize(f, g!, theta, method = :l_bfgs)
The same using closures and currying (version for those who got used to a function that returns the cost and gradient):
function cost_gradient(θ, X, y, λ)
m = length(y);
return (θ::Array) -> begin
h = sigmoid(X * θ); #(m,n+1)*(n+1,1) -> (m,1)
J = (1 / m) * sum(-y .* log(h) .- (1 - y) .* log(1 - h)) + λ / (2 * m) * sum(θ[2:end] .^ 2);
end, (θ::Array, storage::Array) -> begin
h = sigmoid(X * θ); #(m,n+1)*(n+1,1) -> (m,1)
storage[:] = (1 / m) * (X' * (h .- y)) + (λ / m) * [0; θ[2:end]];
end
end
Then, somewhere in the code:
initialθ = zeros(n,1);
f, g! = cost_gradient(initialθ, X, y, λ);
res = optimize(f, g!, initialθ, method = :cg, iterations = your_iterations);
θ = res.minimum;
Related
I have written a surrogate function called GEKPLS and I am trying to make the code work with an optimizer to find the optimal theta parameter values.
As a first step, I'm trying to make it work with Zygote and have the following code:
function min_rlfv(theta)
g = GEKPLS(X, y, grads, n_comp, delta_x, xlimits, extra_points, theta)
return -g.reduced_likelihood_function_value
end
Zygote.gradient(min_rlfv, [0.01, 0.1])
Running the above code results in the following error message:
ERROR: Need an adjoint for constructor LinearAlgebra.QRCompactWYQ{Float64, Matrix{Float64}}. Gradient is of type LinearAlgebra.Transpose{Float64, Matrix{Float64}}
The stacktrace leads to the following function. The highlighted line in my code editor is Q, G = qr(Ft) from the following code block:
function _reduced_likelihood_function(theta, kernel_type, d, nt, ij, y_norma, noise = 0.0)
reduced_likelihood_function_value = -Inf
nugget = 1000000.0 * eps() #a jitter for numerical stability;
if kernel_type == "squar_exp"
r = squar_exp(theta, d)
end
R = (I + zeros(nt, nt)) .* (1.0 + nugget + noise)
for k in 1:size(ij)[1]
R[ij[k, 1], ij[k, 2]] = r[k]
R[ij[k, 2], ij[k, 1]] = r[k]
end
C = cholesky(R).L
F = ones(nt, 1)
Ft = C \ F
Q, G = qr(Ft)
Q = Array(Q)
Yt = C \ y_norma
beta = G \ [(transpose(Q) ⋅ Yt)]
rho = Yt .- (Ft .* beta)
gamma = transpose(C) \ rho
sigma2 = sum((rho) .^ 2, dims = 1) / nt
detR = prod(diag(C) .^ (2.0 / nt))
reduced_likelihood_function_value = -nt * log10(sum(sigma2)) - nt * log10(detR)
return beta, gamma, reduced_likelihood_function_value
end
Any pointers on how this can be fixed?
I would like to minimize a distance function ||dz - z|| under the constraint that g(z) = 0.
I wanted to use Lagrange Multipliers to solve this problem. Then I used NLsolve.jl to solve the non-linear equation that I end up with.
using NLsolve
using ForwardDiff
function ProjLagrange(dz, g::Function)
λ_init = ones(size(g(dz...),1))
initial_x = vcat(dz, λ_init)
function gradL!(F, x)
len_dz = length(dz)
z = x[1:len_dz]
λ = x[len_dz+1:end]
F = Array{Float64}(undef, length(x))
my_distance(z) = norm(dz - z)
∇f = z -> ForwardDiff.gradient(my_distance, z)
F[1:len_dz] = ∇f(z) .- dot(λ, g(z...))
if length(λ) == 1
F[end] = g(z...)
else
F[len_dz+1:end] = g(z)
end
end
nlsolve(gradL!, initial_x)
end
g_test(x1, x2, x3) = x1^2 + x2 - x2 + 5
z = [1000,1,1]
ProjLagrange(z, g_test)
But I always end up with Zero: [NaN, NaN, NaN, NaN] and Convergence: false.
Just so you know I have already solved the equation by using Optim.jl and minimizing the following function: Proj(z) = b * sum(abs.(g(z))) + a * norm(dz - z).
But I would really like to know if this is possible with NLsolve. Any help is greatly appreciated!
Starting almost from scratch and wikipedia's Lagrange multiplier page because it was good for me, the code below seemed to work. I added an λ₀s argument to the ProjLagrange function so that it can accept a vector of initial multiplier λ values (I saw you initialized them at 1.0 but I thought this was more generic). (Note this has not been optimized for performance!)
using NLsolve, ForwardDiff, LinearAlgebra
function ProjLagrange(x₀, λ₀s, gs, n_it)
# distance function from x₀ and its gradients
f(x) = norm(x - x₀)
∇f(x) = ForwardDiff.gradient(f, x)
# gradients of the constraints
∇gs = [x -> ForwardDiff.gradient(g, x) for g in gs]
# Form the auxiliary function and its gradients
ℒ(x,λs) = f(x) - sum(λ * g(x) for (λ,g) in zip(λs,gs))
∂ℒ∂x(x,λs) = ∇f(x) - sum(λ * ∇g(x) for (λ,∇g) in zip(λs,∇gs))
∂ℒ∂λ(x,λs) = [g(x) for g in gs]
# as a function of a single argument
nx = length(x₀)
ℒ(v) = ℒ(v[1:nx], v[nx+1:end])
∇ℒ(v) = vcat(∂ℒ∂x(v[1:nx], v[nx+1:end]), ∂ℒ∂λ(v[1:nx], v[nx+1:end]))
# and solve
v₀ = vcat(x₀, λ₀s)
nlsolve(∇ℒ, v₀, iterations=n_it)
end
# test
gs_test = [x -> x[1]^2 + x[2] - x[3] + 5]
λ₀s_test = [1.0]
x₀_test = [1000.0, 1.0, 1.0]
n_it = 100
res = ProjLagrange(x₀_test, λ₀s_test, gs_test, n_it)
gives me
julia> res = ProjLagrange(x₀_test, λ₀s_test, gs_test, n_it)
Results of Nonlinear Solver Algorithm
* Algorithm: Trust-region with dogleg and autoscaling
* Starting Point: [1000.0, 1.0, 1.0, 1.0]
* Zero: [9.800027199717013, -49.52026655749088, 51.520266557490885, -0.050887973682118504]
* Inf-norm of residuals: 0.000000
* Iterations: 10
* Convergence: true
* |x - x'| < 0.0e+00: false
* |f(x)| < 1.0e-08: true
* Function Calls (f): 11
* Jacobian Calls (df/dx): 11
I altered your code as below (see my comments in there) and got the following output. It doesn't throw NaNs anymore, reduces the objective and converges. Does this differ from your Optim.jl results?
Results of Nonlinear Solver Algorithm
* Algorithm: Trust-region with dogleg and autoscaling
* Starting Point: [1000.0, 1.0, 1.0, 1.0]
* Zero: [9.80003, -49.5203, 51.5203, -0.050888]
* Inf-norm of residuals: 0.000000
* Iterations: 10
* Convergence: true
* |x - x'| < 0.0e+00: false
* |f(x)| < 1.0e-08: true
* Function Calls (f): 11
* Jacobian Calls (df/dx): 11
using NLsolve
using ForwardDiff
using LinearAlgebra: norm, dot
using Plots
function ProjLagrange(dz, g::Function, n_it)
λ_init = ones(size(g(dz),1))
initial_x = vcat(dz, λ_init)
# These definitions can go outside as well
len_dz = length(dz)
my_distance = z -> norm(dz - z)
∇f = z -> ForwardDiff.gradient(my_distance, z)
# In fact, this is probably the most vital difference w.r.t. your proposal.
# We need the gradient of the constraints.
∇g = z -> ForwardDiff.gradient(g, z)
function gradL!(F, x)
z = x[1:len_dz]
λ = x[len_dz+1:end]
# `F` is memory allocated by NLsolve to store the residual of the
# respective call of `gradL!` and hence doesn't need to be allocated
# anew every time (or at all).
F[1:len_dz] = ∇f(z) .- λ .* ∇g(z)
F[len_dz+1:end] .= g(z)
end
return nlsolve(gradL!, initial_x, iterations=n_it, store_trace=true)
end
# Presumable here is something wrong: x2 - x2 is not very likely, also made it
# callable directly with an array argument
g_test = x -> x[1]^2 + x[2] - x[3] + 5
z = [1000,1,1]
n_it = 10000
res = ProjLagrange(z, g_test, n_it)
# Ugly reformatting here
trace = hcat([[state.iteration; state.fnorm; state.stepnorm] for state in res.trace.states]...)
plot(trace[1,:], trace[2,:], label="f(x) inf-norm", xlabel="steps")
Evolution of inf-norm of f(x) over iteration steps
[Edit: Adapted solution to incorporate correct gradient computation for g()]
I am trying to practice fitting with the Lsq-Fit-function in Julia.
The derivative of a Cauchy-distribution with parameters \gamma and x_0.
Following this manual I tried
f(x, x_0, γ) = -2*(x - x_0)*(π * γ^3 * (1 + ((x - x_0)/γ)^2)^2)^(-1)
x_0 = 3350
γ = 50
xarr = range(3000, length = 5000, stop = 4000)
yarr = [f(x, x_0, γ) for x in xarr]
using LsqFit
# p ≡ [x_0, γ]
model(x, p) = -2*(x - p[1])*(π * (p[2])^3 * (1 + ((x - p[1])/p[2])^2)^2)^(-1)
p0 = [3349, 49]
curve_fit(model, xarr, yarr, p0)
param = fit.param
... and it does not work, giving a MethodError: no method matching -(::StepRangeLen[...], leaving me confused.
Can please somebody tell me what I am doing wrong?
There are a few issues with what you've written:
the model function is meant to be called with its first argument (x) being the full vector of independent variables, not just one value. This is where the error you mention comes from:
julia> model(x, p) = -2*(x - p[1])*(π * (p[2])^3 * (1 + ((x - p[1])/p[2])^2)^2)^(-1);
julia> p0 = [3349, 49];
julia> model(xarr, p0);
ERROR: MethodError: no method matching -(::StepRangeLen{Float64,Base.TwicePrecision{Float64},Base.TwicePrecision{Float64}}, ::Float64)
One way to fix this is to use the dot notation to broadcast all operators so that they work elementwise:
julia> model(x, p) = -2*(x .- p[1]) ./ (π * (p[2])^3 * (1 .+ ((x .- p[1])/p[2]).^2).^2);
julia> model(xarr, p0); # => No error
but if this is too tedious you can let the #. macro do the work for you:
# just put #. in front of the expression to transform every
# occurrence of a-b into a.-b (and likewise for all operators)
# which means to compute the operation elementwise
julia> model(x, p) = #. -2*(x - p[1])*(π * (p[2])^3 * (1 + ((x - p[1])/p[2])^2)^2)^(-1);
julia> model(xarr, p0); # => No error
Another issue is that the parameters you're looking for are meant to be floating-point values. But your initial guess p0 is initialized with integers, which confuses curve_fit. There are two ways of fixing this. Either put floating-point values in p0:
julia> p0 = [3349.0, 49.0]
2-element Array{Float64,1}:
3349.0
49.0
or use a typed array initializer to specify explicitly the element type:
julia> p0 = Float64[3349, 49]
2-element Array{Float64,1}:
3349.0
49.0
This is not really an error, but I would find it more intuitive to compute a/b instead of a*b^(-1). Also, yarr can be computed with a simple broadcast using dot notation instead of a comprehension.
Wrapping all this together:
f(x, x_0, γ) = -2*(x - x_0)*(π * γ^3 * (1 + ((x - x_0)/γ)^2)^2)^(-1)
(x_0, γ) = (3350, 50)
xarr = range(3000, length = 5000, stop = 4000);
# use dot-notation to "broadcast" f and map it
# elementwise to elements of xarr
yarr = f.(xarr, x_0, γ);
using LsqFit
model(x, p) = #. -2*(x - p[1]) / (π * (p[2])^3 * (1 + ((x - p[1])/p[2])^2)^2)
p0 = Float64[3300, 10]
fit = curve_fit(model, xarr, yarr, p0)
yields:
julia> fit.param
2-element Array{Float64,1}:
3349.999986535933
49.99999203625603
Say I have prices of a stock and I want to find the slope of the regression line in rolling manner with a given window size. How can I get it done in Julia? I want it to be really fast hence don't want to use a for loop.
You should not, in general, be worried about for loops in Julia, as they do not have the overhead of R or Python for loops. Thus, you only need to worry about asymptotic complexity and not the potentially large constant factor introduced by interpreter overhead.
Nevertheless, this operation can be done much more (asymptotically) efficiently with convolutions than with the naïve O(n²) slice-and-regress approach. The DSP.jl package provides convolution functionality. The following is an example with no intercept (it computes the rolling betas); support for an intercept should be possible by modifying the formulas.
using DSP
# Create some example x (signal) and y (stock prices)
# such that strength of signal goes up over time
const x = randn(100)
const y = (1:100) .* x .+ 100 .* randn(100)
# Create the rolling window
const window = Window.rect(20)
# Compute linear least squares estimate (X^T X)^-1 X^T Y
const xᵗx = conv(x .* x, window)[length(window):end-length(window)+1]
const xᵗy = conv(x .* y, window)[length(window):end-length(window)+1]
const lls = xᵗy ./ xᵗx # desired beta
# Check result against naïve for loop
const βref = [dot(x[i:i+19], y[i:i+19]) / dot(x[i:i+19], x[i:i+19]) for i = 1:81]
#assert isapprox(βref, lls)
Edit to add: To support an intercept, i.e. X = [x 1], so X^T X = [dot(x, x) sum(x); sum(x) w] where w is the window size, the formula for inverse of a 2D matrix can be used to get (X^T X)^-1 = [w -sum(x); -sum(x) dot(x, x)]/(w * dot(x, x) - sum(x)^2). Thus, [β, α] = [w dot(x, y) - sum(x) * sum(y), dot(x, x) * sum(y) - sum(x) * dot(x, y)] / (w * dot(x, x) - sum(x)^2). This can be translated to the following convolution code:
# Compute linear least squares estimate with intercept
const w = length(window)
const xᵗx = conv(x .* x, window)[w:end-w+1]
const xᵗy = conv(x .* y, window)[w:end-w+1]
const 𝟙ᵗx = conv(x, window)[w:end-w+1]
const 𝟙ᵗy = conv(y, window)[w:end-w+1]
const denom = w .* xᵗx - 𝟙ᵗx .^ 2
const α = (xᵗx .* 𝟙ᵗy .- 𝟙ᵗx .* xᵗy) ./ denom
const β = (w .* xᵗy .- 𝟙ᵗx .* 𝟙ᵗy) ./ denom
# Check vs. naive solution
const ref = vcat([([x[i:i+19] ones(20)] \ y[i:i+19])' for i = 1:81]...)
#assert isapprox([β α], ref)
Note that, for weighted least squares with a different window shape, some minor modifications will be needed to disentangle length(window) and sum(window) which are used interchangeably in the code above.
Since I dont need a x variable, I created a numeric series. Using RollingFunctions Package I was able to get rolling regressions through below function.
using RollingFunctions
function rolling_regression(price,windowsize)
sum_x = sum(collect(1:windowsize))
sum_x_squared = sum(collect(1:windowsize).^2)
sum_xy = rolling(sum,price,windowsize,collect(1:windowsize))
sum_y = rolling(sum,price,windowsize)
b = ((windowsize*sum_xy) - (sum_x*sum_y))/(windowsize*sum_x_squared - sum_x^2)
c = [repeat([missing],windowsize-1);b]
end
I've tried to reproduce the model from a PYMC3 and Stan comparison. But it seems to run slowly and when I look at #code_warntype there are some things -- K and N I think -- which the compiler seemingly calls Any.
I've tried adding types -- though I can't add types to turing_model's arguments and things are complicated within turing_model because it's using autodiff variables and not the usuals. I put all the code into the function do_it to avoid globals, because they say that globals can slow things down. (It actually seems slower, though.)
Any suggestions as to what's causing the problem? The turing_model code is what's iterating, so that should make the most difference.
using Turing, StatsPlots, Random
sigmoid(x) = 1.0 / (1.0 + exp(-x))
function scale(w0::Float64, w1::Array{Float64,1})
scale = √(w0^2 + sum(w1 .^ 2))
return w0 / scale, w1 ./ scale
end
function do_it(iterations::Int64)::Chains
K = 10 # predictor dimension
N = 1000 # number of data samples
X = rand(N, K) # predictors (1000, 10)
w1 = rand(K) # weights (10,)
w0 = -median(X * w1) # 50% of elements for each class (number)
w0, w1 = scale(w0, w1) # unit length (euclidean)
w_true = [w0, w1...]
y = (w0 .+ (X * w1)) .> 0.0 # labels
y = [Float64(x) for x in y]
σ = 5.0
σm = [x == y ? σ : 0.0 for x in 1:K, y in 1:K]
#model turing_model(X, y, σ, σm) = begin
w0_pred ~ Normal(0.0, σ)
w1_pred ~ MvNormal(σm)
p = sigmoid.(w0_pred .+ (X * w1_pred))
#inbounds for n in 1:length(y)
y[n] ~ Bernoulli(p[n])
end
end
#time chain = sample(turing_model(X, y, σ, σm), NUTS(iterations, 200, 0.65));
# ϵ = 0.5
# τ = 10
# #time chain = sample(turing_model(X, y, σ), HMC(iterations, ϵ, τ));
return (w_true=w_true, chains=chain::Chains)
end
chain = do_it(1000)