Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incorporating Optim.jl optimizers #481

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ MacroTools 0.3.6
AutoHashEquals 0.1.0
MLDatasets 0.3.0
SpecialFunctions 0.7.0
Optim 0.17.0
4 changes: 2 additions & 2 deletions deps/build.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using PyCall
using Conda

const cur_version = "1.10.0"
const cur_py_version = "1.10.0"
const cur_version = "1.12.0"
const cur_py_version = "1.12.0"


############################
Expand Down
3 changes: 3 additions & 0 deletions deps/default_imports.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,6 @@ Rank
Conv2DBackpropInput
Svd
Cross
FFT
ComplexAbs
MatrixSolve
1 change: 1 addition & 0 deletions src/TensorFlow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ tf_versioninfo


using Distributed
using Optim

const pyproc = Ref(0)

Expand Down
131 changes: 130 additions & 1 deletion src/train.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ apply_gradients,
GradientDescentOptimizer,
MomentumOptimizer,
AdamOptimizer,
optim_minimize,
Saver,
save,
restore,
Expand All @@ -25,8 +26,10 @@ using Compat
using JLD2
using FileIO
using ProtoBuf
using Optim
import Printf


import ..TensorFlow: Graph, Operation, get_def_graph, extend_graph, gradients, variable_scope, ConstantInitializer, node_name, get_variable, get_shape, get_collection, Session, placeholder, Tensor, Variable, cast, group, @not_implemented, AbstractQueue, tensorflow, add_to_collection, get_proto, get_def, @op

import TensorFlow
Expand Down Expand Up @@ -183,6 +186,132 @@ function apply_gradients(optimizer::AdamOptimizer, grads_and_vars; global_step=n
return group(ops...)
end

mutable struct OptimOptimizer
indices::Array{Array{Int64}}
segments::Array{Array{Int64}}
sess::Session
vars::Array{Tuple{Any,Any},1}
dtype::Type
feed_dict::Dict
end

function OptimOptimizer(dtype::Type, loss::Tensor, sess::Session, feed_dict::Dict=Dict())
var_list = get_def_graph().collections[:TrainableVariables]
vars = zip(gradients(loss, var_list), var_list) |> collect
filter!(x->x[1]!==nothing, vars)

indices = Array{Int64}[]
segments = Array{Int64}[]
idx = 1
for i = 1:length(vars)
W = run(sess, vars[i][2], feed_dict)
push!(indices, [ i for i in size(W)])
push!(segments, [idx; idx+length(W)-1])
idx += length(W)
end
OptimOptimizer(indices, segments, sess, vars, dtype, feed_dict)
end

function update_values(opt::OptimOptimizer, x)
for i = 1:length(opt.indices)
x0 = reshape(x[opt.segments[i][1]:opt.segments[i][2]], opt.indices[i]...)
run(opt.sess, tf.assign(opt.vars[i][2], x0))
end
end

function compute_grads(opt::OptimOptimizer)
grads = zeros(opt.dtype, opt.segments[end][2])
for i = 1:length(opt.indices)
grads[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][1], opt.feed_dict)
end
return grads
end

function compute_init(opt::OptimOptimizer)
x0 = zeros(opt.dtype, opt.segments[end][2])
for i = 1:length(opt.indices)
x0[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][2], opt.feed_dict)
end
return x0
end

"""
optim_minimize(sess::Session, loss::AbstractTensor;
dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing)

`optim_minimize` calls first order optimization solvers from Optim.jl package (https://github.com/JuliaNLSolvers/Optim.jl).
`sess`: current session
`loss`: the loss function to minimize
`dtype`: the computation value type (default Float64)
`feed_dict`: a dictionary for placeholders
`method`: four methods are supported: `LBFGS`(default), `BFGS`, `AGD`(AcceleratedGradientDescent), `CG`
`options`: An Optim.Options instance. See `Optim.jl` documents for details

Example
=======
```
function mycallback(handle)
res = run(sess, Loss, Dict(X=>x, Y_obs=>y))
println("iter \$(handle.iteration): \$(res)")
return false # so it do not stop
end

options = Optim.Options(show_trace = false, iterations=1000, callback = mycallback, allow_f_increases=true)
optim_minimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method="AGD")
```

Note
=======

Note that this optimizer is not built as part of the graph. Rather, it contructs a function and a gradient function using
`run(sess, ...)` for every iteration. There is drawback for this approach: (1) stochastic gradient descent is not easy to
implement; (2) there is some overhead. However, it would be nice to call the solvers from Optim.jl directly and leverage the
robustness and ffine granite parameter control options.
"""
function optim_minimize(sess::Session, loss::Tensor;
dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options::Union{Nothing, Optim.Options}=nothing)
opt = OptimOptimizer(dtype, loss, sess, feed_dict)
function f(x)
update_values(opt, x)
res = run(sess, loss, feed_dict)
return res
end

function g!(G, x)
update_values(opt, x)
G[:] = compute_grads(opt)
end

x0 = compute_init(opt)

optimizer = nothing
if method=="LBFGS"
optimizer = LBFGS()
elseif method=="BFGS"
optimizer = BFGS()
elseif method=="AGD"
optimizer = AcceleratedGradientDescent()
elseif method=="CG"
optimizer = ConjugateGradient()
else
@error """
Available Optimier:
* LBFGS
* BFGS
* AGD (AcceleratedGradientDescent)
* GC (ConjugateGradient)
"""

end
if options===nothing
return optimize(f, g!, x0, optimizer)
else
return optimize(f, g!, x0, optimizer, options)
end
end



mutable struct Saver
var_list
max_to_keep
Expand Down Expand Up @@ -417,4 +546,4 @@ function SummaryWriter(args...; kwargs...)
TensorFlow.summary.FileWriter(args...; kwargs...)
end

end
end
53 changes: 53 additions & 0 deletions test/train.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using TensorFlow
using Test
using Optim

@testset "save and resore" begin
try
Expand Down Expand Up @@ -78,3 +79,55 @@ end
end
end
end


@testset "optimizers" begin
using Distributions
# Generate some synthetic data
x = randn(100, 50)
w = randn(50, 10)
y_prob = exp.(x*w)
y_prob ./= sum(y_prob,dims=2)

function draw(probs)
y = zeros(size(probs))
for i in 1:size(probs, 1)
idx = rand(Categorical(probs[i, :]))
y[i, idx] = 1
end
return y
end

y = draw(y_prob)

# Build the model
sess = Session(Graph())

X = placeholder(Float64, shape=[-1, 50])
Y_obs = placeholder(Float64, shape=[-1, 10])

variable_scope("logisitic_model"; initializer=Normal(0, .001)) do
global W = get_variable("W", [50, 10], Float64)
global B = get_variable("B", [10], Float64)
end

Y=nn.softmax(X*W + B)
Loss = -reduce_sum(log(Y).*Y_obs)

function mycallback(handle)
res = run(sess, Loss, Dict(X=>x, Y_obs=>y))
println("iter $(handle.iteration): $(res)")
if isnan(res) || isinf(res)
return true
else
return false # so it do not stop
end
end

for m in ["AGD", "CG", "BFGS", "LBFGS"]
run(sess, global_variables_initializer())
options = Optim.Options(show_trace = false, iterations=10, callback = mycallback, allow_f_increases=true)
train.optim_minimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method=m)
end

end