From 542452ca63afb4eb6425e74caf30d1ecc0984fb8 Mon Sep 17 00:00:00 2001 From: kailaix Date: Thu, 13 Dec 2018 14:11:34 -0800 Subject: [PATCH 01/10] Update build.jl --- deps/build.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/build.jl b/deps/build.jl index 1a104aed..caabe43d 100644 --- a/deps/build.jl +++ b/deps/build.jl @@ -1,8 +1,8 @@ using PyCall using Conda -const cur_version = "1.10.0" -const cur_py_version = "1.10.0" +const cur_version = "1.12.0" +const cur_py_version = "1.12.0" ############################ From 90f94c2c6a703d78bd40f61bc9fbe9abac20b95e Mon Sep 17 00:00:00 2001 From: kailaix Date: Thu, 13 Dec 2018 23:57:24 -0800 Subject: [PATCH 02/10] Update default_imports.txt --- deps/default_imports.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deps/default_imports.txt b/deps/default_imports.txt index 9368e056..839cf659 100644 --- a/deps/default_imports.txt +++ b/deps/default_imports.txt @@ -166,3 +166,6 @@ Rank Conv2DBackpropInput Svd Cross +FFT +ComplexAbs +MatrixSolve From cd5c9d94aa1cf90bc1744169485a7749e0fc3fe9 Mon Sep 17 00:00:00 2001 From: kailaix Date: Sun, 16 Dec 2018 22:05:54 -0800 Subject: [PATCH 03/10] transpose test passed --- src/ops/transformations.jl | 6 +- src/train.jl | 121 ++++++++++++++++++++++++++++++++++++- test/transformations.jl | 8 +++ 3 files changed, 131 insertions(+), 4 deletions(-) diff --git a/src/ops/transformations.jl b/src/ops/transformations.jl index 2d17705c..9b9369fa 100644 --- a/src/ops/transformations.jl +++ b/src/ops/transformations.jl @@ -380,16 +380,16 @@ Returns: local result with_op_name(name, "Transpose") do if perm === nothing - r = range(constant(0), LinearAlgebra.rank(n)-1) + r = range(constant(1), LinearAlgebra.rank(n)) perm = reverse(r, [true]) end - result = Ops.transpose(n, perm) + result = Ops.transpose(n, perm .- 1) end result end @op function Base.permutedims(n::AbstractTensor, perm; name=nothing) - transpose(n, perm .- 1; name=name) + transpose(n, perm; name=name) end @define_unary Base.adjoint transpose diff --git a/src/train.jl b/src/train.jl index 0514ce99..b3df2f55 100644 --- a/src/train.jl +++ b/src/train.jl @@ -7,6 +7,8 @@ apply_gradients, GradientDescentOptimizer, MomentumOptimizer, AdamOptimizer, +NadamOptimizer, +AMSGradOptimizer, Saver, save, restore, @@ -183,6 +185,123 @@ function apply_gradients(optimizer::AdamOptimizer, grads_and_vars; global_step=n return group(ops...) end +mutable struct NadamOptimizer <: Optimizer + η::Float64 + β1::Float64 + β2::Float64 + ϵ::Float64 + name::String +end + +NadamOptimizer(learning_rate; β1=.9, β2=.999, ϵ=1e-8, name="nadam") = NadamOptimizer(learning_rate, β1, β2, ϵ, name) + +function NadamOptimizer(; η=.001, kwargs...) + NadamOptimizer(η; kwargs...) +end + +function Base.show(io::IO, optim::NadamOptimizer) + print(io, "NadamOptimizer(η=$(optim.η), β1=$(optim.β1), β2=$(optim.β2), ϵ=$(optim.ϵ))") +end + +function apply_gradients(optimizer::NadamOptimizer, grads_and_vars; global_step=nothing, name="nadam") + ops = Tensor[] + @advance_step + for (grad, var) in grads_and_vars + local m, v, T + variable_scope(name) do + variable_scope(node_name(var)[1]) do + m = get_variable("m", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) + v = get_variable("v", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) + T = get_variable("t", [], Float32, initializer=ConstantInitializer(1.0), trainable=false) + end + end + β1 = eltype(var)(optimizer.β1) + β2 = eltype(var)(optimizer.β2) + ϵ = eltype(var)(optimizer.ϵ) + η = eltype(var)(optimizer.η) + t = convert(Tensor{eltype(var)}, T) + push!(ops, tf.assign(T, T+1)) + lr = η*sqrt(1-β2^t)/(1-β1^t) + if isa(grad, tf.IndexedSlices) + m_slice = tf.gather(m, grad.indices) + v_slice = tf.gather(v, grad.indices) + m_new = β1 .* m_slice + (1-β1) .* grad.values + v_new = (1-β2) .* (grad.values .^ 2) + push!(ops, tf.scatter_sub(var.var_node, grad.indices, lr/(sqrt(v_new)+ϵ) .* (β1 .* m_new + (1-β1) .* grad.values))) + push!(ops, tf.scatter_update(m.var_node, grad.indices, m_new)) + push!(ops, tf.scatter_update(v.var_node, grad.indices, v_new)) + else + m_new = β1 .* m + (1-β1).*grad + v_new = β2 .* v + (1-β2).*(grad.*grad) + push!(ops, tf.assign_sub(var, lr/(sqrt(v_new)+ϵ) .* (β1 .* m_new + (1-β1) .* grad.values))) + push!(ops, tf.assign(m, m_new)) + push!(ops, tf.assign(v, v_new)) + end + end + return group(ops...) +end + +mutable struct AMSGradOptimizer <: Optimizer + η::Float64 + β1::Float64 + β2::Float64 + ϵ::Float64 + name::String +end + +AMSGradOptimizer(learning_rate; β1=.9, β2=.999, ϵ=1e-8, name="AMSGrad") = AMSGradOptimizer(learning_rate, β1, β2, ϵ, name) + +function AMSGradOptimizer(; η=.001, kwargs...) + AMSGradOptimizer(η; kwargs...) +end + +function Base.show(io::IO, optim::AMSGradOptimizer) + print(io, "AMSGradOptimizer(η=$(optim.η), β1=$(optim.β1), β2=$(optim.β2), ϵ=$(optim.ϵ))") +end + +function apply_gradients(optimizer::AMSGradOptimizer, grads_and_vars; global_step=nothing, name="AMSGrad") + ops = Tensor[] + @advance_step + for (grad, var) in grads_and_vars + local m, v, T + variable_scope(name) do + variable_scope(node_name(var)[1]) do + m = get_variable("m", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) + v = get_variable("v", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) + v_hat = get_variable("v_hat", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) + T = get_variable("t", [], Float32, initializer=ConstantInitializer(1.0), trainable=false) + end + end + β1 = eltype(var)(optimizer.β1) + β2 = eltype(var)(optimizer.β2) + ϵ = eltype(var)(optimizer.ϵ) + η = eltype(var)(optimizer.η) + t = convert(Tensor{eltype(var)}, T) + push!(ops, tf.assign(T, T+1)) + if isa(grad, tf.IndexedSlices) + m_slice = tf.gather(m, grad.indices) + v_slice = tf.gather(v, grad.indices) + m_new = β1 .* m_slice + (1-β1) .* grad.values + v_new = β2 .* v_slice + (1-β2) .* (grad.values .^ 2) + v_hat = max(v_hat, v_new) + push!(ops, tf.scatter_sub(var.var_node, grad.indices, η/(sqrt(v_hat)+ϵ) .* m_new)) + push!(ops, tf.scatter_update(m.var_node, grad.indices, m_new)) + push!(ops, tf.scatter_update(v.var_node, grad.indices, v_new)) + push!(ops, tf.scatter_update(v_hat.var_node, grad.indices, v_hat)) + else + m_new = β1 .* m + (1-β1).*grad + v_new = β2 .* v + (1-β2).*(grad.*grad) + v_hat = max(v_hat, v_new) + push!(ops, tf.assign_sub(var, η/(sqrt(v_hat)+ϵ) .* m_new)) + push!(ops, tf.assign(m, m_new)) + push!(ops, tf.assign(v, v_new)) + push!(ops, tf.assign(v_hat, v_hat)) + end + end + return group(ops...) +end + + mutable struct Saver var_list max_to_keep @@ -417,4 +536,4 @@ function SummaryWriter(args...; kwargs...) TensorFlow.summary.FileWriter(args...; kwargs...) end -end +end \ No newline at end of file diff --git a/test/transformations.jl b/test/transformations.jl index 1d9658dc..4c0e9980 100644 --- a/test/transformations.jl +++ b/test/transformations.jl @@ -33,6 +33,14 @@ end @testset "Permute Dims" begin @test ones(Float32, 4,3) == run(sess, transpose(ones(Tensor, (3, 4)))) @test ones(Float32, 4,3,2) == run(sess, permutedims(ones(Tensor, (4, 2, 3)), [1, 3, 2])) + + A = rand(Float32, 5, 5, 5) + B = permutedims(A, [3,2,1]) + c = TensorFlow.constant(A) + d = transpose(c, [3,2,1]) + result = run(sess, d) + @test maximum(abs.(result-B))≈0.0 + end From fbcc0da874414118c10d489e5b148ef92ae98e9f Mon Sep 17 00:00:00 2001 From: kailaix Date: Mon, 17 Dec 2018 00:41:02 -0800 Subject: [PATCH 04/10] update version --- REQUIRE | 1 + src/train.jl | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++ test/train.jl | 72 ++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+) diff --git a/REQUIRE b/REQUIRE index 080e1af8..79a4e614 100644 --- a/REQUIRE +++ b/REQUIRE @@ -13,3 +13,4 @@ MacroTools 0.3.6 AutoHashEquals 0.1.0 MLDatasets 0.3.0 SpecialFunctions 0.7.0 +Optim 0.17.0 diff --git a/src/train.jl b/src/train.jl index b3df2f55..a246c071 100644 --- a/src/train.jl +++ b/src/train.jl @@ -27,8 +27,10 @@ using Compat using JLD2 using FileIO using ProtoBuf +using Optim import Printf + import ..TensorFlow: Graph, Operation, get_def_graph, extend_graph, gradients, variable_scope, ConstantInitializer, node_name, get_variable, get_shape, get_collection, Session, placeholder, Tensor, Variable, cast, group, @not_implemented, AbstractQueue, tensorflow, add_to_collection, get_proto, get_def, @op import TensorFlow @@ -302,6 +304,133 @@ function apply_gradients(optimizer::AMSGradOptimizer, grads_and_vars; global_ste end +abstract type OptimOptimizer end + +mutable struct LBFGSOptimizer <: OptimOptimizer + indices::Array{Array{Int64}} + segments::Array{Array{Int64}} + sess::Session + vars::Array{Tuple{Any,Any},1} + dtype::Type + feed_dict::Dict +end + +function LBFGSOptimizer(dtype::Type, sess::Session, feed_dict::Dict=Dict()) + var_list = get_def_graph().collections[:TrainableVariables] + vars = zip(gradients(Loss, var_list), var_list) |> collect + filter!(x->x[1]!==nothing, vars) + + indices = Array{Int64}[] + segments = Array{Int64}[] + idx = 1 + for i = 1:length(vars) + W = run(sess, vars[i][2], feed_dict) + push!(indices, [ i for i in size(W)]) + push!(segments, [idx; idx+length(W)-1]) + idx += length(W) + end + LBFGSOptimizer(indices, segments, sess, vars, dtype, feed_dict) +end + +function update_values(opt::LBFGSOptimizer, x) + for i = 1:length(opt.indices) + x0 = reshape(x[opt.segments[i][1]:opt.segments[i][2]], opt.indices[i]...) + run(opt.sess, assign(opt.vars[i][2], x0)) + end +end + +function compute_grads(opt::LBFGSOptimizer) + grads = zeros(opt.dtype, opt.segments[end][2]) + for i = 1:length(opt.indices) + grads[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][1], opt.feed_dict) + end + return grads +end + +function compute_init(opt::LBFGSOptimizer) + x0 = zeros(opt.dtype, opt.segments[end][2]) + for i = 1:length(opt.indices) + x0[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][2], opt.feed_dict) + end + return x0 +end + +""" +OptimMinimize(sess::Session, loss::AbstractTensor; +dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing) + +`OptimMinimize` calls first order optimization solvers from Optim.jl package (https://github.com/JuliaNLSolvers/Optim.jl). +`sess`: current session +`loss`: the loss function to minimize +`dtype`: the computation value type (default Float64) +`feed_dict`: a dictionary for placeholders +`method`: four methods are supported: `LBFGS`(default), `BFGS`, `AGD`(AcceleratedGradientDescent), `CG` +`options`: An Optim.Options instance. See `Optim.jl` documents for details + +Example +======= +``` +function mycallback(handle) + res = run(sess, Loss, Dict(X=>x, Y_obs=>y)) + println("iter \$(handle.iteration): \$(res)") + return false # so it do not stop +end + +options = Optim.Options(show_trace = false, iterations=1000, callback = mycallback, allow_f_increases=true) +OptimMinimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method="AGD") +``` + +Note +======= + +Note that this optimizer is not built as part of the graph. Rather, it contructs a function and a gradient function using +`run(sess, ...)` for every iteration. There is drawback for this approach: (1) stochastic gradient descent is not easy to +implement; (2) there is some overhead. However, it would be nice to call the solvers from Optim.jl directly and leverage the +robustness and ffine granite parameter control options. +""" +function OptimMinimize(sess::Session, loss::Tensor; + dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing) + opt = LBFGSOptimizer(dtype, sess, feed_dict) + function f(x) + update_values(opt, x) + res = run(sess, loss, feed_dict) + return res + end + + function g!(G, x) + update_values(opt, x) + G[:] = compute_grads(opt) + end + + x0 = compute_init(opt) + + optimizer = nothing + if method=="LBFGS" + optimizer = LBFGS() + elseif method=="BFGS" + optimizer = BFGS() + elseif method=="AGD" + optimizer = AcceleratedGradientDescent() + elseif method=="CG" + optimizer = ConjugateGradient() + else + @error """ +Available Optimier: +* LBFGS +* BFGS +* AGD +""" + + end + if options===nothing + return optimize(f, g!, x0, optimizer) + else + return optimize(f, g!, x0, optimizer, options) + end +end + + + mutable struct Saver var_list max_to_keep diff --git a/test/train.jl b/test/train.jl index 7a52bf50..f266e382 100644 --- a/test/train.jl +++ b/test/train.jl @@ -78,3 +78,75 @@ end end end end + + +@test "optimizers" begin + using Distributions + # Generate some synthetic data + x = randn(100, 50) + w = randn(50, 10) + y_prob = exp.(x*w) + y_prob ./= sum(y_prob,dims=2) + + function draw(probs) + y = zeros(size(probs)) + for i in 1:size(probs, 1) + idx = rand(Categorical(probs[i, :])) + y[i, idx] = 1 + end + return y + end + + y = draw(y_prob) + + # Build the model + sess = Session(Graph()) + + X = placeholder(Float64, shape=[-1, 50]) + Y_obs = placeholder(Float64, shape=[-1, 10]) + + variable_scope("logisitic_model"; initializer=Normal(0, .001)) do + global W = get_variable("W", [50, 10], Float64) + global B = get_variable("B", [10], Float64) + end + + Y=nn.softmax(X*W + B) + + + + + Loss = -reduce_sum(log(Y).*Y_obs) + + ### NadamOptimizer + optimizer = train.NadamOptimizer() + minimize_op = train.minimize(optimizer, Loss) + # Run training + run(sess, global_variables_initializer()) + for epoch in 1:100 + cur_loss, _ = run(sess, [Loss, minimize_op], Dict(X=>x, Y_obs=>y)) + println(@sprintf("[NadamOptimizer]Current loss is %.2f.", cur_loss)) + end + + ### AMSGradOptimizer + optimizer = train.AMSGradOptimizer() + minimize_op = train.minimize(optimizer, Loss) + # Run training + run(sess, global_variables_initializer()) + for epoch in 1:100 + cur_loss, _ = run(sess, [Loss, minimize_op], Dict(X=>x, Y_obs=>y)) + println(@sprintf("[AMSGradOptimizer]Current loss is %.2f.", cur_loss)) + end + + function mycallback(handle) + res = run(sess, Loss, Dict(X=>x, Y_obs=>y)) + println("[$m]iter \$(handle.iteration): \$(res)") + return false # so it do not stop + end + + for m in ["AGD", "CG", "BFGS", "LBFGS"] + run(sess, global_variables_initializer()) + options = Optim.Options(show_trace = false, iterations=1000, callback = mycallback, allow_f_increases=true) + OptimMinimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method=m) + end + +end \ No newline at end of file From ef320c4da2c6a3eb543f73594f09425966b63478 Mon Sep 17 00:00:00 2001 From: kailaix Date: Mon, 17 Dec 2018 01:14:49 -0800 Subject: [PATCH 05/10] training test passed --- src/TensorFlow.jl | 1 + src/train.jl | 119 ++-------------------------------------------- test/train.jl | 39 ++++----------- 3 files changed, 15 insertions(+), 144 deletions(-) diff --git a/src/TensorFlow.jl b/src/TensorFlow.jl index 69b76c45..228c93ac 100644 --- a/src/TensorFlow.jl +++ b/src/TensorFlow.jl @@ -128,6 +128,7 @@ tf_versioninfo using Distributed +using Optim const pyproc = Ref(0) diff --git a/src/train.jl b/src/train.jl index a246c071..957218d2 100644 --- a/src/train.jl +++ b/src/train.jl @@ -7,8 +7,6 @@ apply_gradients, GradientDescentOptimizer, MomentumOptimizer, AdamOptimizer, -NadamOptimizer, -AMSGradOptimizer, Saver, save, restore, @@ -195,115 +193,6 @@ mutable struct NadamOptimizer <: Optimizer name::String end -NadamOptimizer(learning_rate; β1=.9, β2=.999, ϵ=1e-8, name="nadam") = NadamOptimizer(learning_rate, β1, β2, ϵ, name) - -function NadamOptimizer(; η=.001, kwargs...) - NadamOptimizer(η; kwargs...) -end - -function Base.show(io::IO, optim::NadamOptimizer) - print(io, "NadamOptimizer(η=$(optim.η), β1=$(optim.β1), β2=$(optim.β2), ϵ=$(optim.ϵ))") -end - -function apply_gradients(optimizer::NadamOptimizer, grads_and_vars; global_step=nothing, name="nadam") - ops = Tensor[] - @advance_step - for (grad, var) in grads_and_vars - local m, v, T - variable_scope(name) do - variable_scope(node_name(var)[1]) do - m = get_variable("m", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) - v = get_variable("v", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) - T = get_variable("t", [], Float32, initializer=ConstantInitializer(1.0), trainable=false) - end - end - β1 = eltype(var)(optimizer.β1) - β2 = eltype(var)(optimizer.β2) - ϵ = eltype(var)(optimizer.ϵ) - η = eltype(var)(optimizer.η) - t = convert(Tensor{eltype(var)}, T) - push!(ops, tf.assign(T, T+1)) - lr = η*sqrt(1-β2^t)/(1-β1^t) - if isa(grad, tf.IndexedSlices) - m_slice = tf.gather(m, grad.indices) - v_slice = tf.gather(v, grad.indices) - m_new = β1 .* m_slice + (1-β1) .* grad.values - v_new = (1-β2) .* (grad.values .^ 2) - push!(ops, tf.scatter_sub(var.var_node, grad.indices, lr/(sqrt(v_new)+ϵ) .* (β1 .* m_new + (1-β1) .* grad.values))) - push!(ops, tf.scatter_update(m.var_node, grad.indices, m_new)) - push!(ops, tf.scatter_update(v.var_node, grad.indices, v_new)) - else - m_new = β1 .* m + (1-β1).*grad - v_new = β2 .* v + (1-β2).*(grad.*grad) - push!(ops, tf.assign_sub(var, lr/(sqrt(v_new)+ϵ) .* (β1 .* m_new + (1-β1) .* grad.values))) - push!(ops, tf.assign(m, m_new)) - push!(ops, tf.assign(v, v_new)) - end - end - return group(ops...) -end - -mutable struct AMSGradOptimizer <: Optimizer - η::Float64 - β1::Float64 - β2::Float64 - ϵ::Float64 - name::String -end - -AMSGradOptimizer(learning_rate; β1=.9, β2=.999, ϵ=1e-8, name="AMSGrad") = AMSGradOptimizer(learning_rate, β1, β2, ϵ, name) - -function AMSGradOptimizer(; η=.001, kwargs...) - AMSGradOptimizer(η; kwargs...) -end - -function Base.show(io::IO, optim::AMSGradOptimizer) - print(io, "AMSGradOptimizer(η=$(optim.η), β1=$(optim.β1), β2=$(optim.β2), ϵ=$(optim.ϵ))") -end - -function apply_gradients(optimizer::AMSGradOptimizer, grads_and_vars; global_step=nothing, name="AMSGrad") - ops = Tensor[] - @advance_step - for (grad, var) in grads_and_vars - local m, v, T - variable_scope(name) do - variable_scope(node_name(var)[1]) do - m = get_variable("m", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) - v = get_variable("v", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) - v_hat = get_variable("v_hat", get_shape(var), eltype(var), initializer=ConstantInitializer(0.0), trainable=false) - T = get_variable("t", [], Float32, initializer=ConstantInitializer(1.0), trainable=false) - end - end - β1 = eltype(var)(optimizer.β1) - β2 = eltype(var)(optimizer.β2) - ϵ = eltype(var)(optimizer.ϵ) - η = eltype(var)(optimizer.η) - t = convert(Tensor{eltype(var)}, T) - push!(ops, tf.assign(T, T+1)) - if isa(grad, tf.IndexedSlices) - m_slice = tf.gather(m, grad.indices) - v_slice = tf.gather(v, grad.indices) - m_new = β1 .* m_slice + (1-β1) .* grad.values - v_new = β2 .* v_slice + (1-β2) .* (grad.values .^ 2) - v_hat = max(v_hat, v_new) - push!(ops, tf.scatter_sub(var.var_node, grad.indices, η/(sqrt(v_hat)+ϵ) .* m_new)) - push!(ops, tf.scatter_update(m.var_node, grad.indices, m_new)) - push!(ops, tf.scatter_update(v.var_node, grad.indices, v_new)) - push!(ops, tf.scatter_update(v_hat.var_node, grad.indices, v_hat)) - else - m_new = β1 .* m + (1-β1).*grad - v_new = β2 .* v + (1-β2).*(grad.*grad) - v_hat = max(v_hat, v_new) - push!(ops, tf.assign_sub(var, η/(sqrt(v_hat)+ϵ) .* m_new)) - push!(ops, tf.assign(m, m_new)) - push!(ops, tf.assign(v, v_new)) - push!(ops, tf.assign(v_hat, v_hat)) - end - end - return group(ops...) -end - - abstract type OptimOptimizer end mutable struct LBFGSOptimizer <: OptimOptimizer @@ -315,9 +204,9 @@ mutable struct LBFGSOptimizer <: OptimOptimizer feed_dict::Dict end -function LBFGSOptimizer(dtype::Type, sess::Session, feed_dict::Dict=Dict()) +function LBFGSOptimizer(dtype::Type, loss::Tensor, sess::Session, feed_dict::Dict=Dict()) var_list = get_def_graph().collections[:TrainableVariables] - vars = zip(gradients(Loss, var_list), var_list) |> collect + vars = zip(gradients(loss, var_list), var_list) |> collect filter!(x->x[1]!==nothing, vars) indices = Array{Int64}[] @@ -335,7 +224,7 @@ end function update_values(opt::LBFGSOptimizer, x) for i = 1:length(opt.indices) x0 = reshape(x[opt.segments[i][1]:opt.segments[i][2]], opt.indices[i]...) - run(opt.sess, assign(opt.vars[i][2], x0)) + run(opt.sess, tf.assign(opt.vars[i][2], x0)) end end @@ -390,7 +279,7 @@ robustness and ffine granite parameter control options. """ function OptimMinimize(sess::Session, loss::Tensor; dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing) - opt = LBFGSOptimizer(dtype, sess, feed_dict) + opt = LBFGSOptimizer(dtype, loss, sess, feed_dict) function f(x) update_values(opt, x) res = run(sess, loss, feed_dict) diff --git a/test/train.jl b/test/train.jl index f266e382..c7ff3a2c 100644 --- a/test/train.jl +++ b/test/train.jl @@ -1,5 +1,6 @@ using TensorFlow using Test +using Optim @testset "save and resore" begin try @@ -80,7 +81,7 @@ end end -@test "optimizers" begin +@testset "optimizers" begin using Distributions # Generate some synthetic data x = randn(100, 50) @@ -111,42 +112,22 @@ end end Y=nn.softmax(X*W + B) - - - - Loss = -reduce_sum(log(Y).*Y_obs) - - ### NadamOptimizer - optimizer = train.NadamOptimizer() - minimize_op = train.minimize(optimizer, Loss) - # Run training - run(sess, global_variables_initializer()) - for epoch in 1:100 - cur_loss, _ = run(sess, [Loss, minimize_op], Dict(X=>x, Y_obs=>y)) - println(@sprintf("[NadamOptimizer]Current loss is %.2f.", cur_loss)) - end - - ### AMSGradOptimizer - optimizer = train.AMSGradOptimizer() - minimize_op = train.minimize(optimizer, Loss) - # Run training - run(sess, global_variables_initializer()) - for epoch in 1:100 - cur_loss, _ = run(sess, [Loss, minimize_op], Dict(X=>x, Y_obs=>y)) - println(@sprintf("[AMSGradOptimizer]Current loss is %.2f.", cur_loss)) - end function mycallback(handle) res = run(sess, Loss, Dict(X=>x, Y_obs=>y)) - println("[$m]iter \$(handle.iteration): \$(res)") - return false # so it do not stop + println("iter $(handle.iteration): $(res)") + if isnan(res) || isinf(res) + return true + else + return false # so it do not stop + end end for m in ["AGD", "CG", "BFGS", "LBFGS"] run(sess, global_variables_initializer()) - options = Optim.Options(show_trace = false, iterations=1000, callback = mycallback, allow_f_increases=true) - OptimMinimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method=m) + options = Optim.Options(show_trace = false, iterations=50, callback = mycallback, allow_f_increases=true) + train.OptimMinimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method=m) end end \ No newline at end of file From 3f6c324c950940f5ba657d68b939bbc678a6875f Mon Sep 17 00:00:00 2001 From: kailaix Date: Mon, 17 Dec 2018 01:21:40 -0800 Subject: [PATCH 06/10] optim minimize --- src/train.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/train.jl b/src/train.jl index 957218d2..0f2d8ba7 100644 --- a/src/train.jl +++ b/src/train.jl @@ -7,6 +7,7 @@ apply_gradients, GradientDescentOptimizer, MomentumOptimizer, AdamOptimizer, +OptimMinimize, Saver, save, restore, From 75809e5ed59d9b0d7e5c537bd7c22bbd4048f408 Mon Sep 17 00:00:00 2001 From: kailaix Date: Mon, 17 Dec 2018 01:32:30 -0800 Subject: [PATCH 07/10] fix --- src/train.jl | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/src/train.jl b/src/train.jl index 0f2d8ba7..cabd695a 100644 --- a/src/train.jl +++ b/src/train.jl @@ -186,17 +186,7 @@ function apply_gradients(optimizer::AdamOptimizer, grads_and_vars; global_step=n return group(ops...) end -mutable struct NadamOptimizer <: Optimizer - η::Float64 - β1::Float64 - β2::Float64 - ϵ::Float64 - name::String -end - -abstract type OptimOptimizer end - -mutable struct LBFGSOptimizer <: OptimOptimizer +mutable struct OptimOptimizer indices::Array{Array{Int64}} segments::Array{Array{Int64}} sess::Session @@ -205,7 +195,7 @@ mutable struct LBFGSOptimizer <: OptimOptimizer feed_dict::Dict end -function LBFGSOptimizer(dtype::Type, loss::Tensor, sess::Session, feed_dict::Dict=Dict()) +function OptimOptimizer(dtype::Type, loss::Tensor, sess::Session, feed_dict::Dict=Dict()) var_list = get_def_graph().collections[:TrainableVariables] vars = zip(gradients(loss, var_list), var_list) |> collect filter!(x->x[1]!==nothing, vars) @@ -219,17 +209,17 @@ function LBFGSOptimizer(dtype::Type, loss::Tensor, sess::Session, feed_dict::Dic push!(segments, [idx; idx+length(W)-1]) idx += length(W) end - LBFGSOptimizer(indices, segments, sess, vars, dtype, feed_dict) + OptimOptimizer(indices, segments, sess, vars, dtype, feed_dict) end -function update_values(opt::LBFGSOptimizer, x) +function update_values(opt::OptimOptimizer, x) for i = 1:length(opt.indices) x0 = reshape(x[opt.segments[i][1]:opt.segments[i][2]], opt.indices[i]...) run(opt.sess, tf.assign(opt.vars[i][2], x0)) end end -function compute_grads(opt::LBFGSOptimizer) +function compute_grads(opt::OptimOptimizer) grads = zeros(opt.dtype, opt.segments[end][2]) for i = 1:length(opt.indices) grads[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][1], opt.feed_dict) @@ -237,7 +227,7 @@ function compute_grads(opt::LBFGSOptimizer) return grads end -function compute_init(opt::LBFGSOptimizer) +function compute_init(opt::OptimOptimizer) x0 = zeros(opt.dtype, opt.segments[end][2]) for i = 1:length(opt.indices) x0[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][2], opt.feed_dict) @@ -280,7 +270,7 @@ robustness and ffine granite parameter control options. """ function OptimMinimize(sess::Session, loss::Tensor; dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing) - opt = LBFGSOptimizer(dtype, loss, sess, feed_dict) + opt = OptimOptimizer(dtype, loss, sess, feed_dict) function f(x) update_values(opt, x) res = run(sess, loss, feed_dict) From 1fcf5bb90538f31066ba8a305595d7423da8ed97 Mon Sep 17 00:00:00 2001 From: kailaix Date: Mon, 17 Dec 2018 11:45:42 -0800 Subject: [PATCH 08/10] conv1d test passed --- src/ops/nn.jl | 19 +++++++++++++++++++ src/train.jl | 15 ++++++++------- test/nn.jl | 20 ++++++++++++++++++++ test/train.jl | 4 ++-- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/ops/nn.jl b/src/ops/nn.jl index dcd47a3c..cb808a50 100644 --- a/src/ops/nn.jl +++ b/src/ops/nn.jl @@ -36,6 +36,25 @@ import .rnn_cell: zero_state, output_size, state_size conv2d(input, filter; padding=padding, strides=strides, kwargs...) end +@tf.op function conv1d(input, filter_, strides_::Int64, padding::String; data_format="NHWC", kwargs...) + spatial_start_dim = 0 + if data_format=="NHWC" + strides_ = [1,1,strides_,1] + spatial_start_dim = 2 + elseif data_format == "NCHW" || data_format == "NCW" + data_format = "NCHW" + spatial_start_dim = 3 + strides_ = [1,1,1,strides_] + else + @error "data_format must be NHWC or NCHW or NCW" + end + input = Ops.expand_dims(input, spatial_start_dim) + filter_ = Ops.expand_dims(filter_, 1) + result = Ops.conv2d(input, filter_; strides = strides_, padding = padding, data_format=data_format, kwargs...) + result = Ops.squeeze(result, squeeze_dims=[spatial_start_dim-1]) + return result +end + # Same for max pool @tf.op function max_pool(input, ksize, strides, padding; kwargs...) max_pool(input; ksize=ksize, strides=strides, padding=padding, kwargs...) diff --git a/src/train.jl b/src/train.jl index cabd695a..d05ae6c2 100644 --- a/src/train.jl +++ b/src/train.jl @@ -7,7 +7,7 @@ apply_gradients, GradientDescentOptimizer, MomentumOptimizer, AdamOptimizer, -OptimMinimize, +optim_minimize, Saver, save, restore, @@ -236,10 +236,10 @@ function compute_init(opt::OptimOptimizer) end """ -OptimMinimize(sess::Session, loss::AbstractTensor; +optim_minimize(sess::Session, loss::AbstractTensor; dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing) -`OptimMinimize` calls first order optimization solvers from Optim.jl package (https://github.com/JuliaNLSolvers/Optim.jl). +`optim_minimize` calls first order optimization solvers from Optim.jl package (https://github.com/JuliaNLSolvers/Optim.jl). `sess`: current session `loss`: the loss function to minimize `dtype`: the computation value type (default Float64) @@ -257,7 +257,7 @@ function mycallback(handle) end options = Optim.Options(show_trace = false, iterations=1000, callback = mycallback, allow_f_increases=true) -OptimMinimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method="AGD") +optim_minimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method="AGD") ``` Note @@ -268,8 +268,8 @@ Note that this optimizer is not built as part of the graph. Rather, it contructs implement; (2) there is some overhead. However, it would be nice to call the solvers from Optim.jl directly and leverage the robustness and ffine granite parameter control options. """ -function OptimMinimize(sess::Session, loss::Tensor; - dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing) +function optim_minimize(sess::Session, loss::Tensor; + dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options::Union{Nothing, Optim.Options}=nothing) opt = OptimOptimizer(dtype, loss, sess, feed_dict) function f(x) update_values(opt, x) @@ -298,7 +298,8 @@ function OptimMinimize(sess::Session, loss::Tensor; Available Optimier: * LBFGS * BFGS -* AGD +* AGD (AcceleratedGradientDescent) +* GC (ConjugateGradient) """ end diff --git a/test/nn.jl b/test/nn.jl index 6ef29e52..2bf1e64c 100644 --- a/test/nn.jl +++ b/test/nn.jl @@ -4,6 +4,26 @@ using StatsFuns using Random import LinearAlgebra +@testset "conv1d" begin + let + sess = Session(Graph()) + F = zeros(Float32, 2, 3, 4) # batch_size = 2, dimension = 3, channle = 4 + for i = 1:2 + for j = 1:3 + for k = 1:4 + F[i,j,k] = Float32(i+j+k-3) + end + end + end + input = constant(F) + filter_ = constant(ones(Float32, 3, 4, 1)) # width = 3, input channel = 4 output channel = 1 + output = nn.conv1d(input, filter_, 2, "VALID") + output_val = run(sess, output) + ref_val = reshape(Float32[30.0;42.0], 2, 1, 1) + @test ref_val ≈ output_val + end +end + @testset "conv2d_transpose" begin let sess = Session(Graph()) diff --git a/test/train.jl b/test/train.jl index c7ff3a2c..ea5a6a26 100644 --- a/test/train.jl +++ b/test/train.jl @@ -126,8 +126,8 @@ end for m in ["AGD", "CG", "BFGS", "LBFGS"] run(sess, global_variables_initializer()) - options = Optim.Options(show_trace = false, iterations=50, callback = mycallback, allow_f_increases=true) - train.OptimMinimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method=m) + options = Optim.Options(show_trace = false, iterations=10, callback = mycallback, allow_f_increases=true) + train.optim_minimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method=m) end end \ No newline at end of file From e9f2374ccf4de281783f93a5ddcf3aea7badc4c9 Mon Sep 17 00:00:00 2001 From: kailaix Date: Mon, 17 Dec 2018 14:09:15 -0800 Subject: [PATCH 09/10] update --- src/ops/nn.jl | 19 ------------------- src/ops/transformations.jl | 6 +++--- test/nn.jl | 20 -------------------- test/transformations.jl | 8 -------- 4 files changed, 3 insertions(+), 50 deletions(-) diff --git a/src/ops/nn.jl b/src/ops/nn.jl index cb808a50..dcd47a3c 100644 --- a/src/ops/nn.jl +++ b/src/ops/nn.jl @@ -36,25 +36,6 @@ import .rnn_cell: zero_state, output_size, state_size conv2d(input, filter; padding=padding, strides=strides, kwargs...) end -@tf.op function conv1d(input, filter_, strides_::Int64, padding::String; data_format="NHWC", kwargs...) - spatial_start_dim = 0 - if data_format=="NHWC" - strides_ = [1,1,strides_,1] - spatial_start_dim = 2 - elseif data_format == "NCHW" || data_format == "NCW" - data_format = "NCHW" - spatial_start_dim = 3 - strides_ = [1,1,1,strides_] - else - @error "data_format must be NHWC or NCHW or NCW" - end - input = Ops.expand_dims(input, spatial_start_dim) - filter_ = Ops.expand_dims(filter_, 1) - result = Ops.conv2d(input, filter_; strides = strides_, padding = padding, data_format=data_format, kwargs...) - result = Ops.squeeze(result, squeeze_dims=[spatial_start_dim-1]) - return result -end - # Same for max pool @tf.op function max_pool(input, ksize, strides, padding; kwargs...) max_pool(input; ksize=ksize, strides=strides, padding=padding, kwargs...) diff --git a/src/ops/transformations.jl b/src/ops/transformations.jl index 9b9369fa..2d17705c 100644 --- a/src/ops/transformations.jl +++ b/src/ops/transformations.jl @@ -380,16 +380,16 @@ Returns: local result with_op_name(name, "Transpose") do if perm === nothing - r = range(constant(1), LinearAlgebra.rank(n)) + r = range(constant(0), LinearAlgebra.rank(n)-1) perm = reverse(r, [true]) end - result = Ops.transpose(n, perm .- 1) + result = Ops.transpose(n, perm) end result end @op function Base.permutedims(n::AbstractTensor, perm; name=nothing) - transpose(n, perm; name=name) + transpose(n, perm .- 1; name=name) end @define_unary Base.adjoint transpose diff --git a/test/nn.jl b/test/nn.jl index 2bf1e64c..6ef29e52 100644 --- a/test/nn.jl +++ b/test/nn.jl @@ -4,26 +4,6 @@ using StatsFuns using Random import LinearAlgebra -@testset "conv1d" begin - let - sess = Session(Graph()) - F = zeros(Float32, 2, 3, 4) # batch_size = 2, dimension = 3, channle = 4 - for i = 1:2 - for j = 1:3 - for k = 1:4 - F[i,j,k] = Float32(i+j+k-3) - end - end - end - input = constant(F) - filter_ = constant(ones(Float32, 3, 4, 1)) # width = 3, input channel = 4 output channel = 1 - output = nn.conv1d(input, filter_, 2, "VALID") - output_val = run(sess, output) - ref_val = reshape(Float32[30.0;42.0], 2, 1, 1) - @test ref_val ≈ output_val - end -end - @testset "conv2d_transpose" begin let sess = Session(Graph()) diff --git a/test/transformations.jl b/test/transformations.jl index 4c0e9980..1d9658dc 100644 --- a/test/transformations.jl +++ b/test/transformations.jl @@ -33,14 +33,6 @@ end @testset "Permute Dims" begin @test ones(Float32, 4,3) == run(sess, transpose(ones(Tensor, (3, 4)))) @test ones(Float32, 4,3,2) == run(sess, permutedims(ones(Tensor, (4, 2, 3)), [1, 3, 2])) - - A = rand(Float32, 5, 5, 5) - B = permutedims(A, [3,2,1]) - c = TensorFlow.constant(A) - d = transpose(c, [3,2,1]) - result = run(sess, d) - @test maximum(abs.(result-B))≈0.0 - end From 1e3ae4eeee5d9d0bc36b6b45ed281e94a2d39688 Mon Sep 17 00:00:00 2001 From: kailaix Date: Thu, 13 Dec 2018 14:11:34 -0800 Subject: [PATCH 10/10] optim --- REQUIRE | 1 + deps/build.jl | 4 +- deps/default_imports.txt | 3 + src/TensorFlow.jl | 1 + src/train.jl | 131 ++++++++++++++++++++++++++++++++++++++- test/train.jl | 53 ++++++++++++++++ 6 files changed, 190 insertions(+), 3 deletions(-) diff --git a/REQUIRE b/REQUIRE index 080e1af8..79a4e614 100644 --- a/REQUIRE +++ b/REQUIRE @@ -13,3 +13,4 @@ MacroTools 0.3.6 AutoHashEquals 0.1.0 MLDatasets 0.3.0 SpecialFunctions 0.7.0 +Optim 0.17.0 diff --git a/deps/build.jl b/deps/build.jl index 1a104aed..caabe43d 100644 --- a/deps/build.jl +++ b/deps/build.jl @@ -1,8 +1,8 @@ using PyCall using Conda -const cur_version = "1.10.0" -const cur_py_version = "1.10.0" +const cur_version = "1.12.0" +const cur_py_version = "1.12.0" ############################ diff --git a/deps/default_imports.txt b/deps/default_imports.txt index 9368e056..839cf659 100644 --- a/deps/default_imports.txt +++ b/deps/default_imports.txt @@ -166,3 +166,6 @@ Rank Conv2DBackpropInput Svd Cross +FFT +ComplexAbs +MatrixSolve diff --git a/src/TensorFlow.jl b/src/TensorFlow.jl index 69b76c45..228c93ac 100644 --- a/src/TensorFlow.jl +++ b/src/TensorFlow.jl @@ -128,6 +128,7 @@ tf_versioninfo using Distributed +using Optim const pyproc = Ref(0) diff --git a/src/train.jl b/src/train.jl index 0514ce99..d05ae6c2 100644 --- a/src/train.jl +++ b/src/train.jl @@ -7,6 +7,7 @@ apply_gradients, GradientDescentOptimizer, MomentumOptimizer, AdamOptimizer, +optim_minimize, Saver, save, restore, @@ -25,8 +26,10 @@ using Compat using JLD2 using FileIO using ProtoBuf +using Optim import Printf + import ..TensorFlow: Graph, Operation, get_def_graph, extend_graph, gradients, variable_scope, ConstantInitializer, node_name, get_variable, get_shape, get_collection, Session, placeholder, Tensor, Variable, cast, group, @not_implemented, AbstractQueue, tensorflow, add_to_collection, get_proto, get_def, @op import TensorFlow @@ -183,6 +186,132 @@ function apply_gradients(optimizer::AdamOptimizer, grads_and_vars; global_step=n return group(ops...) end +mutable struct OptimOptimizer + indices::Array{Array{Int64}} + segments::Array{Array{Int64}} + sess::Session + vars::Array{Tuple{Any,Any},1} + dtype::Type + feed_dict::Dict +end + +function OptimOptimizer(dtype::Type, loss::Tensor, sess::Session, feed_dict::Dict=Dict()) + var_list = get_def_graph().collections[:TrainableVariables] + vars = zip(gradients(loss, var_list), var_list) |> collect + filter!(x->x[1]!==nothing, vars) + + indices = Array{Int64}[] + segments = Array{Int64}[] + idx = 1 + for i = 1:length(vars) + W = run(sess, vars[i][2], feed_dict) + push!(indices, [ i for i in size(W)]) + push!(segments, [idx; idx+length(W)-1]) + idx += length(W) + end + OptimOptimizer(indices, segments, sess, vars, dtype, feed_dict) +end + +function update_values(opt::OptimOptimizer, x) + for i = 1:length(opt.indices) + x0 = reshape(x[opt.segments[i][1]:opt.segments[i][2]], opt.indices[i]...) + run(opt.sess, tf.assign(opt.vars[i][2], x0)) + end +end + +function compute_grads(opt::OptimOptimizer) + grads = zeros(opt.dtype, opt.segments[end][2]) + for i = 1:length(opt.indices) + grads[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][1], opt.feed_dict) + end + return grads +end + +function compute_init(opt::OptimOptimizer) + x0 = zeros(opt.dtype, opt.segments[end][2]) + for i = 1:length(opt.indices) + x0[opt.segments[i][1]:opt.segments[i][2]] = run(opt.sess, opt.vars[i][2], opt.feed_dict) + end + return x0 +end + +""" +optim_minimize(sess::Session, loss::AbstractTensor; +dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options=nothing) + +`optim_minimize` calls first order optimization solvers from Optim.jl package (https://github.com/JuliaNLSolvers/Optim.jl). +`sess`: current session +`loss`: the loss function to minimize +`dtype`: the computation value type (default Float64) +`feed_dict`: a dictionary for placeholders +`method`: four methods are supported: `LBFGS`(default), `BFGS`, `AGD`(AcceleratedGradientDescent), `CG` +`options`: An Optim.Options instance. See `Optim.jl` documents for details + +Example +======= +``` +function mycallback(handle) + res = run(sess, Loss, Dict(X=>x, Y_obs=>y)) + println("iter \$(handle.iteration): \$(res)") + return false # so it do not stop +end + +options = Optim.Options(show_trace = false, iterations=1000, callback = mycallback, allow_f_increases=true) +optim_minimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method="AGD") +``` + +Note +======= + +Note that this optimizer is not built as part of the graph. Rather, it contructs a function and a gradient function using +`run(sess, ...)` for every iteration. There is drawback for this approach: (1) stochastic gradient descent is not easy to +implement; (2) there is some overhead. However, it would be nice to call the solvers from Optim.jl directly and leverage the +robustness and ffine granite parameter control options. +""" +function optim_minimize(sess::Session, loss::Tensor; + dtype::Type = Float64, feed_dict::Dict = Dict(), method::String = "LBFGS", options::Union{Nothing, Optim.Options}=nothing) + opt = OptimOptimizer(dtype, loss, sess, feed_dict) + function f(x) + update_values(opt, x) + res = run(sess, loss, feed_dict) + return res + end + + function g!(G, x) + update_values(opt, x) + G[:] = compute_grads(opt) + end + + x0 = compute_init(opt) + + optimizer = nothing + if method=="LBFGS" + optimizer = LBFGS() + elseif method=="BFGS" + optimizer = BFGS() + elseif method=="AGD" + optimizer = AcceleratedGradientDescent() + elseif method=="CG" + optimizer = ConjugateGradient() + else + @error """ +Available Optimier: +* LBFGS +* BFGS +* AGD (AcceleratedGradientDescent) +* GC (ConjugateGradient) +""" + + end + if options===nothing + return optimize(f, g!, x0, optimizer) + else + return optimize(f, g!, x0, optimizer, options) + end +end + + + mutable struct Saver var_list max_to_keep @@ -417,4 +546,4 @@ function SummaryWriter(args...; kwargs...) TensorFlow.summary.FileWriter(args...; kwargs...) end -end +end \ No newline at end of file diff --git a/test/train.jl b/test/train.jl index 7a52bf50..ea5a6a26 100644 --- a/test/train.jl +++ b/test/train.jl @@ -1,5 +1,6 @@ using TensorFlow using Test +using Optim @testset "save and resore" begin try @@ -78,3 +79,55 @@ end end end end + + +@testset "optimizers" begin + using Distributions + # Generate some synthetic data + x = randn(100, 50) + w = randn(50, 10) + y_prob = exp.(x*w) + y_prob ./= sum(y_prob,dims=2) + + function draw(probs) + y = zeros(size(probs)) + for i in 1:size(probs, 1) + idx = rand(Categorical(probs[i, :])) + y[i, idx] = 1 + end + return y + end + + y = draw(y_prob) + + # Build the model + sess = Session(Graph()) + + X = placeholder(Float64, shape=[-1, 50]) + Y_obs = placeholder(Float64, shape=[-1, 10]) + + variable_scope("logisitic_model"; initializer=Normal(0, .001)) do + global W = get_variable("W", [50, 10], Float64) + global B = get_variable("B", [10], Float64) + end + + Y=nn.softmax(X*W + B) + Loss = -reduce_sum(log(Y).*Y_obs) + + function mycallback(handle) + res = run(sess, Loss, Dict(X=>x, Y_obs=>y)) + println("iter $(handle.iteration): $(res)") + if isnan(res) || isinf(res) + return true + else + return false # so it do not stop + end + end + + for m in ["AGD", "CG", "BFGS", "LBFGS"] + run(sess, global_variables_initializer()) + options = Optim.Options(show_trace = false, iterations=10, callback = mycallback, allow_f_increases=true) + train.optim_minimize(sess, Loss, feed_dict = Dict(X=>x, Y_obs=>y), options=options, method=m) + end + +end \ No newline at end of file