sliced prox allows operators

nantonel · nantonel · commit 8dfe4426b9e7 · 2018-03-05T18:32:52.000+01:00
diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md
@@ -12,13 +12,13 @@ where $f$ is a smooth function while $g$ is possibly nonsmooth.
 
 ## Unconstraint optimization
 
-The LASSO problem is popular example of this class of problems: 
+The *least absolute shrinkage and selection operator* (LASSO) belongs to this class of problems: 
 
 ```math
-\underset{ \mathbf{x} }{\text{minimize}} \ \tfrac{1}{2} \| \mathbf{A} \mathbf{x} - \mathbf{y} \|^2+  \| \mathbf{x} \|_1.
+\underset{ \mathbf{x} }{\text{minimize}} \ \tfrac{1}{2} \| \mathbf{A} \mathbf{x} - \mathbf{y} \|^2+ \lambda \| \mathbf{x} \|_1.
 ```
 
-Here the squared norm $\tfrac{1}{2} \| \mathbf{A} \mathbf{x} - \mathbf{y} \|^2$ is a _smooth_ function while the $l_1$-norm is a _nonsmooth_ function.
+Here the squared norm $\tfrac{1}{2} \| \mathbf{A} \mathbf{x} - \mathbf{y} \|^2$ is a *smooth* function while the $l_1$-norm is a *nonsmooth* function.
 
 This can be solved using `StructuredOptimization.jl` using only few lines of code:
 
@@ -64,13 +64,13 @@ for a nonempty set $\mathcal{S}$ the constraint of
 can be converted into an indicator function
 
 ```math
-g(\mathbf{x}) =  \begin{cases}
+g(\mathbf{x}) = \delta_{\mathcal{S}} (\mathbf{x}) =  \begin{cases}
     0       & \text{if} \ \mathbf{x} \in \mathcal{S},\\
     +\infty & \text{otherwise},
     \end{cases}
 ```
 
-to obtain the standard form. Constraints are treated as _nonsmooth functions_.
+to obtain the standard form. Constraints are treated as *nonsmooth functions*.
 
 This conversion is automatically performed by `StructuredOptimization.jl`.
 
@@ -133,35 +133,54 @@ julia> @minimize ls(X1*X2-Y) st X1 >= 0., X2 >= 0.
 
 ## Limitations
 
-**TODO simplify this**
+Currently `StructuredOptimization.jl` supports only *Proximal Gradient (aka Forward Backward) algorithms*, which require specific properties of the nonsmooth functions and costraint to be applicable.
 
-Currently `StructuredOptimization.jl` supports only Proximal Gradient (aka Forward Backward) algorithms, which require certain properties of the nonsmooth functions and costraint.
-
-In the general case a nonsmooth function of $M$ variables composed by $G$ terms can be written as: 
+If we express the nonsmooth function $g$ as the composition of 
+a function $\tilde{g}$ with a linear operator $A$: 
 ```math
-g(\mathbf{x}_1,\dots,\mathbf{x}_M) =
-\sum_{i = 0}^G g_i \left(\sum_{j = 1}^{M}
-A_{i,j} \mathbf{x}_j \right).
+g(\mathbf{x}) =
+\tilde{g}(A \mathbf{x}) 
 ```
-where the functions $g_i$ are nonsmooth functions (or indicator functions resulting from constraints) and $A_{i,j}$ linear operators.
-
-The problem can be solved when $g$ satisfies the following conditions:
+than the problem can be solved when $g$ satisifies the following properties:
 
-1. for all $i\in \{1,\ldots,G \}$ and $j\in\{1,\ldots,M \}$, mapping $A_{i,j}$ satisfies $A_{i,j}^* A_{i,j} = \mu_{i,j} I$, where $\mu_{i,j} \geq 0$, $A^*$ is the adjoint of $A$ and $\mathcal{I}$ is the identity operator.
+1. the mapping $A$ must be a *tight frame*  namely it must satisfy $A A^* = \mu Id$, where $\mu \geq 0$ and $A^*$ is the adjoint of $A$ and $Id$ is the identity operator.
 
-2. for all $j \in \{1,\dots,M \}$, the cardinality of $\{i | A_{i,j} \neq 0 \} = 1$. 
+2. if $A$ is not a tight frame, than it must be possible write $g$ as a *separable* sum $g(\mathbf{x}) =  \sum_j h_j (B_j \mathbf{x}_j)$ with $\mathbf{x}_j$ being a non-overlapping slices of $\mathbf{x}$ and $B_j$ being tight frames.
 
 Let us analyze these rules with a series of examples. 
 
-The previous example was satisfing the rules:
+The LASSO example above satisfy the first rule:
 ```julia
-@minimize ls(X1*X2-Y) st X1 >= 0., X2 >= 0.
+julia> @minimize ls( A*x - y ) + λ*norm(x, 1)
+
 ```
-Here there are two constraints each one containing only one variable and 
+since the non-smooth function $\lambda \| \cdot \|_1$ is not composed with any operator (or equivalently is composed with $Id$ which is a tight frame). 
 
+Also the following problem would be accepted:
+```julia
+julia> @minimize ls( A*x - y ) + λ*norm(dct(x), 1)
 
+```
+since the discrete cosine transform (DCT) is orthogonal and is therefore a tight frame.
 
+On the other hand, the following problem 
+```julia
+julia> @minimize ls( A*x - y ) + λ*norm(x, 1) st x >= 1.0
 
+```
+cannot be solved through proximal gradient algorithms, since the second rule would be violated. 
+Here the constraint would be converted into an indicator function and the nonsmooth function $g$ can be written as the sum: 
 
+```math
+g(\mathbf{x}) =\lambda \| \mathbf{x} \|_1 + \delta_{\mathcal{S}} (\mathbf{x})
+```
+
+which is not separable.
 
+On the other hand this problem would be accepted:
+```julia
+julia> @minimize ls( A*x - y ) + λ*norm(x[1:div(n,2)], 1) st x[div(n,2)+1:n] >= 1.0
+
+```
+as not the optimization variables $\mathbf{x}$ are partitioned into non-overlapping groups.
 
diff --git a/src/solvers/terms_extract.jl b/src/solvers/terms_extract.jl
@@ -70,14 +70,23 @@ end
 
 # extract function and merge operator
 function extract_merge_functions(t::Term)
-    if is_eye(operator(t)) 
+    if is_sliced(t)
+        if typeof(operator(t)) <: Compose
+            op = operator(t).A[2]
+        else
+            op = Eye(size(operator(t),1)...)
+        end
+    else
+        op = operator(t)
+    end
+    if is_eye(op) 
         f = displacement(t) == 0 ? t.f : PrecomposeDiagonal(t.f, 1.0, displacement(t))
-    elseif is_diagonal(operator(t))
-        f = PrecomposeDiagonal(t.f, diag(operator(t)), displacement(t))
-    elseif is_AAc_diagonal(operator(t))
-        f = Precompose(t.f, operator(t), diag_AAc(operator(t)), displacement(t))
+    elseif is_diagonal(op)
+        f = PrecomposeDiagonal(t.f, diag(op), displacement(t))
+    elseif is_AAc_diagonal(op)
+        f = Precompose(t.f, op, diag_AAc(op), displacement(t))
     end
-	f = t.lambda == 1. ? f : Postcompose(f, t.lambda)                                  #for now I keep this
+	f = t.lambda == 1. ? f : Postcompose(f, t.lambda) #for now I keep this
 	#TODO change this
 	return f
 end
@@ -95,13 +104,14 @@ function extract_proximable(xAll::NTuple{N,Variable}, t::NTuple{M,Term}) where {
 			fx = IndFree()
 		elseif length(tx) == 1          #only one term per variable
 			fx = extract_proximable(x,tx[1])
-		else                            #multiple terms per variable
-			                        #currently this happens only with GetIndex
-		
+		else                            
+            #multiple terms per variable
+            #currently this happens only with GetIndex
 			fxi,idxs = (),()
 			for ti in tx
-				fxi  = (fxi..., extract_functions(ti))
-				idxs = (idxs...,operator(ti).idx     )
+				fxi  = (fxi..., extract_merge_functions(ti))
+                idx = typeof(operator(ti)) <: Compose ? operator(ti).A[1].idx : operator(ti).idx
+				idxs = (idxs...,  idx   )
 			end
 			fx = SlicedSeparableSum(fxi,idxs)
 		end
diff --git a/src/solvers/terms_properties.jl b/src/solvers/terms_properties.jl
@@ -14,18 +14,12 @@ function is_proximable(terms::Tuple)
 	for v in vars
 		tv = [t for t in terms if v in variables(t)]
 		if length(tv) != 1
-			#TODO make this more general
-			if all( (<:).(typeof.(operator.(tv)), GetIndex) )
+            if all( is_sliced.(tv) ) && all( is_proximable.(tv) )
 				return true
 			else
 				return false
 			end
 		end
 	end
-	# NOTE: I see why GetIndex requires a special case. However, it is a more
-	# general case than just GetIndex, and I would postpone its implementation,
-	# unless we have a very concrete and important example where this is
-	# strictly required...
-	# I agree... but we have this in the Audio Declipping demo!
 	return true
 end
diff --git a/src/syntax/terms/term.jl b/src/syntax/terms/term.jl
@@ -66,7 +66,9 @@ is_f = [:is_linear,
         :is_orthogonal,
         :is_invertible,
         :is_full_row_rank,
-        :is_full_column_rank]
+        :is_full_column_rank,
+        :is_sliced
+       ]
 
 for f in is_f
 	@eval begin
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -22,6 +22,7 @@ end
 
 @testset "Problem construction" begin
   include("test_problem.jl")
+  include("test_build_minimize.jl")
 end
 
 @testset "Integration tests" begin
diff --git a/test/test_build_minimize.jl b/test/test_build_minimize.jl
@@ -0,0 +1,50 @@
+@printf("\n Testing solver build \n")
+
+x = Variable(10)
+A = randn(5, 10)
+y = Variable(7)
+B = randn(5, 7)
+b = randn(5)
+
+prob = problem(ls(A*x + b), norm(x, 2) <= 1.0)
+built_slv = build(prob, StructuredOptimization.PG())
+solve!(built_slv)
+
+~x .= 0.
+prob = problem(ls(A*x - B*y + b) + norm(y, 1), norm(x, 2) <= 1.0)
+built_slv = build(prob, FPG())
+solve!(built_slv)
+
+@printf("\n Testing @minimize \n")
+~x .= 0.
+~y .= 0.
+slv, = @minimize ls(A*x - B*y + b) st norm(x, 2) <= 1e4, norm(y, 1) <= 1.0 with PG()
+~x .= 0.
+slv, = @minimize ls(A*x - b) st norm(x, 1) <= 1.0 with PG()
+~x .= 0.
+slv, = @minimize ls(A*x - b) st norm(x, 1) <= 1.0
+~x .= 0.
+slv, = @minimize ls(A*x - b) + norm(x, 1) with PG()
+~x .= 0.
+slv, = @minimize ls(A*x - b) + norm(x, 1)
+~x .= 0.
+slv, = @minimize ls(A*x - b)
+
+#TODO many many more tests
+x = Variable(5)
+A = randn(10, 5)
+b = randn(10)
+
+@printf("\n Testing @minimize nonlinear \n")
+slv, = @minimize ls(sigmoid(A*x,10) - b)+norm(x,1) with PG()
+xpg = copy(~x)
+~x .= 0.
+slv, = @minimize ls(sigmoid(A*x,10) - b)+norm(x,1) with ZeroFPR()
+xz = copy(~x)
+~x .= 0.
+slv, = @minimize ls(sigmoid(A*x,10) - b)+norm(x,1) with PANOC()
+xp = copy(~x)
+~x .= 0.
+
+@test norm(xz-xpg) <1e-4
+@test norm(xp-xpg) <1e-4
diff --git a/test/test_problem.jl b/test/test_problem.jl
@@ -188,13 +188,22 @@ f = StructuredOptimization.extract_proximable(xAll,cf)
 #@test norm(f(~x) - 10*norm(fft(~x)-b,1)) < 1e-12
 
 # single variable, multiple terms with GetIndex
- x = Variable(randn(5))
- b = randn(2)
- cf = 10*norm(x[1:2]-b,1)+norm(x[3:5],2)
- xAll = StructuredOptimization.extract_variables(cf)
- @test StructuredOptimization.is_proximable(cf) == true
- f = StructuredOptimization.extract_proximable(xAll,cf)
- @test norm(f(~x) - sum([10*norm((~x)[1:2]-b,1);norm((~x)[3:5],2)])) < 1e-12
+x = Variable(randn(5))
+b = randn(2)
+cf = 10*norm(x[1:2]-b,1)+norm(x[3:5],2)
+xAll = StructuredOptimization.extract_variables(cf)
+@test StructuredOptimization.is_proximable(cf) == true
+f = StructuredOptimization.extract_proximable(xAll,cf)
+@test norm(f(~x) - sum([10*norm((~x)[1:2]-b,1);norm((~x)[3:5],2)])) < 1e-12
+
+# single variable, multiple terms with GetIndex composed with dct
+x = Variable(randn(5))
+b = randn(2)
+cf = 10*norm(x[1:2]-b,1)+norm(dct(x[3:5]),2)
+xAll = StructuredOptimization.extract_variables(cf)
+@test StructuredOptimization.is_proximable(cf) == true
+f = StructuredOptimization.extract_proximable(xAll,cf)
+@test norm(f(~x) - sum([10*norm((~x)[1:2]-b,1);norm(dct((~x)[3:5]),2)])) < 1e-12
 
 # multiple variables, multiple terms
 x1 = Variable(randn(5))
@@ -246,54 +255,3 @@ xAll = (x1,x2)
 f = StructuredOptimization.extract_proximable(xAll,cf)
 @test norm(f.fs[1](~x1)-norm((~x1)[1:2]+b1[1:2],2)-norm((~x1)[3:5]+b1[3:5],1) ) < 1e-12
 @test norm(f.fs[2](~x2)-10*norm(~x2-b2,1) ) < 1e-12
-
-@printf("\n Testing solver build \n")
-
-x = Variable(10)
-A = randn(5, 10)
-y = Variable(7)
-B = randn(5, 7)
-b = randn(5)
-
-prob = problem(ls(A*x + b), norm(x, 2) <= 1.0)
-built_slv = build(prob, StructuredOptimization.PG())
-solve!(built_slv)
-
-~x .= 0.
-prob = problem(ls(A*x - B*y + b) + norm(y, 1), norm(x, 2) <= 1.0)
-built_slv = build(prob, FPG())
-solve!(built_slv)
-
-@printf("\n Testing @minimize \n")
-~x .= 0.
-~y .= 0.
-slv, = @minimize ls(A*x - B*y + b) st norm(x, 2) <= 1e4, norm(y, 1) <= 1.0 with PG()
-~x .= 0.
-slv, = @minimize ls(A*x - b) st norm(x, 1) <= 1.0 with PG()
-~x .= 0.
-slv, = @minimize ls(A*x - b) st norm(x, 1) <= 1.0
-~x .= 0.
-slv, = @minimize ls(A*x - b) + norm(x, 1) with PG()
-~x .= 0.
-slv, = @minimize ls(A*x - b) + norm(x, 1)
-~x .= 0.
-slv, = @minimize ls(A*x - b)
-
-#TODO many many more tests
-x = Variable(5)
-A = randn(10, 5)
-b = randn(10)
-
-@printf("\n Testing @minimize nonlinear \n")
-slv, = @minimize ls(sigmoid(A*x,10) - b)+norm(x,1) with PG()
-xpg = copy(~x)
-~x .= 0.
-slv, = @minimize ls(sigmoid(A*x,10) - b)+norm(x,1) with ZeroFPR()
-xz = copy(~x)
-~x .= 0.
-slv, = @minimize ls(sigmoid(A*x,10) - b)+norm(x,1) with PANOC()
-xp = copy(~x)
-~x .= 0.
-
-@test norm(xz-xpg) <1e-4
-@test norm(xp-xpg) <1e-4