Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add example with sparse_initializer #1038

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 291 additions & 0 deletions docs/tutorial/plot_sparse_tfidf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
# SPDX-License-Identifier: Apache-2.0

"""
TfIdf, SVC and sparse matrices
==============================

.. index:: sparse

The example is useful to whom wants to convert a pipeline
doing a TfIdfVectorizer + SVC when the features are sparse.

The pipeline
++++++++++++
"""
import os
import pickle
import numpy as np
import scipy
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from onnxruntime import InferenceSession
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.common.data_types import StringTensorType
from skl2onnx.common._topology import Scope, Operator
from skl2onnx.common._container import ModelComponentContainer
from skl2onnx.common.data_types import (
DoubleTensorType,
FloatTensorType,
guess_proto_type,
)


X_train = np.array(
[
"This is the first document",
"This document is the second document.",
"And this is the third one",
"Is this the first document?",
]
).reshape((4, 1))
y_train = np.array([0, 1, 0, 1])

model_pipeline = Pipeline(
[
(
"vectorizer",
TfidfVectorizer(
lowercase=True,
use_idf=True,
ngram_range=(1, 3),
max_features=30000,
),
),
(
"classifier",
SVC(
class_weight="balanced",
kernel="rbf",
gamma="scale",
probability=True,
),
),
]
)
model_pipeline.fit(X_train.ravel(), y_train)

out0 = model_pipeline.steps[0][-1].transform(X_train.ravel())
is_sparse = isinstance(out0, scipy.sparse._csr.csr_matrix)
print(f"Output type for TfIdfVectorizier is {'sparse' if is_sparse else 'dense'}.")

svc_coef = model_pipeline.steps[1][-1].support_vectors_
is_parse = isinstance(svc_coef, scipy.sparse._csr.csr_matrix)
print(f"Supports for SVC is {'sparse' if is_sparse else 'dense'}.")
sparsity = 1 - (svc_coef != 0).sum() / np.prod(svc_coef.shape)
print(f"sparsity={sparsity} and shape={svc_coef.shape}")


######################################
# Size Comparison
# +++++++++++++++

pkl_name = "model.pkl"
with open(pkl_name, "wb") as f:
pickle.dump(model_pipeline, f)

onx_name = "model.onnx"
onx = to_onnx(
model_pipeline,
initial_types=[("input", StringTensorType([None, 1]))],
options={SVC: {"zipmap": False}},
target_opset=18,
)
with open(onx_name, "wb") as f:
f.write(onx.SerializeToString())

print(f"pickle size={os.stat(pkl_name).st_size}")
print(f"onnx size={os.stat(onx_name).st_size}")

#######################################
# On such small model, it does not show that SVC is using a sparse matrix
# and ONNX SVMClassifier is using a dense one. If the matrix is 90% sparse,
# this part becomes 10 times bigger once converter into ONNX.
#
# Tweak
# +++++
#
# The idea is to take out the matrix of coefficient out of SVC by
# reducing the number dimensions.
# We could apply a PCA but it does not support sparse features.
# TruncatedSVD does but the matrix it produces to reduce the dimension
# is dense. SparsePCA does not support sparse feature as well.
# Let's try something custom: a TruncatedSVD and then some small coefficient
# will be set to zero.


class SparseTruncatedSVD(TruncatedSVD):
def __init__(
self,
n_components=2,
*,
algorithm="randomized",
n_iter=5,
n_oversamples=10,
power_iteration_normalizer="auto",
random_state=None,
tol=0.0,
sparsity=0.9,
):
TruncatedSVD.__init__(
self,
n_components,
algorithm=algorithm,
n_iter=n_iter,
n_oversamples=n_oversamples,
power_iteration_normalizer=power_iteration_normalizer,
random_state=random_state,
tol=tol,
)
self.sparsity = sparsity

def fit_transform(self, X, y=None):
TruncatedSVD.fit_transform(self, X, y)

# The matrix. We could choose the coefficients to set to zero
# by minimizing `(X @ M.T - X @ M0.T) ** 2`
# where M is the original matrix and M0 the new one.
# In a first approach, we just sort the coefficients by absolute value.
components = self.components_.ravel()
flat = list((v, i) for i, v in enumerate(np.abs(components)))
flat.sort()
last_index = int(self.sparsity * len(flat))
for tu in flat[:last_index]:
components[tu[1]] = 0
self.components_ = scipy.sparse.coo_matrix(
components.reshape(self.components_.shape)
)
return self.transform(X)


sparse_pipeline = Pipeline(
[
(
"vectorizer",
TfidfVectorizer(
lowercase=True,
use_idf=True,
ngram_range=(1, 3),
max_features=30000,
),
),
("sparse", SparseTruncatedSVD(10, sparsity=0.6)),
(
"classifier",
SVC(
class_weight="balanced",
kernel="rbf",
gamma="scale",
probability=True,
),
),
]
)
sparse_pipeline.fit(X_train.ravel(), y_train)

expected = model_pipeline.predict(X_train.ravel())
got = sparse_pipeline.predict(X_train.ravel())
print(f"Number of different predicted labels: {((expected-got)==0).sum()}")

expected = model_pipeline.predict_proba(X_train.ravel())
got = sparse_pipeline.predict_proba(X_train.ravel())
diff = np.abs(expected - got)
print(f"Average absolute difference for the probabilities: {diff.max(axis=1)}")

######################################
# Conversion to ONNX
# ++++++++++++++++++
#
# The new transformer cannot be converted because sklearn-onnx does not have any
# registered converter for it. We must implement it.
# We use the converter for TruncatedSVD as a base and a sparse matrix multiplication
# implemented in onnxruntime (see `OperatorKernels.md
# <https://github.com/microsoft/onnxruntime/blob/main/docs/OperatorKernels.md>`_).


def calculate_sparse_sklearn_truncated_svd_output_shapes(operator):
cls_type = operator.inputs[0].type.__class__
if cls_type != DoubleTensorType:
cls_type = FloatTensorType
N = operator.inputs[0].get_first_dimension()
K = operator.raw_operator.n_components
operator.outputs[0].type = cls_type([N, K])


def convert_sparse_truncated_svd(
scope: Scope, operator: Operator, container: ModelComponentContainer
):
# Create alias for the scikit-learn truncated SVD model we
# are going to convert
svd = operator.raw_operator
if isinstance(operator.inputs[0].type, DoubleTensorType):
proto_dtype = guess_proto_type(operator.inputs[0].type)
else:
proto_dtype = guess_proto_type(FloatTensorType())
# Transpose [K, C] matrix to [C, K], where C/K is the
# input/transformed feature dimension
transform_matrix = svd.components_
transform_matrix_name = scope.get_unique_variable_name("transform_matrix")
# Put the transformation into an ONNX tensor
container.add_initializer(
transform_matrix_name,
proto_dtype,
transform_matrix.shape,
transform_matrix,
)

input_name = operator.inputs[0].full_name

transposed_inputs = scope.get_unique_variable_name("transposed_inputs")
container.add_node("Transpose", input_name, transposed_inputs, perm=[1, 0])

transposed_outputs = scope.get_unique_variable_name("transposed_outputs")
container.add_node(
"SparseToDenseMatMul",
[transform_matrix_name, transposed_inputs],
transposed_outputs,
op_domain="com.microsoft",
op_version=1,
)
container.add_node(
"Transpose", transposed_outputs, operator.outputs[0].full_name, perm=[1, 0]
)


update_registered_converter(
SparseTruncatedSVD,
"SparseTruncatedSVD",
calculate_sparse_sklearn_truncated_svd_output_shapes,
convert_sparse_truncated_svd,
)

sparse_onx_name = "model_sparse.onnx"
sparse_onx = to_onnx(
sparse_pipeline,
initial_types=[("input", StringTensorType([None, 1]))],
options={SVC: {"zipmap": False}},
target_opset=18,
)
print(sparse_onx)
with open(sparse_onx_name, "wb") as f:
f.write(sparse_onx.SerializeToString())

print(f"pickle size={os.stat(pkl_name).st_size}")
print(f"onnx size={os.stat(onx_name).st_size}")
print(f"sparse onnx size={os.stat(sparse_onx_name).st_size}")

############################################
# Let's check it is working with onnxruntime.

sess = InferenceSession(sparse_onx_name, providers=["CPUExecutionProvider"])
got = sess.run(None, {"input": X_train})
print(got)


######################################
# Conclusion
# ++++++++++
#
# This option decreases the size of the onnx model by using one
# sparse matrix in the converted pipeline. It may bring an accuracy loss.
7 changes: 5 additions & 2 deletions skl2onnx/algebra/onnx_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from onnx.helper import make_graph, make_model
from onnx.numpy_helper import from_array
from scipy.sparse import coo_matrix
from ..proto import TensorProto
from ..proto import SparseTensorProto, TensorProto
from ..common.data_types import _guess_type_proto_str, _guess_type_proto_str_inv
from ..common._topology import (
Variable,
Expand Down Expand Up @@ -1101,7 +1101,10 @@ def to_onnx(
model_name,
container.inputs,
container.outputs,
container.initializers,
[i for i in container.initializers if isinstance(i, TensorProto)],
sparse_initializer=[
i for i in container.initializers if isinstance(i, SparseTensorProto)
],
)
onnx_model = make_model(graph)

Expand Down
16 changes: 6 additions & 10 deletions skl2onnx/common/_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def add_initializer(self, name, onnx_type, shape, content):
"Sparse matrices require SparseTensorProto. Update onnx."
)
values_tensor = make_tensor(
name + "_v",
name,
data_type=onnx_type,
dims=(len(content.data),),
vals=content.data,
Expand Down Expand Up @@ -547,14 +547,7 @@ def add_initializer(self, name, onnx_type, shape, content):
cached_name = self.initializers_strings.get(content, None)
if cached_name is None:
self.initializers_strings[content] = name
self.add_node(
"Constant",
[],
[name],
sparse_value=sparse_tensor,
op_version=self.target_opset,
name=name + "_op",
)
self.initializers.append(sparse_tensor)
return sparse_tensor

self.add_node(
Expand Down Expand Up @@ -872,8 +865,10 @@ def ensure_topological_order(self):
name = inp.name
order[name] = 0
for inp in self.initializers:
name = inp.name
name = inp.name if hasattr(inp, "name") else inp.values.name
order[name] = 0
print("#", type(inp), name)
print("---", order)

n_iter = 0
missing_ops = []
Expand All @@ -891,6 +886,7 @@ def ensure_topological_order(self):
else:
maxi = None
missing_names.add(name)
print("***", name, order, node.input)
break
if maxi is None:
missing_ops.append(node)
Expand Down
18 changes: 16 additions & 2 deletions skl2onnx/common/_onnx_optimisation_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,14 @@ def _rename_graph_output(graph, old_name, new_name):
outputs.append(value_info)
nodes = list(graph.node)
nodes.append(_make_node("Identity", [old_name], [new_name]))
new_graph = make_graph(nodes, graph.name, graph.input, outputs, graph.initializer)
new_graph = make_graph(
nodes,
graph.name,
graph.input,
outputs,
graph.initializer,
sparse_initializer=graph.sparse_initializer,
)
new_graph.value_info.extend(graph.value_info)
return new_graph

Expand All @@ -207,7 +214,14 @@ def _rename_graph_input(graph, old_name, new_name):
inputs.append(value_info)
nodes = list(graph.node)
nodes.append(_make_node("Identity", [new_name], [old_name]))
new_graph = make_graph(nodes, graph.name, inputs, graph.output, graph.initializer)
new_graph = make_graph(
nodes,
graph.name,
inputs,
graph.output,
graph.initializer,
sparse_initializer=graph.sparse_initializer,
)
new_graph.value_info.extend(graph.value_info)
return new_graph

Expand Down
Loading
Loading