diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..291d057 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +/target +**/*.rs.bk +Cargo.lock + +# IDEs +.idea/ +tags diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..3008f0f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "linfa" +version = "0.1.0" +authors = ["LukeMathWalker "] +edition = "2018" + +[dependencies] + +[dev-dependencies] +ndarray = "0.12.1" +ndarray-rand = "0.9.0" +rand = "*" +derive_more = "0.13.0" diff --git a/examples/running_mean/main.rs b/examples/running_mean/main.rs new file mode 100644 index 0000000..1edc81e --- /dev/null +++ b/examples/running_mean/main.rs @@ -0,0 +1,59 @@ +extern crate linfa; +extern crate ndarray; +extern crate ndarray_rand; +extern crate rand; +#[macro_use] +extern crate derive_more; + +use crate::standard_scaler::{Config, OnlineOptimizer, ScalingError, StandardScaler}; +use linfa::{Fit, IncrementalFit, Transformer}; +use ndarray::{stack, Array1, ArrayBase, Axis, Data, Ix1}; +use ndarray_rand::RandomExt; +use rand::distributions::Uniform; + +mod standard_scaler; + +fn generate_batch(n_samples: usize) -> (Array1, Array1) { + let distribution = Uniform::new(0., 10.); + let x = Array1::random(n_samples, distribution); + let y = Array1::random(n_samples, distribution); + (x, y) +} + +fn check(scaler: &StandardScaler, x: &ArrayBase) -> Result<(), ScalingError> +where + S: Data, +{ + let old_batch_mean = x.mean_axis(Axis(0)).into_scalar(); + let new_batch_mean = scaler.transform(&x)?.mean_axis(Axis(0)).into_scalar(); + let old_batch_std = x.std_axis(Axis(0), 1.).into_scalar(); + let new_batch_std = scaler.transform(&x)?.std_axis(Axis(0), 1.).into_scalar(); + println!( + "The mean.\nBefore scaling: {:?}\nAfter scaling: {:?}\n", + old_batch_mean, new_batch_mean + ); + println!( + "The std deviation.\nBefore scaling: {:?}\nAfter scaling: {:?}\n", + old_batch_std, new_batch_std + ); + Ok(()) +} + +/// Run it with: cargo run --example running_mean +fn main() -> Result<(), ScalingError> { + let n_samples = 20; + let (x, y) = generate_batch(n_samples); + + let mut optimizer = OnlineOptimizer::default(); + let standard_scaler = optimizer.fit(&x, &y, Config::default())?; + + check(&standard_scaler, &x)?; + + let (x2, y2) = generate_batch(n_samples); + let standard_scaler = optimizer.incremental_fit(&x2, &y2, standard_scaler)?; + + let whole_x = stack(Axis(0), &[x.view(), x2.view()]).expect("Failed to stack arrays"); + check(&standard_scaler, &whole_x)?; + + Ok(()) +} diff --git a/examples/running_mean/standard_scaler/config.rs b/examples/running_mean/standard_scaler/config.rs new file mode 100644 index 0000000..8cb33ba --- /dev/null +++ b/examples/running_mean/standard_scaler/config.rs @@ -0,0 +1,24 @@ +use crate::standard_scaler::{Input, Output, StandardScaler}; +use linfa::Blueprint; +use ndarray::Data; + +pub struct Config { + // Delta degrees of freedom. + // With ddof = 1, you get the sample standard deviation + // With ddof = 0, you get the population standard deviation + pub ddof: f64, +} + +/// Defaults to computing the sample standard deviation. +impl Default for Config { + fn default() -> Self { + Self { ddof: 1. } + } +} + +impl Blueprint, Output> for Config +where + S: Data, +{ + type Transformer = StandardScaler; +} diff --git a/examples/running_mean/standard_scaler/error.rs b/examples/running_mean/standard_scaler/error.rs new file mode 100644 index 0000000..cb828ae --- /dev/null +++ b/examples/running_mean/standard_scaler/error.rs @@ -0,0 +1,7 @@ +use std::error::Error; + +/// Fast-and-dirty error struct +#[derive(Debug, Eq, PartialEq, From, Display)] +pub struct ScalingError {} + +impl Error for ScalingError {} diff --git a/examples/running_mean/standard_scaler/mod.rs b/examples/running_mean/standard_scaler/mod.rs new file mode 100644 index 0000000..da91ba9 --- /dev/null +++ b/examples/running_mean/standard_scaler/mod.rs @@ -0,0 +1,15 @@ +use ndarray::{Array1, ArrayBase, Ix1}; + +/// Short-hand notations +type Input = ArrayBase; +type Output = Array1; + +mod config; +mod error; +mod optimizer; +mod transformer; + +pub use config::Config; +pub use error::ScalingError; +pub use optimizer::OnlineOptimizer; +pub use transformer::StandardScaler; diff --git a/examples/running_mean/standard_scaler/optimizer.rs b/examples/running_mean/standard_scaler/optimizer.rs new file mode 100644 index 0000000..2227939 --- /dev/null +++ b/examples/running_mean/standard_scaler/optimizer.rs @@ -0,0 +1,93 @@ +use crate::standard_scaler::{Config, Input, Output, ScalingError, StandardScaler}; +use linfa::{Fit, IncrementalFit}; +use ndarray::{Axis, Data}; + +/// It keeps track of the number of samples seen so far, to allow for +/// incremental computation of mean and standard deviation. +pub struct OnlineOptimizer { + pub n_samples: u64, +} + +/// Initialize n_samples to 0. +impl Default for OnlineOptimizer { + fn default() -> Self { + Self { n_samples: 0 } + } +} + +impl Fit, Output> for OnlineOptimizer +where + S: Data, +{ + type Error = ScalingError; + + fn fit( + &mut self, + inputs: &Input, + _targets: &Output, + blueprint: Config, + ) -> Result { + if inputs.len() == 0 { + return Err(ScalingError {}); + } + // Compute relevant quantities + let mean = inputs.mean_axis(Axis(0)).into_scalar(); + let standard_deviation = inputs.std_axis(Axis(0), blueprint.ddof).into_scalar(); + // Initialize n_samples using the array length + self.n_samples = inputs.len() as u64; + // Return new, tuned scaler + Ok(StandardScaler { + ddof: blueprint.ddof, + mean, + standard_deviation, + }) + } +} + +impl IncrementalFit, Output> for OnlineOptimizer +where + S: Data, +{ + type Error = ScalingError; + + fn incremental_fit( + &mut self, + inputs: &Input, + _targets: &Output, + transformer: StandardScaler, + ) -> Result { + if inputs.len() == 0 { + // Nothing to be done + return Ok(transformer); + } + + let ddof = transformer.ddof; + + // Compute relevant quantities for the new batch + let batch_n_samples = inputs.len(); + let batch_mean = inputs.mean_axis(Axis(0)).into_scalar(); + let batch_std = inputs.std_axis(Axis(0), ddof).into_scalar(); + + // Update + let mean_delta = batch_mean - transformer.mean; + let new_n_samples = self.n_samples + (batch_n_samples as u64); + let new_mean = + transformer.mean + mean_delta * (batch_n_samples as f64) / (new_n_samples as f64); + let new_std = ((transformer.standard_deviation.powi(2) * (self.n_samples as f64 - ddof) + + batch_std.powi(2) * (batch_n_samples as f64 - ddof) + + mean_delta.powi(2) * (self.n_samples as f64) * (batch_n_samples as f64) + / (new_n_samples as f64)) + / (new_n_samples as f64 - ddof)) + .sqrt(); + + // Update n_samples + self.n_samples = new_n_samples; + + // Return tuned scaler + Ok(StandardScaler { + ddof, + mean: new_mean, + standard_deviation: new_std, + }) + } +} diff --git a/examples/running_mean/standard_scaler/transformer.rs b/examples/running_mean/standard_scaler/transformer.rs new file mode 100644 index 0000000..dcae233 --- /dev/null +++ b/examples/running_mean/standard_scaler/transformer.rs @@ -0,0 +1,29 @@ +use crate::standard_scaler::{Input, Output, ScalingError}; +use linfa::Transformer; +use ndarray::Data; + +/// Given an input, it rescales it to have zero mean and unit variance. +/// +/// We use 64-bit floats for simplicity. +pub struct StandardScaler { + // Delta degrees of freedom. + // With ddof = 1, you get the sample standard deviation + // With ddof = 0, you get the population standard deviation + pub ddof: f64, + pub mean: f64, + pub standard_deviation: f64, +} + +impl Transformer, Output> for StandardScaler +where + S: Data, +{ + type Error = ScalingError; + + fn transform(&self, inputs: &Input) -> Result + where + S: Data, + { + Ok((inputs - self.mean) / self.standard_deviation) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..c9435fc --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,133 @@ +use std::error; +use std::iter; + +/// The basic `Transformer` trait. +/// +/// It is training-agnostic: a transformer takes an input and returns an output. +/// +/// There might be multiple ways to discover the best settings for every +/// particular algorithm (e.g. training a logistic regressor using +/// a pseudo-inverse matrix vs using gradient descent). +/// It doesn't matter: the end result, the transformer, is a set of parameters. +/// The way those parameter originated is an orthogonal concept. +/// +/// In the same way, it has no notion of loss or "correct" predictions. +/// Those concepts are embedded elsewhere. +/// +/// It's generic over input and output types: +/// - you can transform a fully in-memory dataset; +/// - you can transform a stream of data; +/// - you can return a class; +/// - you can return a probability distribution. +/// +/// The mechanism for selecting the desired output, when not self-evident from the downstream +/// usage, should be the same of the `::collect()` method. +pub trait Transformer { + type Error: error::Error; + + fn transform(&self, inputs: &I) -> Result; +} + +/// One step closer to the peak. +/// +/// `Fit` is generic over a type `B` implementing the `Blueprint` trait: `B::Transformer` is used to +/// constrain what type of inputs and targets are acceptable. +/// +/// `fit` takes an instance of `B` as one of its inputs, `blueprint`: it's consumed with move +/// semantics and a new transformer is returned. +/// +/// Different types implementing `Fit` can work on the same `Blueprint` type! +/// +/// It's a transition in the transformer state machine: from `Blueprint` to `Transformer`. +/// +/// It's generic over input and output types: +/// - you can fit on a fully in-memory dataset; +/// - you can fit on a stream of data; +/// - you can use integer-encoded class membership as a target; +/// - you can use a one-hot-encoded class membership as a target. +pub trait Fit +where + B: Blueprint, +{ + type Error: error::Error; + + fn fit(&mut self, inputs: &I, targets: &O, blueprint: B) + -> Result; +} + +/// We are not done with that `Transformer` yet. +/// +/// `IncrementalFit` is generic over a type `T` implementing the `Transformer` trait: `T` is used to +/// constrain what type of inputs and targets are acceptable. +/// +/// `incremental_fit` takes an instance of `T` as one of its inputs, `transformer`: it's consumed with move +/// semantics and a new transformer is returned. +/// +/// It's a transition in the transformer state machine: from `Transformer` to `Transformer`. +/// +/// It's generic over input and output types: +/// - you can fit on a fully in-memory dataset; +/// - you can fit on a stream of data; +/// - you can use integer-encoded class membership as a target; +/// - you can use a one-hot-encoded class membership as a target. +pub trait IncrementalFit +where + T: Transformer, +{ + type Error: error::Error; + + fn incremental_fit( + &mut self, + inputs: &I, + targets: &O, + transformer: T, + ) -> Result; +} + +/// Where `Transformer`s are forged. +/// +/// `Blueprint` is a marker trait: it identifies what types can be used as starting points for +/// building `Transformer`s. It's the initial stage of the transformer state machine. +/// +/// Every `Blueprint` is associated to a single `Transformer` type (is it wise to do so?). +/// +/// For the same transformer type `T`, nothing prevents a user from providing more than one `Blueprint`: +/// multiple initialization strategies can sometimes be used to be build the same transformer type. +/// +/// Each of these strategies can take different (hyper)parameters, even though they return an +/// instance of the same transformer type in the end. +pub trait Blueprint { + type Transformer: Transformer; +} + +/// Where you need to go meta (hyperparameters!). +/// +/// `BlueprintGenerator`s can be used to explore different combination of hyperparameters +/// when you are working with a certain `Transformer` type. +/// +/// `BlueprintGenerator::generate` returns, if successful, an `IntoIterator` type +/// yielding instances of blueprints. +pub trait BlueprintGenerator +where + B: Blueprint, +{ + type Error: error::Error; + type Output: IntoIterator; + + fn generate(&self) -> Result; +} + +/// Any `Blueprint` can be used as `BlueprintGenerator`, as long as it's clonable: +/// it returns an iterator with a single element, a clone of itself. +impl BlueprintGenerator for B +where + B: Blueprint + Clone, +{ + // Random error, didn't have time to get a proper one + type Error = std::io::Error; + type Output = iter::Once; + + fn generate(&self) -> Result { + Ok(iter::once(self.clone())) + } +}