Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Tidy-up and improve ergonomics with new interface and dataset #45

Closed
wants to merge 37 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
f885f1d
Introduce traits and dataset
bytesnake Sep 16, 2020
3c117bf
Merge remote-tracking branch 'upstream/master' into traits
bytesnake Oct 12, 2020
271d1cc
Rename `fit_update` to `fit_with`
bytesnake Oct 12, 2020
aae00c0
Add `get_label` to dataset struct
bytesnake Oct 12, 2020
f29efb2
Every dataset can also act as data alone
bytesnake Oct 12, 2020
b4605b0
Rename data to records to disambiguate from ndarray
bytesnake Oct 12, 2020
4c3f033
Port `linfa-kernel` to new syntax
bytesnake Oct 12, 2020
e4c3df2
Start porting SVM to new architecture
bytesnake Oct 12, 2020
d7e2528
Introduce phantom type to SVM
bytesnake Oct 13, 2020
35173f4
Fit with different targets
bytesnake Oct 14, 2020
071c94e
Add Fit to SVRegression
bytesnake Oct 15, 2020
6f3231f
Implement ConfusionMatrix for dataset struct
bytesnake Oct 15, 2020
e0cbff5
Port BinaryClassification to dataset struct
bytesnake Oct 15, 2020
92794ee
First working example for SVM
bytesnake Oct 15, 2020
a0541e6
Run autoformatting
bytesnake Oct 15, 2020
b9a5218
Move to new API for tests in `linfa-svm`
bytesnake Oct 16, 2020
5f689e8
Move wine quality example to linfa dataset
bytesnake Oct 17, 2020
549bf8d
Add support vector regression tests
bytesnake Oct 17, 2020
c5b40f6
Implement transformer for hierarchical clustering
bytesnake Oct 18, 2020
e10ddd5
Add option to choose ndarray backend for `linfa-ica`
bytesnake Oct 18, 2020
0167887
Move `linfa-ica` to new traits
bytesnake Oct 18, 2020
0e48991
Implement new traits for KMeans
bytesnake Oct 18, 2020
7f06b36
Implement transformer for DBSCAN
bytesnake Oct 18, 2020
46f5e2e
Move PCA and diffusion maps to new traits
bytesnake Oct 18, 2020
0e3ba8f
Add prelude to linfa
bytesnake Oct 19, 2020
a060c85
Fix tests of classification metrics
bytesnake Oct 21, 2020
dd504fb
Add text how to contribute
bytesnake Oct 21, 2020
eb7d5d3
Remove associated type from `Labels`
bytesnake Oct 25, 2020
ae940bf
Remove labels from Dataset
bytesnake Oct 25, 2020
e710d51
Run fmt and remove serde dependency in reduction
bytesnake Oct 25, 2020
430faf4
Make serde optional in clustering
bytesnake Oct 25, 2020
6e5c608
Add section on datasets to contribute document
bytesnake Oct 25, 2020
93ac0aa
Add section on serde feature
bytesnake Oct 25, 2020
a9e1161
Add one-vs-all function
bytesnake Oct 25, 2020
7b0591d
Add error type
bytesnake Oct 25, 2020
58113a1
Add error type for parameters and ndarray
bytesnake Oct 25, 2020
fdf2411
Add section on builder patterns
bytesnake Nov 2, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Implement ConfusionMatrix for dataset struct
bytesnake committed Oct 15, 2020
commit 6f3231ffee9f408b7c9fdcfd99500a0214c4c800
2 changes: 1 addition & 1 deletion linfa-svm/Cargo.toml
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@ netlib = ["ndarray-linalg/netlib"]

[dependencies]
ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]}
ndarray-linalg = {version = "0.12" }
ndarray-linalg = { version = "0.12" }
ndarray-rand = "0.11"
rand_isaac = "0.2.0"
num-traits = "0.1.32"
18 changes: 13 additions & 5 deletions linfa-svm/src/classification.rs
Original file line number Diff line number Diff line change
@@ -245,9 +245,12 @@ impl<'a, F: Float, T> Predict<Dataset<Array2<F>, T>, Dataset<Array2<F>, Vec<Pr>>
}
#[cfg(test)]
mod tests {
use super::{fit_c, fit_nu, fit_one_class, SolverParams};
use super::Svm;
use linfa::dataset::Dataset;
use linfa::traits::{Fit, Transformer, Predict};
use linfa::metrics::IntoConfusionMatrix;
use linfa_kernel::Kernel;
use linfa_kernel::{Kernel, KernelMethod};

use ndarray::{Array, Array2, Axis};
use ndarray_rand::rand_distr::Uniform;
use ndarray_rand::RandomExt;
@@ -282,9 +285,14 @@ mod tests {
)
.unwrap();
let targets = (0..20).map(|x| x < 10).collect::<Vec<_>>();
let dataset = Dataset::new(entries, targets);

let kernel = Kernel::linear(&entries);

let dataset = Kernel::params()
.method(KernelMethod::Linear)
.transform(&dataset);
}
}
/*
let params = SolverParams {
eps: 1e-3,
shrinking: false,
@@ -428,4 +436,4 @@ mod tests {
// at least 95% should be correctly rejected
assert!((rejected as f32) / (total as f32) > 0.95);
}
}
}*/
8 changes: 4 additions & 4 deletions linfa-svm/src/lib.rs
Original file line number Diff line number Diff line change
@@ -222,10 +222,10 @@ impl<'a, A: Float, T> fmt::Display for Svm<'a, A, T> {
fn test() {
use linfa::traits::Transformer;

let dataset: Dataset<Array2<f32>, Vec<bool>> = Dataset {
records: Array2::zeros((10, 10)),
targets: vec![true, false]
};
let dataset = Dataset::from((
Array2::zeros((10, 10)),
vec![true; 10]
));

let dataset = Kernel::params()
.transform(&dataset);
3 changes: 2 additions & 1 deletion linfa-svm/src/regression.rs
Original file line number Diff line number Diff line change
@@ -137,6 +137,7 @@ impl<'a, F: Float> Fit<'a, Kernel<'a, F>, &Vec<F>> for SvmParams<F, F> {
}
}
}
/*
#[cfg(test)]
pub mod tests {
use super::{fit_epsilon, fit_nu, SolverParams};
@@ -198,4 +199,4 @@ pub mod tests {

assert!(predicted.mean_squared_error(&target) < 1e-2);
}
}
}*/
36 changes: 28 additions & 8 deletions src/dataset/impl_dataset.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use ndarray::Array2;
use std::collections::HashSet;
use super::{Float, Label, Dataset, iter::Iter, Records, Targets, Labels};

impl<F: Float, L: Label> Dataset<Array2<F>, Vec<L>> {
@@ -9,33 +8,50 @@ impl<F: Float, L: Label> Dataset<Array2<F>, Vec<L>> {
}

impl<R: Records, S: Targets> Dataset<R, S> {
pub fn new(records: R, targets: S) -> Dataset<R, S> {
Dataset {
records,
targets,
labels: Vec::new(),
weights: Vec::new()
}
}

pub fn targets(&self) -> &[S::Elem] {
self.targets.as_slice()
}

pub fn weights(&self) -> &[f32] {
&self.weights
}

pub fn with_records<T: Records>(self, records: T) -> Dataset<T, S> {
Dataset {
records,
targets: self.targets
targets: self.targets,
labels: self.labels,
weights: Vec::new()
}
}

pub fn map_targets<T, G: FnMut(&S::Elem) -> T>(self, fnc: G) -> Dataset<R, Vec<T>> {
let Dataset { records, targets } = self;
let Dataset { records, targets, labels, weights } = self;

let new_targets = targets.as_slice().into_iter()
.map(fnc)
.collect::<Vec<T>>();

Dataset {
records,
targets: new_targets
targets: new_targets,
labels: Vec::new(),
weights
}
}
}

impl<R: Records, S: Labels> Dataset<R, S> {
pub fn labels(&self) -> HashSet<&S::Elem> {
impl<R: Records, S: Targets + Labels> Dataset<R, S> {
pub fn labels(&self) -> Vec<<S as Labels>::Elem> {
self.targets.labels()
}
}
@@ -44,7 +60,9 @@ impl<F: Float> From<Array2<F>> for Dataset<Array2<F>, ()> {
fn from(records: Array2<F>) -> Self {
Dataset {
records,
targets: ()
targets: (),
labels: Vec::new(),
weights: Vec::new()
}
}
}
@@ -53,7 +71,9 @@ impl<F: Float, T: Targets> From<(Array2<F>, T)> for Dataset<Array2<F>, T> {
fn from(rec_tar: (Array2<F>, T)) -> Self {
Dataset {
records: rec_tar.0,
targets: rec_tar.1
targets: rec_tar.1,
labels: Vec::new(),
weights: Vec::new()
}
}
}
6 changes: 3 additions & 3 deletions src/dataset/impl_records.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use ndarray::{ArrayBase, Axis, Dimension, Data};

use super::{Records, Float, Dataset};
use super::{Records, Float, Dataset, Targets};

impl<F: Float, S: Data<Elem = F>, I: Dimension> Records for ArrayBase<S, I> {
type Elem = F;
@@ -17,15 +17,15 @@ impl<F: Float, S: Data<Elem = F>, I: Dimension> Records for &ArrayBase<S, I> {
self.len_of(Axis(0))
}
}
impl<F: Float, D: Records<Elem = F>, T> Records for Dataset<D, T> {
impl<F: Float, D: Records<Elem = F>, T: Targets> Records for Dataset<D, T> {
type Elem = F;

fn observations(&self) -> usize {
self.records.observations()
}
}

impl<F: Float, D: Records<Elem = F>, T> Records for &Dataset<D, T> {
impl<F: Float, D: Records<Elem = F>, T: Targets> Records for &Dataset<D, T> {
type Elem = F;

fn observations(&self) -> usize {
51 changes: 16 additions & 35 deletions src/dataset/impl_targets.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
use super::{Targets, Label, Dataset, Records, Labels};
use ndarray::{Dimension, ArrayBase, Data};
use std::collections::HashSet;

impl<L> Targets for Vec<L> {
type Elem = L;

@@ -13,8 +11,8 @@ impl<L> Targets for Vec<L> {
impl<L: Label> Labels for Vec<L> {
type Elem = L;

fn labels(&self) -> HashSet<&L> {
self.iter().collect()
fn labels(&self) -> Vec<L> {
self.iter().cloned().collect()
}
}

@@ -29,8 +27,8 @@ impl<L> Targets for &Vec<L> {
impl<L: Label> Labels for &Vec<L> {
type Elem = L;

fn labels(&self) -> HashSet<&L> {
self.iter().collect()
fn labels(&self) -> Vec<L> {
self.iter().cloned().collect()
}
}

@@ -47,8 +45,8 @@ impl<L> Targets for &[L] {
impl<L: Label> Labels for &[L] {
type Elem = L;

fn labels(&self) -> HashSet<&L> {
self.iter().collect()
fn labels(&self) -> Vec<L> {
self.iter().cloned().collect()
}
}

@@ -63,43 +61,26 @@ impl<L, S: Data<Elem = L>, I: Dimension> Targets for ArrayBase<S, I> {
impl<L: Label, S: Data<Elem = L>, I: Dimension> Labels for ArrayBase<S, I> {
type Elem = L;

fn labels(&self) -> HashSet<&L> {
self.iter().collect()
}
}

pub struct TargetsWithLabels<L: Label, T: Targets<Elem = L>> {
targets: T,
labels: HashSet<L>
}

impl<L: Label, T: Targets<Elem = L>> Targets for TargetsWithLabels<L, T> {
type Elem = L;

fn as_slice(&self) -> &[Self::Elem] {
self.targets.as_slice()
fn labels(&self) -> Vec<L> {
self.iter().cloned().collect()
}
}

impl<L: Label, T: Targets<Elem = L>> Labels for TargetsWithLabels<L, T> {
type Elem = L;
impl Targets for () {
type Elem = ();

fn labels(&self) -> HashSet<&L> {
self.labels.iter().collect()
fn as_slice(&self) -> &[()] {
&[()]
}
}


impl<R: Records, L: Label, T: Targets<Elem=L>> Dataset<R, T> {
pub fn with_labels(self, labels: Vec<L>) -> Dataset<R, TargetsWithLabels<L, T>> {
let targets = TargetsWithLabels {
targets: self.targets,
labels: labels.into_iter().collect()
};

pub fn with_labels(self, labels: Vec<L>) -> Dataset<R, T> {
Dataset {
records: self.records,
targets
targets: self.targets,
weights: self.weights,
labels
}
}
}
14 changes: 9 additions & 5 deletions src/dataset/mod.rs
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ pub trait Float: NdFloat + FromPrimitive + Default + Sum {}
impl Float for f32 {}
impl Float for f64 {}

pub trait Label: PartialEq + Eq + Hash {}
pub trait Label: PartialEq + Eq + Hash + Clone {}

impl Label for bool {}
impl Label for usize {}
@@ -23,12 +23,16 @@ impl Label for String {}
/// Probability types
pub type Pr = f32;

pub struct Dataset<R, S>
pub struct Dataset<R, T>
where
R: Records
R: Records,
T: Targets
{
pub records: R,
pub targets: S,
pub targets: T,

labels: Vec<T::Elem>,
weights: Vec<f32>
}

pub trait Records: Sized {
@@ -46,5 +50,5 @@ pub trait Targets {
pub trait Labels {
type Elem: Label;

fn labels<'a>(&'a self) -> HashSet<&'a Self::Elem>;
fn labels(&self) -> Vec<Self::Elem>;
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -41,7 +41,7 @@ pub use dataset::{Float, Label};

pub mod metrics {
pub use crate::metrics_classification::{
BinaryClassification, ConfusionMatrix, IntoConfusionMatrix, Modify,
BinaryClassification, ConfusionMatrix, ToConfusionMatrix,
ReceiverOperatingCharacteristic,
};
pub use crate::metrics_regression::Regression;
129 changes: 34 additions & 95 deletions src/metrics_classification.rs
Original file line number Diff line number Diff line change
@@ -11,11 +11,14 @@ use ndarray::prelude::*;
use ndarray::Data;
use ndarray::IntoNdProducer;

use crate::Float;
use crate::dataset::{Dataset, Records, Targets, Labels, Label};

/// Return tuple of class index for each element of prediction and ground_truth
fn map_prediction_to_idx<A: Eq + Hash, C: Data<Elem = A>, D: Data<Elem = A>>(
prediction: &ArrayBase<C, Ix1>,
ground_truth: &ArrayBase<D, Ix1>,
classes: &[A],
fn map_prediction_to_idx<L: Label>(
prediction: &[L],
ground_truth: &[L],
classes: &[L],
) -> Vec<Option<(usize, usize)>> {
// create a map from class label to index
let set = classes
@@ -32,66 +35,6 @@ fn map_prediction_to_idx<A: Eq + Hash, C: Data<Elem = A>, D: Data<Elem = A>>(
.collect::<Vec<Option<_>>>()
}

/// A modified prediction
///
/// It can happen that only a subset of classes are of interest or the samples have different
/// weights in the resulting evaluations. For this a `ModifiedPrediction` struct offers the
/// possibility to modify a prediction before evaluation.
pub struct ModifiedPrediction<A, D: Data<Elem = A>> {
prediction: ArrayBase<D, Ix1>,
weights: Vec<f32>,
classes: Vec<A>,
}

/// Modify prediction weights or classes
pub trait Modify<A: PartialOrd + Eq + Hash, D: Data<Elem = A>> {
/// Add weights to prediction, each weight-entry correspond to a single prediction. The
/// prediction influence is scaled according to the weight.
fn with_weights(self, weights: &[f32]) -> ModifiedPrediction<A, D>;
/// Select certain classes. This can be used to select a subset of classes or re-order classes.
fn with_classes(self, classes: &[A]) -> ModifiedPrediction<A, D>;
}

/// Modify a prediction stored in `ndarray`
impl<A: PartialOrd + Eq + Hash + Clone, D: Data<Elem = A>> Modify<A, D> for ArrayBase<D, Ix1> {
fn with_weights(self, weights: &[f32]) -> ModifiedPrediction<A, D> {
ModifiedPrediction {
prediction: self,
weights: weights.to_vec(),
classes: Vec::new(),
}
}

fn with_classes(self, classes: &[A]) -> ModifiedPrediction<A, D> {
ModifiedPrediction {
prediction: self,
weights: Vec::new(),
classes: classes.to_vec(),
}
}
}

/// Modify a already modified prediction
impl<A: PartialOrd + Eq + Hash + Clone, D: Data<Elem = A>> Modify<A, D>
for ModifiedPrediction<A, D>
{
fn with_weights(self, weights: &[f32]) -> ModifiedPrediction<A, D> {
ModifiedPrediction {
prediction: self.prediction,
weights: weights.to_vec(),
classes: self.classes,
}
}

fn with_classes(self, classes: &[A]) -> ModifiedPrediction<A, D> {
ModifiedPrediction {
prediction: self.prediction,
weights: self.weights,
classes: classes.to_vec(),
}
}
}

/// Confusion matrix for multi-label evaluation
///
/// A confusion matrix shows predictions in a matrix, where rows correspond to target and columns
@@ -316,52 +259,48 @@ impl<A: fmt::Display> fmt::Debug for ConfusionMatrix<A> {
/// Classification for multi-label evaluation
///
/// Contains a routine to calculate the confusion matrix, all other scores are derived form it.
pub trait IntoConfusionMatrix<A> {
fn into_confusion_matrix<'a, T>(self, ground_truth: T) -> ConfusionMatrix<A>
where
A: 'a,
T: IntoNdProducer<Item = &'a A, Dim = Ix1, Output = ArrayView1<'a, A>>;
pub trait ToConfusionMatrix<A, T> {
fn confusion_matrix(self, ground_truth: T) -> ConfusionMatrix<A>;
}

impl<A: Clone + Ord + Hash, D: Data<Elem = A>> IntoConfusionMatrix<A> for ModifiedPrediction<A, D> {
fn into_confusion_matrix<'a, T>(self, ground_truth: T) -> ConfusionMatrix<A>
where
A: 'a,
T: IntoNdProducer<Item = &'a A, Dim = Ix1, Output = ArrayView1<'a, A>>,
{
let ground_truth = ground_truth.into_producer();

// if we don't have any classes, create a set of predicted labels
let classes = if self.classes.is_empty() {
let mut classes = ground_truth
.iter()
.chain(self.prediction.iter())
.cloned()
.collect::<Vec<_>>();
// create a set
classes.sort();
classes.dedup();
classes
} else {
self.classes
};
impl<R: Records, L: Label, T: Targets<Elem = L> + Labels<Elem = L>> ToConfusionMatrix<L, Dataset<R, T>> for Dataset<R, T> {
fn confusion_matrix(self, ground_truth: Dataset<R, T>) -> ConfusionMatrix<L> {
let classes: Vec<L> = ground_truth.labels();
let indices = map_prediction_to_idx(&self.targets.as_slice(), &ground_truth.targets.as_slice(), &classes);

// find indices to labels
let indices = map_prediction_to_idx(&self.prediction, &ground_truth, &classes);
// count each index tuple in the confusion matrix
let mut confusion_matrix = Array2::zeros((classes.len(), classes.len()));
for (i1, i2) in indices.into_iter().filter_map(|x| x) {
confusion_matrix[(i1, i2)] += *ground_truth.weights().get(i1).unwrap_or(&1.0);
}

ConfusionMatrix {
matrix: confusion_matrix,
members: Array1::from(classes),
}
}
}

impl<R: Records, L: Label, T: Targets<Elem=L>+Labels<Elem=L>> ToConfusionMatrix<L, Dataset<R, T>> for Vec<L> {
fn confusion_matrix(self, ground_truth: Dataset<R, T>) -> ConfusionMatrix<L> {
let classes: Vec<L> = ground_truth.labels();
let indices = map_prediction_to_idx(&self, &ground_truth.targets.as_slice(), &classes);

// count each index tuple in the confusion matrix
let mut confusion_matrix = Array2::zeros((classes.len(), classes.len()));
for (i1, i2) in indices.into_iter().filter_map(|x| x) {
confusion_matrix[(i1, i2)] += *self.weights.get(i1).unwrap_or(&1.0);
confusion_matrix[(i1, i2)] += *ground_truth.weights().get(i1).unwrap_or(&1.0);
}

ConfusionMatrix {
matrix: confusion_matrix,
members: Array1::from(classes),
}

}
}

/*
impl<A: Clone + Ord + Hash, D: Data<Elem = A>> IntoConfusionMatrix<A> for ArrayBase<D, Ix1> {
fn into_confusion_matrix<'a, T>(self, ground_truth: T) -> ConfusionMatrix<A>
where
@@ -392,7 +331,7 @@ impl<A: Clone + Ord + Hash> IntoConfusionMatrix<A> for Vec<A> {
tmp.into_confusion_matrix(ground_truth)
}
}
}*/

/*
* TODO: specialization requires unstable Rust
6 changes: 3 additions & 3 deletions src/traits.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Provide traits for different classes of algorithms
//!
use crate::dataset::{Records, Dataset};
use crate::dataset::{Records, Dataset, Targets};

/// Transformation algorithms
///
@@ -20,7 +20,7 @@ pub trait Transformer<R: Records, T> {
/// A fittable algorithm takes a dataset and creates a concept of some kind about it. For example
/// in *KMeans* this would be the mean values for each class, or in *SVM* the separating
/// hyperplane. It returns a model, which can be used to predict targets for new data.
pub trait Fit<'a, R: Records, T> {
pub trait Fit<'a, R: Records, T: Targets> {
type Object: 'a;

fn fit(&self, dataset: &'a Dataset<R, T>) -> Self::Object;
@@ -31,7 +31,7 @@ pub trait Fit<'a, R: Records, T> {
/// An incremental algorithm takes a former model and dataset and returns a new model with updated
/// parameters. If the former model is `None`, then the function acts like `Fit::fit` and
/// initializes the model first.
pub trait IncrementalFit<R: Records, T> {
pub trait IncrementalFit<R: Records, T: Targets> {
type Object: Predict<R, T>;

fn fit_with<I: Into<Option<Self::Object>>>(&self, model: I, dataset: Dataset<R, T>) -> Self::Object;