diff --git a/README.rst b/README.rst
index 78eeddf..7f0cbdb 100644
--- a/README.rst
+++ b/README.rst
@@ -5,16 +5,16 @@ Collection of benchmarks comparing various python-based machine
 learning packages.
 
 This is meant to work with the development version of the libraries
-scikits.learn, mlpy, pybrain, pymvpa, mdp and shogun. It might be hard
-to get all packages working on the same machine, but benchmarks are
-designed so that if something fail it will just print the exception
-and go to the next one.
+scikits.learn, mlpy, pybrain, pymvpa, mdp, shogun, milk and orange. It 
+might be hard to get all packages working on the same machine, but 
+benchmarks are designed so that if something fail it will just print 
+the exception and go to the next one.
 
 To execute a benchmark, just type from the prompt::
 
     $ python benchmarks/bench_$name.py
 
-and you will se as output the mean and std deviation for the timing of
+and you will see as output the mean and standard deviation for the timing of
 running the benchmark 10 times with its extreme values removed.
 
 Results
@@ -43,6 +43,7 @@ References
   - Shogun: http://www.shogun-toolbox.org/
   - PyBrain : http://pybrain.org/
   - Milk : http://luispedro.org/software/milk
+  - Orange : http://orange.biolab.si/
 
 
 Misc
diff --git a/benchmarks/bench_elasticnet.py b/benchmarks/bench_elasticnet.py
index 5e40b05..9317275 100644
--- a/benchmarks/bench_elasticnet.py
+++ b/benchmarks/bench_elasticnet.py
@@ -69,17 +69,17 @@ def bench_pymvpa(X, y, T, valid):
 
     print 'Loading data ...'
     data = misc.load_data(dataset)
+
     print 'Done, %s samples with %s features loaded into ' \
-          'memory' % data[0].shape
+      'memory\n' % data[0].shape
+
+    score, res = misc.bench(bench_skl, data)
+    misc.print_result("elasticnet", dataset, "scikits.learn", score, res) 
 
-    score, res_skl = misc.bench(bench_skl, data)
-    print 'scikits.learn: mean %s, std %s' % (res_skl.mean(), res_skl.std())
-    print 'MSE ', score
+    score, res = misc.bench(bench_mlpy, data)
+    misc.print_result("elasticnet", dataset, "MLPy", score, res)     
 
-    score, res_mlpy = misc.bench(bench_mlpy, data)
-    print 'MLPy: mean %s, std %s' % (res_mlpy.mean(), res_mlpy.std())
-    print 'MSE ', score
+    score, res = misc.bench(bench_pymvpa, data)
+    misc.print_result("elasticnet", dataset, "PyMVPA", score, res)      
 
-    score, res_pymvpa = misc.bench(bench_pymvpa, data)
-    print 'PyMVPA: mean %s, std %s' % (res_pymvpa.mean(), res_pymvpa.std())
-    print 'MSE ', score
+    misc.save_results()
diff --git a/benchmarks/bench_kmeans.py b/benchmarks/bench_kmeans.py
index 3792505..9be0ffb 100644
--- a/benchmarks/bench_kmeans.py
+++ b/benchmarks/bench_kmeans.py
@@ -119,34 +119,24 @@ def bench_milk(X, y, T, valid):
     data = misc.load_data(dataset)
 
     print 'Done, %s samples with %s features loaded into ' \
-      'memory' % data[0].shape
-
-    score, res_shogun = misc.bench(bench_shogun, data)
-    print 'Shogun: mean %.2f, std %.2f' % (
-        np.mean(res_shogun), np.std(res_shogun))
-    print 'Score: %2f\n' % score
-
-    score, res_mdp = misc.bench(bench_mdp, data)
-    print 'MDP: mean %.2f, std %.2f' % (
-        np.mean(res_mdp), np.std(res_mdp))
-    print 'Score: %2f\n' % score
-
-    score, res_skl = misc.bench(bench_skl, data)
-    print 'scikits.learn: mean %.2f, std %.2f' % (
-        np.mean(res_skl), np.std(res_skl))
-    print 'Score: %2f\n' % score
-
-    score, res_mlpy = misc.bench(bench_mlpy, data)
-    print 'MLPy: mean %.2f, std %.2f' % (
-        np.mean(res_mlpy), np.std(res_mlpy))
-    print 'Score: %2f\n' % score
-
-    score, res_pybrain = misc.bench(bench_pybrain, data)
-    print 'Pybrain: mean %.2f, std %.2f' % (
-        np.mean(res_pybrain), np.std(res_pybrain))
-    print 'Score: %2f\n' % score
-
-    score, res_milk = misc.bench(bench_milk, data)
-    print 'milk: mean %.2f, std %.2f' % (
-        np.mean(res_milk), np.std(res_milk))
-    print 'Score: %2f\n' % score
+      'memory\n' % data[0].shape
+
+    score, res = misc.bench(bench_shogun, data)
+    misc.print_result("kmeans", dataset, "Shogun", score, res)
+
+    score, res = misc.bench(bench_mdp, data)
+    misc.print_result("kmeans", dataset, "MDP", score, res)
+
+    score, res = misc.bench(bench_skl, data)
+    misc.print_result("kmeans", dataset, "scikits.learn", score, res)
+
+    score, res = misc.bench(bench_mlpy, data)
+    misc.print_result("kmeans", dataset, "MLPy", score, res)
+
+    score, res = misc.bench(bench_pybrain, data)
+    misc.print_result("kmeans", dataset, "Pybrain", score, res)
+
+    score, res = misc.bench(bench_milk, data)
+    misc.print_result("kmeans", dataset, "milk", score, res)
+
+    misc.save_results()
diff --git a/benchmarks/bench_knn.py b/benchmarks/bench_knn.py
index 2fdc42c..21bd817 100644
--- a/benchmarks/bench_knn.py
+++ b/benchmarks/bench_knn.py
@@ -107,28 +107,24 @@ def bench_milk(X, y, T, valid):
     data = misc.load_data(dataset)
 
     print 'Done, %s samples with %s features loaded into ' \
-      'memory' % data[0].shape
+      'memory\n' % data[0].shape
 
-    score, res_shogun = misc.bench(bench_shogun, data)
-    print 'Shogun: mean %.2f, std %.2f\n' % (res_shogun.mean(), res_shogun.std())
-    print 'Score: %.2f' % score
+    score, res = misc.bench(bench_shogun, data)
+    misc.print_result("knn", dataset, "Shogun", score, res)
 
-    score, res_mdp = misc.bench(bench_mdp, data)
-    print 'MDP: mean %.2f, std %.2f\n' % (res_mdp.mean(), res_mdp.std())
-    print 'Score: %.2f' % score
+    score, res = misc.bench(bench_mdp, data)
+    misc.print_result("knn", dataset, "MDP", score, res)
 
-    score, res_skl = misc.bench(bench_skl, data)
-    print 'scikits.learn: mean %.2f, std %.2f\n' % (res_skl.mean(), res_skl.std())
-    print 'Score: %.2f' % score
+    score, res = misc.bench(bench_skl, data)
+    misc.print_result("knn", dataset, "scikits.learn", score, res)
 
-    score, res_mlpy = misc.bench(bench_mlpy, data)
-    print 'MLPy: mean %.2f, std %.2f\n' % (res_mlpy.mean(), res_mlpy.std())
-    print 'Score: %.2f' % score
+    score, res = misc.bench(bench_mlpy, data)
+    misc.print_result("knn", dataset, "MLPy", score, res)
 
-    score, res_milk = misc.bench(bench_milk, data)
-    print 'milk: mean %.2f, std %.2f\n' % (res_milk.mean(), res_milk.std())
-    print 'Score: %.2f' % score
+    score, res = misc.bench(bench_pymvpa, data)
+    misc.print_result("knn", dataset, "PyMVPA", score, res)
 
-    score, res_pymvpa = misc.bench(bench_pymvpa, data)
-    print 'PyMVPA: mean %.2f, std %.2f\n' % (res_pymvpa.mean(), res_pymvpa.std())
-    print 'Score: %.2f' % score
+    score, res = misc.bench(bench_milk, data)
+    misc.print_result("knn", dataset, "milk", score, res)
+
+    misc.save_results()
diff --git a/benchmarks/bench_lassolars.py b/benchmarks/bench_lassolars.py
index b234d69..61e1174 100644
--- a/benchmarks/bench_lassolars.py
+++ b/benchmarks/bench_lassolars.py
@@ -66,19 +66,15 @@ def bench_pymvpa(X, y, T, valid):
     data = misc.load_data(dataset)
 
     print 'Done, %s samples with %s features loaded into ' \
-      'memory' % data[0].shape
-
-    score, res_skl = misc.bench(bench_skl, data)
-    print 'scikits.learn: mean %.2f, std %.2f' % (
-        np.mean(res_skl), np.std(res_skl))
-    print 'MSE: %s\n' % score
-
-    score, res_mlpy = misc.bench(bench_mlpy, data)
-    print 'MLPy: mean %.2f, std %.2f' % (
-        np.mean(res_mlpy), np.std(res_mlpy))
-    print 'MSE: %s\n' % score
-
-    score, res_pymvpa = misc.bench(bench_pymvpa, data)
-    print 'PyMVPA: mean %.2f, std %.2f' % (
-        np.mean(res_pymvpa), np.std(res_pymvpa))
-    print 'MSE: %s\n' % score
+      'memory\n' % data[0].shape
+
+    score, res = misc.bench(bench_skl, data)
+    misc.print_result("lassolars", dataset, "scikits.learn", score, res)
+
+    score, res = misc.bench(bench_mlpy, data)
+    misc.print_result("lassolars", dataset, "MLPy", score, res)
+
+    score, res = misc.bench(bench_pymvpa, data)
+    misc.print_result("lassolars", dataset, "PyMVPA", score, res)
+
+    misc.save_results()
diff --git a/benchmarks/bench_logistic.py b/benchmarks/bench_logistic.py
index 9d00878..33067d7 100644
--- a/benchmarks/bench_logistic.py
+++ b/benchmarks/bench_logistic.py
@@ -36,8 +36,9 @@ def bench_skl(X, y, T, valid):
     data = misc.load_data(dataset)
 
     print 'Done, %s samples with %s features loaded into ' \
-      'memory' % data[0].shape
+      'memory\n' % data[0].shape
 
-    res_skl = misc.bench(bench_skl, data)
-    print 'MLPy: mean %.2f, std %.2f\n' % (
-        np.mean(res_skl), np.std(res_skl))
+    score, res = misc.bench(bench_skl, data)
+    misc.print_result("logistic", dataset, "scikits.learn", score, res)
+
+    misc.save_results()
diff --git a/benchmarks/bench_pca.py b/benchmarks/bench_pca.py
index 6a241ef..df9f2dc 100644
--- a/benchmarks/bench_pca.py
+++ b/benchmarks/bench_pca.py
@@ -107,28 +107,19 @@ def bench_milk(X, y, T, valid):
     print 'Done, %s samples with %s features loaded into ' \
       'memory' % data[0].shape
 
-    score, res_mdp = misc.bench(bench_mdp, data)
-    print 'MDP: mean %s, std %s' % (
-        np.mean(res_mdp), np.std(res_mdp))
-    print 'Explained variance: %s\n'% score
-
-    score, res_skl = misc.bench(bench_skl, data)
-    print 'scikits.learn: mean %.2f, std %.2f' % (
-        np.mean(res_skl), np.std(res_skl))
-    print 'Explained variance: %s\n'% score
-
-    score, res_pybrain = misc.bench(bench_pybrain, data)
-    print 'Pybrain: mean %s, std %s' % (
-        np.mean(res_pybrain), np.std(res_pybrain))
-    print 'Explained variance: %s\n'% score
-
-    score, res_milk = misc.bench(bench_milk, data)
-    print 'milk: mean %s, std %s' % (
-        np.mean(res_milk), np.std(res_milk))
-    print 'Explained variance: %s\n'% score
-
-    score, res_pymvpa = misc.bench(bench_pymvpa, data)
-    print 'PyMVPA: mean %s, std %s' % (
-        np.mean(res_pymvpa), np.std(res_pymvpa))
-    print 'Explained variance: %s\n'% score
+    score, res = misc.bench(bench_mdp, data)
+    misc.print_result("pca", dataset, "MDP", score, res)
 
+    score, res = misc.bench(bench_skl, data)
+    misc.print_result("pca", dataset, "scikits.learn", score, res)
+
+    score, res = misc.bench(bench_pymvpa, data)
+    misc.print_result("pca", dataset, "PyMVPA", score, res)
+
+    score, res = misc.bench(bench_pybrain, data)
+    misc.print_result("pca", dataset, "Pybrain", score, res)
+
+    score, res = misc.bench(bench_milk, data)
+    misc.print_result("pca", dataset, "milk", score, res)
+
+    misc.save_results()
diff --git a/benchmarks/bench_svm.py b/benchmarks/bench_svm.py
index f8b1cfc..a2091fd 100644
--- a/benchmarks/bench_svm.py
+++ b/benchmarks/bench_svm.py
@@ -172,44 +172,30 @@ def bench_orange(X, y, T, valid):
     sigma = np.median(pdist(data[0]))
 
     print 'Done, %s samples with %s features loaded into ' \
-      'memory' % data[0].shape
-
-    score, res_shogun = misc.bench(bench_shogun, data)
-    print 'Shogun: mean %.2f, std %.2f' % (
-        np.mean(res_shogun), np.std(res_shogun))
-    print 'Score: %.2f\n' % score
-
-    score, res_mdp = misc.bench(bench_mdp, data)
-    print 'MDP: mean %.2f, std %.2f' % (
-        np.mean(res_mdp), np.std(res_mdp))
-    print 'Score: %.2f\n' % score
-
-    score, res_skl = misc.bench(bench_skl, data)
-    print 'scikits.learn: mean %.2f, std %.2f' % (
-        np.mean(res_skl), np.std(res_skl))
-    print 'Score: %.2f\n' % score
-
-    score, res_mlpy = misc.bench(bench_mlpy, data)
-    print 'MLPy: mean %.2f, std %.2f' % (
-        np.mean(res_mlpy), np.std(res_mlpy))
-    print 'Score: %.2f\n' % score
-
-    score, res_pymvpa = misc.bench(bench_pymvpa, data)
-    print 'PyMVPA: mean %.2f, std %.2f' % (
-        np.mean(res_pymvpa), np.std(res_pymvpa))
-    print 'Score: %.2f\n' % score
-
-    score, res_pybrain = misc.bench(bench_pybrain, data)
-    print 'Pybrain: mean %.2f, std %.2f' % (
-        np.mean(res_pybrain), np.std(res_pybrain))
-    print 'Score: %.2f\n' % score
-
-    score, res_milk = misc.bench(bench_milk, data)
-    print 'milk: mean %.2f, std %.2f' % (
-        np.mean(res_milk), np.std(res_milk))
-    print 'Score: %.2f\n' % score
-
-    score, res_orange = misc.bench(bench_orange, data)
-    print 'Orange: mean %.2f, std %.2f' % (
-        np.mean(res_orange), np.std(res_orange))
-    print 'Score: %.2f\n' % score
+      'memory\n' % data[0].shape
+
+    score, res = misc.bench(bench_shogun, data)
+    misc.print_result("svm", dataset, "Shogun", score, res)
+
+    score, res = misc.bench(bench_mdp, data)
+    misc.print_result("svm", dataset, "MDP", score, res)
+
+    score, res = misc.bench(bench_skl, data)
+    misc.print_result("svm", dataset, "scikits.learn", score, res)
+
+    score, res = misc.bench(bench_mlpy, data)
+    misc.print_result("svm", dataset, "MLPy", score, res)
+
+    score, res = misc.bench(bench_pymvpa, data)
+    misc.print_result("svm", dataset, "PyMVPA", score, res)
+
+    score, res = misc.bench(bench_pybrain, data)
+    misc.print_result("svm", dataset, "Pybrain", score, res)
+
+    score, res = misc.bench(bench_milk, data)
+    misc.print_result("svm", dataset, "milk", score, res)
+
+    score, res = misc.bench(bench_orange, data)
+    misc.print_result("svm", dataset, "Orange", score, res)
+
+    misc.save_results()
diff --git a/benchmarks/misc.py b/benchmarks/misc.py
index 311dca1..6464b8d 100644
--- a/benchmarks/misc.py
+++ b/benchmarks/misc.py
@@ -2,6 +2,7 @@
 import numpy as np
 import os
 
+
 def load_data(dataset):
 
     f = open(os.path.dirname(__file__) + '/data/%s_train.data' % dataset)
@@ -33,6 +34,7 @@ def load_data(dataset):
 def dtime_to_seconds(dtime):
     return dtime.seconds + (dtime.microseconds * 1e-6)
 
+
 def bench(func, data, n=10):
     """
     Benchmark a given function. The function is executed n times and
@@ -44,7 +46,8 @@ def bench(func, data, n=10):
     ----------
     func: function to benchmark
 
-    data: tuple (X, y, T, valid) containing training (X, y) and validation (T, valid) data.
+    data: tuple (X, y, T, valid) containing training (X, y)
+    and validation (T, valid) data.
 
     Returns
     -------
@@ -65,7 +68,46 @@ def bench(func, data, n=10):
         time = []
     return score, np.array(time)
 
+task_string = ""
+dataset_string = ""
+packages = []
+scores = []
+means = []
+stds = []
+
+
+def print_result(task, dataset, package, score, timing_results):
+    global task_string
+    global dataset_string
+    global packages
+    global scores
+    global means
+    global stds
+
+    print '%s on dataset %s' % (task, dataset)
+    mean = np.mean(timing_results)
+    std = np.std(timing_results)
+    print '%s: mean %.2f, std %.2f' % (package, mean, std)
+    print 'Score: %.2f\n' % score
+
+    task_string = task
+    dataset_string = dataset
+    packages.append(package)
+    scores.append(score)
+    means.append(mean)
+    stds.append(std)
+
+
+def save_results():
+    global task_string
+    global dataset_string
+    with open('%s_%s.results' % (task_string, dataset_string), 'w') as f:
+        import pickle
+        pickle.dump([task_string, dataset_string, packages,
+                     scores, means, stds], f)
+
+
 USAGE = """usage: python %s dataset
 
 where dataset is one of {madelon, arcene}
-"""
\ No newline at end of file
+"""
diff --git a/benchmarks/plot.py b/benchmarks/plot.py
new file mode 100644
index 0000000..b83065d
--- /dev/null
+++ b/benchmarks/plot.py
@@ -0,0 +1,198 @@
+
+import numpy as np
+
+
+def plot_results_for_task(task, datasets, packages, scores, means, stds):
+    """Plot the results for this task, grouping by package
+
+    task : string
+        The name of the task
+    datasets : list of strings, shape = [n_datasets]
+        The names of the datasets
+    packages : list of strings, shape = [n_packages]
+    scores : array-like, shape = [n_datasets, n_packages]
+        The scores of the tests
+    means : array-like, shape = [n_datasets, n_packages]
+        The means of the timings
+    std :  array-like, shape = [n_datasets, n_packages]
+        The standard deviations of the timings
+    """
+
+    import matplotlib.pyplot as plt
+    import itertools
+    c = itertools.cycle('bgcmykbgrcmyk')
+
+    n_datasets = len(datasets)
+    n_packages = len(packages)
+
+    scores = np.atleast_2d(scores)
+    m, n = scores.shape
+    assert m == n_datasets, ValueError("scores must be shape %d,%d" %
+                                       (n_datasets, n_packages))
+
+    means = np.atleast_2d(means)
+    m, n = means.shape
+    assert m == n_datasets, ValueError("means must be shape %d,%d" %
+                                       (n_datasets, n_packages))
+
+    stds = np.atleast_2d(stds)
+    m, n = stds.shape
+    assert m == n_datasets, ValueError("stds must be shape %d,%d" %
+                                       (n_datasets, n_packages))
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+
+    ind = np.arange(n_packages)  # the x locations for the groups
+    width = 0.35       # the width of the bars
+
+    for i in range(n_datasets):
+        rect = ax.bar(ind + i * width, means[i, :], width,
+                      color=c.next(), yerr=stds[i, :],
+                      ecolor='k', label=datasets[i])
+
+    ax.set_title('Time needed to perform train + predict (smaller is better)')
+    ax.set_ylabel('Seconds')
+    ax.set_xticks(ind + width)
+    ax.set_xticklabels(tuple(packages))
+    ax.legend()
+
+    plt.show()
+    plt.savefig("bench_%s.png" % (task))
+ 
+ 
+def hcat(left, right, spaces=""):
+    res = []
+    for l, r in zip(left,right):
+        res.append(l + spaces + r)
+    return res
+
+
+def frame(top, bottom):
+    # calculate the max length of all the strings
+    max_len = len(top)
+    for b in bottom:
+        if len(b) > max_len:
+            max_len = len(b)
+    
+    f = ["="*max_len]
+    spaces = " "*(max_len - len(top))
+    f.append(spaces + top)
+    f.append("="*max_len)
+    for b in bottom:
+        spaces = " "*(max_len - len(b))
+        f.append(spaces + b)
+    f.append("="*max_len) 
+    
+    return f
+
+
+def rst_table(task, datasets, packages, values, use_min=True):
+    """Print the results in a table like this one:
+    
+    ============    =======     ======     ======     =======     ========    =============      ========
+         Dataset     PyMVPA     Shogun        MDP     Pybrain         MLPy     scikit-learn          Milk
+    ============    =======     ======     ======     =======     ========    =============      ========
+         Madelon      11.52       5.63      40.48        17.5         9.47         **5.20**          5.76
+         Arcene        1.30       0.39       4.87          --         1.61             0.38      **0.33**
+    ============    =======     ======     ======     =======     ========    =============      ========
+    """
+    import math
+    
+    a = "Dataset"
+    b = datasets
+    output = frame(a, b)
+    
+    value_strings = []
+    for v in values:
+        # turn the values into strings
+        vs_temp = []
+        m = np.inf
+        for v2 in v:
+            x = float(v2)
+            if not math.isnan(x) and x < m:
+                m = x 
+        for v2 in v:
+            x = float(v2)
+            if math.isnan(x) or math.isinf(x):
+                vs_temp.append("--")
+            elif use_min and v2 == m:
+                vs_temp.append("**%.02f**" % v2)
+            else:
+                vs_temp.append("%.02f" % v2)
+        value_strings.append(vs_temp)
+    
+    # transpose the value strings list of lists so that we can work on
+    # columns
+    values = [list(v) for v in zip(*value_strings)]
+    
+    for a, b in zip(packages, values):
+        o = frame(a, b)
+        output = hcat(output, o, " "*4)
+
+    return output
+
+
+def prepare_results(task):
+
+    import glob
+    result_files = glob.glob("%s*.results" % (task))
+
+    datasets = []
+    packages = []
+
+    scores = []
+    means = []
+    stds = []
+
+    for i, result_file in enumerate(result_files):
+
+        with open(result_file, 'r') as f:
+            import pickle
+            result = pickle.load(f)
+
+        datasets.append(result[1])
+
+        if packages == []:
+            packages = result[2]
+
+        scores.append(result[3])
+        means.append(result[4])
+        stds.append(result[5])
+
+    plot_results_for_task(task, datasets, packages,
+                          scores, means, stds)
+    rst = rst_table(task, datasets, packages, means, use_min=True)
+    print "Timing for ", task
+    for l in rst:
+        print l
+    
+    print
+    rst = rst_table(task, datasets, packages, scores, use_min=False)
+    print "Scores for ", task
+    for l in rst:
+        print l    
+    print
+    
+
+USAGE = """usage: python plot.py package
+
+where package is one of {elasticnet, kmeans, ...}
+"""
+
+if __name__ == "__main__":
+    import sys
+
+    # don't bother me with warnings
+    import warnings
+    warnings.simplefilter('ignore')
+    np.seterr(all='ignore')
+
+    #print __doc__ + '\n'
+    if not len(sys.argv) == 2:
+        print USAGE
+        sys.exit(-1)
+    else:
+        task = sys.argv[1]
+
+    prepare_results(task)