Skip to content

Commit a766883

Browse files
committedJul 28, 2022
Add back generic impl for Taxonomy
We need it in the end
1 parent 03b90d1 commit a766883

13 files changed

+647
-253
lines changed
 

‎Cargo.toml

+5
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ bench = false
2828
[dev-dependencies]
2929
rand = "0.8"
3030
tempfile = "3.3"
31+
criterion = "0.3.5"
32+
33+
[[bench]]
34+
name = "taxonomy"
35+
harness = false
3136

3237
[package.metadata.maturin]
3338
classifier = ["Intended Audience :: Science/Research", "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Topic :: Scientific/Engineering :: Bio-Informatics"]

‎benches/taxonomy.rs

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
use std::io::Cursor;
2+
3+
use criterion::{criterion_group, criterion_main, Criterion};
4+
5+
use taxonomy::{json::load, Taxonomy};
6+
7+
fn str_taxonomy(c: &mut Criterion) {
8+
let build_json = include_str!("../tests/data/ncbi_subset_tax.json");
9+
let taxonomy = load(Cursor::new(build_json), None).expect("Error loading json");
10+
11+
c.bench_function("lineage str", move |b| {
12+
b.iter(|| {
13+
taxonomy
14+
.lca(
15+
65574u32.to_string().as_str(),
16+
160352u32.to_string().as_str(),
17+
)
18+
.unwrap()
19+
.parse::<u32>()
20+
.unwrap()
21+
.to_string()
22+
});
23+
});
24+
}
25+
26+
fn u32_taxonomy(c: &mut Criterion) {
27+
let build_json = include_str!("../tests/data/ncbi_subset_tax.json");
28+
let taxonomy = load(Cursor::new(build_json), None).expect("Error loading json");
29+
30+
c.bench_function("lineage u32", move |b| {
31+
b.iter(|| taxonomy.lca(1577, 828));
32+
});
33+
}
34+
35+
criterion_group!(benches, str_taxonomy, u32_taxonomy);
36+
criterion_main!(benches);

‎src/base.rs

+80-136
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use serde::{Deserialize, Serialize};
2+
use std::borrow::Cow;
23
use std::collections::{HashMap, HashSet};
4+
use std::fmt::Debug;
35

46
use serde_json::Value;
57

@@ -19,7 +21,7 @@ pub struct GeneralTaxonomy {
1921
// Only used by the JSON format
2022
pub data: Vec<HashMap<String, Value>>,
2123

22-
// these are lookup tables that dramatically speed up some operations
24+
// Lookup tables that can dramatically speed up some operations
2325
pub(crate) tax_id_lookup: HashMap<String, InternalIndex>,
2426
pub(crate) children_lookup: Vec<Vec<InternalIndex>>,
2527
}
@@ -191,14 +193,6 @@ impl GeneralTaxonomy {
191193
}
192194
}
193195

194-
/// Retrieves the data associated with a specific tax id
195-
///
196-
/// Only contains data when the data is loaded from JSON
197-
pub fn data(&self, tax_id: &str) -> TaxonomyResult<&HashMap<String, Value>> {
198-
let idx = self.to_internal_index(tax_id)?;
199-
Ok(&self.data[idx])
200-
}
201-
202196
/// Add a new node to the taxonomy.
203197
pub fn add(&mut self, parent_id: &str, tax_id: &str) -> TaxonomyResult<()> {
204198
let parent_idx = self.to_internal_index(parent_id)?;
@@ -266,105 +260,11 @@ impl GeneralTaxonomy {
266260
self.index();
267261
Ok(())
268262
}
269-
270-
/// Return a tree with these tax_ids and their children removed.
271-
pub fn prune_away(&self, tax_ids: &[&str]) -> TaxonomyResult<Self> {
272-
let mut new_ids = Vec::new();
273-
let mut parent_ids = Vec::new();
274-
let mut dists = Vec::new();
275-
let mut names = Vec::new();
276-
let mut ranks = Vec::new();
277-
let mut data = Vec::new();
278-
279-
let tax_set: HashSet<_> = tax_ids.iter().cloned().collect();
280-
281-
let mut dropping: u8 = 0;
282-
let mut cur_lineage = Vec::new();
283-
for (node, pre) in self.traverse(self.root())? {
284-
if tax_set.contains(&node) {
285-
if pre {
286-
dropping += 1;
287-
} else {
288-
dropping -= 1;
289-
}
290-
}
291-
if dropping == 0 {
292-
if pre {
293-
new_ids.push(node.to_string());
294-
parent_ids.push(cur_lineage.last().map(|x| x - 1).unwrap_or(0));
295-
dists.push(self.parent(node)?.map(|x| x.1).unwrap_or(0.));
296-
names.push(self.name(node)?.to_string());
297-
ranks.push(self.rank(node)?);
298-
data.push(self.data(node)?.clone());
299-
300-
cur_lineage.push(new_ids.len());
301-
} else {
302-
cur_lineage.pop();
303-
}
304-
}
305-
}
306-
GeneralTaxonomy::from_arrays(
307-
new_ids,
308-
parent_ids,
309-
Some(names),
310-
Some(ranks),
311-
Some(dists),
312-
Some(data),
313-
)
314-
}
315-
316-
/// Return a tree containing only the given tax_ids and their parents.
317-
pub fn prune_to(&self, tax_ids: &[&str], include_children: bool) -> TaxonomyResult<Self> {
318-
let mut good_ids: HashSet<_> = tax_ids.iter().cloned().collect();
319-
for (node, pre) in self.traverse(self.root())? {
320-
if let Some((parent_node, _)) = self.parent(node)? {
321-
if pre && include_children && good_ids.contains(&parent_node) {
322-
// insert child nodes on the traverse down (add node if parent is in)
323-
good_ids.insert(node);
324-
} else if !pre && good_ids.contains(&node) {
325-
// insert parent nodes of stuff we've seen on the traverse back
326-
// (note this will add some duplicates of the "children" nodes
327-
// but since this is a set that still works)
328-
good_ids.insert(parent_node);
329-
}
330-
}
331-
}
332-
333-
let mut new_ids = Vec::new();
334-
let mut parent_ids = Vec::new();
335-
let mut dists = Vec::new();
336-
let mut names = Vec::new();
337-
let mut ranks = Vec::new();
338-
let mut data = Vec::new();
339-
340-
let mut cur_lineage = Vec::new();
341-
for (node, pre) in self.traverse(self.root())? {
342-
if pre {
343-
if good_ids.contains(&node) {
344-
new_ids.push(node.to_string());
345-
parent_ids.push(cur_lineage.last().map(|x| x - 1).unwrap_or(0));
346-
dists.push(self.parent(node)?.map(|x| x.1).unwrap_or(0.));
347-
names.push(self.name(node)?.to_string());
348-
ranks.push(self.rank(node)?);
349-
data.push(self.data(node)?.clone());
350-
}
351-
cur_lineage.push(new_ids.len());
352-
} else {
353-
cur_lineage.pop();
354-
}
355-
}
356-
GeneralTaxonomy::from_arrays(
357-
new_ids,
358-
parent_ids,
359-
Some(names),
360-
Some(ranks),
361-
Some(dists),
362-
Some(data),
363-
)
364-
}
365263
}
366264

367-
impl<'t> Taxonomy<'t> for GeneralTaxonomy {
265+
/// This is the implementation for &str taxonomy access for a more
266+
/// end-user understandable (but slightly slower) workflow.
267+
impl<'t> Taxonomy<'t, &'t str> for GeneralTaxonomy {
368268
fn root(&'t self) -> &'t str {
369269
&self.tax_ids[0]
370270
}
@@ -394,19 +294,83 @@ impl<'t> Taxonomy<'t> for GeneralTaxonomy {
394294
Ok(&self.names[idx])
395295
}
396296

297+
fn data(&'t self, tax_id: &str) -> TaxonomyResult<Cow<'t, HashMap<String, Value>>> {
298+
let idx = self.to_internal_index(tax_id)?;
299+
Ok(Cow::Borrowed(&self.data[idx]))
300+
}
301+
397302
fn rank(&'t self, tax_id: &str) -> TaxonomyResult<TaxRank> {
398303
let idx = self.to_internal_index(tax_id)?;
399304
Ok(self.ranks[idx])
400305
}
401306

402-
fn get_internal_tax_id(&'t self, node: &str) -> TaxonomyResult<&'t str> {
403-
match self.tax_ids.iter().find(|t| t.as_str() == node) {
404-
Some(t) => Ok(t),
405-
None => Err(Error::new(ErrorKind::NoSuchTaxId(node.to_owned()))),
307+
fn len(&'t self) -> usize
308+
where
309+
Self: Sized,
310+
{
311+
self.tax_ids.len()
312+
}
313+
}
314+
315+
/// This is the implementation for "internal" tax ID lookup; these IDs are
316+
/// arbitrary (they're positions of the tax nodes in the internal array) and
317+
/// not linked at all to the "external" (e.g. NCBI) IDs. Using these IDs
318+
/// directly can lead to a decent speed up without having to build indices.
319+
/// This is about 3-8x faster than the &str impl, depending on the usage.
320+
impl<'t> Taxonomy<'t, InternalIndex> for GeneralTaxonomy {
321+
fn root(&'t self) -> InternalIndex {
322+
0
323+
}
324+
325+
fn children(&'t self, tax_id: InternalIndex) -> TaxonomyResult<Vec<InternalIndex>> {
326+
if let Some(children) = self.children_lookup.get(tax_id) {
327+
Ok(children.to_vec())
328+
} else {
329+
Err(Error::new(ErrorKind::NoSuchInternalIndex(tax_id)))
330+
}
331+
}
332+
333+
fn parent(&'t self, idx: InternalIndex) -> TaxonomyResult<Option<(InternalIndex, f32)>> {
334+
if idx == 0 {
335+
return Ok(None);
336+
}
337+
if idx >= self.parent_ids.len() {
338+
return Err(Error::new(ErrorKind::NoSuchInternalIndex(idx)));
339+
}
340+
Ok(Some((
341+
self.parent_ids[idx as usize],
342+
self.parent_distances[idx as usize],
343+
)))
344+
}
345+
346+
fn name(&'t self, idx: InternalIndex) -> TaxonomyResult<&str> {
347+
if let Some(name) = self.names.get(idx) {
348+
Ok(name)
349+
} else {
350+
Err(Error::new(ErrorKind::NoSuchInternalIndex(idx)))
406351
}
407352
}
408353

409-
fn len(&'t self) -> usize {
354+
fn data(&'t self, idx: InternalIndex) -> TaxonomyResult<Cow<'t, HashMap<String, Value>>> {
355+
if let Some(data) = self.data.get(idx) {
356+
Ok(Cow::Borrowed(data))
357+
} else {
358+
Err(Error::new(ErrorKind::NoSuchInternalIndex(idx)))
359+
}
360+
}
361+
362+
fn rank(&'t self, idx: InternalIndex) -> TaxonomyResult<TaxRank> {
363+
if let Some(rank) = self.ranks.get(idx) {
364+
Ok(*rank)
365+
} else {
366+
Err(Error::new(ErrorKind::NoSuchInternalIndex(idx)))
367+
}
368+
}
369+
370+
fn len(&'t self) -> usize
371+
where
372+
Self: Sized,
373+
{
410374
self.tax_ids.len()
411375
}
412376
}
@@ -438,7 +402,7 @@ mod tests {
438402
#[test]
439403
fn implements_taxonomy_correctly() {
440404
let tax = create_test_taxonomy();
441-
assert_eq!(tax.len(), 4);
405+
assert_eq!(Taxonomy::<&str>::len(&tax), 4);
442406
assert_eq!(tax.children("1").unwrap(), vec!["2", "1000"]);
443407
assert_eq!(tax.name("562").unwrap(), "Escherichia coli");
444408
assert_eq!(tax.rank("562").unwrap(), TaxRank::Species);
@@ -455,45 +419,25 @@ mod tests {
455419
#[test]
456420
fn can_add_node() {
457421
let mut tax = create_test_taxonomy();
458-
let tax_size = tax.len();
422+
let tax_size = Taxonomy::<&str>::len(&tax);
459423
tax.add("2", "200").unwrap();
460-
assert_eq!(tax.len(), tax_size + 1);
424+
assert_eq!(Taxonomy::<&str>::len(&tax), tax_size + 1);
461425
assert_eq!(tax.parent("200").unwrap(), Some(("2", 1.0)));
462426
assert_eq!(tax.lineage("200").unwrap(), vec!["200", "2", "1"]);
463427
}
464428

465429
#[test]
466430
fn can_remove_node() {
467431
let mut tax = create_test_taxonomy();
468-
let tax_size = tax.len();
432+
let tax_size = Taxonomy::<&str>::len(&tax);
469433
tax.remove("2").unwrap();
470-
assert_eq!(tax.len(), tax_size - 1);
434+
assert_eq!(Taxonomy::<&str>::len(&tax), tax_size - 1);
471435
assert_eq!(tax.parent("562").unwrap(), Some(("1", 2.0)));
472436
assert_eq!(tax.lineage("562").unwrap(), vec!["562", "1"]);
473437
// can't remove root
474438
assert!(tax.remove("1").is_err());
475439
}
476440

477-
#[test]
478-
fn can_prune_away() {
479-
let tax = create_test_taxonomy();
480-
assert_eq!(tax.len(), 4);
481-
let pruned_tax = tax.prune_away(&["2"]).unwrap();
482-
assert_eq!(pruned_tax.len(), 2);
483-
assert_eq!(pruned_tax.children("1").unwrap(), vec!["1000"]);
484-
assert!(pruned_tax.data("1000").unwrap().get("readcount").is_some());
485-
}
486-
487-
#[test]
488-
fn can_prune_to() {
489-
let tax = create_test_taxonomy();
490-
assert_eq!(tax.len(), 4);
491-
let pruned_tax = tax.prune_to(&["2"], true).unwrap();
492-
assert_eq!(pruned_tax.len(), 3);
493-
assert_eq!(pruned_tax.children("1").unwrap(), vec!["2"]);
494-
assert!(pruned_tax.data("1").unwrap().get("readcount").is_some());
495-
}
496-
497441
#[test]
498442
fn errors_on_taxonomy_with_cycle() {
499443
let example = r#"{

0 commit comments

Comments
 (0)
Please sign in to comment.