1
1
use serde:: { Deserialize , Serialize } ;
2
+ use std:: borrow:: Cow ;
2
3
use std:: collections:: { HashMap , HashSet } ;
4
+ use std:: fmt:: Debug ;
3
5
4
6
use serde_json:: Value ;
5
7
@@ -19,7 +21,7 @@ pub struct GeneralTaxonomy {
19
21
// Only used by the JSON format
20
22
pub data : Vec < HashMap < String , Value > > ,
21
23
22
- // these are lookup tables that dramatically speed up some operations
24
+ // Lookup tables that can dramatically speed up some operations
23
25
pub ( crate ) tax_id_lookup : HashMap < String , InternalIndex > ,
24
26
pub ( crate ) children_lookup : Vec < Vec < InternalIndex > > ,
25
27
}
@@ -191,14 +193,6 @@ impl GeneralTaxonomy {
191
193
}
192
194
}
193
195
194
- /// Retrieves the data associated with a specific tax id
195
- ///
196
- /// Only contains data when the data is loaded from JSON
197
- pub fn data ( & self , tax_id : & str ) -> TaxonomyResult < & HashMap < String , Value > > {
198
- let idx = self . to_internal_index ( tax_id) ?;
199
- Ok ( & self . data [ idx] )
200
- }
201
-
202
196
/// Add a new node to the taxonomy.
203
197
pub fn add ( & mut self , parent_id : & str , tax_id : & str ) -> TaxonomyResult < ( ) > {
204
198
let parent_idx = self . to_internal_index ( parent_id) ?;
@@ -266,105 +260,11 @@ impl GeneralTaxonomy {
266
260
self . index ( ) ;
267
261
Ok ( ( ) )
268
262
}
269
-
270
- /// Return a tree with these tax_ids and their children removed.
271
- pub fn prune_away ( & self , tax_ids : & [ & str ] ) -> TaxonomyResult < Self > {
272
- let mut new_ids = Vec :: new ( ) ;
273
- let mut parent_ids = Vec :: new ( ) ;
274
- let mut dists = Vec :: new ( ) ;
275
- let mut names = Vec :: new ( ) ;
276
- let mut ranks = Vec :: new ( ) ;
277
- let mut data = Vec :: new ( ) ;
278
-
279
- let tax_set: HashSet < _ > = tax_ids. iter ( ) . cloned ( ) . collect ( ) ;
280
-
281
- let mut dropping: u8 = 0 ;
282
- let mut cur_lineage = Vec :: new ( ) ;
283
- for ( node, pre) in self . traverse ( self . root ( ) ) ? {
284
- if tax_set. contains ( & node) {
285
- if pre {
286
- dropping += 1 ;
287
- } else {
288
- dropping -= 1 ;
289
- }
290
- }
291
- if dropping == 0 {
292
- if pre {
293
- new_ids. push ( node. to_string ( ) ) ;
294
- parent_ids. push ( cur_lineage. last ( ) . map ( |x| x - 1 ) . unwrap_or ( 0 ) ) ;
295
- dists. push ( self . parent ( node) ?. map ( |x| x. 1 ) . unwrap_or ( 0. ) ) ;
296
- names. push ( self . name ( node) ?. to_string ( ) ) ;
297
- ranks. push ( self . rank ( node) ?) ;
298
- data. push ( self . data ( node) ?. clone ( ) ) ;
299
-
300
- cur_lineage. push ( new_ids. len ( ) ) ;
301
- } else {
302
- cur_lineage. pop ( ) ;
303
- }
304
- }
305
- }
306
- GeneralTaxonomy :: from_arrays (
307
- new_ids,
308
- parent_ids,
309
- Some ( names) ,
310
- Some ( ranks) ,
311
- Some ( dists) ,
312
- Some ( data) ,
313
- )
314
- }
315
-
316
- /// Return a tree containing only the given tax_ids and their parents.
317
- pub fn prune_to ( & self , tax_ids : & [ & str ] , include_children : bool ) -> TaxonomyResult < Self > {
318
- let mut good_ids: HashSet < _ > = tax_ids. iter ( ) . cloned ( ) . collect ( ) ;
319
- for ( node, pre) in self . traverse ( self . root ( ) ) ? {
320
- if let Some ( ( parent_node, _) ) = self . parent ( node) ? {
321
- if pre && include_children && good_ids. contains ( & parent_node) {
322
- // insert child nodes on the traverse down (add node if parent is in)
323
- good_ids. insert ( node) ;
324
- } else if !pre && good_ids. contains ( & node) {
325
- // insert parent nodes of stuff we've seen on the traverse back
326
- // (note this will add some duplicates of the "children" nodes
327
- // but since this is a set that still works)
328
- good_ids. insert ( parent_node) ;
329
- }
330
- }
331
- }
332
-
333
- let mut new_ids = Vec :: new ( ) ;
334
- let mut parent_ids = Vec :: new ( ) ;
335
- let mut dists = Vec :: new ( ) ;
336
- let mut names = Vec :: new ( ) ;
337
- let mut ranks = Vec :: new ( ) ;
338
- let mut data = Vec :: new ( ) ;
339
-
340
- let mut cur_lineage = Vec :: new ( ) ;
341
- for ( node, pre) in self . traverse ( self . root ( ) ) ? {
342
- if pre {
343
- if good_ids. contains ( & node) {
344
- new_ids. push ( node. to_string ( ) ) ;
345
- parent_ids. push ( cur_lineage. last ( ) . map ( |x| x - 1 ) . unwrap_or ( 0 ) ) ;
346
- dists. push ( self . parent ( node) ?. map ( |x| x. 1 ) . unwrap_or ( 0. ) ) ;
347
- names. push ( self . name ( node) ?. to_string ( ) ) ;
348
- ranks. push ( self . rank ( node) ?) ;
349
- data. push ( self . data ( node) ?. clone ( ) ) ;
350
- }
351
- cur_lineage. push ( new_ids. len ( ) ) ;
352
- } else {
353
- cur_lineage. pop ( ) ;
354
- }
355
- }
356
- GeneralTaxonomy :: from_arrays (
357
- new_ids,
358
- parent_ids,
359
- Some ( names) ,
360
- Some ( ranks) ,
361
- Some ( dists) ,
362
- Some ( data) ,
363
- )
364
- }
365
263
}
366
264
367
- impl < ' t > Taxonomy < ' t > for GeneralTaxonomy {
265
+ /// This is the implementation for &str taxonomy access for a more
266
+ /// end-user understandable (but slightly slower) workflow.
267
+ impl < ' t > Taxonomy < ' t , & ' t str > for GeneralTaxonomy {
368
268
fn root ( & ' t self ) -> & ' t str {
369
269
& self . tax_ids [ 0 ]
370
270
}
@@ -394,19 +294,83 @@ impl<'t> Taxonomy<'t> for GeneralTaxonomy {
394
294
Ok ( & self . names [ idx] )
395
295
}
396
296
297
+ fn data ( & ' t self , tax_id : & str ) -> TaxonomyResult < Cow < ' t , HashMap < String , Value > > > {
298
+ let idx = self . to_internal_index ( tax_id) ?;
299
+ Ok ( Cow :: Borrowed ( & self . data [ idx] ) )
300
+ }
301
+
397
302
fn rank ( & ' t self , tax_id : & str ) -> TaxonomyResult < TaxRank > {
398
303
let idx = self . to_internal_index ( tax_id) ?;
399
304
Ok ( self . ranks [ idx] )
400
305
}
401
306
402
- fn get_internal_tax_id ( & ' t self , node : & str ) -> TaxonomyResult < & ' t str > {
403
- match self . tax_ids . iter ( ) . find ( |t| t. as_str ( ) == node) {
404
- Some ( t) => Ok ( t) ,
405
- None => Err ( Error :: new ( ErrorKind :: NoSuchTaxId ( node. to_owned ( ) ) ) ) ,
307
+ fn len ( & ' t self ) -> usize
308
+ where
309
+ Self : Sized ,
310
+ {
311
+ self . tax_ids . len ( )
312
+ }
313
+ }
314
+
315
+ /// This is the implementation for "internal" tax ID lookup; these IDs are
316
+ /// arbitrary (they're positions of the tax nodes in the internal array) and
317
+ /// not linked at all to the "external" (e.g. NCBI) IDs. Using these IDs
318
+ /// directly can lead to a decent speed up without having to build indices.
319
+ /// This is about 3-8x faster than the &str impl, depending on the usage.
320
+ impl < ' t > Taxonomy < ' t , InternalIndex > for GeneralTaxonomy {
321
+ fn root ( & ' t self ) -> InternalIndex {
322
+ 0
323
+ }
324
+
325
+ fn children ( & ' t self , tax_id : InternalIndex ) -> TaxonomyResult < Vec < InternalIndex > > {
326
+ if let Some ( children) = self . children_lookup . get ( tax_id) {
327
+ Ok ( children. to_vec ( ) )
328
+ } else {
329
+ Err ( Error :: new ( ErrorKind :: NoSuchInternalIndex ( tax_id) ) )
330
+ }
331
+ }
332
+
333
+ fn parent ( & ' t self , idx : InternalIndex ) -> TaxonomyResult < Option < ( InternalIndex , f32 ) > > {
334
+ if idx == 0 {
335
+ return Ok ( None ) ;
336
+ }
337
+ if idx >= self . parent_ids . len ( ) {
338
+ return Err ( Error :: new ( ErrorKind :: NoSuchInternalIndex ( idx) ) ) ;
339
+ }
340
+ Ok ( Some ( (
341
+ self . parent_ids [ idx as usize ] ,
342
+ self . parent_distances [ idx as usize ] ,
343
+ ) ) )
344
+ }
345
+
346
+ fn name ( & ' t self , idx : InternalIndex ) -> TaxonomyResult < & str > {
347
+ if let Some ( name) = self . names . get ( idx) {
348
+ Ok ( name)
349
+ } else {
350
+ Err ( Error :: new ( ErrorKind :: NoSuchInternalIndex ( idx) ) )
406
351
}
407
352
}
408
353
409
- fn len ( & ' t self ) -> usize {
354
+ fn data ( & ' t self , idx : InternalIndex ) -> TaxonomyResult < Cow < ' t , HashMap < String , Value > > > {
355
+ if let Some ( data) = self . data . get ( idx) {
356
+ Ok ( Cow :: Borrowed ( data) )
357
+ } else {
358
+ Err ( Error :: new ( ErrorKind :: NoSuchInternalIndex ( idx) ) )
359
+ }
360
+ }
361
+
362
+ fn rank ( & ' t self , idx : InternalIndex ) -> TaxonomyResult < TaxRank > {
363
+ if let Some ( rank) = self . ranks . get ( idx) {
364
+ Ok ( * rank)
365
+ } else {
366
+ Err ( Error :: new ( ErrorKind :: NoSuchInternalIndex ( idx) ) )
367
+ }
368
+ }
369
+
370
+ fn len ( & ' t self ) -> usize
371
+ where
372
+ Self : Sized ,
373
+ {
410
374
self . tax_ids . len ( )
411
375
}
412
376
}
@@ -438,7 +402,7 @@ mod tests {
438
402
#[ test]
439
403
fn implements_taxonomy_correctly ( ) {
440
404
let tax = create_test_taxonomy ( ) ;
441
- assert_eq ! ( tax . len( ) , 4 ) ;
405
+ assert_eq ! ( Taxonomy :: < & str > :: len( & tax ) , 4 ) ;
442
406
assert_eq ! ( tax. children( "1" ) . unwrap( ) , vec![ "2" , "1000" ] ) ;
443
407
assert_eq ! ( tax. name( "562" ) . unwrap( ) , "Escherichia coli" ) ;
444
408
assert_eq ! ( tax. rank( "562" ) . unwrap( ) , TaxRank :: Species ) ;
@@ -455,45 +419,25 @@ mod tests {
455
419
#[ test]
456
420
fn can_add_node ( ) {
457
421
let mut tax = create_test_taxonomy ( ) ;
458
- let tax_size = tax . len ( ) ;
422
+ let tax_size = Taxonomy :: < & str > :: len ( & tax ) ;
459
423
tax. add ( "2" , "200" ) . unwrap ( ) ;
460
- assert_eq ! ( tax . len( ) , tax_size + 1 ) ;
424
+ assert_eq ! ( Taxonomy :: < & str > :: len( & tax ) , tax_size + 1 ) ;
461
425
assert_eq ! ( tax. parent( "200" ) . unwrap( ) , Some ( ( "2" , 1.0 ) ) ) ;
462
426
assert_eq ! ( tax. lineage( "200" ) . unwrap( ) , vec![ "200" , "2" , "1" ] ) ;
463
427
}
464
428
465
429
#[ test]
466
430
fn can_remove_node ( ) {
467
431
let mut tax = create_test_taxonomy ( ) ;
468
- let tax_size = tax . len ( ) ;
432
+ let tax_size = Taxonomy :: < & str > :: len ( & tax ) ;
469
433
tax. remove ( "2" ) . unwrap ( ) ;
470
- assert_eq ! ( tax . len( ) , tax_size - 1 ) ;
434
+ assert_eq ! ( Taxonomy :: < & str > :: len( & tax ) , tax_size - 1 ) ;
471
435
assert_eq ! ( tax. parent( "562" ) . unwrap( ) , Some ( ( "1" , 2.0 ) ) ) ;
472
436
assert_eq ! ( tax. lineage( "562" ) . unwrap( ) , vec![ "562" , "1" ] ) ;
473
437
// can't remove root
474
438
assert ! ( tax. remove( "1" ) . is_err( ) ) ;
475
439
}
476
440
477
- #[ test]
478
- fn can_prune_away ( ) {
479
- let tax = create_test_taxonomy ( ) ;
480
- assert_eq ! ( tax. len( ) , 4 ) ;
481
- let pruned_tax = tax. prune_away ( & [ "2" ] ) . unwrap ( ) ;
482
- assert_eq ! ( pruned_tax. len( ) , 2 ) ;
483
- assert_eq ! ( pruned_tax. children( "1" ) . unwrap( ) , vec![ "1000" ] ) ;
484
- assert ! ( pruned_tax. data( "1000" ) . unwrap( ) . get( "readcount" ) . is_some( ) ) ;
485
- }
486
-
487
- #[ test]
488
- fn can_prune_to ( ) {
489
- let tax = create_test_taxonomy ( ) ;
490
- assert_eq ! ( tax. len( ) , 4 ) ;
491
- let pruned_tax = tax. prune_to ( & [ "2" ] , true ) . unwrap ( ) ;
492
- assert_eq ! ( pruned_tax. len( ) , 3 ) ;
493
- assert_eq ! ( pruned_tax. children( "1" ) . unwrap( ) , vec![ "2" ] ) ;
494
- assert ! ( pruned_tax. data( "1" ) . unwrap( ) . get( "readcount" ) . is_some( ) ) ;
495
- }
496
-
497
441
#[ test]
498
442
fn errors_on_taxonomy_with_cycle ( ) {
499
443
let example = r#"{
0 commit comments