Skip to content

Commit 0a6ecc5

Browse files
committedJun 9, 2020
New API for parsing + Python bindings
1 parent 5ef0e7b commit 0a6ecc5

28 files changed

+2243
-1593
lines changed
 

‎.github/workflows/tests.yml

+45-1
Original file line numberDiff line numberDiff line change
@@ -109,4 +109,48 @@ jobs:
109109
run: |
110110
cargo +nightly install cargo-fuzz
111111
cargo +nightly fuzz run parse_fasta -- -max_total_time=180
112-
cargo +nightly fuzz run parse_fastq -- -max_total_time=180
112+
cargo +nightly fuzz run parse_fastq -- -max_total_time=180
113+
114+
python-bindings:
115+
runs-on: ubuntu-latest
116+
steps:
117+
- name: Checkout
118+
uses: actions/checkout@master
119+
120+
- uses: actions-rs/toolchain@v1
121+
with:
122+
profile: minimal
123+
toolchain: nightly
124+
override: true
125+
126+
- name: version info
127+
run: rustc --version; cargo --version;
128+
129+
- name: Run all tests
130+
run: cargo test --features=python_test
131+
132+
- name: Install python headers
133+
run: sudo apt-get update && sudo apt-get install python3-dev python3-pip python3-venv
134+
135+
- name: Install maturin
136+
run: |
137+
python3 -m venv venv
138+
. venv/bin/activate
139+
pip3 install maturin
140+
pip3 show maturin
141+
142+
143+
- name: add library to venv
144+
run: |
145+
. venv/bin/activate
146+
maturin develop --cargo-extra-args="--features=python"
147+
148+
- name: Run the python tests
149+
run: |
150+
. venv/bin/activate
151+
python test_python.py
152+
153+
- name: compile taxonomy with python bindings
154+
run: |
155+
. venv/bin/activate
156+
maturin build --cargo-extra-args="--features=python" --release --strip --manylinux=off

‎.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ Cargo.lock
33
venv/
44
.DS_Store
55
.idea/
6+
test.py

‎CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
## [0.4.0] - TODO
10+
11+
## Changed
12+
- Added `parse_fastx_file` which replaces `parse_sequence_reader` and offers an iterator like usage and is faster
13+
than 0.3
14+
- `SequenceRecord` now offers more information about the file such as line ending, which allows writing a file identical
15+
to the input one.
16+
17+
## Removed
18+
- Gzip files piped in through `stdin` is not supported anymore since we do require `Seek`
19+
20+
921
## [0.3.0] - 2019-09-12
1022
### Added
1123
- Improved error reporting (i.e., a parse failure now gives the record it failed on).

‎Cargo.toml

+13-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "needletail"
3-
version = "0.3.2"
3+
version = "0.4.0"
44
authors = ["Roderick Bovee <roderick@onecodex.com>"]
55
description = "FASTX parsing and k-mer methods"
66
keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"]
@@ -11,16 +11,24 @@ readme = "./README.md"
1111
edition = "2018"
1212
include = ["src/**/*", "LICENSE", "README.md", "CHANGELOG.md"]
1313

14+
[lib]
15+
crate-type=["cdylib", "rlib"]
16+
bench = false
17+
1418
[features]
1519
default = ["compression"]
1620
compression = ["bzip2", "flate2", "xz2"]
21+
python = ["pyo3/extension-module"]
22+
python_test = ["pyo3"]
1723

1824
[dependencies]
1925
flate2 = { version="1.0.6", optional=true }
2026
bzip2 = { version="0.3.3", optional=true }
2127
xz2 = { version="0.1.6", optional=true }
28+
pyo3 = { version = "0.10", optional = true }
2229
memchr = "2.2.1"
23-
safemem = "0.3.2"
30+
bytecount = {version = "0.6", features = ["runtime-dispatch-simd"]}
31+
buf_redux = { version = "0.8", default_features = false }
2432

2533
[dev-dependencies]
2634
criterion = "0.3"
@@ -34,6 +42,9 @@ toml = "0.5"
3442
serde = "1.0"
3543
serde_derive = "1.0"
3644

45+
[profile.release]
46+
lto = true
47+
3748
[[bench]]
3849
name = "benchmark"
3950
harness = false

‎README.md

+53-46
Original file line numberDiff line numberDiff line change
@@ -8,61 +8,53 @@ Needletail is a MIT-licensed, minimal-copying FASTA/FASTQ parser and _k_-mer pro
88
The goal is to write a fast *and* well-tested set of functions that more specialized bioinformatics programs can use.
99
Needletail's goal is to be as fast as the [readfq](https://github.com/lh3/readfq) C library at parsing FASTX files and much (i.e. 25 times) faster than equivalent Python implementations at _k_-mer counting.
1010

11-
# Example
11+
## Example
1212

1313
```rust
14-
extern crate needletail;
15-
use needletail::{parse_sequence_path, Sequence};
16-
use std::env;
17-
18-
fn main() {
19-
let filename: String = env::args().nth(1).unwrap();
20-
21-
let mut n_bases = 0;
22-
let mut n_valid_kmers = 0;
23-
parse_sequence_path(
24-
filename,
25-
|_| {},
26-
|seq| {
27-
// seq.id is the name of the record
28-
// seq.seq is the base sequence
29-
// seq.qual is an optional quality score
30-
31-
// keep track of the total number of bases
32-
n_bases += seq.seq.len();
33-
34-
// normalize to make sure all the bases are consistantly capitalized
35-
let norm_seq = seq.normalize(false);
36-
// we make a reverse complemented copy of the sequence first for
37-
// `canonical_kmers` to draw the complemented sequences from.
38-
let rc = norm_seq.reverse_complement();
39-
// now we keep track of the number of AAAAs (or TTTTs via
40-
// canonicalization) in the file; note we also get the postion (i.0;
41-
// in the event there were `N`-containing kmers that were skipped)
42-
// and whether the sequence was complemented (i.2) in addition to
43-
// the canonical kmer (i.1)
44-
for (_, kmer, _) in norm_seq.canonical_kmers(4, &rc) {
45-
if kmer == b"AAAA" {
46-
n_valid_kmers += 1;
47-
}
48-
}
49-
},
50-
)
51-
.expect("parsing failed");
52-
println!("There are {} bases in your file.", n_bases);
53-
println!("There are {} AAAAs in your file.", n_valid_kmers);
54-
}
14+
//! extern crate needletail;
15+
//! use needletail::{parse_fastx_file, Sequence, FastxReader};
16+
//!
17+
//! fn main() {
18+
//! let filename = "tests/data/28S.fasta";
19+
//!
20+
//! let mut n_bases = 0;
21+
//! let mut n_valid_kmers = 0;
22+
//! let mut reader = parse_fastx_file(&filename).expect("valid path/file");
23+
//! while let Some(record) = reader.next() {
24+
//! let seqrec = record.expect("invalid record");
25+
//! // keep track of the total number of bases
26+
//! n_bases += seqrec.num_bases();
27+
//! // normalize to make sure all the bases are consistently capitalized and
28+
//! // that we remove the newlines since this is FASTA
29+
//! let norm_seq = seqrec.normalize(false);
30+
//! // we make a reverse complemented copy of the sequence first for
31+
//! // `canonical_kmers` to draw the complemented sequences from.
32+
//! let rc = norm_seq.reverse_complement();
33+
//! // now we keep track of the number of AAAAs (or TTTTs via
34+
//! // canonicalization) in the file; note we also get the position (i.0;
35+
//! // in the event there were `N`-containing kmers that were skipped)
36+
//! // and whether the sequence was complemented (i.2) in addition to
37+
//! // the canonical kmer (i.1)
38+
//! for (_, kmer, _) in norm_seq.canonical_kmers(4, &rc) {
39+
//! if kmer == b"AAAA" {
40+
//! n_valid_kmers += 1;
41+
//! }
42+
//! }
43+
//! }
44+
//! println!("There are {} bases in your file.", n_bases);
45+
//! println!("There are {} AAAAs in your file.", n_valid_kmers);
46+
//! }
5547
```
5648

57-
# Installation
49+
## Installation
5850

5951
Needletail requires `rust` and `cargo` to be installed.
6052
Please use either your local package manager (`homebrew`, `apt-get`, `pacman`, etc) or install these via [rustup](https://www.rustup.rs/).
6153

6254
Once you have Rust set up, you can include needletail in your `Cargo.toml` file like:
6355
```shell
6456
[dependencies]
65-
needletail = "^0.3.1"
57+
needletail = "0.4"
6658
```
6759

6860
To install needletail itself for development:
@@ -71,10 +63,25 @@ git clone https://github.com/onecodex/needletail
7163
cargo test # to run tests
7264
```
7365

74-
# Getting Help
66+
### Python
67+
To work on the Python library on a Mac OS X/Unix system (requires Python 3):
68+
```bash
69+
# you need the nightly version of Rust installed
70+
curl https://sh.rustup.rs -sSf | sh
71+
rustup default nightly
72+
73+
# finally, install the library in the local virtualenv
74+
maturin develop --cargo-extra-args="--features=python"
75+
```
76+
77+
## Getting Help
7578

7679
Questions are best directed as GitHub issues. We plan to add more documentation soon, but in the meantime "doc" comments are included in the source.
7780

78-
# Contributing
81+
## Contributing
7982

8083
Please do! We're happy to discuss possible additions and/or accept pull requests.
84+
85+
## Acknowledgements
86+
Starting from 0.4, the parsers algorithms is taken from [seq_io](https://github.com/markschl/seq_io). While it has been slightly modified, it is mainly
87+
coming that library. Links to the original files are available in `src/parser/fast{a,q}.rs`.

‎benches/benchmark.rs

+37-66
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ extern crate criterion;
33
extern crate needletail;
44

55
use criterion::Criterion;
6-
use needletail::parse_sequence_reader;
7-
use needletail::sequence::Sequence;
6+
use needletail::parser::FastxReader;
7+
use needletail::Sequence;
88
use std::fs::File;
99
use std::io::{Cursor, Read};
1010

@@ -22,48 +22,47 @@ fn bench_kmer_speed(c: &mut Criterion) {
2222
group.sample_size(10);
2323

2424
group.bench_function("Kmer", |b| {
25+
use needletail::parser::FastaReader;
2526
b.iter(|| {
2627
let mut n_total = 0;
2728
let mut n_canonical = 0;
2829
let fasta_data = Cursor::new(data.clone());
29-
parse_sequence_reader(
30-
fasta_data,
31-
|_| {},
32-
|rec| {
33-
let seq = rec.seq.normalize(true);
34-
let rc = seq.reverse_complement();
35-
for (_, _kmer, was_rc) in seq.canonical_kmers(ksize, &rc) {
36-
if !was_rc {
37-
n_canonical += 1;
38-
}
39-
n_total += 1;
30+
let mut reader = FastaReader::new(fasta_data);
31+
32+
while let Some(record) = reader.next() {
33+
let rec = record.unwrap();
34+
let seq = rec.normalize(true);
35+
let rc = seq.reverse_complement();
36+
for (_, _kmer, was_rc) in seq.canonical_kmers(ksize, &rc) {
37+
if !was_rc {
38+
n_canonical += 1;
4039
}
41-
},
42-
)
43-
.unwrap();
40+
n_total += 1;
41+
}
42+
}
4443
assert_eq!(718_007, n_total);
4544
assert_eq!(350_983, n_canonical);
4645
});
4746
});
4847

4948
group.bench_function("Bitkmer", |bench| {
49+
use needletail::parser::FastaReader;
5050
bench.iter(|| {
5151
let mut n_total = 0;
5252
let mut n_canonical = 0;
5353
let fasta_data = Cursor::new(data.clone());
54-
parse_sequence_reader(
55-
fasta_data,
56-
|_| {},
57-
|seq| {
58-
for (_, _kmer, was_rc) in seq.bit_kmers(ksize, true) {
59-
if !was_rc {
60-
n_canonical += 1;
61-
}
62-
n_total += 1;
54+
let mut reader = FastaReader::new(fasta_data);
55+
while let Some(record) = reader.next() {
56+
let rec = record.unwrap();
57+
let seq = rec.strip_returns();
58+
for (_, _kmer, was_rc) in seq.bit_kmers(ksize, true) {
59+
if !was_rc {
60+
n_canonical += 1;
6361
}
64-
},
65-
)
66-
.unwrap();
62+
n_total += 1;
63+
}
64+
}
65+
6766
assert_eq!(718_007, n_total);
6867
assert_eq!(350_983, n_canonical);
6968
});
@@ -111,28 +110,14 @@ fn bench_fastq_file(c: &mut Criterion) {
111110
});
112111

113112
group.bench_function("Needletail", |bench| {
113+
use needletail::parser::FastqReader;
114114
bench.iter(|| {
115115
let fastq_data = Cursor::new(data.clone());
116116
let mut n_bases = 0;
117-
parse_sequence_reader(
118-
fastq_data,
119-
|_| {},
120-
|seq| {
121-
n_bases += seq.seq.len();
122-
},
123-
)
124-
.unwrap();
125-
assert_eq!(250_000, n_bases);
126-
});
127-
});
128-
129-
group.bench_function("Needletail (No Buffer)", |bench| {
130-
use needletail::formats::{FastqParser, RecParser};
131-
bench.iter(|| {
132-
let mut reader = FastqParser::from_buffer(&data, true);
133-
let mut n_bases = 0;
134-
for seq in reader.by_ref() {
135-
n_bases += seq.unwrap().seq.len();
117+
let mut reader = FastqReader::new(fastq_data);
118+
while let Some(record) = reader.next() {
119+
let rec = record.unwrap();
120+
n_bases += rec.seq().len();
136121
}
137122
assert_eq!(250_000, n_bases);
138123
});
@@ -178,28 +163,14 @@ fn bench_fasta_file(c: &mut Criterion) {
178163
});
179164

180165
group.bench_function("Needletail", |bench| {
166+
use needletail::parser::FastaReader;
181167
bench.iter(|| {
182168
let fasta_data = Cursor::new(data.clone());
169+
let mut reader = FastaReader::new(fasta_data);
183170
let mut n_bases = 0;
184-
parse_sequence_reader(
185-
fasta_data,
186-
|_| {},
187-
|seq| {
188-
n_bases += seq.seq.len();
189-
},
190-
)
191-
.unwrap();
192-
assert_eq!(738_580, n_bases);
193-
});
194-
});
195-
196-
group.bench_function("Needletail (No Buffer)", |bench| {
197-
use needletail::formats::{FastaParser, RecParser};
198-
bench.iter(|| {
199-
let mut reader = FastaParser::from_buffer(&data, true);
200-
let mut n_bases = 0;
201-
for rec in reader.by_ref() {
202-
n_bases += rec.unwrap().seq.strip_returns().len();
171+
while let Some(result) = reader.next() {
172+
let record = result.unwrap();
173+
n_bases += record.num_bases();
203174
}
204175
assert_eq!(738_580, n_bases);
205176
});

0 commit comments

Comments
 (0)
Please sign in to comment.