Skip to content

Commit 3ba8f8b

Browse files
author
Roderick Bovee
committed
Get rust kmer/fastx code working
1 parent 2aa8b95 commit 3ba8f8b

24 files changed

+12923
-742
lines changed

Cargo.toml

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[package]
2+
name = "needletail"
3+
version = "0.1.0"
4+
authors = ["Roderick Bovee <[email protected]>"]
5+
6+
[features]
7+
default = ["gz"]
8+
gz = ["flate2"]
9+
10+
[dependencies]
11+
flate2 = { version="0.2", optional=true }
12+
memchr = "0.1.11"
13+
14+
[dev-dependencies]
15+
bencher = "0.1.1"
16+
17+
[[bench]]
18+
name = "benchmark"
19+
harness = false

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2016 Roderick Bovee
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
[![Circle CI](https://circleci.com/gh/onecodex/needletail.svg?style=shield&circle-token=65c2b7d87452dba5e8e3e967133311af478632a4)](https://circleci.com/gh/onecodex/needletail)
2+
3+
# Needletail
4+
5+
Needletail is a MIT-licensed, minimal-copying FASTA/FASTQ parser and k-mer processing library.
6+
7+
The goal is to write a fast *and* well-tested set of functions that more-specialized bioinformatics programs can use.
8+
Needletail's goal is to be as fast as the `readfq` C library at parsing FASTX files and much (i.e. 25 times) faster than equivalent Python implementations at k-mer counting.
9+
10+
For example, a simple Needletail script can count all the bases in a 2.1 gigabyte FASTQ file in 2.75 seconds while a comparable parser with `readfq` takes 3.52 seconds (see `bench` folder; measured with `%timeit -r 3 -n 3` on an Early 2015 MacBook Pro).
11+
12+
# Example
13+
14+
```rust
15+
extern crate needletail;
16+
ues std::env;
17+
use needletail::{fastx};
18+
19+
fn main() {
20+
let filename: String = env::args().nth(1).unwrap();
21+
22+
let mut n_bases = 0;
23+
fastx::fastx_file(&filename[..], |seq| {
24+
// seq.0 is the name of the record
25+
// seq.1 is the base sequence
26+
n_bases += seq.1.len();
27+
// seq.2 is an optional quality score
28+
});
29+
println!("There are {} bases in your file.", n_bases);
30+
}
31+
```
32+
33+
# Installation
34+
35+
Needletail requires `rust` and `cargo` to be installed.
36+
Please use either your local package manager (`homebrew`, `apt-get`, `pacman`, etc) or install these via [rustup](https://www.rustup.rs/).
37+
38+
To install needletail itself for development:
39+
```shell
40+
git clone https://github.com/bovee/needletail
41+
cargo test # to run tests
42+
```
43+
44+
# Getting Help
45+
46+
Questions are best directed as GitHub issues.
47+
48+
Hopefully I'll compile the documentation and put it up as a webpage soon too.
49+
50+
# Contributing
51+
52+
Please do! We're happy to discuss/mentor possible additions and/or accept pull requests.

bench/benchmark.c

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// using https://github.com/lh3/readfq for parsing
2+
// derived in part from http://stackoverflow.com/questions/19390245/how-to-parse-a-fasta-file-using-kseq-h
3+
#include <unistd.h>
4+
#include <stdio.h>
5+
#include <stdlib.h>
6+
#include "./readfq/kseq.h"
7+
KSEQ_INIT(FILE*, read)
8+
9+
int main(int argc, char* argv[]) {
10+
FILE* fp = fopen(argv[1], "r");
11+
if (fp == 0) {
12+
perror("fopen");
13+
exit(1);
14+
}
15+
kseq_t *seq = kseq_init(fileno(fp));
16+
17+
unsigned long slen = 0;
18+
while (kseq_read(seq) >= 0) {
19+
slen += seq->seq.l;
20+
}
21+
22+
printf("%lu\n", slen);
23+
24+
kseq_destroy(seq);
25+
fclose(fp);
26+
return (0);
27+
}

bench/benchmark.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/usr/bin/env python
2+
"""
3+
Unscientific benchmarking of this versus the --release rust
4+
implementation below using the %timeit Ipython magic (times in sec)
5+
6+
n_kmers, py_runtime, rust_runtime
7+
6594204, 14.4, 0.578
8+
9+
Both give identical counts on the files tested (and printing kmers out
10+
and diff'ing the results gives no difference)
11+
"""
12+
from __future__ import print_function
13+
import sys
14+
15+
from Bio.SeqIO import parse
16+
from Bio.Seq import reverse_complement
17+
18+
19+
def slid_win(seq, size=4, overlapping=True):
20+
"""Returns a sliding window along self.seq."""
21+
itr = iter(seq)
22+
if overlapping:
23+
buf = ''
24+
for _ in range(size):
25+
buf += next(itr)
26+
for l in itr:
27+
yield buf
28+
buf = buf[1:] + l
29+
yield buf
30+
else:
31+
buf = ''
32+
for l in itr:
33+
buf += l
34+
if len(buf) == size:
35+
yield buf
36+
buf = ''
37+
38+
filename = sys.argv[1]
39+
40+
n_total = 0
41+
n_canonical = 0
42+
43+
for s in parse(filename, 'fastq'):
44+
uppercase_seq = str(s.upper().seq)
45+
for kmer in slid_win(uppercase_seq, 4):
46+
canonical = min(kmer, reverse_complement(kmer))
47+
if canonical == 'CAGC':
48+
n_canonical += 1
49+
n_total += 1
50+
# print(canonical)
51+
52+
print(n_total, n_canonical)

bench/benchmark.rs

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
extern crate needletail;
2+
3+
use std::env;
4+
use needletail::{fastx, kmer};
5+
6+
7+
fn count_bases(filename: String) {
8+
let mut n_bases = 0;
9+
fastx::fastx_file(&filename[..], |seq| {
10+
n_bases += seq.1.len();
11+
}).unwrap();
12+
println!("{:?}", n_bases);
13+
}
14+
15+
16+
fn main() {
17+
let filename: String = env::args().nth(1).unwrap();
18+
19+
count_bases(filename);
20+
}

benches/benchmark.rs

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
extern crate needletail;
4+
5+
use needletail::{fastx, kmer, bitkmer};
6+
use bencher::Bencher;
7+
8+
fn check_kmer_speed(bench: &mut Bencher) {
9+
let filename = String::from("tests/data/28S.fasta");
10+
let ksize = 31;
11+
12+
bench.iter(|| {
13+
let mut n_total = 0;
14+
let mut n_canonical = 0;
15+
fastx::fastx_file(&filename[..], |seq| {
16+
// let normalized_seq = kmer::normalize(Cow::Borrowed(&seq.1), true);
17+
for k in seq.1.windows(ksize) {
18+
let l = kmer::canonical(k).into_owned();
19+
if l == k {
20+
n_canonical += 1;
21+
}
22+
n_total += 1;
23+
}
24+
});
25+
assert_eq!(213703, n_total);
26+
assert_eq!(108521, n_canonical);
27+
})
28+
}
29+
30+
fn check_bitkmer_speed(bench: &mut Bencher) {
31+
let filename = String::from("tests/data/28S.fasta");
32+
let ksize = 31;
33+
34+
bench.iter(|| {
35+
let mut n_total = 0;
36+
let mut n_canonical = 0;
37+
fastx::fastx_file(&filename[..], |seq| {
38+
for k in bitkmer::BitKmerIter::new(&seq.1, ksize) {
39+
let l = bitkmer::canonical(k);
40+
if l == k {
41+
n_canonical += 1;
42+
}
43+
n_total += 1;
44+
}
45+
});
46+
assert_eq!(213703, n_total);
47+
assert_eq!(108521, n_canonical);
48+
})
49+
}
50+
51+
52+
benchmark_group!(benches, check_kmer_speed, check_bitkmer_speed);
53+
benchmark_main!(benches);

circle.yml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
checkout:
2+
post:
3+
- git config --global --unset url.ssh://[email protected]:.insteadof
4+
5+
dependencies:
6+
override:
7+
- curl -sS https://static.rust-lang.org/rustup.sh > rustup.sh && chmod +x ./rustup.sh && ./rustup.sh --yes
8+
9+
cache_directories:
10+
- "~/.cargo"
11+
12+
test:
13+
override:
14+
- cargo build -v --color never
15+
- cargo test -v --color never

python/TODO.txt

-10
This file was deleted.

0 commit comments

Comments
 (0)