Skip to content

Commit

Permalink
Merge pull request #111 from broadinstitute/dp-fasta-handling
Browse files Browse the repository at this point in the history
make samtools happy with spaces in fasta headers
  • Loading branch information
dpark01 committed Sep 19, 2024
2 parents b15de41 + 390b07d commit 979583f
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 10 deletions.
16 changes: 16 additions & 0 deletions test/unit/test_tools_samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@ def test_fasta_index(self):
samtools.faidx(inRef)
self.assertEqualContents(outFai, expected_fai)

def test_messy_fasta_index(self):
orig_ref = os.path.join(util.file.get_test_input_path(), 'TestToolPicard', 'messy-headers.fasta')
samtools = tools.samtools.SamtoolsTool()
with util.file.tempfname('.fasta') as inRef:
shutil.copyfile(orig_ref, inRef)
samtools.faidx(inRef, overwrite=True)
with open(inRef + '.fai', 'rt') as inf:
seqnames = set()
for line in inf:
seq_name = line.strip().split('\t')[0]
# old versions of code cut this off at "Influenza"
self.assertGreater(len(seq_name), 50)
seqnames.add(seq_name)
# require that all sequence names are unique
self.assertEqual(len(seqnames), 8)

def test_isEmpty(self):
samtools = tools.samtools.SamtoolsTool()
self.assertTrue(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'empty.bam')))
Expand Down
5 changes: 4 additions & 1 deletion tools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,10 @@ def faidx(self, inFasta, overwrite=False):
else:
return
#pysam.faidx(inFasta)
self.execute('faidx', [inFasta])
with util.file.fastas_with_sanitized_ids(inFasta, use_tmp=True) as sanitized_fastas:
sanitized_fasta = sanitized_fastas[0]
self.execute('faidx', [sanitized_fasta])
shutil.copyfile(sanitized_fasta + '.fai', outfname)

def depth(self, inBam, outFile, options=None):
""" Write a TSV file with coverage depth by position """
Expand Down
16 changes: 7 additions & 9 deletions util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,17 +647,15 @@ def sanitize_id_for_sam_rname(string_in):
return string_value

def write_fasta_with_sanitized_ids(fasta_in, out_filepath):
with open(out_filepath, "w") as handle:
fasta_out = FastaIO.FastaWriter(handle, wrap=None)
fasta_out.write_header()
for record in SeqIO.parse(fasta_in, "fasta"):
record.id=sanitize_id_for_sam_rname(record.description)
fasta_out.write_record(record)
with open(out_filepath, "wt") as outf:
with open(fasta_in, "rt") as inf:
for line in inf:
line = line.strip()
if line.startswith(">"):
line = ">"+sanitize_id_for_sam_rname(line[1:])
outf.write(line + '\n')
print("out_filepath",out_filepath)
print("os.path.dirname(out_filepath)",os.path.dirname(out_filepath))
print("ls -lah")
for line in subprocess.check_output(["ls","-lah",os.path.dirname(out_filepath)]).decode("utf-8").split("\n"):
print(line)
return out_filepath

@contextlib.contextmanager
Expand Down

0 comments on commit 979583f

Please sign in to comment.