Skip to content

Commit 4347ac2

Browse files
authored
Merge pull request #23 from ZKai0801/master
update for gen_fusion_file
2 parents 7537b48 + 0cbafd8 commit 4347ac2

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed

scripts/make_fusion_genes.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
Generate a user-defined fusion file
3+
4+
Input file should be a text file containing
5+
1. gene name and
6+
2. transcript name (optional)
7+
8+
For example:
9+
gene1 transcript1
10+
gene2
11+
gene3 transcript3
12+
"""
13+
__author__ = "Kai"
14+
__date__ = "20/05/2020"
15+
16+
import argparse
17+
18+
19+
def read_genelist(inputf):
20+
with open(inputf, "r") as fh:
21+
for line in fh:
22+
yield line.rstrip("\n").split()
23+
24+
25+
def make_fusion_gene(gene, fw, refflat):
26+
# no transcript specified --> use the longest transcript
27+
if len(gene) == 1:
28+
transcripts = {}
29+
with open(refflat, "r") as fh:
30+
for line in fh:
31+
if gene[0] not in line:
32+
continue
33+
_, transcript, chrom, _, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t")
34+
transcripts[transcript] = (chrom, start, end, exonstart, exonend)
35+
transcript = get_longest_transcript(transcripts.keys(), refflat)
36+
chrom, start, end, exonstart, exonend = transcripts[transcript]
37+
38+
# use user-specified transcript
39+
elif len(gene) == 2:
40+
with open(refflat, "r") as fh:
41+
for line in fh:
42+
if gene[1] not in line:
43+
continue
44+
_, transcript, chrom, _, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t")
45+
break
46+
47+
# write to a file
48+
header = f">{gene[0]}_{transcript},{chrom}:{start}-{end}\n"
49+
fw.write(header)
50+
exons = list(zip(exonstart.split(","), exonend.split(",")))[:-1]
51+
for index, each_exon in enumerate(exons, start=1):
52+
fw.write(f'{index},{each_exon[0]},{each_exon[1]}\n')
53+
fw.write("\n")
54+
55+
56+
def get_longest_transcript(transcripts, refflat):
57+
longest_length = 0
58+
longest_transcript = ""
59+
with open(refflat, "r") as fh:
60+
for line in fh:
61+
line = line.strip().split()
62+
if line[1] in transcripts:
63+
length = int(line[5]) - int(line[4])
64+
if length > longest_length:
65+
longest_length = length
66+
longest_transcript = line[1]
67+
return longest_transcript
68+
69+
70+
if __name__ == "__main__":
71+
parser = argparse.ArgumentParser(description=__doc__)
72+
parser.add_argument("input", help="Input filename")
73+
parser.add_argument("-r", "--refflat", required=True, help="Path to the refFlat.txt file, need to be downloaded from UCSC in advance")
74+
parser.add_argument("-o", "--output", required=True, help="The output filename")
75+
args = parser.parse_args()
76+
77+
with open(args.output, "w") as fw:
78+
for gene in read_genelist(args.input):
79+
make_fusion_gene(gene, fw, args.refflat)

0 commit comments

Comments
 (0)