1
+ """
2
+ Generate a user-defined fusion file
3
+
4
+ Input file should be a text file containing
5
+ 1. gene name and
6
+ 2. transcript name (optional)
7
+
8
+ For example:
9
+ gene1 transcript1
10
+ gene2
11
+ gene3 transcript3
12
+ """
13
+ __author__ = "Kai"
14
+ __date__ = "20/05/2020"
15
+
16
+ import argparse
17
+
18
+
19
+ def read_genelist (inputf ):
20
+ with open (inputf , "r" ) as fh :
21
+ for line in fh :
22
+ yield line .rstrip ("\n " ).split ()
23
+
24
+
25
+ def make_fusion_gene (gene , fw , refflat ):
26
+ # no transcript specified --> use the longest transcript
27
+ if len (gene ) == 1 :
28
+ transcripts = {}
29
+ with open (refflat , "r" ) as fh :
30
+ for line in fh :
31
+ if gene [0 ] not in line :
32
+ continue
33
+ _ , transcript , chrom , _ , start , end , _ , _ , _ , exonstart , exonend = line .rstrip ("\n " ).split ("\t " )
34
+ transcripts [transcript ] = (chrom , start , end , exonstart , exonend )
35
+ transcript = get_longest_transcript (transcripts .keys (), refflat )
36
+ chrom , start , end , exonstart , exonend = transcripts [transcript ]
37
+
38
+ # use user-specified transcript
39
+ elif len (gene ) == 2 :
40
+ with open (refflat , "r" ) as fh :
41
+ for line in fh :
42
+ if gene [1 ] not in line :
43
+ continue
44
+ _ , transcript , chrom , _ , start , end , _ , _ , _ , exonstart , exonend = line .rstrip ("\n " ).split ("\t " )
45
+ break
46
+
47
+ # write to a file
48
+ header = f">{ gene [0 ]} _{ transcript } ,{ chrom } :{ start } -{ end } \n "
49
+ fw .write (header )
50
+ exons = list (zip (exonstart .split ("," ), exonend .split ("," )))[:- 1 ]
51
+ for index , each_exon in enumerate (exons , start = 1 ):
52
+ fw .write (f'{ index } ,{ each_exon [0 ]} ,{ each_exon [1 ]} \n ' )
53
+ fw .write ("\n " )
54
+
55
+
56
+ def get_longest_transcript (transcripts , refflat ):
57
+ longest_length = 0
58
+ longest_transcript = ""
59
+ with open (refflat , "r" ) as fh :
60
+ for line in fh :
61
+ line = line .strip ().split ()
62
+ if line [1 ] in transcripts :
63
+ length = int (line [5 ]) - int (line [4 ])
64
+ if length > longest_length :
65
+ longest_length = length
66
+ longest_transcript = line [1 ]
67
+ return longest_transcript
68
+
69
+
70
+ if __name__ == "__main__" :
71
+ parser = argparse .ArgumentParser (description = __doc__ )
72
+ parser .add_argument ("input" , help = "Input filename" )
73
+ parser .add_argument ("-r" , "--refflat" , required = True , help = "Path to the refFlat.txt file, need to be downloaded from UCSC in advance" )
74
+ parser .add_argument ("-o" , "--output" , required = True , help = "The output filename" )
75
+ args = parser .parse_args ()
76
+
77
+ with open (args .output , "w" ) as fw :
78
+ for gene in read_genelist (args .input ):
79
+ make_fusion_gene (gene , fw , args .refflat )
0 commit comments