-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCombinedASEbySample.py
62 lines (54 loc) · 2.18 KB
/
CombinedASEbySample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
""" Calculate a polarized log10 pvalue based on a binomial
Assume both replicates are comparable, and can just add together the reference
and alternate counts.
"""
import pandas as pd
import numpy as np
from sys import stdout
from scipy.stats import binom_test
from glob import glob
from argparse import ArgumentParser
from collections import Counter
import re
def parse_args():
"Parse command line arguments"
parser = ArgumentParser()
parser.add_argument('--outfile', '-o', default=stdout)
parser.add_argument('files', nargs='+')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
out_data = {}
alt_finder = re.compile('[ {]1: ([0-9]*)')
ref_finder = re.compile('-1: ([0-9]*)')
for tissue in ['fbfemale', 'fbmale', 'oefemale', 'oemale']:
files_in_tissue = [f for f in args.files if tissue in f]
ref_counts = Counter()
alt_counts = Counter()
all_counts = Counter()
tissue_ref_counts = 0
tissue_alt_counts = 0
for fname in files_in_tissue:
header = next(open(fname))
tissue_ref_counts += int(ref_finder.findall(header)[0])
tissue_alt_counts += int(alt_finder.findall(header)[0])
tab = pd.read_table(fname, index_col=0, skip_blank_lines=True,
comment='#')
for ix in tab.index:
ref_counts[ix] += tab.loc[ix, 'ref_counts']
alt_counts[ix] += tab.loc[ix, 'alt_counts']
all_counts[ix] += tab.loc[ix, 'ref_counts'] + tab.loc[ix,
'alt_counts']
out_data[tissue] = {
ix: (( np.log10(binom_test(
[ref_counts[ix], alt_counts[ix]],
p=(tissue_ref_counts/(tissue_ref_counts +
tissue_alt_counts))
))
* (-1 if alt_counts[ix] > ref_counts[ix] else 1))
if all_counts[ix] > 10 else 0)
for ix in tab.index}
out_data[tissue]['__base__p__'] = (tissue_ref_counts /
(tissue_ref_counts + tissue_alt_counts))
pd.DataFrame(out_data).sort_index().to_csv(args.outfile, sep='\t')