-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyCistopic_kidney_raw.py
52 lines (43 loc) · 2.17 KB
/
pyCistopic_kidney_raw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pycisTopic
import os
import pybiomart as pbm
from pycisTopic.qc import *
projdir = "./cisTopic"
tmp_dir = "./cisTopic/tmp/"
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
fragments_dict = {'kideny': '/home/annac/datasets/kidney_10x/fragments.tsv.gz'}
# For mouse (mm39)
#dataset = pbm.Dataset(name='mmusculus_gene_ensembl', host='http://www.ensembl.org')
# For mouse (mm10)
#dataset = pbm.Dataset(name='mmusculus_gene_ensembl', host='http://nov2020.archive.ensembl.org/')
# For human (hg38)
dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
# For human (hg19)
#dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org/')
# For fly (dm6)
# dataset = pbm.Dataset(name='dmelanogaster_gene_ensembl', host='http://www.ensembl.org')
annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
filter = annot['Chromosome/scaffold name'].str.contains('CHR|GL|JH|MT')
annot = annot[~filter]
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].str.replace(r'(\b\S)', r'chr\1')
annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
annot = annot[annot.Transcript_type == 'protein_coding']
# path_to_regions = [path_to_regions[key] for key in fragments_dict.keys()]
path_to_regions = '/home/annac/datasets/kidney_10x/H_Kidney_Cancer_Chromium_Nuc_Iso_vs_SaltyEZ_vs_ComplexTissueDP_atac_peaks.bed'
metadata_bc, profile_data_dict = compute_qc_stats(fragments_dict = fragments_dict,
tss_annotation = annot,
stats=['barcode_rank_plot', 'duplicate_rate', 'insert_size_distribution', 'profile_tss', 'frip'],
label_list = None,
path_to_regions = path_to_regions,
n_cpu = 1,
valid_bc = None,
n_frag = 2000,
n_bc = None,
tss_flank_window = 2000,
tss_window = 50,
tss_minimum_signal_window = 100,
tss_rolling_window = 10,
remove_duplicates = True,
_temp_dir = tmp_dir + 'ray_spill',
use_polars = True)