-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscATAC_seq simulation.py
105 lines (89 loc) · 3.85 KB
/
scATAC_seq simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import nbinom
def simulate_single_cell_atac_seq(num_cells, num_regions, library_size, mean_read_count, dispersion):
# Generate the library size for each cell (drawn from a normal distribution)
cell_library_sizes = np.random.normal(loc=library_size, scale=library_size/2, size=num_cells).astype(int)
# Simulate read counts for each cell and region
simulated_data = np.zeros((num_cells, num_regions), dtype=int)
for cell_idx in range(num_cells):
# Simulate read counts for each region in the cell
for region_idx in range(num_regions):
# NEGATIVE BINOMIAL
read_count = nbinom.rvs(n=dispersion, p=dispersion / (dispersion + mean_read_count))
# POISSON
# read_count = poisson.rvs(mean_read_count)
# BERNOULLI
# read_count = bernoulli.rvs(mean_read_count / library_size)
# BINOMIAL
# read_count = binom.rvs(library_size, mean_read_count / library_size)
# ZERO-INFLATED NB
'''
zero_prob = 0.2 # Probability of a zero count
zero_count = np.random.choice([0, 1], p=[1 - zero_prob, zero_prob])
if zero_count == 0:
read_count = nbinom.rvs(n=dispersion, p=dispersion / (dispersion + mean_read_count))
else:
read_count = 0
'''
# Ensure the read count does not exceed the library size
read_count = min(read_count, cell_library_sizes[cell_idx])
# Assign the read count to the corresponding cell and region
simulated_data[cell_idx, region_idx] = read_count
# Subtract the assigned read count from the library size of the cell
cell_library_sizes[cell_idx] -= read_count
return simulated_data
# Parameters
num_cells = 100
num_regions = 5000
library_size = 10000
mean_read_count = 5
dispersion = 1
# Simulate data
simulated_data = simulate_single_cell_atac_seq(num_cells, num_regions, library_size, mean_read_count, dispersion)
# Generate plots to illustrate the distributions
plt.figure(figsize=(15, 6))
# Plot the Negative Binomial distribution
plt.subplot(1, 5, 1)
nbinom_data = nbinom.rvs(n=dispersion, p=dispersion / (dispersion + mean_read_count), size=num_regions)
plt.hist(nbinom_data, bins=30, density=True, color='blue', alpha=0.6)
plt.title('Negative Binomial')
plt.xlabel('Read Count')
plt.ylabel('Density')
# Plot the Poisson distribution
plt.subplot(1, 5, 2)
poisson_data = poisson.rvs(mean_read_count, size=num_regions)
plt.hist(poisson_data, bins=30, density=True, color='green', alpha=0.6)
plt.title('Poisson')
plt.xlabel('Read Count')
plt.ylabel('Density')
# Plot the Zero-Inflated Negative Binomial distribution
plt.subplot(1, 5, 3)
zero_prob = 0.2
zinf_data = []
for _ in range(num_regions):
zero_count = np.random.choice([0, 1], p=[1 - zero_prob, zero_prob])
if zero_count == 0:
zinf_data.append(nbinom.rvs(n=dispersion, p=dispersion / (dispersion + mean_read_count)))
else:
zinf_data.append(0)
plt.hist(zinf_data, bins=30, density=True, color='red', alpha=0.6)
plt.title('Zero-Inflated Negative Binomial')
plt.xlabel('Read Count')
plt.ylabel('Density')
# Plot the Bernoulli distribution
plt.subplot(1, 5, 4)
bernoulli_data = bernoulli.rvs(mean_read_count / library_size, size=num_regions)
plt.hist(bernoulli_data, bins=30, density=True, color='green', alpha=0.6)
plt.title('Bernoulli')
plt.xlabel('Read Count')
plt.ylabel('Density')
# Plot the Binomial distribution
plt.subplot(1, 5, 5)
binom_data = binom.rvs(library_size, mean_read_count / library_size, size=num_regions)
plt.hist(binom_data, bins=30, density=True, color='purple', alpha=0.6)
plt.title('Binomial')
plt.xlabel('Read Count')
plt.ylabel('Density')
plt.tight_layout()
plt.show()