-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparseinput_nested.js
165 lines (156 loc) · 5.39 KB
/
parseinput_nested.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/** Parse sieve analysis input files.
* Expected input files:
* FASTA file with vaccine ID and AA sequence
* FASTA file with breakthrough sequences and IDs
* CSV with seqID:treatment (vaccine/placebo) treatment
* CSV with sequence mismatches (relative to vaccine) for each seqID */
/** 2D-array (of chars) representing AAs at each position in the sequence
* for the vaccine and each sequence ID */
var sequences_raw;
/** Object holding a 2D-array of sequences for both the vaccine and placebo groups */
var sequences;
/** Object (dictionary) of sequence IDs with AA sequence (char array),
* vac/plac, and mismatch (boolean array) */
var seqID_lookup;
/** Object with vaccine ID and AA sequence */
var vaccine;
/** Array with conservation and hxb2 info for each position */
var envmap;
/* Lookup table with index for each hxb2 position*/
var hxb2map = {};
/** Number of people in the vaccine group */
var numvac = 0;
/** Number of people in the placebo group */
var numplac = 0;
/** Array of p-values */
var pvalues =[];
/** Array of Entropy Values */
var entropies = {full:[],vaccine:[],placebo:[]};
d3.text("data/env.aa.92TH023.fasta", function(vacdata) {
dovacparsing(vacdata);
d3.csv("data/pvalues.csv").row(function(d) {pvalues.push(+d.pvalue);})
.get(function(error, rows) {;});
d3.csv("data/rv144_trt_lookup.csv", function(trt_lookup_data) {
createdictionary(trt_lookup_data);
d3.csv("data/rv144.env.mismatch.distance.csv", function(mmdata) {
addmmtodict(mmdata);
d3.text("data/rv144.env.aa.fasta", function(seqdata) {
doseqparsing(seqdata);
d3.csv("data/env.map.csv", function(mapdata){
envmap = mapdata;
envmap.forEach(function(d, i)
{
hxb2map[d.hxb2Pos] = i;
});
sequences_raw = transpose(sequences_raw);
sequences.vaccine = transpose(sequences.vaccine);
sequences.placebo = transpose(sequences.placebo);
var loadscreen = d3.select("#overview").append("rect")
.attr("class","loadbox")
.attr("height",300)
.attr("width",900)
.attr("fill","black");
loadscreen.append("text")
.attr("class","loading")
.attr("x",450)
.attr("y",150)
.style("text-anchor","middle")
.style("font-size", "30px")
.text("Loading");
for(var i=0; i < sequences_raw.length; i++){
entropies.full.push(jointentropy([i],sequences_raw,numvac+numplac).toFixed(2));
}
for(var i=0; i < sequences.vaccine.length; i++){
entropies.vaccine.push(jointentropy([i],sequences.vaccine,numvac).toFixed(2));
}
for(var i=0; i < sequences.placebo.length; i++){
entropies.placebo.push(jointentropy([i],sequences.placebo,numplac).toFixed(2));
}
d3.select(".loadbox").remove();
d3.select(".loading").remove();
generateVis();
});
});
});
});
});
/** After having read in FASTA file containing vaccine ID and AA sequence,
* make vaccine AA sequence first row in sequences matrix.
* add data to vaccine object */
function dovacparsing(vacdata){
var lines = vacdata.split('\n');
// add vaccine ID to vaccine object
vaccine = {};
sequences = {};
vaccine.ID = lines[0].substr(1);
// add vaccine sequence (char array) to vaccine object and sequences matrix
var vacseq = lines[1].split("");
while (vacseq[vacseq.length-1].charCodeAt(0) < 32) { vacseq.pop(); }
vaccine.sequence = vacseq;
sequences_raw = new Array(vacseq);
sequences.vaccine = new Array();
sequences.placebo = new Array();
}
/** Create dictionary using sequence IDs and add treatment info (vaccine/placebo)
*/
function createdictionary(trt_lookup_data) {
seqID_lookup = d3.nest()
.key(function(d) {return d.sampleID;})
.rollup(function(d) {
if (d[0].treatment.toUpperCase().startsWith("P")){
return { "vaccine": false };
} else {
return { "vaccine": true };
}
})
.map(trt_lookup_data);
}
/** Add mismatch data to corresponding entry in SeqID_lookup array
*/
function addmmtodict(mmdata) {
for (var i = 0; i < mmdata.length; i++) {
// convert each entry to an array
var mm = d3.values(mmdata[i]);
// remove the sequence ID from the array
var seqID = mm.splice(mm.length-1,1)[0];
// convert mm array from string to int
mm = stringArrToIntArr(mm);
// add mm to seqID
seqID_lookup[seqID].mismatch = mm;
}
}
/** Store AA sequences (as char arrays) as rows in sequences matrix.
* Add AA sequences to corresponding objects in seqID_lookup array. */
function doseqparsing(seqdata) {
var lines = seqdata.split('\n');
for (var i = 0; i < lines.length; i += 0) {
if (!lines[i].startsWith(">") || lines[i].length === 0) { i++; }
else {
var seqID = lines[i].substr(1).trim(/(\r\n|\n|\r)/gm);
var seq = lines[i+1].split("");
while (seq[seq.length-1].charCodeAt(0) < 32) { seq.pop(); }
seqID_lookup[seqID].sequence = seq;
sequences_raw.push(seq);
if (seqID_lookup[seqID].vaccine) {
sequences.vaccine.push(seq);
numvac++;
} else {
sequences.placebo.push(seq);
numplac++;
}
i += 2;
}
}
}
/** Convert an array of strings to integers */
function stringArrToIntArr(array){
var result = array;
for (var i = 0; i < result.length; i++){
result[i] = parseInt(result[i]);
}
return result;
}
/** Transpose 2D array */
function transpose(array) {
return array[0].map(function (_, c) { return array.map(function (r) { return r[c]; }); });
}