Skip to content

Commit 632e3e1

Browse files
author
cjakobson
committed
add diploid mapping code
1 parent a7fdc03 commit 632e3e1

File tree

4 files changed

+1129
-0
lines changed

4 files changed

+1129
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
function [ph2] = fineMappingLod_multiSite_anova(pos1,pos2,queryPos,x,b_fwselection,residual)
2+
yr = residual;
3+
yr = yr + b_fwselection(queryPos)*x(:,queryPos);
4+
5+
groupNames = {'1/1','1/0','0/1','0/0'};
6+
if queryPos<(12054+53+1) %homozygous
7+
groupAssignments{1} = x(:,pos1)==1 & x(:,pos2) == 1;
8+
groupAssignments{2} = x(:,pos1)==1 & x(:,pos2) == -1;
9+
groupAssignments{3} = x(:,pos1)==-1 & x(:,pos2) == 1;
10+
groupAssignments{4} = x(:,pos1)==-1 & x(:,pos2) == -1;
11+
elseif queryPos>(12054+53) %hets
12+
groupAssignments{1} = x(:,pos1)==1 & x(:,pos2) == 1;
13+
groupAssignments{2} = x(:,pos1)==1 & x(:,pos2) == 0;
14+
groupAssignments{3} = x(:,pos1)==0 & x(:,pos2) == 1;
15+
groupAssignments{4} = x(:,pos1)==0 & x(:,pos2) == 0;
16+
end
17+
18+
sum(groupAssignments{1});
19+
20+
group = cell(length(yr),1);
21+
for i = 1:length(yr)
22+
for j = 1:4
23+
if groupAssignments{j}(i) == 1
24+
group{i} = groupNames{j};
25+
end
26+
end
27+
end
28+
29+
toKeep= find(~cellfun(@isempty,group));
30+
31+
yr = yr(toKeep);
32+
group = group(toKeep);
33+
for i = 1:4
34+
groupAssignments{i} = groupAssignments{i}(toKeep);
35+
end
36+
37+
%%%%%%%%%%%%%%%
38+
39+
%%% Do not run the ANOVA analysis for positions with no crossover genotypes.
40+
%%% Requires both types of crossovers.
41+
if sum(groupAssignments{1}) > 0 && sum(groupAssignments{4}) > 0
42+
if sum(groupAssignments{2}) > 0 && sum(groupAssignments{3}) > 0
43+
44+
% Use anova on the 4 genotype groups at position 1 and position 2
45+
[p_anova,tbl_anova,stats_anova] = anova1(yr,group,'off');
46+
47+
%%% Perform Anova comparison of equality for all pairs of genotypes.
48+
[pairwise_p,mean_group,handle,gnames] = multcompare(stats_anova,'Display','off');
49+
50+
index1 = find(strcmp(groupNames{1},gnames));
51+
index2 = find(strcmp(groupNames{2},gnames));
52+
index3 = find(strcmp(groupNames{3},gnames));
53+
index4 = find(strcmp(groupNames{4},gnames));
54+
55+
ph1_index1 = find(pairwise_p(:,1) == min(index1,index2) & pairwise_p(:,2) == max(index1,index2));
56+
ph1_index2 = find(pairwise_p(:,1) == min(index3,index4) & pairwise_p(:,2) == max(index3,index4));
57+
ph2_index1 = find(pairwise_p(:,1) == min(index1,index3) & pairwise_p(:,2) == max(index3,index1));
58+
ph2_index2 = find(pairwise_p(:,1) == min(index2,index4) & pairwise_p(:,2) == max(index4,index2));
59+
60+
%%% ph1 stands for p-value H1, the likelihood that hypothesis 1 is
61+
%%% true. H1 is that pos1 is the causal variant (i.e. YJM/YJM ==
62+
%%% YJM/RM and RM/RM == RM/YJM)
63+
ph1 = pairwise_p(ph1_index1,6)*pairwise_p(ph1_index2,6);
64+
65+
%%% ph2 stands for p-value H2, the likelihood that hypothesis 2 is
66+
%%% true. H1 is that pos2 is the causal variant. This is the alternate hypothesis. (i.e. YJM/YJM ==
67+
%%% RM/YJM and RM/RM == YJM/RM)
68+
ph2 = pairwise_p(ph2_index1,6)*pairwise_p(ph2_index2,6);
69+
else
70+
p_anova = -1;
71+
pairwise_p = -1;
72+
ph1 = -1;
73+
ph2 = -1;
74+
end
75+
else
76+
p_anova = -1;
77+
pairwise_p = -1;
78+
ph1 = -1;
79+
ph2 = -1;
80+
end

diploidMapping/linearMixedModel.m

+272
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
%perform regression and save output
2+
3+
function [] = linearMixedModel(traitIdx,doHets,doFineMapping)
4+
5+
6+
if ischar(traitIdx)
7+
traitIdx=str2num(traitIdx);
8+
end
9+
10+
traitIdx
11+
12+
mkdir('linearOnly');
13+
14+
15+
load('phasedVLCgenotype.mat')
16+
genotypes=phasedVLCgenotype;
17+
clear phasedVLCgenotype
18+
19+
load('traitMerged.mat')
20+
load('filenameMerged.mat')
21+
22+
phenotypes=trait{traitIdx};
23+
phenotypes(isnan(phenotypes))=0;
24+
25+
[nStrains nCols]=size(genotypes);
26+
nLoci=nCols/4;
27+
28+
%zero out missing growth measurements and missing genotypes
29+
vNoSpot=phenotypes==min(phenotypes);
30+
vNoGenotype=sum(genotypes==0,2)==nCols;
31+
32+
phenotypes(vNoSpot)=0;
33+
phenotypes(vNoGenotype)=0;
34+
genotypes(vNoSpot,:)=zeros(sum(vNoSpot),nCols);
35+
36+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
37+
38+
%assume these are already Z-scored
39+
40+
41+
nPlates=length(phenotypes)/384;
42+
nStrains=length(phenotypes);
43+
44+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
45+
%MODEL A
46+
%ignore hets; homoRM gets 1; homoYJM gets -1
47+
%final: only nLoci columns
48+
[~,temp]=size(genotypes);
49+
nLoci=temp/4;
50+
51+
modelGenotypes=zeros(nStrains,nLoci);
52+
for i=1:nStrains
53+
vGenotype=genotypes(i,1:nLoci)-genotypes(i,(nLoci+1):(2*nLoci));
54+
modelGenotypes(i,:)=vGenotype;
55+
end
56+
57+
clear genotypes;
58+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59+
60+
pseudogenotypes=zeros(nStrains,nPlates+1);
61+
62+
%plates
63+
for i=1:nPlates
64+
65+
vPlate=zeros(nStrains,1);
66+
vPlate(((i-1)*384+1):(384*i),1)=ones(384,1);
67+
pseudogenotypes(:,i)=vPlate;
68+
69+
end
70+
71+
%edges
72+
%top
73+
vEdge=zeros(384,1);
74+
vEdge(1:24)=1;
75+
%bottom
76+
vEdge(361:384)=1;
77+
%sides
78+
for i=2:15
79+
vEdge(24*(i-1)+1)=1;
80+
vEdge(24*i)=1;
81+
end
82+
83+
for i=1:nPlates
84+
pseudogenotypes(((i-1)*384+1):(384*i),nPlates+1)=vEdge;
85+
end
86+
87+
88+
modelGenotypes=[pseudogenotypes modelGenotypes];
89+
90+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
91+
92+
tic
93+
[b_fwselection,se,pval,inmodel,stats,nextstep,history] = stepwisefit(modelGenotypes,phenotypes,'penter',10^-3,'display','off');
94+
dev_fwselection = 1-stats.SSresid/stats.SStotal;
95+
dof_fwselection = stats.df0;
96+
bPos = find(inmodel);
97+
dof = length(bPos);
98+
pValues = -log10(stats.PVAL(bPos));
99+
[pValues,sortIndex] = sort(pValues,'descend');
100+
bPos = bPos(sortIndex);
101+
toc %this fit takes about 25min on sherlock
102+
103+
%now add in het terms and refit
104+
105+
if doHets
106+
107+
load('phasedVLCgenotype.mat')
108+
genotypes=phasedVLCgenotype;
109+
clear phasedVLCgenotype
110+
111+
genotypes(vNoSpot,:)=zeros(sum(vNoSpot),nCols);
112+
113+
114+
nStrains=length(phenotypes);
115+
116+
%MODEL C
117+
%homoRM gets 1; homoYJM gets -1
118+
%hets all get 1
119+
%final: 2*nLoci columns
120+
[~,temp]=size(genotypes);
121+
nLoci=temp/4;
122+
123+
modelGenotypes=zeros(nStrains,2*nLoci);
124+
for i=1:nStrains
125+
vGenotype=[genotypes(i,1:nLoci)-genotypes(i,(nLoci+1):(2*nLoci)) genotypes(i,(2*nLoci+1):(3*nLoci))+genotypes(i,(3*nLoci+1):(4*nLoci))];
126+
modelGenotypes(i,:)=vGenotype;
127+
end
128+
129+
clear genotypes;
130+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
131+
132+
pseudogenotypes=zeros(nStrains,nPlates+1);
133+
134+
%plates
135+
for i=1:nPlates
136+
137+
vPlate=zeros(nStrains,1);
138+
vPlate(((i-1)*384+1):(384*i),1)=ones(384,1);
139+
pseudogenotypes(:,i)=vPlate;
140+
141+
end
142+
143+
%edges
144+
%top
145+
vEdge=zeros(384,1);
146+
vEdge(1:24)=1;
147+
%bottom
148+
vEdge(361:384)=1;
149+
%sides
150+
for i=2:15
151+
vEdge(24*(i-1)+1)=1;
152+
vEdge(24*i)=1;
153+
end
154+
155+
for i=1:nPlates
156+
pseudogenotypes(((i-1)*384+1):(384*i),nPlates+1)=vEdge;
157+
end
158+
159+
modelGenotypes=[pseudogenotypes modelGenotypes];
160+
161+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
162+
163+
164+
165+
tic
166+
inmodel=[inmodel,zeros(1,nLoci)]; %second genotype mat has nLoci more columns
167+
[b_fwselection,se,pval,inmodel,stats,nextstep,history] = stepwisefit(modelGenotypes,phenotypes,'penter',10^-3,'inmodel',logical(inmodel),'display','off');
168+
dev_fwselection = 1-stats.SSresid/stats.SStotal;
169+
dof_fwselection = stats.df0;
170+
bPos = find(inmodel);
171+
dof = length(bPos);
172+
pValues = -log10(stats.PVAL(bPos));
173+
[pValues,sortIndex] = sort(pValues,'descend');
174+
bPos = bPos(sortIndex);
175+
toc %this fit takes about 5 min
176+
end
177+
%now do fine mapping
178+
179+
%map all variants (discard those w poor pVals later)
180+
tic
181+
182+
if doFineMapping
183+
%neglect geometric factors (plates, edges) now and merge back later
184+
posToMap=bPos(find((bPos>(nPlates+1)).*(pValues>5)'));
185+
186+
%remove those too close to the end (can't fine map)
187+
posToMap=posToMap(posToMap<(12054*2-10+(nPlates+1)));
188+
posToMap=posToMap(posToMap>(10+(nPlates+1)));
189+
190+
%calculate residuals for fine mapping
191+
[~,~,r] = regress(phenotypes,[ones(length(phenotypes),1),modelGenotypes(:,bPos)]);
192+
193+
ph2=cell(length(posToMap),1);
194+
195+
for k=1:length(posToMap)
196+
197+
position1=posToMap(k);
198+
upper=position1+10;
199+
lower=position1-10;
200+
201+
for i = lower:upper
202+
for j = lower:upper
203+
204+
[ph2{k}(i-lower+1,j-lower+1)] = ...
205+
fineMappingLod_multiSite_anova(i,j,position1,modelGenotypes,...
206+
b_fwselection,r);
207+
208+
end
209+
end
210+
211+
end
212+
213+
toc
214+
%interpret fine mapping
215+
216+
candidates=cell(length(posToMap),1);
217+
218+
for i=1:length(ph2)
219+
220+
[~,candidates{i}]=qtnScore(ph2{i});
221+
222+
end
223+
224+
for i=1:length(candidates)
225+
vResolved(i)=length(candidates{i})==1;
226+
end
227+
228+
fracResolved=sum(vResolved)/length(vResolved);
229+
230+
231+
%adjust positions according to fine mapping as appropriate
232+
233+
for i=1:length(candidates)
234+
if length(candidates{i})==1
235+
posToMap(i)=posToMap(i)+candidates{i}(1)-11;
236+
end
237+
end
238+
end
239+
240+
241+
242+
%%% Calculate percentage of variance explained by each predictor in
243+
%%% the model
244+
sumR = zeros(length(bPos),1);
245+
varianceExplained = zeros(length(bPos),1);
246+
for i = 1:length(bPos)
247+
newResidual = stats.yr + b_fwselection(bPos(i))*modelGenotypes(:,bPos(i));
248+
sumR(i) = sum(newResidual.^2) - stats.SSresid;
249+
end
250+
for i = 1:length(bPos)
251+
varianceExplained(i) = sumR(i)/sum(sumR)*dev_fwselection;
252+
end
253+
254+
255+
256+
257+
% Remove variables that aren't needed that would clog up HD space for when
258+
% we save
259+
clear genotypes;
260+
clear stats; clear se; clear pval; clear domB;
261+
clear inmodel; clear inmodel2; clear inmodel3; clear domSubset; clear newResidual;
262+
clear history; clear phasedVLCgenotype; clear modelGenotypes;
263+
clear secondOrderGenotype;
264+
265+
% Save all the variables
266+
save(['linearOnly/' filename{traitIdx} '.mat']);
267+
268+
269+
end
270+
271+
272+

diploidMapping/qtnScore.m

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
function [qtnVector,candidates] = qtnScore(ph2Mat)
2+
3+
[rows cols]=size(ph2Mat);
4+
5+
qtnVector=0;
6+
7+
%first calculate qtnScore (min of ph2 for each candidate position)
8+
for i=1:rows
9+
10+
toAnalyze=ph2Mat(i,:);
11+
toAnalyze(toAnalyze==-1)=NaN;
12+
logP=real(-log10(toAnalyze));
13+
maxP(i)=min(logP);
14+
15+
end
16+
17+
qtnVector=maxP;
18+
19+
%now determine true causal variant(s) from QTN scores
20+
21+
%first determine maximum QTN score variant
22+
[maxQTNscore,maxIdx]=max(qtnVector);
23+
24+
%now check for other candidate variants
25+
if maxQTNscore>-log10(0.2)
26+
candidates=find(qtnVector>maxQTNscore*0.7);
27+
else
28+
candidates=[];
29+
end
30+
31+
end

0 commit comments

Comments
 (0)