add diploid mapping code

cjakobson · cjakobson · commit 632e3e1d1353 · 2018-12-20T08:49:08.000-08:00
diff --git a/diploidMapping/fineMappingLod_multiSite_anova.m b/diploidMapping/fineMappingLod_multiSite_anova.m
@@ -0,0 +1,80 @@
+function [ph2] = fineMappingLod_multiSite_anova(pos1,pos2,queryPos,x,b_fwselection,residual)
+yr = residual;
+yr = yr + b_fwselection(queryPos)*x(:,queryPos);
+
+groupNames = {'1/1','1/0','0/1','0/0'};
+if queryPos<(12054+53+1)	%homozygous
+	groupAssignments{1} = x(:,pos1)==1 & x(:,pos2) == 1;
+	groupAssignments{2} = x(:,pos1)==1 & x(:,pos2) == -1;
+	groupAssignments{3} = x(:,pos1)==-1 & x(:,pos2) == 1;
+	groupAssignments{4} = x(:,pos1)==-1 & x(:,pos2) == -1;
+elseif queryPos>(12054+53)	%hets
+	groupAssignments{1} = x(:,pos1)==1 & x(:,pos2) == 1;
+	groupAssignments{2} = x(:,pos1)==1 & x(:,pos2) == 0;
+	groupAssignments{3} = x(:,pos1)==0 & x(:,pos2) == 1;
+	groupAssignments{4} = x(:,pos1)==0 & x(:,pos2) == 0;
+end
+
+sum(groupAssignments{1});
+
+group = cell(length(yr),1);
+for i = 1:length(yr)
+    for j = 1:4
+        if groupAssignments{j}(i) == 1
+            group{i} = groupNames{j};
+        end
+    end
+end
+
+toKeep= find(~cellfun(@isempty,group));
+
+yr = yr(toKeep);
+group = group(toKeep);
+for i = 1:4
+    groupAssignments{i} = groupAssignments{i}(toKeep);
+end
+
+%%%%%%%%%%%%%%%
+
+%%% Do not run the ANOVA analysis for positions with no crossover genotypes.
+%%%  Requires both types of crossovers.
+if sum(groupAssignments{1}) > 0 && sum(groupAssignments{4}) > 0
+    if sum(groupAssignments{2}) > 0 && sum(groupAssignments{3}) > 0
+        
+        % Use anova on the 4 genotype groups at position 1 and position 2
+        [p_anova,tbl_anova,stats_anova] = anova1(yr,group,'off');
+        
+        %%% Perform Anova comparison of equality for all pairs of genotypes.
+        [pairwise_p,mean_group,handle,gnames] = multcompare(stats_anova,'Display','off');
+        
+         index1 = find(strcmp(groupNames{1},gnames));
+        index2 = find(strcmp(groupNames{2},gnames));
+        index3 = find(strcmp(groupNames{3},gnames));
+        index4 = find(strcmp(groupNames{4},gnames));
+        
+        ph1_index1 = find(pairwise_p(:,1) == min(index1,index2) & pairwise_p(:,2) == max(index1,index2));
+        ph1_index2 = find(pairwise_p(:,1) == min(index3,index4) & pairwise_p(:,2) == max(index3,index4));
+        ph2_index1 = find(pairwise_p(:,1) == min(index1,index3) & pairwise_p(:,2) == max(index3,index1));
+        ph2_index2 = find(pairwise_p(:,1) == min(index2,index4) & pairwise_p(:,2) == max(index4,index2));
+        
+        %%% ph1 stands for p-value H1, the likelihood that hypothesis 1 is
+        %%% true. H1 is that pos1 is the causal variant (i.e. YJM/YJM ==
+        %%% YJM/RM and RM/RM == RM/YJM)
+        ph1 = pairwise_p(ph1_index1,6)*pairwise_p(ph1_index2,6);
+        
+        %%% ph2 stands for p-value H2, the likelihood that hypothesis 2 is
+        %%% true. H1 is that pos2 is the causal variant. This is the alternate hypothesis. (i.e. YJM/YJM ==
+        %%% RM/YJM and RM/RM == YJM/RM)
+        ph2 = pairwise_p(ph2_index1,6)*pairwise_p(ph2_index2,6);
+    else
+        p_anova = -1;
+        pairwise_p = -1;
+        ph1 = -1;
+        ph2 = -1;
+    end
+else
+    p_anova = -1;
+    pairwise_p = -1;
+    ph1 = -1;
+    ph2 = -1;
+end
diff --git a/diploidMapping/linearMixedModel.m b/diploidMapping/linearMixedModel.m
@@ -0,0 +1,272 @@
+%perform regression and save output
+
+function [] = linearMixedModel(traitIdx,doHets,doFineMapping)
+
+
+if ischar(traitIdx)
+    traitIdx=str2num(traitIdx);
+end
+
+traitIdx
+
+mkdir('linearOnly');
+
+
+load('phasedVLCgenotype.mat')
+genotypes=phasedVLCgenotype;
+clear phasedVLCgenotype
+
+load('traitMerged.mat')
+load('filenameMerged.mat')
+
+phenotypes=trait{traitIdx};
+phenotypes(isnan(phenotypes))=0;
+
+[nStrains nCols]=size(genotypes);
+nLoci=nCols/4;
+
+%zero out missing growth measurements and missing genotypes
+vNoSpot=phenotypes==min(phenotypes);
+vNoGenotype=sum(genotypes==0,2)==nCols;
+
+phenotypes(vNoSpot)=0;
+phenotypes(vNoGenotype)=0;
+genotypes(vNoSpot,:)=zeros(sum(vNoSpot),nCols);
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%assume these are already Z-scored
+
+
+nPlates=length(phenotypes)/384;
+nStrains=length(phenotypes);
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%MODEL A
+%ignore hets; homoRM gets 1; homoYJM gets -1
+%final: only nLoci columns
+[~,temp]=size(genotypes);
+nLoci=temp/4;    
+
+modelGenotypes=zeros(nStrains,nLoci);
+for i=1:nStrains
+    vGenotype=genotypes(i,1:nLoci)-genotypes(i,(nLoci+1):(2*nLoci));
+    modelGenotypes(i,:)=vGenotype;
+end
+
+clear genotypes;
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+pseudogenotypes=zeros(nStrains,nPlates+1);
+
+%plates
+for i=1:nPlates
+    
+    vPlate=zeros(nStrains,1);
+    vPlate(((i-1)*384+1):(384*i),1)=ones(384,1);
+    pseudogenotypes(:,i)=vPlate;
+    
+end
+
+%edges
+%top
+vEdge=zeros(384,1);
+vEdge(1:24)=1;
+%bottom
+vEdge(361:384)=1;
+%sides
+for i=2:15
+    vEdge(24*(i-1)+1)=1;
+    vEdge(24*i)=1;
+end
+
+for i=1:nPlates
+    pseudogenotypes(((i-1)*384+1):(384*i),nPlates+1)=vEdge;
+end
+
+
+modelGenotypes=[pseudogenotypes modelGenotypes];
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+tic
+[b_fwselection,se,pval,inmodel,stats,nextstep,history] = stepwisefit(modelGenotypes,phenotypes,'penter',10^-3,'display','off');
+dev_fwselection = 1-stats.SSresid/stats.SStotal;
+dof_fwselection = stats.df0;
+bPos = find(inmodel);
+dof = length(bPos);
+pValues = -log10(stats.PVAL(bPos));
+[pValues,sortIndex] = sort(pValues,'descend');
+bPos = bPos(sortIndex);
+toc     %this fit takes about 25min on sherlock
+
+%now add in het terms and refit
+
+if doHets
+
+load('phasedVLCgenotype.mat')
+genotypes=phasedVLCgenotype;
+clear phasedVLCgenotype
+
+genotypes(vNoSpot,:)=zeros(sum(vNoSpot),nCols);
+
+
+nStrains=length(phenotypes);
+
+%MODEL C
+%homoRM gets 1; homoYJM gets -1
+%hets all get 1
+%final: 2*nLoci columns
+[~,temp]=size(genotypes);
+nLoci=temp/4;    
+
+modelGenotypes=zeros(nStrains,2*nLoci);
+for i=1:nStrains
+    vGenotype=[genotypes(i,1:nLoci)-genotypes(i,(nLoci+1):(2*nLoci)) genotypes(i,(2*nLoci+1):(3*nLoci))+genotypes(i,(3*nLoci+1):(4*nLoci))];
+    modelGenotypes(i,:)=vGenotype;
+end
+
+clear genotypes;
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+pseudogenotypes=zeros(nStrains,nPlates+1);
+
+%plates
+for i=1:nPlates
+    
+    vPlate=zeros(nStrains,1);
+    vPlate(((i-1)*384+1):(384*i),1)=ones(384,1);
+    pseudogenotypes(:,i)=vPlate;
+    
+end
+
+%edges
+%top
+vEdge=zeros(384,1);
+vEdge(1:24)=1;
+%bottom
+vEdge(361:384)=1;
+%sides
+for i=2:15
+    vEdge(24*(i-1)+1)=1;
+    vEdge(24*i)=1;
+end
+
+for i=1:nPlates
+    pseudogenotypes(((i-1)*384+1):(384*i),nPlates+1)=vEdge;
+end
+
+modelGenotypes=[pseudogenotypes modelGenotypes];
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+
+tic
+inmodel=[inmodel,zeros(1,nLoci)];   %second genotype mat has nLoci more columns
+[b_fwselection,se,pval,inmodel,stats,nextstep,history] = stepwisefit(modelGenotypes,phenotypes,'penter',10^-3,'inmodel',logical(inmodel),'display','off');
+dev_fwselection = 1-stats.SSresid/stats.SStotal;
+dof_fwselection = stats.df0;
+bPos = find(inmodel);
+dof = length(bPos);
+pValues = -log10(stats.PVAL(bPos));
+[pValues,sortIndex] = sort(pValues,'descend');
+bPos = bPos(sortIndex);
+toc     %this fit takes about 5 min
+end
+%now do fine mapping
+
+%map all variants (discard those w poor pVals later)
+tic
+
+if doFineMapping
+    %neglect geometric factors (plates, edges) now and merge back later
+    posToMap=bPos(find((bPos>(nPlates+1)).*(pValues>5)'));
+    
+    %remove those too close to the end (can't fine map)
+    posToMap=posToMap(posToMap<(12054*2-10+(nPlates+1)));
+    posToMap=posToMap(posToMap>(10+(nPlates+1)));
+
+    %calculate residuals for fine mapping
+    [~,~,r] = regress(phenotypes,[ones(length(phenotypes),1),modelGenotypes(:,bPos)]);
+    
+    ph2=cell(length(posToMap),1);
+    
+    for k=1:length(posToMap)
+
+        position1=posToMap(k);
+        upper=position1+10;
+        lower=position1-10;
+
+        for i = lower:upper
+            for j = lower:upper
+
+                [ph2{k}(i-lower+1,j-lower+1)] = ...
+                    fineMappingLod_multiSite_anova(i,j,position1,modelGenotypes,...
+                    b_fwselection,r);
+
+            end
+        end
+
+    end
+
+toc
+%interpret fine mapping 
+
+candidates=cell(length(posToMap),1);
+
+for i=1:length(ph2)
+    
+    [~,candidates{i}]=qtnScore(ph2{i});
+    
+end
+
+for i=1:length(candidates)
+    vResolved(i)=length(candidates{i})==1;
+end
+
+fracResolved=sum(vResolved)/length(vResolved);
+
+
+%adjust positions according to fine mapping as appropriate
+
+for i=1:length(candidates)
+    if length(candidates{i})==1
+        posToMap(i)=posToMap(i)+candidates{i}(1)-11;
+    end
+end
+end
+
+
+
+%%% Calculate percentage of variance explained by each predictor in
+%%% the model
+sumR = zeros(length(bPos),1);
+varianceExplained = zeros(length(bPos),1);
+for i = 1:length(bPos)
+    newResidual = stats.yr + b_fwselection(bPos(i))*modelGenotypes(:,bPos(i));
+    sumR(i) = sum(newResidual.^2) - stats.SSresid;
+end
+for i = 1:length(bPos)
+    varianceExplained(i) = sumR(i)/sum(sumR)*dev_fwselection;
+end
+
+
+
+
+% Remove variables that aren't needed that would clog up HD space for when
+% we save
+clear genotypes;
+clear stats; clear se; clear pval; clear domB;
+clear inmodel; clear inmodel2; clear inmodel3; clear domSubset; clear newResidual; 
+clear history; clear phasedVLCgenotype; clear modelGenotypes;
+clear secondOrderGenotype;
+
+% Save all the variables
+save(['linearOnly/' filename{traitIdx} '.mat']);
+
+
+end
+
+
+
diff --git a/diploidMapping/qtnScore.m b/diploidMapping/qtnScore.m
@@ -0,0 +1,31 @@
+function [qtnVector,candidates] = qtnScore(ph2Mat)
+
+[rows cols]=size(ph2Mat);
+
+qtnVector=0;
+
+%first calculate qtnScore (min of ph2 for each candidate position)
+for i=1:rows
+   
+    toAnalyze=ph2Mat(i,:);
+    toAnalyze(toAnalyze==-1)=NaN;
+    logP=real(-log10(toAnalyze));
+    maxP(i)=min(logP);
+    
+end
+
+qtnVector=maxP;
+
+%now determine true causal variant(s) from QTN scores
+
+%first determine maximum QTN score variant
+[maxQTNscore,maxIdx]=max(qtnVector);
+
+%now check for other candidate variants
+if maxQTNscore>-log10(0.2)
+    candidates=find(qtnVector>maxQTNscore*0.7);
+else
+    candidates=[];
+end
+
+end
diff --git a/diploidMapping/scrapeLinearMapping.m b/diploidMapping/scrapeLinearMapping.m