Skip to content

Commit

Permalink
added threshold option, fixed NTC barcode max, allow for .gz fastqs
Browse files Browse the repository at this point in the history
  • Loading branch information
Sarah K. Harrison committed Jun 14, 2024
1 parent 08c0444 commit c0d0537
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 178 deletions.
64 changes: 21 additions & 43 deletions align.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,33 @@ usage()
{
cat << EOF
usage:
bash /data/apps/src/blastn.sh $1 $2
$1 - input fastfile (format is checked, but blastn only takes fasta)
$2 - reference fasta
$3 - threads
bash /data/apps/src/blastn.sh \$1 \$2 \$3 \$4
\$1 - input fastfile (format is checked, but blastn only takes fasta)
\$2 - reference fasta
\$3 - threads
\$4 - reference threshold (identity and length cutoffs)
example:
input="/data/seqdata/phase2/initial_analyses/NB_BA-R9/fastq/9.fastq"
REF="/data/refdata/phase2/amplicons/blastdb_all_amplicons"
bash /data/apps/src/blastn.sh "$input" "$REF"
refThresh="90"
bash /data/apps/src/blastn.sh "\$input" "\$REF" "\$THREADS" "\$refThresh"
EOF
}
if [[ "$1" == "" ]]; then usage; exit; fi
if [[ "$2" == "" ]]; then usage; exit; fi
if [[ "$2" == "" ]]; then usage; exit; fi
if [[ "$3" == "" ]]; then usage; exit; fi
if [[ "$4" == "" ]]; then usage; exit; fi

input="$1"
REF="$2"
THREADS="$3"
refThresh="$4"

# make other variables and directories
base=$(basename "$input" | sed 's/\.f.*//')
ref_base=$(basename "$REF" | sed 's/\.f.*//')
#indir=$(dirname $(dirname "$input"))
indir=$(dirname "$input")
workdir="$indir/align_blastn/$base-V-$ref_base"

Expand All @@ -50,17 +54,12 @@ fi

mkdir -p "$workdir"





# NOTE: could be >1 output line per sequence
# convert fastq input to fasta 'on the fly'
blast6()
{
bn=$(basename "$1")
# check format

if [[ $(head -c1 "$1") == "@" ]]; then
blastn -num_threads 1 -db "$REF" -query <(sed -n '1~4s/^@/>/p;2~4p' "$1" 2> /dev/null) -outfmt 6 > "$workdir/blast_out/$bn.b6" 2> /dev/null
elif [[ $(head -c1 "$1") == ">" ]]; then
Expand All @@ -75,7 +74,13 @@ export -f blast6
# split fastq into 100,000 seqs per file
mkdir -p "$workdir/split"
mkdir -p "$workdir/blast_out"
split -l 400000 "$input" "$workdir/split/seq-"

if [[ "$input" == *.gz ]]; then
gunzip -c "$input" | split -l 400000 - "$workdir/split/seq-"
else
split -l 400000 "$input" "$workdir/split/seq-"
fi

if [[ ! -f "$workdir/parallel.blast.complete" ]]; then
find "$workdir/split" -name "seq-*" > "$workdir/parallel.blast"
parallel --arg-file "$workdir/parallel.blast" --jobs="$THREADS" blast6
Expand All @@ -85,13 +90,10 @@ fi
# FILTER 1
# filter b6 for best hit per seq (based on identity $3)
cat "$workdir/blast_out/"*.b6 | awk -F'\t' '{tmp[$1]=$3; if(arr[$1]==""){arr[$1]=$3; filtered[$1]=$0}else{if($3>arr[$1]){filtered[$1]=$0}}}END{for(h in filtered){print(filtered[h])}}' > "$workdir/$base.b6"
# FILTER 2
# filter on identity (>=90%) and alignment length (>= 90% length of classified amplicon)
awk -F'\t' '{if(NR==FNR){ref[$1]=length($2)}else{if($3>=90){if($4>=(0.9*ref[$2])){print($0)}}}}' <(sed $'$!N;s/\\\n/\t/' "$REF.fasta" 2> /dev/null | sed 's/^>//') "$workdir/$base.b6" > "$workdir/$base.filter2.b6"




# FILTER 2
# filter on identity and alignment length based on refThresh
awk -F'\t' -v thresh="$refThresh" '{if(NR==FNR){ref[$1]=length($2)}else{if($3>=thresh){if($4>=(thresh/100*ref[$2])){print($0)}}}}' <(sed $'$!N;s/\\\n/\t/' "$REF.fasta" 2> /dev/null | sed 's/^>//') "$workdir/$base.b6" > "$workdir/$base.filter2.b6"

# counts for specific references
cut -f2 "$workdir/$base.filter2.b6" | sort | uniq -c | sed -e 's/^ \+//' -e 's/ /\t/' | awk -F'\t' '{printf("%s\t%s\n",$2,$1)}' > "$workdir/rname_count.tsv"
Expand All @@ -107,39 +109,15 @@ printf "totals\treads_unmapped\t$reads_unmapped\n" >> "$workdir/df_rname_count.t
# mapped/classified as specific 'references'
awk '{printf("%s\t%s\n","references",$0)}' "$workdir/rname_count.tsv" >> "$workdir/df_rname_count.tsv"









# get headers, get counts of mapped reads per header
grep "^>" "$REF.fasta" | sort -V | sed -e 's/^>//' -e 's/ .*//' | while read r1; do
# r1=$(printf "%s" "$ref" | awk -F'|' '{print($1)}')
# r2=$(printf "%s" "$ref" | awk -F'|' '{print($2)}')
ref_count=$(grep -P "^$r1\t" "$workdir/rname_count.tsv" | cut -f2)
if [[ "$ref_count" == "" ]]; then
ref_count="0"
fi
printf "%s\t%s\n" "$r1" "$ref_count"
done > "$workdir/df_all_ref_counts.tsv"






# clean up
rm -rf "$workdir/blast_out"
rm -rf "$workdir/split"









71 changes: 46 additions & 25 deletions frontend/src/components/AnalysisForm.vue
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
CONSEQUENTIAL, SPECIAL OR OTHER DAMAGES ARISING FROM THE USE OF, OR INABILITY TO USE,
THE MATERIAL, INCLUDING, BUT NOT LIMITED TO, ANY DAMAGES FOR LOST PROFITS.
-->
<template>
<template>
<v-card :height="height" class="d-flex flex-column">
<v-card-title>Analysis Form</v-card-title>
<v-card-text>
Expand All @@ -38,7 +38,6 @@
<span>
The input directory must be the full path to the 'fastq_pass' subfolder of an ONT sequencing run,
and have sub-folders named for each demultiplexed (demux) barcode (this is default for demux by MinKNOW).
Currently, sub-folder names are expected to be from 'barcode01' to 'barcode12'.
</span>
</v-tooltip>

Expand Down Expand Up @@ -90,7 +89,7 @@
></v-text-field>
</template>
<span>
Additionally, one of the 12 barcodes is reserved for a NTC (nontemplate control).
Additionally, one or more of the barcodes is reserved for a NTC (nontemplate control).
This is used for the final threshold for calling a positive amplicon (T3).
</span>
</v-tooltip>
Expand Down Expand Up @@ -134,28 +133,46 @@
that the thread count be set no greater than half the total threads available on the system.
</span>
</v-tooltip>

<!-- New refThresh field -->
<v-tooltip bottom>
<template v-slot:activator="{ on, attrs }">
<v-text-field
v-model="refThresh"
v-bind="attrs"
v-on="on"
label="Reference Alignment and Identity Threshold"
type="text"
:rules="[rules.required]"
required
></v-text-field>
</template>
<span>
The reference alignment and identity threshold for considering a hit,
for example '90' means the amplicon read alignment must cover >= 90% of the reference length and be >=90% the identity.
</span>
</v-tooltip>

</v-form>
</v-card-text>
<v-row class="mt-3">
<v-col>
<v-btn
color="success"
:disabled="!valid"
class="ml-6"
@click="startAnalysis"
>
Start
</v-btn>
<v-btn
color="error"
:disabled="!stopvalid"
class="ml-2"
@click="stopAnalysis"
>
Stop
</v-btn>
</v-col>
</v-row>
<v-card-actions class="justify-end">
<v-btn
color="success"
:disabled="!valid"
class="ml-6"
@click="startAnalysis"
>
Start
</v-btn>
<v-btn
color="error"
:disabled="!stopvalid"
class="ml-2"
@click="stopAnalysis"
>
Stop
</v-btn>
</v-card-actions>
</v-card>
</template>

Expand All @@ -170,12 +187,13 @@ export default {
data: () => ({
valid: true,
stopvalid: false,
inputDirectory: '/app/DATA/EXAMPLE_DEMO/fastq_pass/',
inputDirectory: '/app/DATA/fastq_pass/',
refDirectory: '/app/DATA/Amplicon_blastdb/Amplicon_set',
orgFile: '/app/DATA/organisms.sh',
barcodes: 'barcode10,barcode11,barcode12',
analysisInterval: '10',
threads: '1',
refThresh: '90',
rules: {
required: value => !!value && value.trim() !== '' || 'This field is required'
}
Expand All @@ -192,7 +210,8 @@ export default {
orgFile: this.orgFile,
barcodeList: this.barcodes,
analysisInterval: this.analysisInterval,
numThreads: this.threads
numThreads: this.threads,
refThresh: this.refThresh
}

AnalysisService.startAnalysis(data)
Expand Down Expand Up @@ -227,3 +246,5 @@ export default {
max-width: 350px !important
}
</style>


Loading

0 comments on commit c0d0537

Please sign in to comment.