Skip to content

Commit 3de3df6

Browse files
Merge pull request #158 from uclahs-cds/sfitz-combine-gvcfs
Use GVCFs for genotyping - run time/CPU hours substantially reduced (0.52)
2 parents d36fc31 + e929f14 commit 3de3df6

11 files changed

+191
-114
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
88
---
99

1010
## [Unreleased]
11+
### Added
12+
- Add workflow for genotyping from GVCFs
1113
### Changed
1214
- Standardize description
15+
- Update GATK to 4.5.0.0
1316

1417
---
1518

config/F16.config

+12-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ process {
1111
cpus = 1
1212
memory = 1.GB
1313
}
14-
withName: run_HaplotypeCallerVCF_GATK {
14+
withName: run_HaplotypeCallerGVCF_GATK {
1515
cpus = 2
1616
memory = 4.GB
1717
retry_strategy {
@@ -21,7 +21,17 @@ process {
2121
}
2222
}
2323
}
24-
withName: run_HaplotypeCallerGVCF_GATK {
24+
withName: run_CombineGVCFs_GATK {
25+
cpus = 2
26+
memory = 4.GB
27+
retry_strategy {
28+
memory {
29+
strategy = 'exponential'
30+
operand = 2
31+
}
32+
}
33+
}
34+
withName: run_GenotypeGVCFs_GATK {
2535
cpus = 2
2636
memory = 4.GB
2737
retry_strategy {

config/F32.config

+12-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ process {
1111
cpus = 1
1212
memory = 1.GB
1313
}
14-
withName: run_HaplotypeCallerVCF_GATK {
14+
withName: run_HaplotypeCallerGVCF_GATK {
1515
cpus = 2
1616
memory = 4.GB
1717
retry_strategy {
@@ -21,7 +21,17 @@ process {
2121
}
2222
}
2323
}
24-
withName: run_HaplotypeCallerGVCF_GATK {
24+
withName: run_CombineGVCFs_GATK {
25+
cpus = 2
26+
memory = 4.GB
27+
retry_strategy {
28+
memory {
29+
strategy = 'exponential'
30+
operand = 2
31+
}
32+
}
33+
}
34+
withName: run_GenotypeGVCFs_GATK {
2535
cpus = 2
2636
memory = 4.GB
2737
retry_strategy {

config/F72.config

+14-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ process {
1111
cpus = 1
1212
memory = 1.GB
1313
}
14-
withName: run_HaplotypeCallerVCF_GATK {
14+
withName: run_HaplotypeCallerGVCF_GATK {
1515
cpus = 3
1616
memory = 7.GB
1717
retry_strategy {
@@ -21,9 +21,19 @@ process {
2121
}
2222
}
2323
}
24-
withName: run_HaplotypeCallerGVCF_GATK {
25-
cpus = 3
26-
memory = 7.GB
24+
withName: run_CombineGVCFs_GATK {
25+
cpus = 2
26+
memory = 4.GB
27+
retry_strategy {
28+
memory {
29+
strategy = 'exponential'
30+
operand = 2
31+
}
32+
}
33+
}
34+
withName: run_GenotypeGVCFs_GATK {
35+
cpus = 2
36+
memory = 4.GB
2737
retry_strategy {
2838
memory {
2939
strategy = 'exponential'

config/M64.config

+14-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ process {
1111
cpus = 1
1212
memory = 1.GB
1313
}
14-
withName: run_HaplotypeCallerVCF_GATK {
14+
withName: run_HaplotypeCallerGVCF_GATK {
1515
cpus = 3
1616
memory = 7.GB
1717
retry_strategy {
@@ -21,9 +21,19 @@ process {
2121
}
2222
}
2323
}
24-
withName: run_HaplotypeCallerGVCF_GATK {
25-
cpus = 3
26-
memory = 7.GB
24+
withName: run_CombineGVCFs_GATK {
25+
cpus = 2
26+
memory = 4.GB
27+
retry_strategy {
28+
memory {
29+
strategy = 'exponential'
30+
operand = 2
31+
}
32+
}
33+
}
34+
withName: run_GenotypeGVCFs_GATK {
35+
cpus = 2
36+
memory = 4.GB
2737
retry_strategy {
2838
memory {
2939
strategy = 'exponential'

config/default.config

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ params {
1616

1717
docker_container_registry = "ghcr.io/uclahs-cds"
1818

19-
gatk_version = "4.2.4.1"
19+
gatk_version = "4.5.0.0"
2020
picard_version = "2.26.10"
2121
pipeval_version = "4.0.0-rc.2"
2222
gatkfilter_version = "v1.0.0"

main.nf

+28-18
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Current Configuration:
2525
bundle_omni_1000g_2p5_vcf_gz: ${params.bundle_omni_1000g_2p5_vcf_gz}
2626
bundle_phase1_1000g_snps_high_conf_vcf_gz: ${params.bundle_phase1_1000g_snps_high_conf_vcf_gz}
2727
28-
- output:
28+
- output:
2929
output: ${params.output_dir}
3030
output_dir_base: ${params.output_dir_base}
3131
log_output_dir: ${params.log_output_dir}
@@ -58,9 +58,10 @@ include { extract_GenomeIntervals } from './external/pipeline-Nextflow-module/mo
5858
]
5959
)
6060
include {
61-
run_HaplotypeCallerVCF_GATK
6261
run_HaplotypeCallerGVCF_GATK
6362
} from './module/haplotypecaller.nf'
63+
include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf'
64+
include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf'
6465
include {
6566
run_MergeVcfs_Picard as run_MergeVcfs_Picard_VCF
6667
run_MergeVcfs_Picard as run_MergeVcfs_Picard_GVCF
@@ -147,51 +148,60 @@ workflow {
147148
/**
148149
* Haplotype calling
149150
*/
150-
input_ch_collected_files.combine(input_ch_intervals)
151+
152+
input_ch_samples_with_index.combine(input_ch_intervals)
151153
.map{ it ->
152154
[
153-
it[0].bams,
154-
it[0].indices,
155+
it[0].id,
156+
it[0].path,
157+
it[0].index,
155158
it[1].interval_path,
156159
it[1].interval_id
157160
]
158161
}
159-
.set{ input_ch_haplotypecallervcf }
162+
.set{ input_ch_haplotypecallergvcf }
160163

161-
run_HaplotypeCallerVCF_GATK(
164+
run_HaplotypeCallerGVCF_GATK(
162165
params.reference_fasta,
163166
"${params.reference_fasta}.fai",
164167
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
165168
params.bundle_v0_dbsnp138_vcf_gz,
166169
"${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
167-
input_ch_haplotypecallervcf
170+
input_ch_haplotypecallergvcf
168171
)
169172

170-
input_ch_samples_with_index.combine(input_ch_intervals)
173+
run_HaplotypeCallerGVCF_GATK.out.gvcfs
174+
.groupTuple(by: 4) // Group by interval ID
171175
.map{ it ->
172176
[
173-
it[0].id,
174-
it[0].path,
175-
it[0].index,
176-
it[1].interval_path,
177-
it[1].interval_id
177+
it[1].flatten(), // GVCFs
178+
it[2].flatten(), // Indices
179+
it[3][0], // Interval path
180+
it[4] // Interval ID
178181
]
179182
}
180-
.set{ input_ch_haplotypecallergvcf }
183+
.set { input_ch_combine_gvcfs }
181184

182-
run_HaplotypeCallerGVCF_GATK(
185+
run_CombineGVCFs_GATK(
186+
params.reference_fasta,
187+
"${params.reference_fasta}.fai",
188+
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
189+
input_ch_combine_gvcfs
190+
)
191+
192+
run_GenotypeGVCFs_GATK(
183193
params.reference_fasta,
184194
"${params.reference_fasta}.fai",
185195
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
186196
params.bundle_v0_dbsnp138_vcf_gz,
187197
"${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
188-
input_ch_haplotypecallergvcf
198+
run_CombineGVCFs_GATK.out.combined_gvcf
189199
)
190200

191201
/**
192202
* Merge VCFs
193203
*/
194-
run_HaplotypeCallerVCF_GATK.out.vcfs
204+
run_GenotypeGVCFs_GATK.out.vcfs
195205
.reduce( ['vcfs': [], 'indices': []] ){ a, b ->
196206
a.vcfs.add(b[0]);
197207
a.indices.add(b[1]);

metadata.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ maintainers: "Boutros Lab Infrastructure <[email protected]
55
languages: ["Nextflow", "Docker"]
66
dependencies: ["Java", "Nextflow", "Docker"]
77
references: "https://uclahs-cds.atlassian.net/wiki/spaces/BOUTROSLAB/pages/3189620/Guide+to+Nextflow"
8-
tools: ["Picard:2.26.10", "GATK:3.7.0", "GATK:4.2.4.1"]
8+
tools: ["Picard:2.26.10", "GATK:3.7.0", "GATK:4.5.0.0"]

module/combine-gvcfs.nf

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
2+
3+
/*
4+
Nextflow module for merging GVCFs for joint genotyping with GATK
5+
*/
6+
process run_CombineGVCFs_GATK {
7+
container params.docker_image_gatk
8+
publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
9+
mode: "copy",
10+
enabled: params.save_intermediate_files,
11+
pattern: '*g.vcf.gz*'
12+
publishDir path: "${params.log_output_dir}/process-log",
13+
pattern: ".command.*",
14+
mode: "copy",
15+
saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }
16+
17+
input:
18+
path(reference_fasta)
19+
path(reference_fasta_fai)
20+
path(reference_fasta_dict)
21+
tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id)
22+
23+
output:
24+
path(".command.*")
25+
tuple path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: combined_gvcf
26+
27+
script:
28+
output_filename = generate_standard_filename(
29+
"GATK-${params.gatk_version}",
30+
params.dataset_id,
31+
params.patient_id,
32+
[
33+
'additional_information': "${interval_id}.g.vcf.gz"
34+
]
35+
)
36+
gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
37+
"""
38+
set -euo pipefail
39+
40+
gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
41+
CombineGVCFs \
42+
--reference ${reference_fasta} \
43+
${gvcf_input_str} \
44+
--output ${output_filename} \
45+
--create-output-variant-index true \
46+
--verbosity INFO
47+
"""
48+
}

module/genotype-gvcfs.nf

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
2+
3+
/*
4+
Nextflow module for joint genotyping merged GVCFs with GATK
5+
*/
6+
process run_GenotypeGVCFs_GATK {
7+
container params.docker_image_gatk
8+
publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
9+
mode: "copy",
10+
enabled: params.save_intermediate_files,
11+
pattern: '*.vcf*'
12+
13+
publishDir path: "${params.log_output_dir}/process-log",
14+
pattern: ".command.*",
15+
mode: "copy",
16+
saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }
17+
18+
input:
19+
path(reference_fasta)
20+
path(reference_fasta_fai)
21+
path(reference_fasta_dict)
22+
path(dbsnp_bundle)
23+
path(dbsnp_bundle_index)
24+
tuple path(combined_gvcf), path(combined_gvcf_index), path(interval_path), val(interval_id)
25+
26+
output:
27+
path(".command.*")
28+
tuple path(output_filename), path("${output_filename}.tbi"), emit: vcfs
29+
30+
script:
31+
output_filename = generate_standard_filename(
32+
"GATK-${params.gatk_version}",
33+
params.dataset_id,
34+
params.patient_id,
35+
[
36+
'additional_information': "${interval_id}.vcf.gz"
37+
]
38+
)
39+
interval_str = "--intervals ${interval_path}"
40+
interval_padding = params.is_targeted ? "--interval-padding 100" : ""
41+
"""
42+
set -euo pipefail
43+
44+
gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
45+
GenotypeGVCFs \
46+
--variant ${combined_gvcf} \
47+
--reference ${reference_fasta} \
48+
--verbosity INFO \
49+
--output ${output_filename} \
50+
--dbsnp ${dbsnp_bundle} \
51+
--standard-min-confidence-threshold-for-calling 50 \
52+
${interval_str} \
53+
${interval_padding}
54+
"""
55+
}

0 commit comments

Comments
 (0)