From 5f2cf1c19017a081f0b6b411aa309d1a4e4b7021 Mon Sep 17 00:00:00 2001 From: Yash Patel <86321070+yashpatel6@users.noreply.github.com> Date: Mon, 11 Jul 2022 17:18:13 -0700 Subject: [PATCH] Param validation (#95) * Add parameter validation * Add schema files * Update changelog * Fix linting and comment * Add .gitattributes --- .gitattributes | 1 + CHANGELOG.md | 1 + config/custom_schema_types.config | 94 +++++++++++++++++++++ config/methods.config | 11 +-- config/schema.yaml | 132 ++++++++++++++++++++++++++++++ 5 files changed, 234 insertions(+), 5 deletions(-) create mode 100644 .gitattributes create mode 100644 config/custom_schema_types.config create mode 100644 config/schema.yaml diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..cef56aa --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.config linguist-language=groovy diff --git a/CHANGELOG.md b/CHANGELOG.md index 687c23a..9ec5e80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] ### Added - IndelRealignment compression parameter +- Param validation ### Changed - Parse CSV inputs using modularized `csv_parser` - Delete merged but un-deduplicated BAMs earlier for more efficient disk usage diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config new file mode 100644 index 0000000..c149c56 --- /dev/null +++ b/config/custom_schema_types.config @@ -0,0 +1,94 @@ +/** +* This custom schema namespace implements a custom type for checking input BAMs for call-gSNP +*/ +custom_schema_types { + allowed_input_types = [ + 'BAM' + ] + allowed_bam_types = [ + 'normal', + 'tumour' + ] + + /** + * Check that input types are in allowed list + */ + check_input_type_keys = { List given, String name, List choices=custom_schema_types.allowed_input_types -> + for (elem in given) { + if (!(elem in choices)) { + throw new Exception("Invalid paramter ${name}. Valid types: ${choices}.") + } + } + } + + /** + * Check if given input is a Namespace + */ + check_if_namespace = { val, String name -> + if (!(val in Map)) { + throw new Exception("${name} should be a Namespace, not ${val.getClass()}.") + } + } + + /** + * Check if given input is a list + */ + check_if_list = { val, String name -> + if (!(val in List || val in Set)) { + throw new Exception("${name} should be a List, not ${val.getClass()}.") + } + } + + /** + * Check that input is namespace of expected types + */ + check_input_namespace = { Map options, String name, Map properties -> + // Check parameters keys + custom_schema_types.check_if_namespace(options[name], name) + def given_keys = options[name].keySet() as ArrayList + custom_schema_types.check_input_type_keys(given_keys, name) + + options[name].each { entry -> + def entry_as_map = [:] + entry_as_map[entry.key] = entry.value + schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) + } + } + + /** + * Check namespace BAM + */ + check_bam_namespace = { Map options, String name, Map properties -> + custom_schema_types.check_if_namespace(options[name], name) + def given_keys = options[name].keySet() as ArrayList + if (given_keys.size() <= 0) { + throw new Exception("No inputs provided! Please provide inputs in the CSV or YAML.") + } + custom_schema_types.check_input_type_keys(given_keys, name, custom_schema_types.allowed_bam_types) + + options[name].each { entry -> + def entry_as_map = [:] + entry_as_map[entry.key] = entry.value + schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) + } + } + + /** + * Check if proper BAM entry list + */ + check_bam_list = { Map options, String name, Map properties -> + custom_schema_types.check_if_list(options[name], name) + for (item in options[name]) { + custom_schema_types.check_if_namespace(item, name) + properties.elements.each { key, val -> + schema.validate_parameter(item, key, val) + } + } + } + + types = [ + 'InputNamespace': custom_schema_types.check_input_namespace, + 'InputBAMNamespace': custom_schema_types.check_bam_namespace, + 'BAMEntryList': custom_schema_types.check_bam_list + ] +} diff --git a/config/methods.config b/config/methods.config index 48670d4..6db4ffc 100644 --- a/config/methods.config +++ b/config/methods.config @@ -1,5 +1,6 @@ import nextflow.util.SysHelper includeConfig "${projectDir}/external/nextflow-config/config/csv/csv_parser.config" +includeConfig "${projectDir}/external/nextflow-config/config/schema/schema.config" class log_output_dir { static def check_permissions(path) { @@ -245,7 +246,7 @@ methods { } } - parse_input = { + convert_to_yaml_input = { if (params.containsKey('input')) { // YAML was used so set modes accordingly params.is_NT_paired = params.input.BAM.containsKey('normal') && params.input.BAM.containsKey('tumour') @@ -257,7 +258,6 @@ methods { } params.single_sample_type = all_input_keys[0] } - methods.format_input_from_yaml() } else if (params.containsKey('input_csv')) { // Parse CSV header line and determine modes def reader = new BufferedReader(new FileReader(params.input_csv)) @@ -278,8 +278,6 @@ methods { methods.set_ids_from_csv(raw_csv_input) // Format the CSV input to match input YAML format methods.format_csv_input(raw_csv_input) - // Call YAML input formatter to generate matching input for pipeline - methods.format_input_from_yaml() } else { throw new Exception("Neither YAML nor CSV inputs found! Please run pipeline with inputs.") } @@ -288,7 +286,10 @@ methods { // Set up env, timeline, trace, and report above. setup = { methods.set_env() - methods.parse_input() + methods.convert_to_yaml_input() + schema.load_custom_types("${projectDir}/config/custom_schema_types.config") + schema.validate() + methods.format_input_from_yaml() methods.set_log_output_dir() methods.set_output_dir() log_output_dir.check_permissions(params.log_output_dir) diff --git a/config/schema.yaml b/config/schema.yaml new file mode 100644 index 0000000..026f687 --- /dev/null +++ b/config/schema.yaml @@ -0,0 +1,132 @@ +--- +patient_id: + type: 'String' + required: true + help: 'Patient ID' +sample_id: + type: 'String' + required: true + help: 'Sample ID' +output_dir: + type: 'Path' + mode: 'w' + required: true + help: 'Absolute path to output directory' +save_intermediate_files: + type: 'Bool' + required: true + default: false + help: 'Whether to save intermediate files' +is_emit_original_quals: + type: 'Bool' + required: true + default: true + help: 'Whether to emit original quality scores after recalibration' +input_csv: + type: 'Path' + mode: 'r' + required: false + help: 'Absolute path to input CSV containing sample information' +is_DOC_run: + type: 'Bool' + required: true + default: false + help: 'Whether to run the DepthOfCoverage process, which is very time-consuming for large BAMs' +intervals: + type: 'String' + allow_empty: true + required: true + help: 'Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS' +scatter_count: + type: 'Integer' + required: true + default: 50 + help: 'How many intervals to divide the genome into for parallelization' +split_intervals_extra_args: + type: 'String' + allow_empty: true + required: false + help: 'Extra arguments for interval splitting' +gatk_ir_compression: + type: 'Integer' + choices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + default: 0 + required: false +reference_fasta: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to reference genome fasta' +bundle_mills_and_1000g_gold_standard_indels_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to Mills and 1000g gold standard INDELs VCF' +bundle_known_indels_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to known INDELs VCF' +bundle_v0_dbsnp138_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to v0 dbSNP 138 VCF' +bundle_hapmap_3p3_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to HapMap 3p3 VCF' +bundle_omni_1000g_2p5_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to OMNI 1000g 2p5 VCF' +bundle_phase1_1000g_snps_high_conf_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to phase 1 1000g high confidence SNPs VCF' +bundle_contest_hapmap_3p3_vcf_gz: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to ConEst HapMap 3p3 VCF' +input: + type: 'InputNamespace' + required: true + help: 'Input samples' + elements: + BAM: + type: 'InputBAMNamespace' + required: true + help: 'Input BAMs for calling' + elements: + normal: + type: 'BAMEntryList' + required: false + help: 'Input normal BAMs' + elements: + id: + type: 'String' + required: true + help: 'Identifier for sample' + path: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to BAM file' + tumour: + type: 'BAMEntryList' + required: false + help: 'Input normal BAMs' + elements: + id: + type: 'String' + required: true + help: 'Identifier for sample' + path: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to BAM file'