Param validation (#95)

* Add parameter validation * Add schema files * Update changelog * Fix linting and comment * Add .gitattributes
uclahs-cds · Jul 12, 2022 · 5f2cf1c · 5f2cf1c
1 parent c14f5fb
commit 5f2cf1c
Show file tree

Hide file tree

Showing 5 changed files with 234 additions and 5 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.config linguist-language=groovy
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 ## [Unreleased]
 ### Added
 - IndelRealignment compression parameter
+- Param validation
 ### Changed
 - Parse CSV inputs using modularized `csv_parser`
 - Delete merged but un-deduplicated BAMs earlier for more efficient disk usage

diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config
@@ -0,0 +1,94 @@
+/**
+* This custom schema namespace implements a custom type for checking input BAMs for call-gSNP
+*/
+custom_schema_types {
+    allowed_input_types = [
+        'BAM'
+    ]
+    allowed_bam_types = [
+        'normal',
+        'tumour'
+    ]
+
+    /**
+    * Check that input types are in allowed list
+    */
+    check_input_type_keys = { List given, String name, List choices=custom_schema_types.allowed_input_types ->
+        for (elem in given) {
+            if (!(elem in choices)) {
+                throw new Exception("Invalid paramter ${name}. Valid types: ${choices}.")
+            }
+        }
+    }
+
+    /**
+    * Check if given input is a Namespace
+    */
+    check_if_namespace = { val, String name ->
+        if (!(val in Map)) {
+            throw new Exception("${name} should be a Namespace, not ${val.getClass()}.")
+        }
+    }
+
+    /**
+    * Check if given input is a list
+    */
+    check_if_list = { val, String name ->
+        if (!(val in List || val in Set)) {
+            throw new Exception("${name} should be a List, not ${val.getClass()}.")
+        }
+    }
+
+    /**
+    * Check that input is namespace of expected types
+    */
+    check_input_namespace = { Map options, String name, Map properties ->
+        // Check parameters keys
+        custom_schema_types.check_if_namespace(options[name], name)
+        def given_keys = options[name].keySet() as ArrayList
+        custom_schema_types.check_input_type_keys(given_keys, name)
+
+        options[name].each { entry ->
+            def entry_as_map = [:]
+            entry_as_map[entry.key] = entry.value
+            schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key])
+        }
+    }
+
+    /**
+    * Check namespace BAM
+    */
+    check_bam_namespace = { Map options, String name, Map properties ->
+        custom_schema_types.check_if_namespace(options[name], name)
+        def given_keys = options[name].keySet() as ArrayList
+        if (given_keys.size() <= 0) {
+            throw new Exception("No inputs provided! Please provide inputs in the CSV or YAML.")
+        }
+        custom_schema_types.check_input_type_keys(given_keys, name, custom_schema_types.allowed_bam_types)
+
+        options[name].each { entry ->
+            def entry_as_map = [:]
+            entry_as_map[entry.key] = entry.value
+            schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key])
+        }
+    }
+
+    /**
+    * Check if proper BAM entry list
+    */
+    check_bam_list = { Map options, String name, Map properties ->
+        custom_schema_types.check_if_list(options[name], name)
+        for (item in options[name]) {
+            custom_schema_types.check_if_namespace(item, name)
+            properties.elements.each { key, val ->
+                schema.validate_parameter(item, key, val)
+            }
+        }
+    }
+
+    types = [
+        'InputNamespace': custom_schema_types.check_input_namespace,
+        'InputBAMNamespace': custom_schema_types.check_bam_namespace,
+        'BAMEntryList': custom_schema_types.check_bam_list
+    ]
+}
diff --git a/config/methods.config b/config/methods.config
@@ -1,5 +1,6 @@
 import nextflow.util.SysHelper
 includeConfig "${projectDir}/external/nextflow-config/config/csv/csv_parser.config"
+includeConfig "${projectDir}/external/nextflow-config/config/schema/schema.config"
 
 class log_output_dir {
   static def check_permissions(path) {
@@ -245,7 +246,7 @@ methods {
     }
   }
 
-  parse_input = {
+  convert_to_yaml_input = {
     if (params.containsKey('input')) {
       // YAML was used so set modes accordingly
       params.is_NT_paired = params.input.BAM.containsKey('normal') && params.input.BAM.containsKey('tumour')
@@ -257,7 +258,6 @@ methods {
         }
         params.single_sample_type = all_input_keys[0]
       }
-      methods.format_input_from_yaml()
     } else if (params.containsKey('input_csv')) {
       // Parse CSV header line and determine modes
       def reader = new BufferedReader(new FileReader(params.input_csv))
@@ -278,8 +278,6 @@ methods {
       methods.set_ids_from_csv(raw_csv_input)
       // Format the CSV input to match input YAML format
       methods.format_csv_input(raw_csv_input)
-      // Call YAML input formatter to generate matching input for pipeline
-      methods.format_input_from_yaml()
     } else {
       throw new Exception("Neither YAML nor CSV inputs found! Please run pipeline with inputs.")
     }
@@ -288,7 +286,10 @@ methods {
   // Set up env, timeline, trace, and report above.
   setup = {
     methods.set_env()
-    methods.parse_input()
+    methods.convert_to_yaml_input()
+    schema.load_custom_types("${projectDir}/config/custom_schema_types.config")
+    schema.validate()
+    methods.format_input_from_yaml()
     methods.set_log_output_dir()
     methods.set_output_dir()
     log_output_dir.check_permissions(params.log_output_dir)

diff --git a/config/schema.yaml b/config/schema.yaml
@@ -0,0 +1,132 @@
+---
+patient_id:
+  type: 'String'
+  required: true
+  help: 'Patient ID'
+sample_id:
+  type: 'String'
+  required: true
+  help: 'Sample ID'
+output_dir:
+  type: 'Path'
+  mode: 'w'
+  required: true
+  help: 'Absolute path to output directory'
+save_intermediate_files:
+  type: 'Bool'
+  required: true
+  default: false
+  help: 'Whether to save intermediate files'
+is_emit_original_quals:
+  type: 'Bool'
+  required: true
+  default: true
+  help: 'Whether to emit original quality scores after recalibration'
+input_csv:
+  type: 'Path'
+  mode: 'r'
+  required: false
+  help: 'Absolute path to input CSV containing sample information'
+is_DOC_run:
+  type: 'Bool'
+  required: true
+  default: false
+  help: 'Whether to run the DepthOfCoverage process, which is very time-consuming for large BAMs'
+intervals:
+  type: 'String'
+  allow_empty: true
+  required: true
+  help: 'Target intervals to process for DNA panel/targeted sequencing samples; leave empty for WGS'
+scatter_count:
+  type: 'Integer'
+  required: true
+  default: 50
+  help: 'How many intervals to divide the genome into for parallelization'
+split_intervals_extra_args:
+  type: 'String'
+  allow_empty: true
+  required: false
+  help: 'Extra arguments for interval splitting'
+gatk_ir_compression:
+  type: 'Integer'
+  choices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+  default: 0
+  required: false
+reference_fasta:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to reference genome fasta'
+bundle_mills_and_1000g_gold_standard_indels_vcf_gz:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to Mills and 1000g gold standard INDELs VCF'
+bundle_known_indels_vcf_gz:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to known INDELs VCF'
+bundle_v0_dbsnp138_vcf_gz:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to v0 dbSNP 138 VCF'
+bundle_hapmap_3p3_vcf_gz:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to HapMap 3p3 VCF'
+bundle_omni_1000g_2p5_vcf_gz:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to OMNI 1000g 2p5 VCF'
+bundle_phase1_1000g_snps_high_conf_vcf_gz:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to phase 1 1000g high confidence SNPs VCF'
+bundle_contest_hapmap_3p3_vcf_gz:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to ConEst HapMap 3p3 VCF'
+input:
+  type: 'InputNamespace'
+  required: true
+  help: 'Input samples'
+  elements:
+    BAM:
+      type: 'InputBAMNamespace'
+      required: true
+      help: 'Input BAMs for calling'
+      elements:
+        normal:
+          type: 'BAMEntryList'
+          required: false
+          help: 'Input normal BAMs'
+          elements:
+            id:
+              type: 'String'
+              required: true
+              help: 'Identifier for sample'
+            path:
+              type: 'Path'
+              mode: 'r'
+              required: true
+              help: 'Absolute path to BAM file'
+        tumour:
+          type: 'BAMEntryList'
+          required: false
+          help: 'Input normal BAMs'
+          elements:
+            id:
+              type: 'String'
+              required: true
+              help: 'Identifier for sample'
+            path:
+              type: 'Path'
+              mode: 'r'
+              required: true
+              help: 'Absolute path to BAM file'