Skip to content
This repository was archived by the owner on Jan 3, 2023. It is now read-only.

Commit cf01ad4

Browse files
committed
Consolidation of GenomicsDB configurations - simplified class hierarchy
for configuration objects. GenomicsDBConfigBase - base class holding practically all the information needed for queries and imports GenomicsDBImportConfig - subclass of GenomicsDBConfigBase that contains some import specific information VariantQueryConfig - subclass of GenomicsDBConfigBase This way all the information is consolidated into a couple of classes. Most of the changes in this commit are to handle the updated class hierarchy and to avoid duplication of config information as much as possible. TODO: Java side changes
1 parent 9140f24 commit cf01ad4

33 files changed

+1046
-1257
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ include_directories (
218218
src/main/cpp/include/query_operations
219219
src/main/cpp/include/utils
220220
src/main/cpp/include/vcf
221+
src/main/cpp/include/config
221222
src/test/cpp/include
222223
)
223224

example/src/test_genomicsdb_importer.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ int main(int argc, char *argv[]) {
178178
rapidjson::Document json_doc;
179179
json_doc.Parse(str.c_str());
180180
if(json_doc.HasParseError())
181-
throw RunConfigException(std::string("Syntax error in JSON file ")+filename);
181+
throw GenomicsDBConfigException(std::string("Syntax error in JSON file ")+filename);
182182
std::string stream_name;
183183
for(auto b=json_doc.MemberBegin(), e=json_doc.MemberEnd();b!=e;++b)
184184
{

src/main/CMakeLists.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ set(GenomicsDB_library_sources
77
cpp/src/genomicsdb/variant_array_schema.cc
88
cpp/src/genomicsdb/variant_field_handler.cc
99
cpp/src/genomicsdb/variant.cc
10-
cpp/src/genomicsdb/variant_query_config.cc
1110
cpp/src/genomicsdb/query_variants.cc
1211
cpp/src/genomicsdb/genomicsdb_columnar_field.cc
1312
cpp/src/genomicsdb/genomicsdb_iterators.cc
@@ -20,7 +19,6 @@ set(GenomicsDB_library_sources
2019
cpp/src/utils/command_line.cc
2120
cpp/src/utils/memory_measure.cc
2221
cpp/src/utils/histogram.cc
23-
cpp/src/utils/json_config.cc
2422
cpp/src/utils/vid_mapper_pb.cc
2523
cpp/src/utils/lut.cc
2624
cpp/src/utils/known_field_info.cc
@@ -30,6 +28,9 @@ set(GenomicsDB_library_sources
3028
cpp/src/vcf/vcf_adapter.cc
3129
cpp/src/vcf/genomicsdb_bcf_generator.cc
3230
cpp/src/vcf/vcf2binary.cc
31+
cpp/src/config/variant_query_config.cc
32+
cpp/src/config/genomicsdb_config_base.cc
33+
cpp/src/config/json_config.cc
3334
)
3435

3536
include_directories(${PROTOBUF_GENERATED_CXX_HDRS_INCLUDE_DIRS})
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* The MIT License (MIT)
3-
* Copyright (c) 2016-2017 Intel Corporation
3+
* Copyright (c) 2018 Intel Corporation
44
*
55
* Permission is hereby granted, free of charge, to any person obtaining a copy of
66
* this software and associated documentation files (the "Software"), to deal in
@@ -20,38 +20,29 @@
2020
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
2121
*/
2222

23-
#ifndef RUN_CONFIG_H
24-
#define RUN_CONFIG_H
23+
#ifndef GENOMICSDB_CONFIG_BASE_H
24+
#define GENOMICSDB_CONFIG_BASE_H
2525

26-
#include "variant_query_config.h"
27-
#include "vcf_adapter.h"
2826
#include "vid_mapper.h"
2927

30-
#include "rapidjson/document.h"
31-
#include "rapidjson/reader.h"
32-
#include "rapidjson/stringbuffer.h"
33-
#include "rapidjson/writer.h"
34-
#include "rapidjson/filewritestream.h"
35-
#include "rapidjson/prettywriter.h"
36-
3728
//Exceptions thrown
38-
class RunConfigException : public std::exception {
29+
class GenomicsDBConfigException : public std::exception {
3930
public:
40-
RunConfigException(const std::string m="") : msg_("RunConfigException : "+m) { ; }
41-
~RunConfigException() { ; }
31+
GenomicsDBConfigException(const std::string m="") : msg_("GenomicsDBConfigException : "+m) { ; }
32+
~GenomicsDBConfigException() { ; }
4233
// ACCESSORS
4334
/** Returns the exception message. */
4435
const char* what() const noexcept { return msg_.c_str(); }
4536
private:
4637
std::string msg_;
4738
};
4839

49-
extern const char* g_json_indent_unit;
40+
class GenomicsDBImportConfig;
5041

51-
class JSONConfigBase
42+
class GenomicsDBConfigBase
5243
{
5344
public:
54-
JSONConfigBase()
45+
GenomicsDBConfigBase()
5546
{
5647
m_single_array_name = false;
5748
m_single_workspace_path = false;
@@ -60,29 +51,58 @@ class JSONConfigBase
6051
m_single_query_row_ranges_vector = false;
6152
m_row_partitions_specified = false;
6253
m_scan_whole_array = false;
54+
m_is_tmp_vcf_header_filename = false;
55+
m_produce_GT_field = false;
56+
m_produce_FILTER_field = false;
57+
m_index_output_VCF = false;
58+
m_sites_only_query = false;
59+
m_produce_GT_with_min_PL_value_for_spanning_deletions = false;
6360
//Lower and upper bounds of callset row idx to import in this invocation
6461
m_lb_callset_row_idx = 0;
6562
m_ub_callset_row_idx = INT64_MAX-1;
6663
m_segment_size = 10u*1024u*1024u; //10MiB default
67-
clear();
6864
}
69-
void clear();
70-
static void extract_contig_interval_from_object(const rapidjson::Value& curr_json_object,
71-
const VidMapper* id_mapper, ColumnRange& result);
72-
static bool extract_interval_from_PB_struct_or_return_false(const rapidjson::Value& curr_json_object,
73-
const VidMapper* id_mapper,
74-
ColumnRange& result);
75-
void read_from_file(const std::string& filename, const VidMapper* id_mapper=0, const int rank=0);
65+
~GenomicsDBConfigBase()
66+
{
67+
if(m_is_tmp_vcf_header_filename)
68+
unlink(m_vcf_header_filename.c_str());
69+
m_is_tmp_vcf_header_filename = false;
70+
}
7671
const std::string& get_workspace(const int rank) const;
7772
const std::string& get_array_name(const int rank) const;
7873
ColumnRange get_column_partition(const int rank, const unsigned idx=0u) const;
7974
RowRange get_row_partition(const int rank, const unsigned idx=0u) const;
8075
const std::vector<ColumnRange> get_sorted_column_partitions() const { return m_sorted_column_partitions; }
81-
void read_and_initialize_vid_and_callset_mapping_if_available(VidMapper* id_mapper, const int rank);
8276
const std::vector<ColumnRange>& get_query_column_ranges(const int rank) const;
8377
const std::vector<RowRange>& get_query_row_ranges(const int rank) const;
8478
inline size_t get_segment_size() const { return m_segment_size; }
8579
void set_segment_size(const size_t v) { m_segment_size = v; }
80+
inline unsigned get_determine_sites_with_max_alleles() const { return m_determine_sites_with_max_alleles; }
81+
inline unsigned get_max_diploid_alt_alleles_that_can_be_genotyped() const { return m_max_diploid_alt_alleles_that_can_be_genotyped; }
82+
inline size_t get_combined_vcf_records_buffer_size_limit() const { return m_combined_vcf_records_buffer_size_limit; }
83+
void set_vcf_header_filename(const std::string& vcf_header_filename);
84+
const std::string& get_vcf_header_filename() const { return m_vcf_header_filename; }
85+
void set_vcf_output_format(const std::string& output_format);
86+
const std::string& get_vcf_output_format() const { return m_vcf_output_format; }
87+
const std::string& get_vcf_output_filename() const { return m_vcf_output_filename; }
88+
const std::string& get_reference_genome() const { return m_reference_genome; }
89+
const bool produce_GT_field() const { return m_produce_GT_field; }
90+
const bool produce_FILTER_field() const { return m_produce_FILTER_field; }
91+
const bool sites_only_query() const { return m_sites_only_query; }
92+
const bool index_output_VCF() const { return m_index_output_VCF; }
93+
const bool produce_GT_with_min_PL_value_for_spanning_deletions() const
94+
{ return m_produce_GT_with_min_PL_value_for_spanning_deletions; }
95+
const VidMapper& get_vid_mapper() const { return m_vid_mapper; }
96+
//Utility functions
97+
static ColumnRange verify_contig_position_and_get_tiledb_column_interval(const ContigInfo& contig_info,
98+
const int64_t begin, const int64_t end);
99+
const std::string& get_callset_mapping_file() const { return m_callset_mapping_file; }
100+
const std::string& get_vid_mapping_file() const { return m_vid_mapping_file; }
101+
//Sometimes information is present in the loader - copy over
102+
void update_from_loader(const GenomicsDBImportConfig& loader_config, const int rank);
103+
void subset_query_column_ranges_based_on_partition(const GenomicsDBImportConfig& loader_config, const int rank);
104+
inline RowRange get_row_bounds() const { return RowRange(m_lb_callset_row_idx, m_ub_callset_row_idx); }
105+
inline uint64_t get_num_rows_within_bounds() const { return m_ub_callset_row_idx - m_lb_callset_row_idx + 1ull; }
86106
protected:
87107
bool m_single_workspace_path;
88108
bool m_single_array_name;
@@ -91,7 +111,18 @@ class JSONConfigBase
91111
bool m_single_query_row_ranges_vector;
92112
bool m_row_partitions_specified;
93113
bool m_scan_whole_array;
94-
rapidjson::Document m_json;
114+
//Useful if template is not in POSIX fs - create copy and then parse
115+
bool m_is_tmp_vcf_header_filename;
116+
//GATK CombineGVCF does not produce GT field by default - option to produce GT
117+
bool m_produce_GT_field;
118+
//GATK CombineGVCF does not produce FILTER field by default - option to produce FILTER
119+
bool m_produce_FILTER_field;
120+
//index output VCF file
121+
bool m_index_output_VCF;
122+
//sites-only query - doesn't produce any of the FORMAT fields
123+
bool m_sites_only_query;
124+
//when producing GT, use the min PL value GT for spanning deletions
125+
bool m_produce_GT_with_min_PL_value_for_spanning_deletions;
95126
std::vector<std::string> m_workspaces;
96127
std::vector<std::string> m_array_names;
97128
std::vector<std::vector<ColumnRange>> m_column_ranges;
@@ -102,39 +133,36 @@ class JSONConfigBase
102133
//Lower and upper bounds of callset row idx to import in this invocation
103134
int64_t m_lb_callset_row_idx;
104135
int64_t m_ub_callset_row_idx;
105-
//Vid mapping file
106-
std::string m_vid_mapping_file;
107-
//callset mapping file - if defined in upper level config file
108-
std::string m_callset_mapping_file;
109136
//TileDB segment size
110137
size_t m_segment_size;
111-
};
112-
113-
class JSONLoaderConfig;
114-
115-
class JSONBasicQueryConfig : public JSONConfigBase
116-
{
138+
//VCF output parameters
139+
std::string m_vcf_header_filename;
140+
std::string m_reference_genome;
141+
std::string m_vcf_output_filename;
142+
std::string m_vcf_output_format;
143+
//Count max #alt alleles , don't create combined gVCF
144+
unsigned m_determine_sites_with_max_alleles;
145+
//Max diploid alleles for which fields whose length is equal to the number of genotypes can be produced (such as PL)
146+
unsigned m_max_diploid_alt_alleles_that_can_be_genotyped;
147+
//Buffer size for combined vcf records
148+
size_t m_combined_vcf_records_buffer_size_limit;
149+
//VidMapper
150+
VidMapper m_vid_mapper;
151+
//Might be empty strings if using Protobuf
152+
std::string m_vid_mapping_file;
153+
std::string m_callset_mapping_file;
117154
public:
118-
JSONBasicQueryConfig() : JSONConfigBase() { }
119-
void read_from_file(const std::string& filename, VariantQueryConfig& query_config, VidMapper* id_mapper=0, int rank=0, JSONLoaderConfig* loader_config=0);
120-
void update_from_loader(JSONLoaderConfig* loader_config, const int rank);
121-
void subset_query_column_ranges_based_on_partition(const JSONLoaderConfig* loader_config, const int rank);
155+
//Static convenience member
156+
static std::unordered_map<std::string, bool> m_vcf_output_format_to_is_bcf_flag;
122157
};
123158

124-
#define JSON_LOADER_PARTITION_INFO_BEGIN_FIELD_NAME "begin"
125-
#define JSON_LOADER_PARTITION_INFO_END_FIELD_NAME "end"
126-
127-
class JSONLoaderConfig : public JSONConfigBase
159+
class GenomicsDBImportConfig : public GenomicsDBConfigBase
128160
{
129161
public:
130-
JSONLoaderConfig(bool vid_mapper_file_required = true);
131-
void read_from_file(const std::string& filename, VidMapper* id_mapper=0, int rank=0);
162+
GenomicsDBImportConfig();
163+
void read_from_file(const std::string& filename, int rank=0);
132164
inline bool is_partitioned_by_row() const { return m_row_based_partitioning; }
133165
inline bool is_partitioned_by_column() const { return !m_row_based_partitioning; }
134-
inline ColumnRange get_column_partition(int idx) const
135-
{
136-
return m_row_based_partitioning ? ColumnRange(0, INT64_MAX) : JSONConfigBase::get_column_partition(idx);
137-
}
138166
inline int64_t get_max_num_rows_in_array() const { return m_max_num_rows_in_array; }
139167
inline bool offload_vcf_output_processing() const { return m_offload_vcf_output_processing; }
140168
inline bool ignore_cells_not_in_partition() const { return m_ignore_cells_not_in_partition; }
@@ -144,12 +172,6 @@ class JSONLoaderConfig : public JSONConfigBase
144172
inline size_t get_segment_size() const { return m_segment_size; }
145173
inline size_t get_num_cells_per_tile() const { return m_num_cells_per_tile; }
146174
inline int64_t get_tiledb_compression_level() const { return m_tiledb_compression_level; }
147-
inline const std::string& get_vid_mapping_filename() const { return m_vid_mapping_file; }
148-
inline const std::string& get_callset_mapping_filename() const { return m_callset_mapping_file; }
149-
inline RowRange get_row_bounds() const { return RowRange(m_lb_callset_row_idx, m_ub_callset_row_idx); }
150-
inline void set_vid_mapper_file_required(bool val) {
151-
m_vid_mapper_file_required = val;
152-
}
153175
inline bool fail_if_updating() const { return m_fail_if_updating; }
154176
inline bool consolidate_tiledb_array_after_load() const { return m_consolidate_tiledb_array_after_load; }
155177
inline bool discard_missing_GTs() const { return m_discard_missing_GTs; }
@@ -186,8 +208,6 @@ class JSONLoaderConfig : public JSONConfigBase
186208
size_t m_num_cells_per_tile;
187209
//TileDB compression level
188210
int m_tiledb_compression_level;
189-
//flag to say whether vid_mapping_file is required or optional
190-
bool m_vid_mapper_file_required;
191211
//flag that causes the loader to fail if this is an update (rather than a fresh load)
192212
bool m_fail_if_updating;
193213
//consolidate TileDB array after load - merges fragments
@@ -197,59 +217,8 @@ class JSONLoaderConfig : public JSONConfigBase
197217
//The array will NOT contain mandatory VCF fields (ref, alt, qual, filter)
198218
//if this flag is enabled
199219
bool m_no_mandatory_VCF_fields;
200-
};
201-
202-
#ifdef HTSDIR
203-
204-
class JSONVCFAdapterConfig : public JSONConfigBase
205-
{
206-
public:
207-
JSONVCFAdapterConfig() : JSONConfigBase()
208-
{
209-
m_vcf_header_filename = "";
210-
m_determine_sites_with_max_alleles = 0;
211-
m_combined_vcf_records_buffer_size_limit = DEFAULT_COMBINED_VCF_RECORDS_BUFFER_SIZE;
212-
}
213-
~JSONVCFAdapterConfig()
214-
{
215-
if (is_tmp_vcf_header_filename) {
216-
unlink(m_vcf_header_filename.c_str());
217-
}
218-
}
219-
void read_from_file(const std::string& filename,
220-
VCFAdapter& vcf_adapter,
221-
VidMapper* id_mapper,
222-
std::string output_format="", int rank=0,
223-
const size_t combined_vcf_records_buffer_size_limit=0u);
224-
inline unsigned get_determine_sites_with_max_alleles() const { return m_determine_sites_with_max_alleles; }
225-
inline unsigned get_max_diploid_alt_alleles_that_can_be_genotyped() const { return m_max_diploid_alt_alleles_that_can_be_genotyped; }
226-
inline size_t get_combined_vcf_records_buffer_size_limit() const { return m_combined_vcf_records_buffer_size_limit; }
227220
protected:
228-
std::string m_vcf_header_filename;
229-
std::string m_reference_genome;
230-
std::string m_vcf_output_filename;
231-
//Count max #alt alleles , don't create combined gVCF
232-
unsigned m_determine_sites_with_max_alleles;
233-
//Max diploid alleles for which fields whose length is equal to the number of genotypes can be produced (such as PL)
234-
unsigned m_max_diploid_alt_alleles_that_can_be_genotyped;
235-
//Buffer size for combined vcf records
236-
size_t m_combined_vcf_records_buffer_size_limit;
237-
private:
238-
bool is_tmp_vcf_header_filename = false;
239-
};
240-
241-
class JSONVCFAdapterQueryConfig : public JSONVCFAdapterConfig, public JSONBasicQueryConfig
242-
{
243-
public:
244-
JSONVCFAdapterQueryConfig() : JSONVCFAdapterConfig(), JSONBasicQueryConfig() { ; }
245-
void read_from_file(const std::string& filename, VariantQueryConfig& query_config,
246-
VCFAdapter& vcf_adapter, VidMapper* id_mapper,
247-
std::string output_format="", int rank=0,
248-
const size_t combined_vcf_records_buffer_size_limit=0u);
221+
void fix_callset_row_idx_bounds(const int rank);
249222
};
250223

251-
252-
253-
#endif
254-
255224
#endif
+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/**
2+
* The MIT License (MIT)
3+
* Copyright (c) 2016-2018 Intel Corporation
4+
*
5+
* Permission is hereby granted, free of charge, to any person obtaining a copy of
6+
* this software and associated documentation files (the "Software"), to deal in
7+
* the Software without restriction, including without limitation the rights to
8+
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9+
* the Software, and to permit persons to whom the Software is furnished to do so,
10+
* subject to the following conditions:
11+
*
12+
* The above copyright notice and this permission notice shall be included in all
13+
* copies or substantial portions of the Software.
14+
*
15+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17+
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18+
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19+
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20+
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21+
*/
22+
23+
#ifndef JSON_CONFIG_H
24+
#define JSON_CONFIG_H
25+
26+
#include "genomicsdb_config_base.h"
27+
28+
#include "rapidjson/document.h"
29+
#include "rapidjson/reader.h"
30+
#include "rapidjson/stringbuffer.h"
31+
#include "rapidjson/writer.h"
32+
#include "rapidjson/filewritestream.h"
33+
#include "rapidjson/prettywriter.h"
34+
35+
class JSONConfigBase : public GenomicsDBConfigBase
36+
{
37+
public:
38+
JSONConfigBase()
39+
: GenomicsDBConfigBase()
40+
{}
41+
JSONConfigBase(const GenomicsDBConfigBase& x)
42+
: GenomicsDBConfigBase(x)
43+
{}
44+
static void extract_contig_interval_from_object(const rapidjson::Value& curr_json_object,
45+
const VidMapper* id_mapper, ColumnRange& result);
46+
static bool extract_interval_from_PB_struct_or_return_false(const rapidjson::Value& curr_json_object,
47+
const VidMapper* id_mapper,
48+
ColumnRange& result);
49+
void read_from_file(const std::string& filename, const int rank=0);
50+
void read_and_initialize_vid_and_callset_mapping_if_available(const int rank);
51+
const rapidjson::Document& get_rapidjson_doc() const { return m_json; }
52+
protected:
53+
rapidjson::Document m_json;
54+
};
55+
56+
#endif

0 commit comments

Comments
 (0)