1
1
/* *
2
2
* The MIT License (MIT)
3
- * Copyright (c) 2016-2017 Intel Corporation
3
+ * Copyright (c) 2018 Intel Corporation
4
4
*
5
5
* Permission is hereby granted, free of charge, to any person obtaining a copy of
6
6
* this software and associated documentation files (the "Software"), to deal in
20
20
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
21
*/
22
22
23
- #ifndef RUN_CONFIG_H
24
- #define RUN_CONFIG_H
23
+ #ifndef GENOMICSDB_CONFIG_BASE_H
24
+ #define GENOMICSDB_CONFIG_BASE_H
25
25
26
- #include " variant_query_config.h"
27
- #include " vcf_adapter.h"
28
26
#include " vid_mapper.h"
29
27
30
- #include " rapidjson/document.h"
31
- #include " rapidjson/reader.h"
32
- #include " rapidjson/stringbuffer.h"
33
- #include " rapidjson/writer.h"
34
- #include " rapidjson/filewritestream.h"
35
- #include " rapidjson/prettywriter.h"
36
-
37
28
// Exceptions thrown
38
- class RunConfigException : public std ::exception {
29
+ class GenomicsDBConfigException : public std ::exception {
39
30
public:
40
- RunConfigException (const std::string m=" " ) : msg_(" RunConfigException : " +m) { ; }
41
- ~RunConfigException () { ; }
31
+ GenomicsDBConfigException (const std::string m=" " ) : msg_(" GenomicsDBConfigException : " +m) { ; }
32
+ ~GenomicsDBConfigException () { ; }
42
33
// ACCESSORS
43
34
/* * Returns the exception message. */
44
35
const char * what () const noexcept { return msg_.c_str (); }
45
36
private:
46
37
std::string msg_;
47
38
};
48
39
49
- extern const char * g_json_indent_unit ;
40
+ class GenomicsDBImportConfig ;
50
41
51
- class JSONConfigBase
42
+ class GenomicsDBConfigBase
52
43
{
53
44
public:
54
- JSONConfigBase ()
45
+ GenomicsDBConfigBase ()
55
46
{
56
47
m_single_array_name = false ;
57
48
m_single_workspace_path = false ;
@@ -60,29 +51,58 @@ class JSONConfigBase
60
51
m_single_query_row_ranges_vector = false ;
61
52
m_row_partitions_specified = false ;
62
53
m_scan_whole_array = false ;
54
+ m_is_tmp_vcf_header_filename = false ;
55
+ m_produce_GT_field = false ;
56
+ m_produce_FILTER_field = false ;
57
+ m_index_output_VCF = false ;
58
+ m_sites_only_query = false ;
59
+ m_produce_GT_with_min_PL_value_for_spanning_deletions = false ;
63
60
// Lower and upper bounds of callset row idx to import in this invocation
64
61
m_lb_callset_row_idx = 0 ;
65
62
m_ub_callset_row_idx = INT64_MAX-1 ;
66
63
m_segment_size = 10u *1024u *1024u ; // 10MiB default
67
- clear ();
68
64
}
69
- void clear ();
70
- static void extract_contig_interval_from_object (const rapidjson::Value& curr_json_object,
71
- const VidMapper* id_mapper, ColumnRange& result);
72
- static bool extract_interval_from_PB_struct_or_return_false (const rapidjson::Value& curr_json_object,
73
- const VidMapper* id_mapper,
74
- ColumnRange& result);
75
- void read_from_file (const std::string& filename, const VidMapper* id_mapper=0 , const int rank=0 );
65
+ ~GenomicsDBConfigBase ()
66
+ {
67
+ if (m_is_tmp_vcf_header_filename)
68
+ unlink (m_vcf_header_filename.c_str ());
69
+ m_is_tmp_vcf_header_filename = false ;
70
+ }
76
71
const std::string& get_workspace (const int rank) const ;
77
72
const std::string& get_array_name (const int rank) const ;
78
73
ColumnRange get_column_partition (const int rank, const unsigned idx=0u ) const ;
79
74
RowRange get_row_partition (const int rank, const unsigned idx=0u ) const ;
80
75
const std::vector<ColumnRange> get_sorted_column_partitions () const { return m_sorted_column_partitions; }
81
- void read_and_initialize_vid_and_callset_mapping_if_available (VidMapper* id_mapper, const int rank);
82
76
const std::vector<ColumnRange>& get_query_column_ranges (const int rank) const ;
83
77
const std::vector<RowRange>& get_query_row_ranges (const int rank) const ;
84
78
inline size_t get_segment_size () const { return m_segment_size; }
85
79
void set_segment_size (const size_t v) { m_segment_size = v; }
80
+ inline unsigned get_determine_sites_with_max_alleles () const { return m_determine_sites_with_max_alleles; }
81
+ inline unsigned get_max_diploid_alt_alleles_that_can_be_genotyped () const { return m_max_diploid_alt_alleles_that_can_be_genotyped; }
82
+ inline size_t get_combined_vcf_records_buffer_size_limit () const { return m_combined_vcf_records_buffer_size_limit; }
83
+ void set_vcf_header_filename (const std::string& vcf_header_filename);
84
+ const std::string& get_vcf_header_filename () const { return m_vcf_header_filename; }
85
+ void set_vcf_output_format (const std::string& output_format);
86
+ const std::string& get_vcf_output_format () const { return m_vcf_output_format; }
87
+ const std::string& get_vcf_output_filename () const { return m_vcf_output_filename; }
88
+ const std::string& get_reference_genome () const { return m_reference_genome; }
89
+ const bool produce_GT_field () const { return m_produce_GT_field; }
90
+ const bool produce_FILTER_field () const { return m_produce_FILTER_field; }
91
+ const bool sites_only_query () const { return m_sites_only_query; }
92
+ const bool index_output_VCF () const { return m_index_output_VCF; }
93
+ const bool produce_GT_with_min_PL_value_for_spanning_deletions () const
94
+ { return m_produce_GT_with_min_PL_value_for_spanning_deletions; }
95
+ const VidMapper& get_vid_mapper () const { return m_vid_mapper; }
96
+ // Utility functions
97
+ static ColumnRange verify_contig_position_and_get_tiledb_column_interval (const ContigInfo& contig_info,
98
+ const int64_t begin, const int64_t end);
99
+ const std::string& get_callset_mapping_file () const { return m_callset_mapping_file; }
100
+ const std::string& get_vid_mapping_file () const { return m_vid_mapping_file; }
101
+ // Sometimes information is present in the loader - copy over
102
+ void update_from_loader (const GenomicsDBImportConfig& loader_config, const int rank);
103
+ void subset_query_column_ranges_based_on_partition (const GenomicsDBImportConfig& loader_config, const int rank);
104
+ inline RowRange get_row_bounds () const { return RowRange (m_lb_callset_row_idx, m_ub_callset_row_idx); }
105
+ inline uint64_t get_num_rows_within_bounds () const { return m_ub_callset_row_idx - m_lb_callset_row_idx + 1ull ; }
86
106
protected:
87
107
bool m_single_workspace_path;
88
108
bool m_single_array_name;
@@ -91,7 +111,18 @@ class JSONConfigBase
91
111
bool m_single_query_row_ranges_vector;
92
112
bool m_row_partitions_specified;
93
113
bool m_scan_whole_array;
94
- rapidjson::Document m_json;
114
+ // Useful if template is not in POSIX fs - create copy and then parse
115
+ bool m_is_tmp_vcf_header_filename;
116
+ // GATK CombineGVCF does not produce GT field by default - option to produce GT
117
+ bool m_produce_GT_field;
118
+ // GATK CombineGVCF does not produce FILTER field by default - option to produce FILTER
119
+ bool m_produce_FILTER_field;
120
+ // index output VCF file
121
+ bool m_index_output_VCF;
122
+ // sites-only query - doesn't produce any of the FORMAT fields
123
+ bool m_sites_only_query;
124
+ // when producing GT, use the min PL value GT for spanning deletions
125
+ bool m_produce_GT_with_min_PL_value_for_spanning_deletions;
95
126
std::vector<std::string> m_workspaces;
96
127
std::vector<std::string> m_array_names;
97
128
std::vector<std::vector<ColumnRange>> m_column_ranges;
@@ -102,39 +133,36 @@ class JSONConfigBase
102
133
// Lower and upper bounds of callset row idx to import in this invocation
103
134
int64_t m_lb_callset_row_idx;
104
135
int64_t m_ub_callset_row_idx;
105
- // Vid mapping file
106
- std::string m_vid_mapping_file;
107
- // callset mapping file - if defined in upper level config file
108
- std::string m_callset_mapping_file;
109
136
// TileDB segment size
110
137
size_t m_segment_size;
111
- };
112
-
113
- class JSONLoaderConfig ;
114
-
115
- class JSONBasicQueryConfig : public JSONConfigBase
116
- {
138
+ // VCF output parameters
139
+ std::string m_vcf_header_filename;
140
+ std::string m_reference_genome;
141
+ std::string m_vcf_output_filename;
142
+ std::string m_vcf_output_format;
143
+ // Count max #alt alleles , don't create combined gVCF
144
+ unsigned m_determine_sites_with_max_alleles;
145
+ // Max diploid alleles for which fields whose length is equal to the number of genotypes can be produced (such as PL)
146
+ unsigned m_max_diploid_alt_alleles_that_can_be_genotyped;
147
+ // Buffer size for combined vcf records
148
+ size_t m_combined_vcf_records_buffer_size_limit;
149
+ // VidMapper
150
+ VidMapper m_vid_mapper;
151
+ // Might be empty strings if using Protobuf
152
+ std::string m_vid_mapping_file;
153
+ std::string m_callset_mapping_file;
117
154
public:
118
- JSONBasicQueryConfig () : JSONConfigBase() { }
119
- void read_from_file (const std::string& filename, VariantQueryConfig& query_config, VidMapper* id_mapper=0 , int rank=0 , JSONLoaderConfig* loader_config=0 );
120
- void update_from_loader (JSONLoaderConfig* loader_config, const int rank);
121
- void subset_query_column_ranges_based_on_partition (const JSONLoaderConfig* loader_config, const int rank);
155
+ // Static convenience member
156
+ static std::unordered_map<std::string, bool > m_vcf_output_format_to_is_bcf_flag;
122
157
};
123
158
124
- #define JSON_LOADER_PARTITION_INFO_BEGIN_FIELD_NAME " begin"
125
- #define JSON_LOADER_PARTITION_INFO_END_FIELD_NAME " end"
126
-
127
- class JSONLoaderConfig : public JSONConfigBase
159
+ class GenomicsDBImportConfig : public GenomicsDBConfigBase
128
160
{
129
161
public:
130
- JSONLoaderConfig ( bool vid_mapper_file_required = true );
131
- void read_from_file (const std::string& filename, VidMapper* id_mapper= 0 , int rank=0 );
162
+ GenomicsDBImportConfig ( );
163
+ void read_from_file (const std::string& filename, int rank=0 );
132
164
inline bool is_partitioned_by_row () const { return m_row_based_partitioning; }
133
165
inline bool is_partitioned_by_column () const { return !m_row_based_partitioning; }
134
- inline ColumnRange get_column_partition (int idx) const
135
- {
136
- return m_row_based_partitioning ? ColumnRange (0 , INT64_MAX) : JSONConfigBase::get_column_partition (idx);
137
- }
138
166
inline int64_t get_max_num_rows_in_array () const { return m_max_num_rows_in_array; }
139
167
inline bool offload_vcf_output_processing () const { return m_offload_vcf_output_processing; }
140
168
inline bool ignore_cells_not_in_partition () const { return m_ignore_cells_not_in_partition; }
@@ -144,12 +172,6 @@ class JSONLoaderConfig : public JSONConfigBase
144
172
inline size_t get_segment_size () const { return m_segment_size; }
145
173
inline size_t get_num_cells_per_tile () const { return m_num_cells_per_tile; }
146
174
inline int64_t get_tiledb_compression_level () const { return m_tiledb_compression_level; }
147
- inline const std::string& get_vid_mapping_filename () const { return m_vid_mapping_file; }
148
- inline const std::string& get_callset_mapping_filename () const { return m_callset_mapping_file; }
149
- inline RowRange get_row_bounds () const { return RowRange (m_lb_callset_row_idx, m_ub_callset_row_idx); }
150
- inline void set_vid_mapper_file_required (bool val) {
151
- m_vid_mapper_file_required = val;
152
- }
153
175
inline bool fail_if_updating () const { return m_fail_if_updating; }
154
176
inline bool consolidate_tiledb_array_after_load () const { return m_consolidate_tiledb_array_after_load; }
155
177
inline bool discard_missing_GTs () const { return m_discard_missing_GTs; }
@@ -186,8 +208,6 @@ class JSONLoaderConfig : public JSONConfigBase
186
208
size_t m_num_cells_per_tile;
187
209
// TileDB compression level
188
210
int m_tiledb_compression_level;
189
- // flag to say whether vid_mapping_file is required or optional
190
- bool m_vid_mapper_file_required;
191
211
// flag that causes the loader to fail if this is an update (rather than a fresh load)
192
212
bool m_fail_if_updating;
193
213
// consolidate TileDB array after load - merges fragments
@@ -197,59 +217,8 @@ class JSONLoaderConfig : public JSONConfigBase
197
217
// The array will NOT contain mandatory VCF fields (ref, alt, qual, filter)
198
218
// if this flag is enabled
199
219
bool m_no_mandatory_VCF_fields;
200
- };
201
-
202
- #ifdef HTSDIR
203
-
204
- class JSONVCFAdapterConfig : public JSONConfigBase
205
- {
206
- public:
207
- JSONVCFAdapterConfig () : JSONConfigBase()
208
- {
209
- m_vcf_header_filename = " " ;
210
- m_determine_sites_with_max_alleles = 0 ;
211
- m_combined_vcf_records_buffer_size_limit = DEFAULT_COMBINED_VCF_RECORDS_BUFFER_SIZE;
212
- }
213
- ~JSONVCFAdapterConfig ()
214
- {
215
- if (is_tmp_vcf_header_filename) {
216
- unlink (m_vcf_header_filename.c_str ());
217
- }
218
- }
219
- void read_from_file (const std::string& filename,
220
- VCFAdapter& vcf_adapter,
221
- VidMapper* id_mapper,
222
- std::string output_format=" " , int rank=0 ,
223
- const size_t combined_vcf_records_buffer_size_limit=0u );
224
- inline unsigned get_determine_sites_with_max_alleles () const { return m_determine_sites_with_max_alleles; }
225
- inline unsigned get_max_diploid_alt_alleles_that_can_be_genotyped () const { return m_max_diploid_alt_alleles_that_can_be_genotyped; }
226
- inline size_t get_combined_vcf_records_buffer_size_limit () const { return m_combined_vcf_records_buffer_size_limit; }
227
220
protected:
228
- std::string m_vcf_header_filename;
229
- std::string m_reference_genome;
230
- std::string m_vcf_output_filename;
231
- // Count max #alt alleles , don't create combined gVCF
232
- unsigned m_determine_sites_with_max_alleles;
233
- // Max diploid alleles for which fields whose length is equal to the number of genotypes can be produced (such as PL)
234
- unsigned m_max_diploid_alt_alleles_that_can_be_genotyped;
235
- // Buffer size for combined vcf records
236
- size_t m_combined_vcf_records_buffer_size_limit;
237
- private:
238
- bool is_tmp_vcf_header_filename = false ;
239
- };
240
-
241
- class JSONVCFAdapterQueryConfig : public JSONVCFAdapterConfig , public JSONBasicQueryConfig
242
- {
243
- public:
244
- JSONVCFAdapterQueryConfig () : JSONVCFAdapterConfig(), JSONBasicQueryConfig() { ; }
245
- void read_from_file (const std::string& filename, VariantQueryConfig& query_config,
246
- VCFAdapter& vcf_adapter, VidMapper* id_mapper,
247
- std::string output_format=" " , int rank=0 ,
248
- const size_t combined_vcf_records_buffer_size_limit=0u );
221
+ void fix_callset_row_idx_bounds (const int rank);
249
222
};
250
223
251
-
252
-
253
- #endif
254
-
255
224
#endif
0 commit comments