Skip to content
This repository was archived by the owner on Jan 3, 2023. It is now read-only.

Commit 34508e9

Browse files
committed
* Further simplification of the config structures.
* Avoid long parameter lists in function calls by passing config object(s) * Java API handled - configs are still passed as temp JSON files i.e. Protobuf objects are serialized to JSON and then read by the C++ modules. However, no more mixing of JSON and Protobuf structures in the JNI functions. Eventually, I would like to add a set of Protobuf JNI functions, but this is low priority. * Modifications to the Protobuf structures to allow editing of vid
1 parent cf01ad4 commit 34508e9

23 files changed

+704
-783
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ set(LIBDBI_DIR "" CACHE PATH "Path to libdbi install directory")
3737
set(MAVEN_QUIET False CACHE BOOL "Do not print mvn messages")
3838
set(GENOMICSDB_SPARK_PROFILE "" CACHE STRING "Profile to choose Apache Spark version") # used in Maven builds
3939
set(IPPROOT "" CACHE PATH "Path to IPP libraries - used when optimized zlib is used")
40-
set(GENERATE_PROTOBUF_FILES_IN_BUILD_DIR False CACHE BOOL "Generate all protobuf files in build directory")
40+
set(GENERATE_PROTOBUF_FILES_IN_BUILD_DIR True CACHE BOOL "Generate all protobuf files in build directory")
4141
if (NOT CMAKE_BUILD_TYPE)
4242
set(CMAKE_BUILD_TYPE Release)
4343
endif()

example/java/TestBufferStreamGenomicsDBImporter.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ public static void main(final String[] args)
147147
Boolean.parseBoolean(args[argsLoaderFileIdx + 6]);
148148
//<loader.json> first arg
149149
String loaderJSONFile = args[argsLoaderFileIdx];
150-
GenomicsDBImporter loader = new GenomicsDBImporter(loaderJSONFile, rank, lbRowIdx, ubRowIdx);
150+
GenomicsDBImporter loader = new GenomicsDBImporter(loaderJSONFile, rank);
151151
//<stream_name_to_file.json> - useful for the driver only
152152
//JSON file that contains "stream_name": "vcf_file_path" entries
153153
FileReader mappingReader = new FileReader(args[argsLoaderFileIdx+1]);

example/java/TestGenomicsDB.java

+18-33
Original file line numberDiff line numberDiff line change
@@ -157,15 +157,13 @@ public enum ArgsIdxEnum
157157
ARGS_IDX_DO_LOAD(1001),
158158
ARGS_IDX_REFERENCE_GENOME(1002),
159159
ARGS_IDX_TEMPLATE_VCF_HEADER(1003),
160-
ARGS_IDX_LB_ROW_IDX(1004),
161-
ARGS_IDX_UB_ROW_IDX(1005),
162-
ARGS_IDX_CHROMOSOME(1006),
163-
ARGS_IDX_BEGIN(1007),
164-
ARGS_IDX_END(1008),
165-
ARGS_IDX_COUNT_ONLY(1009),
166-
ARGS_IDX_PASS_AS_VCF(1010),
167-
ARGS_IDX_PASS_THROUGH_QUERY_JSON(1011),
168-
ARGS_IDX_AFTER_LAST_ARG_IDX(1012);
160+
ARGS_IDX_CHROMOSOME(1004),
161+
ARGS_IDX_BEGIN(1005),
162+
ARGS_IDX_END(1006),
163+
ARGS_IDX_COUNT_ONLY(1007),
164+
ARGS_IDX_PASS_AS_VCF(1008),
165+
ARGS_IDX_PASS_THROUGH_QUERY_JSON(1009),
166+
ARGS_IDX_AFTER_LAST_ARG_IDX(1010);
169167

170168
private final int mArgsIdx;
171169
ArgsIdxEnum(final int idx)
@@ -182,27 +180,22 @@ int idx()
182180
public static void main(final String[] args) throws IOException
183181
{
184182
int firstEnumIdx = ArgsIdxEnum.ARGS_IDX_DO_QUERY.idx();
185-
LongOpt[] longopts = new LongOpt[16];
183+
LongOpt[] longopts = new LongOpt[14];
186184
longopts[0] = new LongOpt("query", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_DO_QUERY.idx());
187185
longopts[1] = new LongOpt("load", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_DO_LOAD.idx());
188186
//Specify rank (or partition idx) of this process
189187
longopts[2] = new LongOpt("rank", LongOpt.REQUIRED_ARGUMENT, null, 'r');
190-
longopts[5] = new LongOpt("workspace", LongOpt.REQUIRED_ARGUMENT, null, 'w');
191-
longopts[6] = new LongOpt("array", LongOpt.REQUIRED_ARGUMENT, null, 'A');
192188
longopts[3] = new LongOpt("reference_genome", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_REFERENCE_GENOME.idx());
193189
longopts[4] = new LongOpt("template_vcf_header", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_TEMPLATE_VCF_HEADER.idx());
194-
//Specify smallest row idx from which to start loading - useful for
195-
//incremental loading into existing array
196-
longopts[7] = new LongOpt("lb_row_idx", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_LB_ROW_IDX.idx());
197-
//Specify largest row idx up to which loading should be performed - for completeness
198-
longopts[8] = new LongOpt("ub_row_idx", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_UB_ROW_IDX.idx());
199-
longopts[9] = new LongOpt("chromosome", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_CHROMOSOME.idx());
200-
longopts[10] = new LongOpt("begin", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_BEGIN.idx());
201-
longopts[11] = new LongOpt("end", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_END.idx());
202-
longopts[12] = new LongOpt("loader_json_file", LongOpt.REQUIRED_ARGUMENT, null, 'l');
203-
longopts[13] = new LongOpt("count_only", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_COUNT_ONLY.idx());
204-
longopts[14] = new LongOpt("pass_as_vcf", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_PASS_AS_VCF.idx());
205-
longopts[15] = new LongOpt("pass_through_query_json", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_PASS_THROUGH_QUERY_JSON.idx());
190+
longopts[5] = new LongOpt("workspace", LongOpt.REQUIRED_ARGUMENT, null, 'w');
191+
longopts[6] = new LongOpt("array", LongOpt.REQUIRED_ARGUMENT, null, 'A');
192+
longopts[7] = new LongOpt("chromosome", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_CHROMOSOME.idx());
193+
longopts[8] = new LongOpt("begin", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_BEGIN.idx());
194+
longopts[9] = new LongOpt("end", LongOpt.REQUIRED_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_END.idx());
195+
longopts[10] = new LongOpt("loader_json_file", LongOpt.REQUIRED_ARGUMENT, null, 'l');
196+
longopts[11] = new LongOpt("count_only", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_COUNT_ONLY.idx());
197+
longopts[12] = new LongOpt("pass_as_vcf", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_PASS_AS_VCF.idx());
198+
longopts[13] = new LongOpt("pass_through_query_json", LongOpt.NO_ARGUMENT, null, ArgsIdxEnum.ARGS_IDX_PASS_THROUGH_QUERY_JSON.idx());
206199
if(args.length < 2)
207200
{
208201
System.err.println("Usage:\n\tFor querying: --query <loader.json> [<query.json> |"
@@ -220,8 +213,6 @@ public static void main(final String[] args) throws IOException
220213
String array = "";
221214
String referenceGenome = "";
222215
String templateVCFHeader = "";
223-
long lbRowIdx = 0;
224-
long ubRowIdx = Long.MAX_VALUE-1;
225216
String chromosome = "";
226217
int chrBegin = 1;
227218
int chrEnd = Integer.MAX_VALUE-1;
@@ -270,12 +261,6 @@ public static void main(final String[] args) throws IOException
270261
case ARGS_IDX_TEMPLATE_VCF_HEADER:
271262
templateVCFHeader = g.getOptarg();
272263
break;
273-
case ARGS_IDX_LB_ROW_IDX:
274-
lbRowIdx = Long.parseLong(g.getOptarg());
275-
break;
276-
case ARGS_IDX_UB_ROW_IDX:
277-
ubRowIdx = Long.parseLong(g.getOptarg());
278-
break;
279264
case ARGS_IDX_CHROMOSOME:
280265
chromosome = g.getOptarg();
281266
break;
@@ -338,7 +323,7 @@ public static void main(final String[] args) throws IOException
338323
loaderJSONFile = args[g.getOptind()];
339324
//<loader.json>
340325
GenomicsDBImporter loader = new GenomicsDBImporter(loaderJSONFile);
341-
loader.write(rank, lbRowIdx, ubRowIdx);
326+
loader.write(rank);
342327
}
343328
}
344329
}

src/main/cpp/include/config/genomicsdb_config_base.h

+1
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ class GenomicsDBConfigBase
7979
void set_segment_size(const size_t v) { m_segment_size = v; }
8080
inline unsigned get_determine_sites_with_max_alleles() const { return m_determine_sites_with_max_alleles; }
8181
inline unsigned get_max_diploid_alt_alleles_that_can_be_genotyped() const { return m_max_diploid_alt_alleles_that_can_be_genotyped; }
82+
void set_combined_vcf_records_buffer_size_limit(const size_t val) { m_combined_vcf_records_buffer_size_limit = val; }
8283
inline size_t get_combined_vcf_records_buffer_size_limit() const { return m_combined_vcf_records_buffer_size_limit; }
8384
void set_vcf_header_filename(const std::string& vcf_header_filename);
8485
const std::string& get_vcf_header_filename() const { return m_vcf_header_filename; }

src/main/cpp/include/config/json_config.h

+2
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,6 @@ class JSONConfigBase : public GenomicsDBConfigBase
5353
rapidjson::Document m_json;
5454
};
5555

56+
rapidjson::Document parse_json_file(const std::string& s);
57+
5658
#endif

src/main/cpp/include/utils/vid_mapper.h

+16-68
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,16 @@ class VidMapper
803803
const int num_partition_callset_mapping_files,
804804
const int rank) const;
805805

806+
//Buffer streams for import
807+
void set_buffer_stream_info(
808+
const std::vector<BufferStreamInfo>& buffer_stream_info_vec);
809+
//Read callsets information from json_doc structure
810+
void read_callsets_info(const rapidjson::Value& json_doc, const int rank=0);
811+
void parse_callsets_json(
812+
const std::string& filename,
813+
const bool is_file);
814+
void parse_callsets_json(
815+
const rapidjson::Value& callsets_container);
806816

807817
protected:
808818
void add_mandatory_fields();
@@ -876,78 +886,16 @@ class FileBasedVidMapper : public VidMapper
876886
{
877887
}
878888

879-
FileBasedVidMapper(
880-
const std::string& filename,
881-
const std::string& callset_mapping_file="",
882-
const bool is_callset_mapping_required=true) : VidMapper() {
883-
std::vector<BufferStreamInfo> empty_vec;
884-
common_constructor_initialization(
885-
filename,
886-
empty_vec,
887-
callset_mapping_file,
888-
"",
889-
is_callset_mapping_required);
890-
}
891-
892-
FileBasedVidMapper(
893-
const rapidjson::Document& json_doc,
894-
const std::string& callset_mapping_file="",
895-
const bool is_callset_mapping_required=true) : VidMapper() {
896-
std::vector<BufferStreamInfo> empty_vec;
897-
common_constructor_initialization(
898-
json_doc,
899-
empty_vec,
900-
callset_mapping_file,
901-
"",
902-
is_callset_mapping_required);
903-
}
904-
905-
FileBasedVidMapper(
906-
const rapidjson::Document& json_doc,
907-
const std::vector<BufferStreamInfo>& buffer_stream_info_vec,
908-
const std::string& callset_mapping_file="",
909-
const std::string& buffer_stream_callset_mapping_json_string="",
910-
const bool is_callset_mapping_required=true) : VidMapper() {
911-
common_constructor_initialization(
912-
json_doc,
913-
buffer_stream_info_vec,
914-
callset_mapping_file,
915-
buffer_stream_callset_mapping_json_string,
916-
is_callset_mapping_required);
917-
}
918-
919-
FileBasedVidMapper(
920-
const std::string& filename,
921-
const std::vector<BufferStreamInfo>& buffer_stream_info_vec,
922-
const std::string& callset_mapping_file="",
923-
const std::string& buffer_stream_callset_mapping_json_string="",
924-
const bool is_callset_mapping_required=true) : VidMapper() {
925-
common_constructor_initialization(
926-
filename,
927-
buffer_stream_info_vec,
928-
callset_mapping_file,
929-
buffer_stream_callset_mapping_json_string,
930-
is_callset_mapping_required);
889+
FileBasedVidMapper(const std::string& filename);
890+
FileBasedVidMapper(const rapidjson::Value& json_doc)
891+
: VidMapper()
892+
{
893+
common_constructor_initialization(json_doc);
931894
}
932895
private:
933896
void common_constructor_initialization(
934-
const std::string& filename,
935-
const std::vector<BufferStreamInfo>& buffer_stream_info_vec,
936-
const std::string& callset_mapping_file="",
937-
const std::string& buffer_stream_callset_mapping_json_string="",
938-
const bool is_callset_mapping_required=true);
939-
940-
void common_constructor_initialization(
941-
const rapidjson::Document& json_doc,
942-
const std::vector<BufferStreamInfo>& buffer_stream_info_vec,
943-
const std::string& callset_mapping_file="",
944-
const std::string& buffer_stream_callset_mapping_json_string="",
945-
const bool is_callset_mapping_required=true);
897+
const rapidjson::Value& json_doc);
946898
private:
947-
void parse_callsets_json(
948-
const std::string& filename,
949-
const std::vector<BufferStreamInfo>& buffer_stream_info_vec,
950-
const bool is_file);
951899
void parse_length_descriptor(const char* field_name,
952900
const rapidjson::Value& length_json_value,
953901
FieldLengthDescriptor& length_descriptor, const size_t length_dim_idx);

src/main/cpp/include/vcf/vcf_adapter.h

+14-3
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class VCFAdapter
9595
VCFAdapter(bool open_output=true);
9696
virtual ~VCFAdapter();
9797
void clear();
98-
void initialize(const GenomicsDBConfigBase& config_base);
98+
virtual void initialize(const GenomicsDBConfigBase& config_base);
9999
//Allocates header
100100
bcf_hdr_t* initialize_default_header();
101101
bcf_hdr_t* get_vcf_header() { return m_template_vcf_hdr; }
@@ -156,7 +156,7 @@ class BufferedVCFAdapter : public VCFAdapter, public CircularBufferController
156156
class VCFSerializedBufferAdapter: public VCFAdapter
157157
{
158158
public:
159-
VCFSerializedBufferAdapter(bool keep_idx_fields_in_bcf_header=true)
159+
VCFSerializedBufferAdapter(bool keep_idx_fields_in_bcf_header=true, bool do_output=false)
160160
: VCFAdapter(false)
161161
{
162162
m_keep_idx_fields_in_bcf_header = keep_idx_fields_in_bcf_header;
@@ -166,17 +166,23 @@ class VCFSerializedBufferAdapter: public VCFAdapter
166166
m_hts_string.m = 4096u;
167167
m_hts_string.s = (char*)malloc(m_hts_string.m);
168168
assert(m_hts_string.s);
169+
m_write_fptr = 0;
170+
m_do_output = do_output;
169171
}
170172
~VCFSerializedBufferAdapter()
171173
{
172174
if(m_hts_string.s && m_hts_string.m > 0)
173175
free(m_hts_string.s);
174176
m_hts_string.s = 0;
175177
m_hts_string.m = 0;
178+
if(m_write_fptr && m_write_fptr != stdout && m_write_fptr != stderr)
179+
fclose(m_write_fptr);
180+
m_write_fptr = 0;
176181
}
177182
//Delete copy and move constructors
178183
VCFSerializedBufferAdapter(const VCFSerializedBufferAdapter& other) = delete;
179184
VCFSerializedBufferAdapter(VCFSerializedBufferAdapter&& other) = delete;
185+
void initialize(const GenomicsDBConfigBase& config_base);
180186
void set_buffer(RWBuffer& buffer) { m_rw_buffer = &buffer; }
181187
void print_header();
182188
void handoff_output_bcf_line(bcf1_t*& line, const size_t bcf_record_size);
@@ -188,12 +194,17 @@ class VCFSerializedBufferAdapter: public VCFAdapter
188194
}
189195
void do_output()
190196
{
191-
//FIXME: used only for debugging - write out to stdout
197+
assert(m_write_fptr);
198+
assert(m_rw_buffer);
199+
auto write_size = fwrite(&(m_rw_buffer->m_buffer[0]), 1u, m_rw_buffer->m_num_valid_bytes, m_write_fptr);
200+
assert(write_size == m_rw_buffer->m_num_valid_bytes);
192201
}
193202
private:
194203
bool m_keep_idx_fields_in_bcf_header;
195204
RWBuffer* m_rw_buffer;
196205
kstring_t m_hts_string;
206+
bool m_do_output;
207+
FILE* m_write_fptr;
197208
};
198209

199210
#endif //ifdef HTSDIR

0 commit comments

Comments
 (0)