diff --git a/c++/build-support/README.md b/c++/build-support/README.md index 0ffad788e8..80966104bb 100644 --- a/c++/build-support/README.md +++ b/c++/build-support/README.md @@ -11,7 +11,7 @@ To use `run_clang_format.py` you could act like below: ```shell mkdir build cd build -cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 make check-format # Do checks only make format # This would apply suggested changes, take care! ``` @@ -23,7 +23,7 @@ To use `run_clang_tidy.py` you could act like below: ```shell mkdir build cd build -cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 +cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1 make -j`nproc` # Important make check-clang-tidy # Do checks only make fix-clang-tidy # This would apply suggested changes, take care! diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index b015b64910..e9f420f113 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -498,9 +498,11 @@ namespace orc { /** * Get the statistics about a stripe. * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about - * @return the statistics about that stripe + * @param includeRowIndex whether the row index of the stripe is included + * @return the statistics about that stripe and row group index statistics */ - virtual std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const = 0; + virtual std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const = 0; /** * Get the length of the data stripes in the file. diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index c93c62f6c4..17bf835203 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -751,27 +751,35 @@ namespace orc { return *(contents_->schema.get()); } - std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { + std::unique_ptr ReaderImpl::getStripeStatistics(uint64_t stripeIndex, + bool includeRowIndex) const { if (!isMetadataLoaded_) { readMetadata(); } if (contents_->metadata == nullptr) { throw std::logic_error("No stripe statistics in file"); } - size_t num_cols = static_cast( - contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); - std::vector> indexStats(num_cols); proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast(stripeIndex)); proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get()); - getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); - const Timezone& writerTZ = currentStripeFooter.has_writer_timezone() ? getTimezoneByName(currentStripeFooter.writer_timezone()) : getLocalTimezone(); StatContext statContext(hasCorrectStatistics(), &writerTZ); - return std::make_unique( + + if (!includeRowIndex) { + return std::make_unique( + contents_->metadata->stripe_stats(static_cast(stripeIndex)), statContext); + } + + size_t num_cols = static_cast( + contents_->metadata->stripe_stats(static_cast(stripeIndex)).col_stats_size()); + std::vector> indexStats(num_cols); + + getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); + + return std::make_unique( contents_->metadata->stripe_stats(static_cast(stripeIndex)), indexStats, statContext); } diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 39ca739675..3d81d26920 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -330,7 +330,8 @@ namespace orc { const std::string& getStreamName() const override; - std::unique_ptr getStripeStatistics(uint64_t stripeIndex) const override; + std::unique_ptr getStripeStatistics( + uint64_t stripeIndex, bool includeRowIndex = true) const override; std::unique_ptr createRowReader() const override; diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index 76fd736b27..c1a23cad16 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -81,11 +81,20 @@ namespace orc { // PASS } - StripeStatisticsImpl::StripeStatisticsImpl( + StripeStatisticsImpl::StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext) { + columnStats_ = std::make_unique(stripeStats, statContext); + } + + StripeStatisticsWithRowGroupIndexImpl::~StripeStatisticsWithRowGroupIndexImpl() { + // PASS + } + + StripeStatisticsWithRowGroupIndexImpl::StripeStatisticsWithRowGroupIndexImpl( const proto::StripeStatistics& stripeStats, std::vector >& indexStats, - const StatContext& statContext) { - columnStats_ = std::make_unique(stripeStats, statContext); + const StatContext& statContext) + : StripeStatisticsImpl(stripeStats, statContext) { rowIndexStats_.resize(indexStats.size()); for (size_t i = 0; i < rowIndexStats_.size(); i++) { for (size_t j = 0; j < indexStats[i].size(); j++) { diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index 6f212c15cc..b7ed5d1e56 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -1713,7 +1713,6 @@ namespace orc { class StripeStatisticsImpl : public StripeStatistics { private: std::unique_ptr columnStats_; - std::vector > > rowIndexStats_; // DELIBERATELY NOT IMPLEMENTED StripeStatisticsImpl(const StripeStatisticsImpl&); @@ -1721,7 +1720,6 @@ namespace orc { public: StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, - std::vector >& indexStats, const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { @@ -1732,13 +1730,38 @@ namespace orc { return columnStats_->getNumberOfColumns(); } + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t, uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + + virtual ~StripeStatisticsImpl() override; + + virtual uint32_t getNumberOfRowIndexStats(uint32_t) const override { + throw NotImplementedYet("set includeRowIndex true to get row index stats"); + } + }; + + class StripeStatisticsWithRowGroupIndexImpl : public StripeStatisticsImpl { + private: + std::vector > > rowIndexStats_; + + // DELIBERATELY NOT IMPLEMENTED + StripeStatisticsWithRowGroupIndexImpl(const StripeStatisticsWithRowGroupIndexImpl&); + StripeStatisticsWithRowGroupIndexImpl& operator=(const StripeStatisticsWithRowGroupIndexImpl&); + + public: + StripeStatisticsWithRowGroupIndexImpl( + const proto::StripeStatistics& stripeStats, + std::vector >& indexStats, + const StatContext& statContext); + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, uint32_t rowIndex) const override { // check id indices are valid return rowIndexStats_[columnId][rowIndex].get(); } - virtual ~StripeStatisticsImpl() override; + virtual ~StripeStatisticsWithRowGroupIndexImpl() override; uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { return static_cast(rowIndexStats_[columnId].size()); diff --git a/c++/test/TestStripeIndexStatistics.cc b/c++/test/TestStripeIndexStatistics.cc index 85fdb80e4f..a529792c17 100644 --- a/c++/test/TestStripeIndexStatistics.cc +++ b/c++/test/TestStripeIndexStatistics.cc @@ -83,6 +83,31 @@ namespace orc { "length: " "8000\n", stringColStats->toString()); + + std::unique_ptr stripeLevelStats = reader->getStripeStatistics(0, false); + const orc::IntegerColumnStatistics* stripeLevelIntColStats; + stripeLevelIntColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(1)); + EXPECT_EQ( + "Data type: Integer\nValues: 6000\nHas null: yes\nMinimum: 1\nMaximum: 6000\nSum: " + "18003000\n", + stripeLevelIntColStats->toString()); + + const orc::StringColumnStatistics* stripeLevelStringColStats; + stripeLevelStringColStats = reinterpret_cast( + stripeLevelStats->getColumnStatistics(2)); + EXPECT_EQ( + "Data type: String\nValues: 6000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: " + "23892\n", + stripeLevelStringColStats->toString()); + + intColStats = + reinterpret_cast(stripeStats->getColumnStatistics(1)); + stringColStats = + reinterpret_cast(stripeStats->getColumnStatistics(2)); + + EXPECT_EQ(intColStats->toString(), stripeLevelIntColStats->toString()); + EXPECT_EQ(stringColStats->toString(), stripeLevelStringColStats->toString()); } } // namespace orc diff --git a/c++/test/TestTimestampStatistics.cc b/c++/test/TestTimestampStatistics.cc index d20a049557..e005fa6cf6 100644 --- a/c++/test/TestTimestampStatistics.cc +++ b/c++/test/TestTimestampStatistics.cc @@ -68,6 +68,19 @@ namespace orc { "00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 " "00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", stripeColStats->toString()); + + std::unique_ptr stripeStatsWithOutRowIndex = + reader->getStripeStatistics(0, false); + const orc::TimestampColumnStatistics* stripeColStatsOnly = + reinterpret_cast( + stripeStatsWithOutRowIndex->getColumnStatistics(0)); + + EXPECT_TRUE(stripeColStatsOnly->hasMinimum()); + EXPECT_TRUE(stripeColStatsOnly->hasMaximum()); + EXPECT_EQ(stripeColStats->toString(), stripeColStatsOnly->toString()); + EXPECT_EQ(stripeStats->getNumberOfColumns(), stripeStatsWithOutRowIndex->getNumberOfColumns()); + EXPECT_THROW(stripeStatsWithOutRowIndex->getRowIndexStatistics(1, 1), NotImplementedYet); + EXPECT_THROW(stripeStatsWithOutRowIndex->getNumberOfRowIndexStats(1), NotImplementedYet); } TEST(TestTimestampStatistics, testTimezoneUTC) {