Skip to content

Commit f9c8067

Browse files
wushapdongjoon-hyun
authored andcommitted
ORC-1858: [C++] Add support to get StripeStatistics without row index
### What changes were proposed in this pull request? add a C++ API in reader to get stripe level statistics without reading row group index. ### Why are the changes needed? To #2137 ### How was this patch tested? UT PASS ### Was this patch authored or co-authored using generative AI tooling? NO Closes #2144 from wushap/main. Authored-by: lan <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 85159c5 commit f9c8067

8 files changed

+99
-18
lines changed

c++/build-support/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ To use `run_clang_format.py` you could act like below:
1111
```shell
1212
mkdir build
1313
cd build
14-
cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1
14+
cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1
1515
make check-format # Do checks only
1616
make format # This would apply suggested changes, take care!
1717
```
@@ -23,7 +23,7 @@ To use `run_clang_tidy.py` you could act like below:
2323
```shell
2424
mkdir build
2525
cd build
26-
cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1
26+
cmake .. -DBUILD_JAVA=OFF -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DORC_ENABLE_CLANG_TOOLS=1
2727
make -j`nproc` # Important
2828
make check-clang-tidy # Do checks only
2929
make fix-clang-tidy # This would apply suggested changes, take care!

c++/include/orc/Reader.hh

+4-2
Original file line numberDiff line numberDiff line change
@@ -498,9 +498,11 @@ namespace orc {
498498
/**
499499
* Get the statistics about a stripe.
500500
* @param stripeIndex the index of the stripe (0 to N-1) to get statistics about
501-
* @return the statistics about that stripe
501+
* @param includeRowIndex whether the row index of the stripe is included
502+
* @return the statistics about that stripe and row group index statistics
502503
*/
503-
virtual std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const = 0;
504+
virtual std::unique_ptr<StripeStatistics> getStripeStatistics(
505+
uint64_t stripeIndex, bool includeRowIndex = true) const = 0;
504506

505507
/**
506508
* Get the length of the data stripes in the file.

c++/src/Reader.cc

+15-7
Original file line numberDiff line numberDiff line change
@@ -751,27 +751,35 @@ namespace orc {
751751
return *(contents_->schema.get());
752752
}
753753

754-
std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
754+
std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex,
755+
bool includeRowIndex) const {
755756
if (!isMetadataLoaded_) {
756757
readMetadata();
757758
}
758759
if (contents_->metadata == nullptr) {
759760
throw std::logic_error("No stripe statistics in file");
760761
}
761-
size_t num_cols = static_cast<size_t>(
762-
contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size());
763-
std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols);
764762

765763
proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast<int>(stripeIndex));
766764
proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get());
767765

768-
getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
769-
770766
const Timezone& writerTZ = currentStripeFooter.has_writer_timezone()
771767
? getTimezoneByName(currentStripeFooter.writer_timezone())
772768
: getLocalTimezone();
773769
StatContext statContext(hasCorrectStatistics(), &writerTZ);
774-
return std::make_unique<StripeStatisticsImpl>(
770+
771+
if (!includeRowIndex) {
772+
return std::make_unique<StripeStatisticsImpl>(
773+
contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)), statContext);
774+
}
775+
776+
size_t num_cols = static_cast<size_t>(
777+
contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size());
778+
std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols);
779+
780+
getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
781+
782+
return std::make_unique<StripeStatisticsWithRowGroupIndexImpl>(
775783
contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext);
776784
}
777785

c++/src/Reader.hh

+2-1
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,8 @@ namespace orc {
330330

331331
const std::string& getStreamName() const override;
332332

333-
std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const override;
333+
std::unique_ptr<StripeStatistics> getStripeStatistics(
334+
uint64_t stripeIndex, bool includeRowIndex = true) const override;
334335

335336
std::unique_ptr<RowReader> createRowReader() const override;
336337

c++/src/Statistics.cc

+12-3
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,20 @@ namespace orc {
8181
// PASS
8282
}
8383

84-
StripeStatisticsImpl::StripeStatisticsImpl(
84+
StripeStatisticsImpl::StripeStatisticsImpl(const proto::StripeStatistics& stripeStats,
85+
const StatContext& statContext) {
86+
columnStats_ = std::make_unique<StatisticsImpl>(stripeStats, statContext);
87+
}
88+
89+
StripeStatisticsWithRowGroupIndexImpl::~StripeStatisticsWithRowGroupIndexImpl() {
90+
// PASS
91+
}
92+
93+
StripeStatisticsWithRowGroupIndexImpl::StripeStatisticsWithRowGroupIndexImpl(
8594
const proto::StripeStatistics& stripeStats,
8695
std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
87-
const StatContext& statContext) {
88-
columnStats_ = std::make_unique<StatisticsImpl>(stripeStats, statContext);
96+
const StatContext& statContext)
97+
: StripeStatisticsImpl(stripeStats, statContext) {
8998
rowIndexStats_.resize(indexStats.size());
9099
for (size_t i = 0; i < rowIndexStats_.size(); i++) {
91100
for (size_t j = 0; j < indexStats[i].size(); j++) {

c++/src/Statistics.hh

+26-3
Original file line numberDiff line numberDiff line change
@@ -1713,15 +1713,13 @@ namespace orc {
17131713
class StripeStatisticsImpl : public StripeStatistics {
17141714
private:
17151715
std::unique_ptr<StatisticsImpl> columnStats_;
1716-
std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats_;
17171716

17181717
// DELIBERATELY NOT IMPLEMENTED
17191718
StripeStatisticsImpl(const StripeStatisticsImpl&);
17201719
StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
17211720

17221721
public:
17231722
StripeStatisticsImpl(const proto::StripeStatistics& stripeStats,
1724-
std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
17251723
const StatContext& statContext);
17261724

17271725
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
@@ -1732,13 +1730,38 @@ namespace orc {
17321730
return columnStats_->getNumberOfColumns();
17331731
}
17341732

1733+
virtual const ColumnStatistics* getRowIndexStatistics(uint32_t, uint32_t) const override {
1734+
throw NotImplementedYet("set includeRowIndex true to get row index stats");
1735+
}
1736+
1737+
virtual ~StripeStatisticsImpl() override;
1738+
1739+
virtual uint32_t getNumberOfRowIndexStats(uint32_t) const override {
1740+
throw NotImplementedYet("set includeRowIndex true to get row index stats");
1741+
}
1742+
};
1743+
1744+
class StripeStatisticsWithRowGroupIndexImpl : public StripeStatisticsImpl {
1745+
private:
1746+
std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats_;
1747+
1748+
// DELIBERATELY NOT IMPLEMENTED
1749+
StripeStatisticsWithRowGroupIndexImpl(const StripeStatisticsWithRowGroupIndexImpl&);
1750+
StripeStatisticsWithRowGroupIndexImpl& operator=(const StripeStatisticsWithRowGroupIndexImpl&);
1751+
1752+
public:
1753+
StripeStatisticsWithRowGroupIndexImpl(
1754+
const proto::StripeStatistics& stripeStats,
1755+
std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
1756+
const StatContext& statContext);
1757+
17351758
virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
17361759
uint32_t rowIndex) const override {
17371760
// check id indices are valid
17381761
return rowIndexStats_[columnId][rowIndex].get();
17391762
}
17401763

1741-
virtual ~StripeStatisticsImpl() override;
1764+
virtual ~StripeStatisticsWithRowGroupIndexImpl() override;
17421765

17431766
uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
17441767
return static_cast<uint32_t>(rowIndexStats_[columnId].size());

c++/test/TestStripeIndexStatistics.cc

+25
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,31 @@ namespace orc {
8383
"length: "
8484
"8000\n",
8585
stringColStats->toString());
86+
87+
std::unique_ptr<orc::Statistics> stripeLevelStats = reader->getStripeStatistics(0, false);
88+
const orc::IntegerColumnStatistics* stripeLevelIntColStats;
89+
stripeLevelIntColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
90+
stripeLevelStats->getColumnStatistics(1));
91+
EXPECT_EQ(
92+
"Data type: Integer\nValues: 6000\nHas null: yes\nMinimum: 1\nMaximum: 6000\nSum: "
93+
"18003000\n",
94+
stripeLevelIntColStats->toString());
95+
96+
const orc::StringColumnStatistics* stripeLevelStringColStats;
97+
stripeLevelStringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
98+
stripeLevelStats->getColumnStatistics(2));
99+
EXPECT_EQ(
100+
"Data type: String\nValues: 6000\nHas null: yes\nMinimum: 1000\nMaximum: 9a\nTotal length: "
101+
"23892\n",
102+
stripeLevelStringColStats->toString());
103+
104+
intColStats =
105+
reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getColumnStatistics(1));
106+
stringColStats =
107+
reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getColumnStatistics(2));
108+
109+
EXPECT_EQ(intColStats->toString(), stripeLevelIntColStats->toString());
110+
EXPECT_EQ(stringColStats->toString(), stripeLevelStringColStats->toString());
86111
}
87112

88113
} // namespace orc

c++/test/TestTimestampStatistics.cc

+13
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,19 @@ namespace orc {
6868
"00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 2037-01-01 "
6969
"00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n",
7070
stripeColStats->toString());
71+
72+
std::unique_ptr<orc::StripeStatistics> stripeStatsWithOutRowIndex =
73+
reader->getStripeStatistics(0, false);
74+
const orc::TimestampColumnStatistics* stripeColStatsOnly =
75+
reinterpret_cast<const orc::TimestampColumnStatistics*>(
76+
stripeStatsWithOutRowIndex->getColumnStatistics(0));
77+
78+
EXPECT_TRUE(stripeColStatsOnly->hasMinimum());
79+
EXPECT_TRUE(stripeColStatsOnly->hasMaximum());
80+
EXPECT_EQ(stripeColStats->toString(), stripeColStatsOnly->toString());
81+
EXPECT_EQ(stripeStats->getNumberOfColumns(), stripeStatsWithOutRowIndex->getNumberOfColumns());
82+
EXPECT_THROW(stripeStatsWithOutRowIndex->getRowIndexStatistics(1, 1), NotImplementedYet);
83+
EXPECT_THROW(stripeStatsWithOutRowIndex->getNumberOfRowIndexStats(1), NotImplementedYet);
7184
}
7285

7386
TEST(TestTimestampStatistics, testTimezoneUTC) {

0 commit comments

Comments
 (0)