Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CALCITE-6786] ANY/SOME operator yields multiple rows in correlated queries #4147

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -696,16 +696,15 @@ private static RexNode rewriteIn(RexSubQuery e, Set<CorrelationId> variablesSet,
if (variablesSet.isEmpty()) {
builder.aggregate(builder.groupKey(builder.field("cs")),
builder.count(false, "c"));

// sorts input with desc order since we are interested
// only in the case when one of the values is true.
// When true value is absent then we are interested
// only in false value.
builder.sortLimit(0, 1,
ImmutableList.of(builder.desc(builder.field("cs"))));
} else {
builder.distinct();
}
// sorts input with desc order since we are interested
// only in the case when one of the values is true.
// When true value is absent then we are interested
// only in false value.
builder.sortLimit(0, 1,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This includes the order by cs limit 1 clause in the correlated query case. Without it, multiple rows are returned, as described in the issue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this was written by @vvysotskyi about 6 years ago, it would be great if he could comment.

ImmutableList.of(builder.desc(builder.field("cs"))));
}
// clears expressionOperands and fields lists since
// all expressions were used in the filter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3624,18 +3624,20 @@ LogicalProject(EMPNO=[$0])
LogicalCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{1}])
LogicalProject(EMPNO=[$0], ENAME=[$1])
LogicalTableScan(table=[[CATALOG, SALES, EMPNULLABLES]])
LogicalSort(sort0=[$0], dir0=[DESC], fetch=[1])
LogicalAggregate(group=[{0}])
LogicalProject(cs=[IS NOT NULL($0)])
LogicalFilter(condition=[OR(=(1, $0), IS NULL($0))])
LogicalProject(DEPTNO=[$0])
LogicalFilter(condition=[AND(=($cor0.ENAME, CAST($1):VARCHAR(20)), >($0, 10))])
LogicalTableScan(table=[[CATALOG, SALES, DEPTNULLABLES]])
LogicalSort(sort0=[$0], dir0=[DESC], fetch=[1])
LogicalAggregate(group=[{0}])
LogicalProject(cs=[IS NOT NULL($0)])
LogicalFilter(condition=[OR(=(1, $0), IS NULL($0))])
LogicalFilter(condition=[OR(=(2, $0), IS NULL($0))])
LogicalProject(DEPTNO=[$0])
LogicalFilter(condition=[AND(=($cor0.ENAME, CAST($1):VARCHAR(20)), >($0, 10))])
LogicalFilter(condition=[AND(=($cor0.ENAME, CAST($1):VARCHAR(20)), <($0, 20))])
LogicalTableScan(table=[[CATALOG, SALES, DEPTNULLABLES]])
LogicalAggregate(group=[{0}])
LogicalProject(cs=[IS NOT NULL($0)])
LogicalFilter(condition=[OR(=(2, $0), IS NULL($0))])
LogicalProject(DEPTNO=[$0])
LogicalFilter(condition=[AND(=($cor0.ENAME, CAST($1):VARCHAR(20)), <($0, 20))])
LogicalTableScan(table=[[CATALOG, SALES, DEPTNULLABLES]])
]]>
</Resource>
</TestCase>
Expand Down
35 changes: 35 additions & 0 deletions core/src/test/resources/sql/some.iq
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,41 @@ where sal > some (4000, 2000);

!ok

# CALCITE-6786: ANY/SOME operator yields multiple rows in correlated queries

WITH tb as (select array(SELECT * FROM (VALUES (TRUE), (NULL)) as x(a)) as a)
SELECT TRUE IN (SELECT b FROM UNNEST(a) AS x1(b)) AS test FROM tb;
+------+
| TEST |
+------+
| true |
+------+
(1 row)

!ok

WITH tb as (select array(SELECT * FROM (VALUES (FALSE), (NULL)) as x(a)) as a)
SELECT TRUE IN (SELECT b FROM UNNEST(a) AS x1(b)) AS test FROM tb;
+------+
| TEST |
+------+
| |
+------+
(1 row)

!ok

WITH tb as (select array(SELECT * FROM (VALUES (FALSE), (TRUE)) as x(a)) as a)
SELECT TRUE IN (SELECT b FROM UNNEST(a) AS x1(b)) AS test FROM tb;
+------+
| TEST |
+------+
| true |
+------+
(1 row)

!ok

select sal, sal > some (4000, 2000, null) from "scott".emp;
+---------+--------+
| SAL | EXPR$1 |
Expand Down
73 changes: 43 additions & 30 deletions core/src/test/resources/sql/sub-query.iq
Original file line number Diff line number Diff line change
Expand Up @@ -1871,13 +1871,15 @@ select sal from "scott".emp e
(0 rows)

!ok
EnumerableCalc(expr#0..4=[{inputs}], expr#5=[RAND()], expr#6=[CAST($t5):INTEGER NOT NULL], expr#7=[2], expr#8=[MOD($t6, $t7)], expr#9=[3], expr#10=[=($t8, $t9)], expr#11=[OR($t10, $t4)], SAL=[$t1], $condition=[$t11])
EnumerableMergeJoin(condition=[=($2, $3)], joinType=[left])
EnumerableSort(sort0=[$2], dir0=[ASC])
EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[false], DEPTNO=[$t0], $f1=[$t3])
EnumerableTableScan(table=[[scott, DEPT]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[RAND()], expr#4=[CAST($t3):INTEGER NOT NULL], expr#5=[2], expr#6=[MOD($t4, $t5)], expr#7=[3], expr#8=[=($t6, $t7)], expr#9=[OR($t8, $t2)], SAL=[$t0], $condition=[$t9])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all these now contain correlated subqueries.
I wonder whether these tests expect uncorrelated results. This would indicate that either the decorrelator fails on the new plans, or perhaps it's run too early.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my understanding, the plan change is basically that the EnumerableMergeJoin was replaced with a EnumerableCorrelate. Aren't they similar in this case?

Naturally, the sort + limit is also introduced.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the plan after this PR is correct.
For decorrelate failures, [CALCITE-6652] reported this issue. I'm trying to submit a PR next month, and after that, the plan should be better.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe that correlated queries in general are much less efficient than uncorrelated ones.
Moreover, the Calcite decorrelator can only decorrelate a limited number of patterns.
That's why a decorrelated plan is preferred to a plan that has correlated subqueries.
However, correctness is more important than performance, so we should take this change if it fixes a correctness bug.
The question was whether we could have both correctness and performance, but that can be part of a separate issue.

EnumerableCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{1}])
EnumerableCalc(expr#0..7=[{inputs}], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableLimit(fetch=[1])
EnumerableSort(sort0=[$0], dir0=[DESC])
EnumerableAggregate(group=[{0}])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[false], expr#4=[$cor0], expr#5=[$t4.DEPTNO], expr#6=[=($t0, $t5)], cs=[$t3], $condition=[$t6])
EnumerableTableScan(table=[[scott, DEPT]])
!plan

# Test filter null IN nullable correlated
Expand Down Expand Up @@ -1958,13 +1960,15 @@ select sal from "scott".emp e
(0 rows)

!ok
EnumerableCalc(expr#0..4=[{inputs}], expr#5=[NOT($t4)], expr#6=[IS NOT NULL($t4)], expr#7=[OR($t5, $t6)], expr#8=[IS NOT TRUE($t7)], SAL=[$t1], $condition=[$t8])
EnumerableMergeJoin(condition=[=($2, $3)], joinType=[left])
EnumerableSort(sort0=[$2], dir0=[ASC])
EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[false], DEPTNO=[$t0], $f1=[$t3])
EnumerableTableScan(table=[[scott, DEPT]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[NOT($t2)], expr#4=[IS NOT NULL($t2)], expr#5=[OR($t3, $t4)], expr#6=[IS NOT TRUE($t5)], SAL=[$t0], $condition=[$t6])
EnumerableCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{1}])
EnumerableCalc(expr#0..7=[{inputs}], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableLimit(fetch=[1])
EnumerableSort(sort0=[$0], dir0=[DESC])
EnumerableAggregate(group=[{0}])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[false], expr#4=[$cor0], expr#5=[$t4.DEPTNO], expr#6=[=($t5, $t0)], cs=[$t3], $condition=[$t6])
EnumerableTableScan(table=[[scott, DEPT]])
!plan

# Test filter null NOT IN literal correlated
Expand Down Expand Up @@ -2027,13 +2031,14 @@ select sal from "scott".emp e
(11 rows)

!ok
EnumerableCalc(expr#0..4=[{inputs}], expr#5=[NOT($t4)], expr#6=[IS NOT NULL($t4)], expr#7=[OR($t5, $t6)], expr#8=[IS NOT TRUE($t7)], SAL=[$t1], $condition=[$t8])
EnumerableMergeJoin(condition=[=($2, $3)], joinType=[left])
EnumerableSort(sort0=[$2], dir0=[ASC])
EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[true], expr#4=[10], expr#5=[CAST($t0):INTEGER NOT NULL], expr#6=[=($t4, $t5)], DEPTNO1=[$t0], $f1=[$t3], $condition=[$t6])
EnumerableTableScan(table=[[scott, DEPT]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[NOT($t2)], expr#4=[IS NOT NULL($t2)], expr#5=[OR($t3, $t4)], expr#6=[IS NOT TRUE($t5)], SAL=[$t0], $condition=[$t6])
EnumerableCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{1}])
EnumerableCalc(expr#0..7=[{inputs}], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableLimit(fetch=[1])
EnumerableSort(sort0=[$0], dir0=[DESC])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[true], expr#4=[$cor0], expr#5=[$t4.DEPTNO], expr#6=[=($t5, $t0)], expr#7=[10], expr#8=[CAST($t0):INTEGER NOT NULL], expr#9=[=($t7, $t8)], expr#10=[AND($t6, $t9)], cs=[$t3], $condition=[$t10])
EnumerableTableScan(table=[[scott, DEPT]])
!plan

# Test filter literal NOT IN nullable correlated
Expand All @@ -2057,13 +2062,14 @@ select sal from "scott".emp e
(11 rows)

!ok
EnumerableCalc(expr#0..4=[{inputs}], expr#5=[NOT($t3)], expr#6=[IS NOT NULL($t3)], expr#7=[OR($t5, $t6)], expr#8=[IS NOT TRUE($t7)], SAL=[$t1], $condition=[$t8])
EnumerableMergeJoin(condition=[=($2, $4)], joinType=[left])
EnumerableSort(sort0=[$2], dir0=[ASC])
EnumerableCalc(expr#0..7=[{inputs}], EMPNO=[$t0], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[true], expr#4=[CAST($t0):INTEGER], expr#5=[10], expr#6=[=($t4, $t5)], cs=[$t3], DEPTNO=[$t0], $condition=[$t6])
EnumerableTableScan(table=[[scott, DEPT]])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[NOT($t2)], expr#4=[IS NOT NULL($t2)], expr#5=[OR($t3, $t4)], expr#6=[IS NOT TRUE($t5)], SAL=[$t0], $condition=[$t6])
EnumerableCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{1}])
EnumerableCalc(expr#0..7=[{inputs}], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableLimit(fetch=[1])
EnumerableSort(sort0=[$0], dir0=[DESC])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[IS NOT NULL($t0)], expr#4=[$cor0], expr#5=[$t4.DEPTNO], expr#6=[=($t5, $t0)], expr#7=[CAST($t0):INTEGER], expr#8=[10], expr#9=[=($t7, $t8)], expr#10=[AND($t6, $t9)], cs=[$t3], $condition=[$t10])
EnumerableTableScan(table=[[scott, DEPT]])
!plan

# Test filter null IN required is unknown correlated
Expand All @@ -2090,8 +2096,15 @@ select sal from "scott".emp e
(14 rows)

!ok
EnumerableCalc(expr#0..7=[{inputs}], SAL=[$t5])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableCalc(expr#0..2=[{inputs}], SAL=[$t0])
EnumerableCorrelate(correlation=[$cor0], joinType=[left], requiredColumns=[{1}])
EnumerableCalc(expr#0..7=[{inputs}], SAL=[$t5], DEPTNO=[$t7])
EnumerableTableScan(table=[[scott, EMP]])
EnumerableLimit(fetch=[1])
EnumerableSort(sort0=[$0], dir0=[DESC])
EnumerableAggregate(group=[{0}])
EnumerableCalc(expr#0..2=[{inputs}], expr#3=[true], expr#4=[$cor0], expr#5=[$t4.DEPTNO], expr#6=[=($t5, $t0)], cs=[$t3], $condition=[$t6])
EnumerableTableScan(table=[[scott, DEPT]])
!plan


Expand Down
Loading