Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve JSON_MATCH performance. #15049

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
*/
package org.apache.pinot.queries;

import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.io.FileUtils;
import org.apache.pinot.common.response.broker.BrokerResponseNative;
import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader;
Expand All @@ -34,11 +35,14 @@
import org.apache.pinot.segment.spi.ImmutableSegment;
import org.apache.pinot.segment.spi.IndexSegment;
import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.config.table.JsonIndexConfig;
import org.apache.pinot.spi.config.table.TableConfig;
import org.apache.pinot.spi.config.table.TableType;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.data.Schema;
import org.apache.pinot.spi.data.readers.GenericRow;
import org.apache.pinot.spi.utils.JsonUtils;
import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
Expand All @@ -58,11 +62,11 @@ public class JsonMatchQueriesTest extends BaseQueriesTest {

private static final String ID_COLUMN = "id";
private static final String JSON_COLUMN = "json";
private static final Schema SCHEMA = new Schema.SchemaBuilder().addSingleValueDimension(ID_COLUMN, DataType.INT)
.addSingleValueDimension(JSON_COLUMN, DataType.JSON).build();
private static final TableConfig TABLE_CONFIG =
new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME).setJsonIndexColumns(List.of(JSON_COLUMN))
.build();

private static final Schema SCHEMA = new Schema.SchemaBuilder()
.addSingleValueDimension(ID_COLUMN, DataType.INT)
.addSingleValueDimension(JSON_COLUMN, DataType.JSON)
.build();

private IndexSegment _indexSegment;
private List<IndexSegment> _indexSegments;
Expand Down Expand Up @@ -110,7 +114,27 @@ public void setUp()
// Top-level object with multiple nested-array values
records.add(createRecord(13, "{\"key\": [1, [\"foo\", [true]]], \"key2\": [2, [\"bar\", false]]}"));

SegmentGeneratorConfig segmentGeneratorConfig = new SegmentGeneratorConfig(TABLE_CONFIG, SCHEMA);
// nested arrays used to test not in/not eq predicates
records.add(createRecord(15, "{\"cities\":[ \"New York\" ] }"));
records.add(createRecord(16, "{\"cities\":[ \"Washington\", \"New York\"] }"));
records.add(createRecord(17, "{\"cities\":[ \"New York\", \"Washington\"] }"));
records.add(createRecord(18, "{\"cities\":[ \"Washington\"] }"));
records.add(createRecord(19, "{\"cities\":[ \"San Francisco\"] }"));
records.add(createRecord(20, "{\"cities\":[ \"San Francisco\", \"Miami\", \"Washington\"] }"));
records.add(createRecord(21, "{\"cities\":[] }"));
records.add(createRecord(22, "{\"cities\":[\"\"] }"));
records.add(createRecord(23, "{\"cities\":[ \"Washington\", \"Washington\"] }"));

// regular field used to test not in/not eq predicates
records.add(createRecord(24, "{\"country\": \"USA\"}"));
records.add(createRecord(25, "{\"country\": \"Canada\"}"));
records.add(createRecord(26, "{\"country\": \"Mexico\"}"));
records.add(createRecord(27, "{\"country\":\"\"}"));
records.add(createRecord(28, "{\"country\":null}"));

TableConfig tableConfig = getTableConfig();

SegmentGeneratorConfig segmentGeneratorConfig = new SegmentGeneratorConfig(tableConfig, SCHEMA);
segmentGeneratorConfig.setTableName(RAW_TABLE_NAME);
segmentGeneratorConfig.setSegmentName(SEGMENT_NAME);
segmentGeneratorConfig.setOutDir(INDEX_DIR.getPath());
Expand All @@ -119,13 +143,34 @@ public void setUp()
driver.init(segmentGeneratorConfig, new GenericRowRecordReader(records));
driver.build();

IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig(TABLE_CONFIG, SCHEMA);
IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig(tableConfig, SCHEMA);
ImmutableSegment immutableSegment =
ImmutableSegmentLoader.load(new File(INDEX_DIR, SEGMENT_NAME), indexLoadingConfig);
_indexSegment = immutableSegment;
_indexSegments = Arrays.asList(immutableSegment, immutableSegment);
}

protected TableConfig getTableConfig() {
ObjectNode indexes = JsonUtils.newObjectNode();
JsonIndexConfig config = new JsonIndexConfig();
config.setDisableCrossArrayUnnest(isDisableCrossArrayUnnest());
indexes.put("json", config.toJsonNode());

return new TableConfigBuilder(TableType.OFFLINE)
.setTableName(RAW_TABLE_NAME)
//.setJsonIndexColumns(List.of(JSON_COLUMN))
.addFieldConfig(
new FieldConfig.Builder(JSON_COLUMN)
.withEncodingType(FieldConfig.EncodingType.RAW)
.withIndexes(indexes)
.build())
.build();
}

protected boolean isDisableCrossArrayUnnest() {
return false; // default value
}

private GenericRow createRecord(int id, Object value) {
GenericRow record = new GenericRow();
record.putValue(ID_COLUMN, id);
Expand Down Expand Up @@ -173,9 +218,6 @@ public void testQueries() {
assertEquals(getSelectedIds("'\"$.key[1][*]\"=true'"), Set.of(12));
assertEquals(getSelectedIds("'\"$.key[1][1][0]\"=true'"), Set.of(13));

// Top-level object with multiple nested-array values
assertEquals(getSelectedIds("'\"$.key[*][*][*]\"=true AND \"$.key2[1][0]\"=''bar'''"), Set.of(13));

// Legacy query format
assertEquals(getSelectedIds("'key=1'"), Set.of(9));
assertEquals(getSelectedIds("'key=''foo'''"), Set.of(10));
Expand All @@ -188,11 +230,68 @@ public void testQueries() {
assertEquals(getSelectedIds("'\"key[1][1][0]\"=true'"), Set.of(13));
}

private Set<Integer> getSelectedIds(String jsonMatchExpression) {
@Test
public void testQueriesOnNestedArrays() {
// Top-level object with multiple nested-array values
assertEquals(getSelectedIds("'\"$.key[*][*][*]\"=true AND \"$.key2[1][0]\"=''bar'''"), Set.of(13));
// searching one more than one nested arrays work when 'disableCrossArrayUnnest' is false (default)
assertEquals(getSelectedIds("'\"$.key[0]\"=1 AND \"$.key2[0]\"=2'"), Set.of(13));
}

@Test
public void testOtherQueries() {
// NOT_EQ on array
assertEquals(getSelectedIds("'\"$.cities[0]\" != ''Seattle'' '"), Set.of(15, 16, 17, 18, 19, 20, 22, 23));
assertEquals(getSelectedIds("'\"$.cities[*]\" != ''Seattle'' '"), Set.of(15, 16, 17, 18, 19, 20, 22, 23));

assertEquals(getSelectedIds("'\"$.cities[0]\" != ''Washington'' '"), Set.of(15, 17, 19, 20, 22));
assertEquals(getSelectedIds("'\"$.cities[1]\" != ''Washington'' '"), Set.of(16, 20));
assertEquals(getSelectedIds("'\"$.cities[*]\" != ''Washington'' '"), Set.of(15, 16, 17, 19, 20, 22));

// NOT_IN on array
assertEquals(getSelectedIds("'\"$.cities[0]\" NOT IN (''Seattle'') '"), Set.of(15, 16, 17, 18, 19, 20, 22, 23));
assertEquals(getSelectedIds("'\"$.cities[*]\" NOT IN (''Seattle'') '"), Set.of(15, 16, 17, 18, 19, 20, 22, 23));
assertEquals(getSelectedIds("'\"$.cities[0]\" NOT IN (''Seattle'', ''Boston'') '"),
Set.of(15, 16, 17, 18, 19, 20, 22, 23));
assertEquals(getSelectedIds("'\"$.cities[*]\" NOT IN (''Seattle'', ''Boston'') '"),
Set.of(15, 16, 17, 18, 19, 20, 22, 23));

assertEquals(getSelectedIds("'\"$.cities[0]\" NOT IN (''Washington'') '"), Set.of(15, 17, 19, 20, 22));
assertEquals(getSelectedIds("'\"$.cities[1]\" NOT IN (''Washington'') '"), Set.of(16, 20));
assertEquals(getSelectedIds("'\"$.cities[*]\" NOT IN (''Washington'') '"), Set.of(15, 16, 17, 19, 20, 22));

assertEquals(getSelectedIds("'\"$.cities[0]\" NOT IN (''Washington'', ''New York'') '"), Set.of(19, 20, 22));
assertEquals(getSelectedIds("'\"$.cities[1]\" NOT IN (''Washington'', ''New York'') '"), Set.of(20));
assertEquals(getSelectedIds("'\"$.cities[*]\" NOT IN (''Washington'', ''New York'') '"), Set.of(19, 20, 22));

// NOT_EQ on field
assertEquals(getSelectedIds("'\"$.country\" != ''USA'' '"), Set.of(25, 26, 27));
assertEquals(getSelectedIds("'\"$.country\" != ''Canada'' '"), Set.of(24, 26, 27));
// '"$.country" != '''' throws error for some reason,
assertEquals(getSelectedIds("'\"$.country\" != '' '' '"), Set.of(24, 25, 26, 27));
assertEquals(getSelectedIds("'\"$.country\" != ''Brazil'' '"), Set.of(24, 25, 26, 27));

// NOT IN on field
assertEquals(getSelectedIds("'\"$.country\" NOT IN (''USA'') '"), Set.of(25, 26, 27));
assertEquals(getSelectedIds("'\"$.country\" NOT IN (''Canada'') '"), Set.of(24, 26, 27));
assertEquals(getSelectedIds("'\"$.country\" NOT IN (''USA'', ''Canada'') '"), Set.of(26, 27));
// '\"$.country\" NOT IN ('''') throws error for some reason
assertEquals(getSelectedIds("'\"$.country\" NOT IN ('' '') '"), Set.of(24, 25, 26, 27));
assertEquals(getSelectedIds("'\"$.country\" NOT IN (''Brazil'', ''Panama'') '"), Set.of(24, 25, 26, 27));

assertEquals(getSelectedIds("'REGEXP_LIKE(\"$.country\" , ''Brazil|Panama'') '"), Set.of());
assertEquals(getSelectedIds("'REGEXP_LIKE(\"$.country\" , ''USA|Canada'') '"), Set.of(24, 25));
assertEquals(getSelectedIds("'REGEXP_LIKE(\"$.country\" , ''[MC][ea].*'') '"), Set.of(25, 26));
assertEquals(getSelectedIds("'REGEXP_LIKE(\"$.country\" , ''US.*'') '"), Set.of(24));

assertEquals(getSelectedIds("'\"$.country\" < ''Romania'' '"), Set.of(25, 26, 27));
}

protected Set<Integer> getSelectedIds(String jsonMatchExpression) {
String query = String.format("SELECT id FROM testTable WHERE JSON_MATCH(json, %s) LIMIT 100", jsonMatchExpression);
BrokerResponseNative brokerResponse = getBrokerResponse(query);
List<Object[]> rows = brokerResponse.getResultTable().getRows();
Set<Integer> selectedIds = new HashSet<>();
Set<Integer> selectedIds = new TreeSet<>();
for (Object[] row : rows) {
selectedIds.add((Integer) row[0]);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.queries;

import java.util.Set;
import org.testng.annotations.Test;

import static org.testng.Assert.assertEquals;


// same as JsonMatchQueriesTest but with array un-nesting disabled
public class JsonMatchQueriesWithDisableUnnestTest extends JsonMatchQueriesTest {

@Override
protected boolean isDisableCrossArrayUnnest() {
return true;
}

@Test
public void testQueriesOnNestedArrays() {
// Top-level object with multiple nested-array values
// Searching one more than one nested arrays work when 'disableCrossArrayUnnest' is false (default)
assertEquals(getSelectedIds("'\"$.key[*][*][*]\"=true AND \"$.key2[1][0]\"=''bar'''"), Set.of());
assertEquals(getSelectedIds("'\"$.key[0]\"=1 AND \"$.key2[0]\"=2'"), Set.of());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -121,15 +121,15 @@ public static void main(String[] args)
+ " from MyTable \n"
+ " limit 100000\n"
+ ") \n"
+ "where regexp_like_const('.*a.*', RAW_STRING_COL )";
+ "where regexp_like('.*a.*', RAW_STRING_COL )";

public static final String REGEXP_LIKE_VAR_QUERY = "select * from \n"
+ "(\n"
+ " select RAW_STRING_COL\n"
+ " from MyTable \n"
+ " limit 100000\n"
+ ") \n"
+ "where regexp_like('.*a.*', RAW_STRING_COL )";
+ "where regexp_like_var('.*a.*', RAW_STRING_COL )";

private Distribution.DataSupplier _supplier;

Expand Down Expand Up @@ -199,7 +199,7 @@ public JsonNode query()

private void buildSegment(String segmentName)
throws Exception {
LazyDataGenerator rows = BenchmarkQueries.createTestData(_numRows, _supplier);
LazyDataGenerator rows = BenchmarkQueriesSSQE.createTestData(_numRows, _supplier);
SegmentGeneratorConfig config = new SegmentGeneratorConfig(TABLE_CONFIG, SCHEMA);
config.setOutDir(_segmentDir.getPath());
config.setTableName(TABLE_NAME);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@State(Scope.Benchmark)
public class BenchmarkQueries extends BaseQueriesTest {
public class BenchmarkQueriesSSQE extends BaseQueriesTest {

public static void main(String[] args)
throws Exception {
ChainedOptionsBuilder opt = new OptionsBuilder().include(BenchmarkQueries.class.getSimpleName());
ChainedOptionsBuilder opt = new OptionsBuilder().include(BenchmarkQueriesSSQE.class.getSimpleName());
new Runner(opt.build()).run();
}

Expand All @@ -88,6 +88,7 @@ public static void main(String[] args)
private static final String NO_INDEX_STRING_COL = "NO_INDEX_STRING_COL";
private static final String LOW_CARDINALITY_STRING_COL = "LOW_CARDINALITY_STRING_COL";
private static final String TIMESTAMP_COL = "TSTMP_COL";
private static final String JSON_COL = "JSON_COL";
private static final List<FieldConfig> FIELD_CONFIGS = new ArrayList<>();

private static final TableConfig TABLE_CONFIG = new TableConfigBuilder(TableType.OFFLINE)
Expand All @@ -97,6 +98,7 @@ public static void main(String[] args)
.setNoDictionaryColumns(List.of(RAW_INT_COL_NAME, RAW_STRING_COL_NAME, TIMESTAMP_COL))
.setSortedColumn(SORTED_COL_NAME)
.setRangeIndexColumns(List.of(INT_COL_NAME, LOW_CARDINALITY_STRING_COL))
.setJsonIndexColumns(List.of(JSON_COL))
.setStarTreeIndexConfigs(
Collections.singletonList(
new StarTreeIndexConfig(List.of(SORTED_COL_NAME, INT_COL_NAME), null,
Expand All @@ -114,6 +116,7 @@ public static void main(String[] args)
.addSingleValueDimension(NO_INDEX_STRING_COL, FieldSpec.DataType.STRING)
.addSingleValueDimension(LOW_CARDINALITY_STRING_COL, FieldSpec.DataType.STRING)
.addSingleValueDimension(TIMESTAMP_COL, FieldSpec.DataType.TIMESTAMP)
.addSingleValueDimension(JSON_COL, FieldSpec.DataType.JSON)
.build();

public static final String FILTERED_QUERY = "SELECT SUM(INT_COL) FILTER(WHERE INT_COL > 123 AND INT_COL < 599999),"
Expand Down Expand Up @@ -200,6 +203,19 @@ public static void main(String[] args)
+ " group by 1 "
+ " limit 1000000\n";

public static final String JSON_MATCH_QUERY =
"SELECT\n"
+ " COUNT(*) AS count,\n"
+ " SUM(INT_COL) AS size,\n"
+ " LOW_CARDINALITY_STRING_COL as type\n"
+ "FROM MyTable\n"
+ "WHERE JSON_MATCH(\n"
+ " JSON_COL,\t\n"
+ " '(\"$.type\" = ''type0'' OR (\"$.type\" = ''type1'' AND (\"$.changes[0].author.name\" != ''author10''"
+ " OR \"$.changes[1].author.name\" IS NOT NULL)))'\n"
+ " )\n"
+ "GROUP BY LOW_CARDINALITY_STRING_COL";

@Param({"1", "2", "10", "50"})
private int _numSegments;
@Param("1500000")
Expand All @@ -212,7 +228,7 @@ public static void main(String[] args)
RAW_COLUMN_SUMMARY_STATS, COUNT_OVER_BITMAP_INDEX_IN, COUNT_OVER_BITMAP_INDEXES,
COUNT_OVER_BITMAP_AND_SORTED_INDEXES, COUNT_OVER_BITMAP_INDEX_EQUALS, STARTREE_SUM_QUERY, STARTREE_FILTER_QUERY,
FILTERING_BITMAP_SCAN_QUERY, FILTERING_SCAN_QUERY, FILTERING_ON_TIMESTAMP_WORKAROUND_QUERY,
FILTERING_ON_TIMESTAMP_QUERY, REGEXP_REPLACE_QUERY
FILTERING_ON_TIMESTAMP_QUERY, REGEXP_REPLACE_QUERY, JSON_MATCH_QUERY
})
String _query;
private IndexSegment _indexSegment;
Expand Down Expand Up @@ -253,6 +269,7 @@ static LazyDataGenerator createTestData(int numRows, Distribution.DataSupplier s
private final String[] _lowCardinalityValues =
IntStream.range(0, 10).mapToObj(i -> "value" + i).toArray(String[]::new);
private Distribution.DataSupplier _supplier = supplier;
private String[] _jsons = generateJsons();

@Override
public int size() {
Expand All @@ -270,6 +287,7 @@ public GenericRow next(GenericRow row, int i) {
row.putValue(NO_INDEX_STRING_COL, row.getValue(RAW_STRING_COL_NAME));
row.putValue(LOW_CARDINALITY_STRING_COL, _lowCardinalityValues[i % _lowCardinalityValues.length]);
row.putValue(TIMESTAMP_COL, i * 1200 * 1000L);
row.putValue(JSON_COL, _jsons[i % _jsons.length]);

return null;
}
Expand All @@ -279,6 +297,25 @@ public void rewind() {
_strings.clear();
_supplier.reset();
}

private String[] generateJsons() {
String[] jsons = new String[1000];
StringBuilder buffer = new StringBuilder();

for (int i = 0; i < jsons.length; i++) {
buffer.setLength(0);
buffer.append("{ \"type\": \"type").append(i % 50).append("\"")
.append(", \"changes\": [ ")
.append("{ \"author\": { \"name\": \"author").append(i % 1000).append("\" } }");
if (i % 2 == 0) {
buffer.append(", { \"author\": { \"name\": \"author").append(i % 100).append("\" } }");
}
buffer.append(" ] }");
jsons[i] = buffer.toString();
}

return jsons;
}
};
}

Expand Down
Loading
Loading