diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cd39edc32279..761fafeba09a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -52,6 +52,9 @@ API Changes * GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and concatenate(Automaton,Automaton) in favor of the methods taking List. (Robert Muir) +* GITHUB#14236: CombinedFieldQuery moved from lucene-sandbox to lucene-core. + (Adrien Grand) + New Features --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/CombinedFieldQuery.java b/lucene/core/src/java/org/apache/lucene/search/CombinedFieldQuery.java new file mode 100644 index 000000000000..42659e3b37e9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/CombinedFieldQuery.java @@ -0,0 +1,462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.TreeMap; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermStates; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.DFRSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOSupplier; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.SmallFloat; + +/** + * A {@link Query} that treats multiple fields as a single stream and scores terms as if they had + * been indexed in a single field whose values would be the union of the values of the provided + * fields. + * + *

The query works as follows: + * + *

    + *
  1. Given a list of fields and weights, it pretends there is a synthetic combined field where + * all terms have been indexed. It computes new term and collection statistics for this + * combined field. + *
  2. It uses a disjunction iterator and {@link IndexSearcher#getSimilarity} to score documents. + *
+ * + *

In order for a similarity to be compatible, {@link Similarity#computeNorm} must be additive: + * the norm of the combined field is the sum of norms for each individual field. The norms must also + * be encoded using {@link SmallFloat#intToByte4}. These requirements hold for all similarities that + * don't customize {@link Similarity#computeNorm}, which includes {@link BM25Similarity} and {@link + * DFRSimilarity}. Per-field similarities are not supported. + * + *

The query also requires that either all fields or no fields have norms enabled. Having only + * some fields with norms enabled can result in errors. + * + *

This query assumes that all fields share the same analyzer. Scores may not make much sense if + * all fields don't have the same analyzer. + * + *

The scoring is based on BM25F's simple formula described in: + * http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the + * same approach but allows other similarities besides {@link + * org.apache.lucene.search.similarities.BM25Similarity}. + * + * @lucene.experimental + */ +public final class CombinedFieldQuery extends Query implements Accountable { + private static final long BASE_RAM_BYTES = + RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class); + + /** A builder for {@link CombinedFieldQuery}. */ + public static class Builder { + private final Map fieldAndWeights = new HashMap<>(); + private final BytesRef term; + + /** Create a builder for the given term {@link String}. */ + public Builder(String term) { + this.term = new BytesRef(term); + } + + /** Create a builder for the given term bytes. */ + public Builder(BytesRef term) { + this.term = BytesRef.deepCopyOf(term); + } + + /** + * Adds a field to this builder. + * + * @param field The field name. + */ + public Builder addField(String field) { + return addField(field, 1f); + } + + /** + * Adds a field to this builder. + * + * @param field The field name. + * @param weight The weight associated to this field. + */ + public Builder addField(String field, float weight) { + if (weight < 1) { + throw new IllegalArgumentException("weight must be greater or equal to 1"); + } + fieldAndWeights.put(field, new FieldAndWeight(field, weight)); + return this; + } + + /** Builds the {@link CombinedFieldQuery}. */ + public CombinedFieldQuery build() { + if (fieldAndWeights.size() > IndexSearcher.getMaxClauseCount()) { + throw new IndexSearcher.TooManyClauses(); + } + return new CombinedFieldQuery(new TreeMap<>(fieldAndWeights), term); + } + } + + record FieldAndWeight(String field, float weight) {} + + // sorted map for fields. + private final TreeMap fieldAndWeights; + // term bytes + private final BytesRef term; + // array of terms per field, sorted by field + private final Term[] fieldTerms; + + private final long ramBytesUsed; + + private CombinedFieldQuery(TreeMap fieldAndWeights, BytesRef term) { + this.fieldAndWeights = fieldAndWeights; + this.term = Objects.requireNonNull(term); + if (fieldAndWeights.size() > IndexSearcher.getMaxClauseCount()) { + throw new IndexSearcher.TooManyClauses(); + } + this.fieldTerms = new Term[fieldAndWeights.size()]; + int pos = 0; + for (String field : fieldAndWeights.keySet()) { + fieldTerms[pos++] = new Term(field, term); + } + + this.ramBytesUsed = + BASE_RAM_BYTES + + RamUsageEstimator.sizeOfObject(fieldAndWeights) + + RamUsageEstimator.sizeOfObject(fieldTerms) + + RamUsageEstimator.sizeOfObject(term); + } + + @Override + public String toString(String field) { + StringBuilder builder = new StringBuilder("CombinedFieldQuery(("); + int pos = 0; + for (FieldAndWeight fieldWeight : fieldAndWeights.values()) { + if (pos++ != 0) { + builder.append(" "); + } + builder.append(fieldWeight.field); + if (fieldWeight.weight != 1f) { + builder.append("^"); + builder.append(fieldWeight.weight); + } + } + builder.append(")("); + builder.append(Term.toString(term)); + builder.append("))"); + return builder.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (sameClassAs(o) == false) return false; + CombinedFieldQuery that = (CombinedFieldQuery) o; + return Objects.equals(fieldAndWeights, that.fieldAndWeights) && term.equals(that.term); + } + + @Override + public int hashCode() { + int result = classHash(); + result = 31 * result + Objects.hash(fieldAndWeights); + result = 31 * result + term.hashCode(); + return result; + } + + @Override + public long ramBytesUsed() { + return ramBytesUsed; + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (fieldAndWeights.isEmpty()) { + return new BooleanQuery.Builder().build(); + } + return this; + } + + @Override + public void visit(QueryVisitor visitor) { + Term[] selectedTerms = + Arrays.stream(fieldTerms).filter(t -> visitor.acceptField(t.field())).toArray(Term[]::new); + if (selectedTerms.length > 0) { + QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.SHOULD, this); + v.consumeTerms(this, selectedTerms); + } + } + + private BooleanQuery rewriteToBoolean() { + // rewrite to a simple disjunction if the score is not needed. + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + for (Term term : fieldTerms) { + bq.add(new TermQuery(term), BooleanClause.Occur.SHOULD); + } + return bq.build(); + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) + throws IOException { + validateConsistentNorms(searcher.getIndexReader()); + if (scoreMode.needsScores()) { + return new CombinedFieldWeight(this, searcher, scoreMode, boost); + } else { + // rewrite to a simple disjunction if the score is not needed. + Query bq = rewriteToBoolean(); + return searcher.rewrite(bq).createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); + } + } + + private void validateConsistentNorms(IndexReader reader) { + boolean allFieldsHaveNorms = true; + boolean noFieldsHaveNorms = true; + + for (LeafReaderContext context : reader.leaves()) { + FieldInfos fieldInfos = context.reader().getFieldInfos(); + for (String field : fieldAndWeights.keySet()) { + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + if (fieldInfo != null) { + allFieldsHaveNorms &= fieldInfo.hasNorms(); + noFieldsHaveNorms &= fieldInfo.omitsNorms(); + } + } + } + + if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) { + throw new IllegalArgumentException( + getClass().getSimpleName() + + " requires norms to be consistent across fields: some fields cannot " + + " have norms enabled, while others have norms disabled"); + } + } + + class CombinedFieldWeight extends Weight { + private final IndexSearcher searcher; + private final TermStates[] termStates; + private final Similarity.SimScorer simWeight; + + CombinedFieldWeight(Query query, IndexSearcher searcher, ScoreMode scoreMode, float boost) + throws IOException { + super(query); + assert scoreMode.needsScores(); + this.searcher = searcher; + long docFreq = 0; + long totalTermFreq = 0; + termStates = new TermStates[fieldTerms.length]; + for (int i = 0; i < termStates.length; i++) { + FieldAndWeight field = fieldAndWeights.get(fieldTerms[i].field()); + TermStates ts = TermStates.build(searcher, fieldTerms[i], true); + termStates[i] = ts; + if (ts.docFreq() > 0) { + TermStatistics termStats = + searcher.termStatistics(fieldTerms[i], ts.docFreq(), ts.totalTermFreq()); + docFreq = Math.max(termStats.docFreq(), docFreq); + totalTermFreq += (double) field.weight * termStats.totalTermFreq(); + } + } + if (docFreq > 0) { + CollectionStatistics pseudoCollectionStats = mergeCollectionStatistics(searcher); + TermStatistics pseudoTermStatistics = + new TermStatistics(new BytesRef("pseudo_term"), docFreq, Math.max(1, totalTermFreq)); + this.simWeight = + searcher.getSimilarity().scorer(boost, pseudoCollectionStats, pseudoTermStatistics); + } else { + this.simWeight = null; + } + } + + private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher) + throws IOException { + long maxDoc = 0; + long docCount = 0; + long sumTotalTermFreq = 0; + long sumDocFreq = 0; + for (FieldAndWeight fieldWeight : fieldAndWeights.values()) { + CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field); + if (collectionStats != null) { + maxDoc = Math.max(collectionStats.maxDoc(), maxDoc); + docCount = Math.max(collectionStats.docCount(), docCount); + sumDocFreq = Math.max(collectionStats.sumDocFreq(), sumDocFreq); + sumTotalTermFreq += (double) fieldWeight.weight * collectionStats.sumTotalTermFreq(); + } + } + + return new CollectionStatistics( + "pseudo_field", maxDoc, docCount, sumTotalTermFreq, sumDocFreq); + } + + @Override + public Matches matches(LeafReaderContext context, int doc) throws IOException { + Weight weight = + searcher.rewrite(rewriteToBoolean()).createWeight(searcher, ScoreMode.COMPLETE, 1f); + return weight.matches(context, doc); + } + + @Override + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + Scorer scorer = scorer(context); + if (scorer != null) { + int newDoc = scorer.iterator().advance(doc); + if (newDoc == doc) { + assert scorer instanceof CombinedFieldScorer; + float freq = ((CombinedFieldScorer) scorer).freq(); + MultiNormsLeafSimScorer docScorer = + new MultiNormsLeafSimScorer( + simWeight, context.reader(), fieldAndWeights.values(), true); + Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq); + Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); + return Explanation.match( + scoreExplanation.getValue(), + "weight(" + getQuery() + " in " + doc + "), result of:", + scoreExplanation); + } + } + return Explanation.noMatch("no matching term"); + } + + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + List iterators = new ArrayList<>(); + List fields = new ArrayList<>(); + long cost = 0; + for (int i = 0; i < fieldTerms.length; i++) { + IOSupplier supplier = termStates[i].get(context); + TermState state = supplier == null ? null : supplier.get(); + if (state != null) { + TermsEnum termsEnum = context.reader().terms(fieldTerms[i].field()).iterator(); + termsEnum.seekExact(fieldTerms[i].bytes(), state); + PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); + iterators.add(postingsEnum); + fields.add(fieldAndWeights.get(fieldTerms[i].field())); + cost += postingsEnum.cost(); + } + } + + if (iterators.isEmpty()) { + return null; + } + + MultiNormsLeafSimScorer scoringSimScorer = + new MultiNormsLeafSimScorer(simWeight, context.reader(), fieldAndWeights.values(), true); + + final long finalCost = cost; + return new ScorerSupplier() { + + @Override + public Scorer get(long leadCost) throws IOException { + // we use termscorers + disjunction as an impl detail + List wrappers = new ArrayList<>(iterators.size()); + for (int i = 0; i < iterators.size(); i++) { + float weight = fields.get(i).weight; + wrappers.add( + new WeightedDisiWrapper(new TermScorer(iterators.get(i), simWeight, null), weight)); + } + // Even though it is called approximation, it is accurate since none of + // the sub iterators are two-phase iterators. + DisjunctionDISIApproximation iterator = + new DisjunctionDISIApproximation(wrappers, leadCost); + return new CombinedFieldScorer(iterator, scoringSimScorer); + } + + @Override + public long cost() { + return finalCost; + } + }; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + } + + private static class WeightedDisiWrapper extends DisiWrapper { + final PostingsEnum postingsEnum; + final float weight; + + WeightedDisiWrapper(Scorer scorer, float weight) { + super(scorer, false); + this.weight = weight; + this.postingsEnum = (PostingsEnum) scorer.iterator(); + } + + float freq() throws IOException { + return weight * postingsEnum.freq(); + } + } + + private static class CombinedFieldScorer extends Scorer { + private final DisjunctionDISIApproximation iterator; + private final MultiNormsLeafSimScorer simScorer; + private final float maxScore; + + CombinedFieldScorer(DisjunctionDISIApproximation iterator, MultiNormsLeafSimScorer simScorer) { + this.iterator = iterator; + this.simScorer = simScorer; + this.maxScore = simScorer.getSimScorer().score(Float.POSITIVE_INFINITY, 1L); + } + + @Override + public int docID() { + return iterator.docID(); + } + + float freq() throws IOException { + DisiWrapper w = iterator.topList(); + float freq = ((WeightedDisiWrapper) w).freq(); + for (w = w.next; w != null; w = w.next) { + freq += ((WeightedDisiWrapper) w).freq(); + if (freq < 0) { // overflow + return Integer.MAX_VALUE; + } + } + return freq; + } + + @Override + public float score() throws IOException { + return simScorer.score(iterator.docID(), freq()); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; + } + + @Override + public float getMaxScore(int upTo) throws IOException { + return maxScore; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/core/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java new file mode 100644 index 000000000000..c6720b362d9c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.CombinedFieldQuery.FieldAndWeight; +import org.apache.lucene.search.similarities.Similarity.SimScorer; +import org.apache.lucene.util.SmallFloat; + +/** + * Scorer that sums document's norms from multiple fields. + * + *

For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also + * requires that either all fields or no fields have norms enabled. Having only some fields with + * norms enabled can result in errors or undefined behavior. + */ +final class MultiNormsLeafSimScorer { + /** Cache of decoded norms. */ + private static final float[] LENGTH_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); + } + } + + private final SimScorer scorer; + private final NumericDocValues norms; + + /** Sole constructor: Score documents of {@code reader} with {@code scorer}. */ + MultiNormsLeafSimScorer( + SimScorer scorer, + LeafReader reader, + Collection normFields, + boolean needsScores) + throws IOException { + this.scorer = Objects.requireNonNull(scorer); + if (needsScores) { + final List normsList = new ArrayList<>(); + final List weightList = new ArrayList<>(); + final Set duplicateCheckingSet = new HashSet<>(); + for (FieldAndWeight field : normFields) { + assert duplicateCheckingSet.add(field.field()) + : "There is a duplicated field [" + + field.field() + + "] used to construct MultiNormsLeafSimScorer"; + + NumericDocValues norms = reader.getNormValues(field.field()); + if (norms != null) { + normsList.add(norms); + weightList.add(field.weight()); + } + } + + if (normsList.isEmpty()) { + norms = null; + } else { + final NumericDocValues[] normsArr = normsList.toArray(new NumericDocValues[0]); + final float[] weightArr = new float[normsList.size()]; + for (int i = 0; i < weightList.size(); i++) { + weightArr[i] = weightList.get(i); + } + norms = new MultiFieldNormValues(normsArr, weightArr); + } + } else { + norms = null; + } + } + + SimScorer getSimScorer() { + return scorer; + } + + private long getNormValue(int doc) throws IOException { + if (norms != null) { + boolean found = norms.advanceExact(doc); + assert found; + return norms.longValue(); + } else { + return 1L; // default norm + } + } + + /** + * Score the provided document assuming the given term document frequency. This method must be + * called on non-decreasing sequences of doc ids. + * + * @see SimScorer#score(float, long) + */ + public float score(int doc, float freq) throws IOException { + return scorer.score(freq, getNormValue(doc)); + } + + /** + * Explain the score for the provided document assuming the given term document frequency. This + * method must be called on non-decreasing sequences of doc ids. + * + * @see SimScorer#explain(Explanation, long) + */ + public Explanation explain(int doc, Explanation freqExpl) throws IOException { + return scorer.explain(freqExpl, getNormValue(doc)); + } + + private static class MultiFieldNormValues extends NumericDocValues { + private final NumericDocValues[] normsArr; + private final float[] weightArr; + private long current; + private int docID = -1; + + MultiFieldNormValues(NumericDocValues[] normsArr, float[] weightArr) { + this.normsArr = normsArr; + this.weightArr = weightArr; + } + + @Override + public long longValue() { + return current; + } + + @Override + public boolean advanceExact(int target) throws IOException { + float normValue = 0; + boolean found = false; + for (int i = 0; i < normsArr.length; i++) { + if (normsArr[i].advanceExact(target)) { + normValue += + weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())]; + found = true; + } + } + current = SmallFloat.intToByte4(Math.round(normValue)); + return found; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + throw new UnsupportedOperationException(); + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/package-info.java b/lucene/core/src/java/org/apache/lucene/search/package-info.java index 98b60d94222c..664e8e616938 100644 --- a/lucene/core/src/java/org/apache/lucene/search/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/search/package-info.java @@ -278,6 +278,30 @@ *

See the {@link org.apache.lucene.search.similarities} package documentation for information on * the built-in available scoring models and extending or changing Similarity. * + *

Scoring multiple fields

+ * + *

In the real world, documents often have multiple fields with different degrees of relevance. A + * robust way of scoring across multiple fields is called BM25F, which is implemented via {@link + * org.apache.lucene.search.CombinedFieldQuery}. It scores documents with multiple fields as if + * their content had been indexed in a single combined field. It supports configuring per-field + * boosts where the value of the boost is interpreted as the number of times that the content of the + * field exists in the virtual combined field. + * + *

Here is an example that constructs a query on "apache OR lucene" on fields "title" with a + * boost of 10, and "body" with a boost of 1: + * + *

+ * BooleanQuery.Builder builder = new BooleanQuery.Builder();
+ * for (String term : new String[] { "apache", "lucene" }) {
+ *   Query query = new CombinedFieldQuery(term)
+ *         .addField("title", 10f)
+ *         .addField("body", 1f)
+ *         .build();
+ *   builder.add(query, Occur.SHOULD);
+ * }
+ * Query query = builder.build();
+ * 
+ * *

Integrating field values into the score

* *

While similarities help score a document relatively to a query, it is also common for diff --git a/lucene/core/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java new file mode 100644 index 000000000000..ed0c051d0bad --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java @@ -0,0 +1,563 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.BooleanSimilarity; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.LMDirichletSimilarity; +import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.search.CheckHits; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestCombinedFieldQuery extends LuceneTestCase { + public void testInvalid() { + CombinedFieldQuery.Builder builder = new CombinedFieldQuery.Builder("foo"); + IllegalArgumentException exc = + expectThrows(IllegalArgumentException.class, () -> builder.addField("foo", 0.5f)); + assertEquals(exc.getMessage(), "weight must be greater or equal to 1"); + } + + public void testRewrite() throws IOException { + CombinedFieldQuery.Builder builder = new CombinedFieldQuery.Builder("foo"); + IndexReader reader = new MultiReader(); + IndexSearcher searcher = new IndexSearcher(reader); + Query actual = searcher.rewrite(builder.build()); + assertEquals(new MatchNoDocsQuery(), actual); + builder.addField("field", 1f); + Query query = builder.build(); + actual = searcher.rewrite(builder.build()); + assertEquals(query, actual); + } + + public void testEqualsAndHashCode() { + CombinedFieldQuery query1 = + new CombinedFieldQuery.Builder("value").addField("field1").addField("field2").build(); + + CombinedFieldQuery query2 = + new CombinedFieldQuery.Builder("value").addField("field1").addField("field2", 1.3f).build(); + assertNotEquals(query1, query2); + assertNotEquals(query1.hashCode(), query2.hashCode()); + + CombinedFieldQuery query3 = + new CombinedFieldQuery.Builder("value").addField("field3").addField("field4").build(); + assertNotEquals(query1, query3); + assertNotEquals(query1.hashCode(), query2.hashCode()); + + CombinedFieldQuery duplicateQuery1 = + new CombinedFieldQuery.Builder("value").addField("field1").addField("field2").build(); + assertEquals(query1, duplicateQuery1); + assertEquals(query1.hashCode(), duplicateQuery1.hashCode()); + } + + public void testToString() { + CombinedFieldQuery.Builder builder = new CombinedFieldQuery.Builder("bar"); + assertEquals("CombinedFieldQuery(()(bar))", builder.build().toString()); + builder.addField("foo", 1f); + assertEquals("CombinedFieldQuery((foo)(bar))", builder.build().toString()); + builder.addField("title", 3f); + assertEquals("CombinedFieldQuery((foo title^3.0)(bar))", builder.build().toString()); + } + + public void testSameScore() throws IOException { + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + Document doc = new Document(); + doc.add(new StringField("f", "a", Store.NO)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new StringField("g", "a", Store.NO)); + for (int i = 0; i < 10; ++i) { + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder("a").addField("f", 1f).addField("g", 1f).build(); + TopScoreDocCollectorManager collectorManager = + new TopScoreDocCollectorManager( + Math.min(reader.numDocs(), Integer.MAX_VALUE), Integer.MAX_VALUE); + TopDocs topDocs = searcher.search(query, collectorManager); + assertEquals(new TotalHits(11, TotalHits.Relation.EQUAL_TO), topDocs.totalHits); + // All docs must have the same score + for (int i = 0; i < topDocs.scoreDocs.length; ++i) { + assertEquals(topDocs.scoreDocs[0].score, topDocs.scoreDocs[i].score, 0.0f); + } + + reader.close(); + w.close(); + dir.close(); + } + + public void testScoringWithMultipleFieldTermsMatch() throws IOException { + int numMatchDoc = randomIntBetween(100, 500); + int numHits = randomIntBetween(1, 100); + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); + + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + // adding potentially matching doc + for (int i = 0; i < numMatchDoc; i++) { + Document doc = new Document(); + + int freqA = random().nextInt(20) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + + freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo" + j, Store.NO)); + } + } + + freqA = random().nextInt(20) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "zoo", Store.NO)); + } + + int freqB = random().nextInt(20) + 1; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo", Store.NO)); + } + + freqB = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo" + j, Store.NO)); + } + } + + int freqC = random().nextInt(20) + 1; + for (int j = 0; j < freqC; j++) { + doc.add(new TextField("c", "bla" + j, Store.NO)); + } + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + + CombinedFieldQuery query = + new CombinedFieldQuery.Builder("foo") + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .build(); + + CollectorManager completeManager = + new TopScoreDocCollectorManager(numHits, Integer.MAX_VALUE); + + searcher.search(query, completeManager); + + reader.close(); + w.close(); + dir.close(); + } + + public void testNormsDisabled() throws IOException { + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + Document doc = new Document(); + doc.add(new StringField("a", "value", Store.NO)); + doc.add(new StringField("b", "value", Store.NO)); + doc.add(new TextField("c", "value", Store.NO)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new StringField("a", "value", Store.NO)); + doc.add(new TextField("c", "value", Store.NO)); + w.addDocument(doc); + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + + Similarity searchSimilarity = randomCompatibleSimilarity(); + searcher.setSimilarity(searchSimilarity); + TopScoreDocCollectorManager collectorManager = new TopScoreDocCollectorManager(10, 10); + + CombinedFieldQuery query = + new CombinedFieldQuery.Builder("value").addField("a", 1.0f).addField("b", 1.0f).build(); + TopDocs topDocs = searcher.search(query, collectorManager); + assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits); + + CombinedFieldQuery invalidQuery = + new CombinedFieldQuery.Builder("value").addField("b", 1.0f).addField("c", 1.0f).build(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, () -> searcher.search(invalidQuery, collectorManager)); + assertTrue(e.getMessage().contains("requires norms to be consistent across fields")); + + reader.close(); + w.close(); + dir.close(); + } + + public void testCopyField() throws IOException { + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + int numMatch = atLeast(10); + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); + for (int i = 0; i < numMatch; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { + doc.add(new TextField("a", "baz", Store.NO)); + doc.add(new TextField("b", "baz", Store.NO)); + for (int k = 0; k < boost1 + boost2; k++) { + doc.add(new TextField("ab", "baz", Store.NO)); + } + w.addDocument(doc); + doc.clear(); + } + int freqA = random().nextInt(5) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + int freqB = random().nextInt(5) + 1; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "foo", Store.NO)); + } + int freqAB = freqA * boost1 + freqB * boost2; + for (int j = 0; j < freqAB; j++) { + doc.add(new TextField("ab", "foo", Store.NO)); + } + w.addDocument(doc); + } + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + + searcher.setSimilarity(similarity); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder("foo") + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .build(); + + checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo"))); + + reader.close(); + w.close(); + dir.close(); + } + + public void testCopyFieldWithSingleField() throws IOException { + Directory dir = new MMapDirectory(createTempDir()); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + int boost = Math.max(1, random().nextInt(5)); + int numMatch = atLeast(10); + for (int i = 0; i < numMatch; i++) { + Document doc = new Document(); + int freqA = random().nextInt(5) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + + int freqB = freqA * boost; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "foo", Store.NO)); + } + + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder("foo").addField("a", (float) boost).build(); + + checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("b", "foo"))); + + reader.close(); + w.close(); + dir.close(); + } + + public void testCopyFieldWithMissingFields() throws IOException { + Directory dir = new MMapDirectory(createTempDir()); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); + int numMatch = atLeast(10); + for (int i = 0; i < numMatch; i++) { + Document doc = new Document(); + int freqA = random().nextInt(5) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + + // Choose frequencies such that sometimes we don't add field B + int freqB = random().nextInt(3); + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "foo", Store.NO)); + } + + int freqAB = freqA * boost1 + freqB * boost2; + for (int j = 0; j < freqAB; j++) { + doc.add(new TextField("ab", "foo", Store.NO)); + } + + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder("foo") + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .build(); + + checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo"))); + + reader.close(); + w.close(); + dir.close(); + } + + private static Similarity randomCompatibleSimilarity() { + return RandomPicks.randomFrom( + random(), + Arrays.asList( + new BM25Similarity(), + new BooleanSimilarity(), + new ClassicSimilarity(), + new LMDirichletSimilarity(), + new LMJelinekMercerSimilarity(0.1f))); + } + + private void checkExpectedHits( + IndexSearcher searcher, int numHits, Query firstQuery, Query secondQuery) throws IOException { + TopScoreDocCollectorManager collectorManager = + new TopScoreDocCollectorManager(numHits, Integer.MAX_VALUE); + + TopDocs firstTopDocs = searcher.search(firstQuery, collectorManager); + assertEquals(numHits, firstTopDocs.totalHits.value()); + + collectorManager = new TopScoreDocCollectorManager(numHits, Integer.MAX_VALUE); + TopDocs secondTopDocs = searcher.search(secondQuery, collectorManager); + CheckHits.checkEqual(firstQuery, secondTopDocs.scoreDocs, firstTopDocs.scoreDocs); + } + + public void testDocWithNegativeNorms() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(new NegativeNormSimilarity()); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + String queryString = "foo"; + + Document doc = new Document(); + // both fields must contain tokens that match the query string "foo" + doc.add(new TextField("f", "foo", Store.NO)); + doc.add(new TextField("g", "foo baz", Store.NO)); + w.addDocument(doc); + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(new BM25Similarity()); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder(queryString).addField("f").addField("g").build(); + TopDocs topDocs = searcher.search(query, 10); + CheckHits.checkDocIds("queried docs do not match", new int[] {0}, topDocs.scoreDocs); + + reader.close(); + w.close(); + dir.close(); + } + + public void testMultipleDocsNegativeNorms() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(new NegativeNormSimilarity()); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + String queryString = "foo"; + + Document doc0 = new Document(); + doc0.add(new TextField("f", "foo", Store.NO)); + doc0.add(new TextField("g", "foo baz", Store.NO)); + w.addDocument(doc0); + + Document doc1 = new Document(); + // add another match on the query string to the second doc + doc1.add(new TextField("f", "foo is foo", Store.NO)); + doc1.add(new TextField("g", "foo baz", Store.NO)); + w.addDocument(doc1); + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(new BM25Similarity()); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder(queryString).addField("f").addField("g").build(); + TopDocs topDocs = searcher.search(query, 10); + // Return doc1 ahead of doc0 since its tf is higher + CheckHits.checkDocIds("queried docs do not match", new int[] {1, 0}, topDocs.scoreDocs); + + reader.close(); + w.close(); + dir.close(); + } + + private static final class NegativeNormSimilarity extends Similarity { + @Override + public long computeNorm(FieldInvertState state) { + return -128; + } + + @Override + public SimScorer scorer( + float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new BM25Similarity().scorer(boost, collectionStats, termStats); + } + } + + public void testOverrideCollectionStatistics() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + Similarity similarity = randomCompatibleSimilarity(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + int numMatch = atLeast(10); + for (int i = 0; i < numMatch; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { + doc.add(new TextField("a", "baz", Store.NO)); + doc.add(new TextField("b", "baz", Store.NO)); + for (int k = 0; k < 2; k++) { + doc.add(new TextField("ab", "baz", Store.NO)); + } + w.addDocument(doc); + doc.clear(); + } + int freqA = random().nextInt(5) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + int freqB = random().nextInt(5) + 1; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "foo", Store.NO)); + } + int freqAB = freqA + freqB; + for (int j = 0; j < freqAB; j++) { + doc.add(new TextField("ab", "foo", Store.NO)); + } + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + + int extraMaxDoc = randomIntBetween(0, 10); + int extraDocCount = randomIntBetween(0, extraMaxDoc); + int extraSumDocFreq = extraDocCount + randomIntBetween(0, 10); + + int extraSumTotalTermFreqA = extraSumDocFreq + randomIntBetween(0, 10); + int extraSumTotalTermFreqB = extraSumDocFreq + randomIntBetween(0, 10); + int extraSumTotalTermFreqAB = extraSumTotalTermFreqA + extraSumTotalTermFreqB; + + IndexSearcher searcher = + new IndexSearcher(reader) { + @Override + public CollectionStatistics collectionStatistics(String field) throws IOException { + CollectionStatistics shardStatistics = super.collectionStatistics(field); + int extraSumTotalTermFreq; + if (field.equals("a")) { + extraSumTotalTermFreq = extraSumTotalTermFreqA; + } else if (field.equals("b")) { + extraSumTotalTermFreq = extraSumTotalTermFreqB; + } else if (field.equals("ab")) { + extraSumTotalTermFreq = extraSumTotalTermFreqAB; + } else { + throw new AssertionError("should never be called"); + } + return new CollectionStatistics( + field, + shardStatistics.maxDoc() + extraMaxDoc, + shardStatistics.docCount() + extraDocCount, + shardStatistics.sumTotalTermFreq() + extraSumTotalTermFreq, + shardStatistics.sumDocFreq() + extraSumDocFreq); + } + }; + searcher.setSimilarity(similarity); + CombinedFieldQuery query = + new CombinedFieldQuery.Builder("foo").addField("a").addField("b").build(); + + checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo"))); + + reader.close(); + w.close(); + dir.close(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java index 1ca3f790f43f..debd10ea83d9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java @@ -92,7 +92,9 @@ * org.apache.lucene.search.similarities.BM25Similarity}. * * @lucene.experimental + * @deprecated Use {@link org.apache.lucene.search.CombinedFieldQuery} instead. */ +@Deprecated public final class CombinedFieldQuery extends Query implements Accountable { private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class); @@ -203,7 +205,7 @@ public String toString(String field) { if (pos++ != 0) { builder.append(" "); } - builder.append(term.utf8ToString()); + builder.append(Term.toString(term)); } builder.append("))"); return builder.toString();