apache · renatoh · Feb 22, 2025 · Feb 22, 2025 · Feb 22, 2025 · Feb 22, 2025
diff --git a/...ommon/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/...ommon/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@@ -28,6 +28,8 @@
  */
 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
 
+  private boolean consumeChars = false;
+
   /**
    * Creates a new {@link DictionaryCompoundWordTokenFilter}
    *
@@ -50,15 +52,21 @@ public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet diction
    * @param minSubwordSize only subwords longer than this get to the output stream
    * @param maxSubwordSize only subwords shorter than this get to the output stream
    * @param onlyLongestMatch Add only the longest matching subword to the stream
+   * @param consumeChars Characters are consumed, if a matching word is found and not used for
+   *     further potential matches anymore. E.g. if the word "schwein" is extracted, the sub-word
+   *     "wein" is not extracted anymore.
    */
   public DictionaryCompoundWordTokenFilter(
       TokenStream input,
       CharArraySet dictionary,
       int minWordSize,
       int minSubwordSize,
       int maxSubwordSize,
-      boolean onlyLongestMatch) {
+      boolean onlyLongestMatch,
+      boolean consumeChars) {
     super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+    this.consumeChars = consumeChars;
+
     if (dictionary == null) {
       throw new IllegalArgumentException("dictionary must not be null");
     }
@@ -87,6 +95,11 @@ protected void decompose() {
           }
         }
       }
+
+      if (longestMatchToken != null && consumeChars) {
+        i += longestMatchToken.txt.length() - 1;
+      }
+
       if (this.onlyLongestMatch && longestMatchToken != null) {
         tokens.add(longestMatchToken);
       }

diff --git a/...rc/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/...rc/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java
@@ -51,6 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
   private final int minSubwordSize;
   private final int maxSubwordSize;
   private final boolean onlyLongestMatch;
+  private final boolean consumeChars;
 
   /** Creates a new DictionaryCompoundWordTokenFilterFactory */
   public DictionaryCompoundWordTokenFilterFactory(Map<String, String> args) {
@@ -62,6 +63,7 @@ public DictionaryCompoundWordTokenFilterFactory(Map<String, String> args) {
     maxSubwordSize =
         getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
     onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
+    consumeChars = getBoolean(args, "consumeChars", false);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -84,6 +86,12 @@ public TokenStream create(TokenStream input) {
       return input;
     }
     return new DictionaryCompoundWordTokenFilter(
-        input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+        input,
+        dictionary,
+        minWordSize,
+        minSubwordSize,
+        maxSubwordSize,
+        onlyLongestMatch,
+        consumeChars);
   }
 }
diff --git a/...ysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/...ysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -251,7 +251,8 @@ public void testDumbCompoundWordsSELongestMatch() throws Exception {
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
-            true);
+            true,
+            false);
 
     assertTokenStreamContents(
         tf,
@@ -275,6 +276,7 @@ public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            false,
             false);
 
     assertTokenStreamContents(
@@ -297,6 +299,7 @@ public void testWordComponentWithLessThanMinimumLength() throws Exception {
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            false,
             false);
 
     // since "d" is shorter than the minimum subword size, it should not be added to the token
@@ -323,6 +326,7 @@ public void testReset() throws Exception {
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            false,
             false);
 
     CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
@@ -351,6 +355,7 @@ public void testRetainMockAttribute() throws Exception {
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+            false,
             false);
     MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
     stream.reset();
@@ -682,4 +687,41 @@ protected TokenStreamComponents createComponents(String fieldName) {
     checkOneTerm(b, "", "");
     b.close();
   }
+
+  public void testDecompoundingWithConsumingChars() throws Exception {
+
+    CharArraySet dict = makeDictionary("wein", "schwein", "fleisch");
+
+    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    String searchTerm = "schweinefleisch";
+    DictionaryCompoundWordTokenFilter tf =
+        getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict);
+
+    assertTokenStreamContents(tf, new String[] {searchTerm, "schwein", "fleisch"});
+  }
+
+  public void testDecompoundingWithConsumingChars2() throws Exception {
+    CharArraySet dict = makeDictionary("waffe", "affe", "kampf");
+
+    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    String searchTerm = "nahkampfwaffen";
+
+    DictionaryCompoundWordTokenFilter tf =
+        getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict);
+
+    assertTokenStreamContents(tf, new String[] {searchTerm, "kampf", "waffe"});
+  }
+
+  private DictionaryCompoundWordTokenFilter getDictionaryCompoundWordTokenFilter(
+      Tokenizer tokenizer, String searchTerm, CharArraySet dict) {
+    tokenizer.setReader(new StringReader(searchTerm));
+    return new DictionaryCompoundWordTokenFilter(
+        tokenizer,
+        dict,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
+        true,
+        true);
+  }
 }