From 0d660d029f7410177ae68b39904fe47e24ae3d98 Mon Sep 17 00:00:00 2001 From: Renato Haeberli <> Date: Sat, 22 Feb 2025 22:20:56 +0100 Subject: [PATCH 1/5] introducing option to consume characters when words are decompounded, in order to prevent matches on sub-words --- .../DictionaryCompoundWordTokenFilter.java | 15 ++++++- ...tionaryCompoundWordTokenFilterFactory.java | 10 ++++- .../compound/TestCompoundWordTokenFilter.java | 44 ++++++++++++++++++- 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index c6278a80a1f3..919594ca91f6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -28,6 +28,8 @@ */ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { + private boolean consumeChars = false; + /** * Creates a new {@link DictionaryCompoundWordTokenFilter} * @@ -50,6 +52,9 @@ public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet diction * @param minSubwordSize only subwords longer than this get to the output stream * @param maxSubwordSize only subwords shorter than this get to the output stream * @param onlyLongestMatch Add only the longest matching subword to the stream + * @param consumeChars Characters are consumes, if a matching word is found and not used for + * further potential matches (e.g. if the word "schwein" is extracted, the sub-word "wein" is + * not extracted anymore */ public DictionaryCompoundWordTokenFilter( TokenStream input, @@ -57,8 +62,11 @@ public DictionaryCompoundWordTokenFilter( int minWordSize, int minSubwordSize, int maxSubwordSize, - boolean onlyLongestMatch) { + boolean onlyLongestMatch, + boolean consumeChars) { super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + this.consumeChars = consumeChars; + if (dictionary == null) { throw new IllegalArgumentException("dictionary must not be null"); } @@ -87,6 +95,11 @@ protected void decompose() { } } } + + if (longestMatchToken != null && consumeChars) { + i += longestMatchToken.txt.length() - 1; + } + if (this.onlyLongestMatch && longestMatchToken != null) { tokens.add(longestMatchToken); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java index 69819736d692..c18d26c120cf 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java @@ -51,6 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory private final int minSubwordSize; private final int maxSubwordSize; private final boolean onlyLongestMatch; + private final boolean consumeChars; /** Creates a new DictionaryCompoundWordTokenFilterFactory */ public DictionaryCompoundWordTokenFilterFactory(Map args) { @@ -62,6 +63,7 @@ public DictionaryCompoundWordTokenFilterFactory(Map args) { maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true); + consumeChars = getBoolean(args, "consumeChars", false); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -84,6 +86,12 @@ public TokenStream create(TokenStream input) { return input; } return new DictionaryCompoundWordTokenFilter( - input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + input, + dictionary, + minWordSize, + minSubwordSize, + maxSubwordSize, + onlyLongestMatch, + consumeChars); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index 1e5ca1417c6d..0271df8fbb6b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -251,7 +251,8 @@ public void testDumbCompoundWordsSELongestMatch() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - true); + true, + false); assertTokenStreamContents( tf, @@ -275,6 +276,7 @@ public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false, false); assertTokenStreamContents( @@ -297,6 +299,7 @@ public void testWordComponentWithLessThanMinimumLength() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false, false); // since "d" is shorter than the minimum subword size, it should not be added to the token @@ -323,6 +326,7 @@ public void testReset() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false, false); CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); @@ -351,6 +355,7 @@ public void testRetainMockAttribute() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false, false); MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); stream.reset(); @@ -682,4 +687,41 @@ protected TokenStreamComponents createComponents(String fieldName) { checkOneTerm(b, "", ""); b.close(); } + + public void testDecompoundingWithConsumingChars() throws Exception { + + CharArraySet dict = makeDictionary("wein", "schwein", "fleisch"); + + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + String searchTerm = "schweinefleisch"; + DictionaryCompoundWordTokenFilter tf = + getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict); + + assertTokenStreamContents(tf, new String[] {searchTerm, "schwein", "fleisch"}); + } + + public void testDecompoundingWithConsumingChars2() throws Exception { + CharArraySet dict = makeDictionary("waffe", "affe", "kampf"); + + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + String searchTerm = "nahkampfwaffen"; + + DictionaryCompoundWordTokenFilter tf = + getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict); + + assertTokenStreamContents(tf, new String[] {searchTerm, "kampf", "waffe"}); + } + + private DictionaryCompoundWordTokenFilter getDictionaryCompoundWordTokenFilter( + Tokenizer tokenizer, String searchTerm, CharArraySet dict) { + tokenizer.setReader(new StringReader(searchTerm)); + return new DictionaryCompoundWordTokenFilter( + tokenizer, + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + true, + true); + } } From edaea41d5c606d76278fe3600b55fbd731b62d1e Mon Sep 17 00:00:00 2001 From: Renato Haeberli <> Date: Sat, 22 Feb 2025 22:23:56 +0100 Subject: [PATCH 2/5] fixing typo --- .../analysis/compound/DictionaryCompoundWordTokenFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index 919594ca91f6..eaebe94c5f3c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -52,7 +52,7 @@ public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet diction * @param minSubwordSize only subwords longer than this get to the output stream * @param maxSubwordSize only subwords shorter than this get to the output stream * @param onlyLongestMatch Add only the longest matching subword to the stream - * @param consumeChars Characters are consumes, if a matching word is found and not used for + * @param consumeChars Characters are consumed, if a matching word is found and not used for * further potential matches (e.g. if the word "schwein" is extracted, the sub-word "wein" is * not extracted anymore */ From 3af19241e7ead1fdd02d752271bea378e658ab58 Mon Sep 17 00:00:00 2001 From: Renato Haeberli <> Date: Sat, 22 Feb 2025 22:30:07 +0100 Subject: [PATCH 3/5] fixing typo --- .../analysis/compound/DictionaryCompoundWordTokenFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index eaebe94c5f3c..ea25775bbf48 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -53,8 +53,8 @@ public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet diction * @param maxSubwordSize only subwords shorter than this get to the output stream * @param onlyLongestMatch Add only the longest matching subword to the stream * @param consumeChars Characters are consumed, if a matching word is found and not used for - * further potential matches (e.g. if the word "schwein" is extracted, the sub-word "wein" is - * not extracted anymore + * further potential matches anymore. E.g. if the word "schwein" is extracted, the sub-word "wein" is + * not extracted anymore. */ public DictionaryCompoundWordTokenFilter( TokenStream input, From e292287b08cab2ab8dc1d2318dd4f2c1f4f7f381 Mon Sep 17 00:00:00 2001 From: Renato Haeberli <> Date: Sat, 22 Feb 2025 22:48:38 +0100 Subject: [PATCH 4/5] fix format of java-doc --- .../analysis/compound/DictionaryCompoundWordTokenFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index ea25775bbf48..e1f2ce273f06 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -53,8 +53,8 @@ public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet diction * @param maxSubwordSize only subwords shorter than this get to the output stream * @param onlyLongestMatch Add only the longest matching subword to the stream * @param consumeChars Characters are consumed, if a matching word is found and not used for - * further potential matches anymore. E.g. if the word "schwein" is extracted, the sub-word "wein" is - * not extracted anymore. + * further potential matches anymore. E.g. if the word "schwein" is extracted, the sub-word + * "wein" is not extracted anymore. */ public DictionaryCompoundWordTokenFilter( TokenStream input, From a4ab9ef440a5f887b05eeeeea28afd5140caf0a8 Mon Sep 17 00:00:00 2001 From: Renato Haeberli <> Date: Mon, 24 Feb 2025 20:26:45 +0100 Subject: [PATCH 5/5] fix format of java-doc --- .../DictionaryCompoundWordTokenFilter.java | 19 +++++++----- ...tionaryCompoundWordTokenFilterFactory.java | 6 ++-- .../compound/TestCompoundWordTokenFilter.java | 31 +++++++++++++------ 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index e1f2ce273f06..5a997e375d3b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -28,7 +28,7 @@ */ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { - private boolean consumeChars = false; + private boolean reuseChars = true; /** * Creates a new {@link DictionaryCompoundWordTokenFilter} @@ -52,9 +52,9 @@ public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet diction * @param minSubwordSize only subwords longer than this get to the output stream * @param maxSubwordSize only subwords shorter than this get to the output stream * @param onlyLongestMatch Add only the longest matching subword to the stream - * @param consumeChars Characters are consumed, if a matching word is found and not used for - * further potential matches anymore. E.g. if the word "schwein" is extracted, the sub-word - * "wein" is not extracted anymore. + * @param reuseChars Characters are reused for multiple matching words, e.g. if a word contains + * 'schwein', the word 'schwein' and 'wein' will be extracted. If set to false, only the + * longer word, 'schwein' in this case, will be extracted. */ public DictionaryCompoundWordTokenFilter( TokenStream input, @@ -63,13 +63,18 @@ public DictionaryCompoundWordTokenFilter( int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch, - boolean consumeChars) { + boolean reuseChars) { super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); - this.consumeChars = consumeChars; + this.reuseChars = reuseChars; if (dictionary == null) { throw new IllegalArgumentException("dictionary must not be null"); } + + if (!reuseChars && !onlyLongestMatch) { + throw new IllegalArgumentException( + "reuseChars can only be set to false if onlyLongestMatch is set to true"); + } } @Override @@ -96,7 +101,7 @@ protected void decompose() { } } - if (longestMatchToken != null && consumeChars) { + if (longestMatchToken != null && !reuseChars) { i += longestMatchToken.txt.length() - 1; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java index c18d26c120cf..effe6e7379db 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java @@ -51,7 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory private final int minSubwordSize; private final int maxSubwordSize; private final boolean onlyLongestMatch; - private final boolean consumeChars; + private final boolean reuseChars; /** Creates a new DictionaryCompoundWordTokenFilterFactory */ public DictionaryCompoundWordTokenFilterFactory(Map args) { @@ -63,7 +63,7 @@ public DictionaryCompoundWordTokenFilterFactory(Map args) { maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true); - consumeChars = getBoolean(args, "consumeChars", false); + reuseChars = getBoolean(args, "reuseChars", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -92,6 +92,6 @@ public TokenStream create(TokenStream input) { minSubwordSize, maxSubwordSize, onlyLongestMatch, - consumeChars); + reuseChars); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index 0271df8fbb6b..7e5dda0c57a9 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -20,6 +20,7 @@ import java.io.Reader; import java.io.StringReader; import java.util.Arrays; +import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; @@ -252,7 +253,7 @@ public void testDumbCompoundWordsSELongestMatch() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true, - false); + true); assertTokenStreamContents( tf, @@ -277,7 +278,7 @@ public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false, - false); + true); assertTokenStreamContents( tf, @@ -300,7 +301,7 @@ public void testWordComponentWithLessThanMinimumLength() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false, - false); + true); // since "d" is shorter than the minimum subword size, it should not be added to the token // stream @@ -327,7 +328,7 @@ public void testReset() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false, - false); + true); CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); tf.reset(); @@ -356,7 +357,7 @@ public void testRetainMockAttribute() throws Exception { CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false, - false); + true); MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); stream.reset(); while (stream.incrementToken()) { @@ -695,7 +696,7 @@ public void testDecompoundingWithConsumingChars() throws Exception { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); String searchTerm = "schweinefleisch"; DictionaryCompoundWordTokenFilter tf = - getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict); + getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict, true); assertTokenStreamContents(tf, new String[] {searchTerm, "schwein", "fleisch"}); } @@ -707,13 +708,23 @@ public void testDecompoundingWithConsumingChars2() throws Exception { String searchTerm = "nahkampfwaffen"; DictionaryCompoundWordTokenFilter tf = - getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict); + getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict, true); assertTokenStreamContents(tf, new String[] {searchTerm, "kampf", "waffe"}); } + public void testDecompoundingWithInvalidParameterCombination() { + + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + expectThrows( + IllegalArgumentException.class, + () -> + getDictionaryCompoundWordTokenFilter( + tokenizer, "", new CharArraySet(List.of(), true), false)); + } + private DictionaryCompoundWordTokenFilter getDictionaryCompoundWordTokenFilter( - Tokenizer tokenizer, String searchTerm, CharArraySet dict) { + Tokenizer tokenizer, String searchTerm, CharArraySet dict, boolean onlyLongestMatch) { tokenizer.setReader(new StringReader(searchTerm)); return new DictionaryCompoundWordTokenFilter( tokenizer, @@ -721,7 +732,7 @@ private DictionaryCompoundWordTokenFilter getDictionaryCompoundWordTokenFilter( CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - true, - true); + onlyLongestMatch, + false); } }