Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance DictionaryCompoundWordTokenFilter #14278

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
*/
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {

private boolean consumeChars = false;

/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
Expand All @@ -50,15 +52,21 @@ public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet diction
* @param minSubwordSize only subwords longer than this get to the output stream
* @param maxSubwordSize only subwords shorter than this get to the output stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
* @param consumeChars Characters are consumed, if a matching word is found and not used for
* further potential matches anymore. E.g. if the word "schwein" is extracted, the sub-word
* "wein" is not extracted anymore.
*/
public DictionaryCompoundWordTokenFilter(
TokenStream input,
CharArraySet dictionary,
int minWordSize,
int minSubwordSize,
int maxSubwordSize,
boolean onlyLongestMatch) {
boolean onlyLongestMatch,
boolean consumeChars) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
this.consumeChars = consumeChars;

if (dictionary == null) {
throw new IllegalArgumentException("dictionary must not be null");
}
Expand Down Expand Up @@ -87,6 +95,11 @@ protected void decompose() {
}
}
}

if (longestMatchToken != null && consumeChars) {
i += longestMatchToken.txt.length() - 1;
}

if (this.onlyLongestMatch && longestMatchToken != null) {
tokens.add(longestMatchToken);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
private final int minSubwordSize;
private final int maxSubwordSize;
private final boolean onlyLongestMatch;
private final boolean consumeChars;

/** Creates a new DictionaryCompoundWordTokenFilterFactory */
public DictionaryCompoundWordTokenFilterFactory(Map<String, String> args) {
Expand All @@ -62,6 +63,7 @@ public DictionaryCompoundWordTokenFilterFactory(Map<String, String> args) {
maxSubwordSize =
getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
consumeChars = getBoolean(args, "consumeChars", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
Expand All @@ -84,6 +86,12 @@ public TokenStream create(TokenStream input) {
return input;
}
return new DictionaryCompoundWordTokenFilter(
input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
input,
dictionary,
minWordSize,
minSubwordSize,
maxSubwordSize,
onlyLongestMatch,
consumeChars);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ public void testDumbCompoundWordsSELongestMatch() throws Exception {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
true);
true,
false);

assertTokenStreamContents(
tf,
Expand All @@ -275,6 +276,7 @@ public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false,
false);

assertTokenStreamContents(
Expand All @@ -297,6 +299,7 @@ public void testWordComponentWithLessThanMinimumLength() throws Exception {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false,
false);

// since "d" is shorter than the minimum subword size, it should not be added to the token
Expand All @@ -323,6 +326,7 @@ public void testReset() throws Exception {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false,
false);

CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
Expand Down Expand Up @@ -351,6 +355,7 @@ public void testRetainMockAttribute() throws Exception {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false,
false);
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
stream.reset();
Expand Down Expand Up @@ -682,4 +687,41 @@ protected TokenStreamComponents createComponents(String fieldName) {
checkOneTerm(b, "", "");
b.close();
}

public void testDecompoundingWithConsumingChars() throws Exception {

CharArraySet dict = makeDictionary("wein", "schwein", "fleisch");

Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String searchTerm = "schweinefleisch";
DictionaryCompoundWordTokenFilter tf =
getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict);

assertTokenStreamContents(tf, new String[] {searchTerm, "schwein", "fleisch"});
}

public void testDecompoundingWithConsumingChars2() throws Exception {
CharArraySet dict = makeDictionary("waffe", "affe", "kampf");

Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String searchTerm = "nahkampfwaffen";

DictionaryCompoundWordTokenFilter tf =
getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict);

assertTokenStreamContents(tf, new String[] {searchTerm, "kampf", "waffe"});
}

private DictionaryCompoundWordTokenFilter getDictionaryCompoundWordTokenFilter(
Tokenizer tokenizer, String searchTerm, CharArraySet dict) {
tokenizer.setReader(new StringReader(searchTerm));
return new DictionaryCompoundWordTokenFilter(
tokenizer,
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
true,
true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we add a case with:

longestMatch = false;
consumeChars = true;

If the combination doesn't make sense, lets just throw an IllegalArgumentException in the constructor and have the test expectThrows() that?

}
}