Skip to content

Commit

Permalink
Fixing parser bugs for numbers 70 and over in French system (e.g. qua…
Browse files Browse the repository at this point in the history
…tre-vingt-dix) (#808)

* Fixing French number parser bugs (70+, 80+, 90+s).

* - Building Java resources and adding interpretation for numbers in Belgian/Swiss French;
- Minor fixes in Java for consistency;
- Adding Java platform to dev build.

* Java test fixes.
  • Loading branch information
tellarin authored and guom08 committed Aug 27, 2018
1 parent 484b847 commit 4f17899
Show file tree
Hide file tree
Showing 30 changed files with 302 additions and 123 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ public static class NumbersDefinitions
public const string LangMarker = "Fr";
public const string RoundNumberIntegerRegex = @"(cent|mille|millions|million|milliard|milliards|billion|billions)";
public const string ZeroToNineIntegerRegex = @"(et un|un|une|deux|trois|quatre|cinq|six|sept|huit|neuf)";
public const string TenToNineteenIntegerRegex = @"(dix\Wneuf|dix\Whuit|dix\Wsept|seize|quinze|quatorze|treize|douze|onze|dix)";
public const string TensNumberIntegerRegex = @"(octante|vingt|trente|quarante|cinquante|soixante-dix|soixante|septante|huitante|quatre-vingt-dix|nonante)";
public const string TenToNineteenIntegerRegex = @"(dix[-\s]neuf|dix[-\s]huit|dix[-\s]sept|(-)?seize|(-)?quinze|(-)?quatorze|(-)?treize|(-)?douze|(-)?onze|dix)";
public const string TensNumberIntegerRegex = @"(quatre[-\s]vingt[-\s]dix|quatre[-\s]vingt(s)?|soixante[-\s]dix|vingt|trente|quarante|cinquante|soixante|septante|octante|huitante|nonante)";
public const string DigitsNumberRegex = @"\d|\d{1,3}(\.\d{3})";
public const string NegativeNumberTermsRegex = @"^[.]";
public static readonly string NegativeNumberSignRegex = $@"^({NegativeNumberTermsRegex}\s+).*";
public static readonly string HundredsNumberIntegerRegex = $@"(({ZeroToNineIntegerRegex}(\s+cent))|cent|((\s+cent\s)+{TensNumberIntegerRegex}))";
public static readonly string BelowHundredsRegex = $@"(({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}(\W+{ZeroToNineIntegerRegex})?))|{ZeroToNineIntegerRegex})";
public static readonly string BelowHundredsRegex = $@"({TenToNineteenIntegerRegex}|({TensNumberIntegerRegex}[-\s]+{TenToNineteenIntegerRegex})|({TensNumberIntegerRegex}([-\s]+{ZeroToNineIntegerRegex})?)|{ZeroToNineIntegerRegex})";
public static readonly string BelowThousandsRegex = $@"(({HundredsNumberIntegerRegex}(\s+{BelowHundredsRegex})?|{BelowHundredsRegex}|{TenToNineteenIntegerRegex})|cent\s+{TenToNineteenIntegerRegex})";
public static readonly string SupportThousandsRegex = $@"(({BelowThousandsRegex}|{BelowHundredsRegex})\s+{RoundNumberIntegerRegex}(\s+{RoundNumberIntegerRegex})?)";
public static readonly string SeparaIntRegex = $@"({SupportThousandsRegex}(\s+{SupportThousandsRegex})*(\s+{BelowThousandsRegex})?|{BelowThousandsRegex})";
Expand Down Expand Up @@ -114,15 +114,27 @@ public static class NumbersDefinitions
{ "soixante-dix", 70 },
{ "septante", 70 },
{ "quatre-vingts", 80 },
{ "quatre-vingt", 80 },
{ "quatre vingts", 80 },
{ "quatre vingt", 80 },
{ "quatre-vingt-dix", 90 },
{ "quatre-vingt dix", 90 },
{ "quatre vingt dix", 90 },
{ "quatre-vingts-dix", 90 },
{ "quatre-vingts-onze", 91 },
{ "quatre-vingt-onze", 91 },
{ "quatre-vingts-douze", 92 },
{ "quatre-vingt-douze", 92 },
{ "quatre-vingts-treize", 93 },
{ "quatre-vingt-treize", 93 },
{ "quatre-vingts-quatorze", 94 },
{ "quatre-vingt-quatorze", 94 },
{ "quatre-vingts-quinze", 95 },
{ "quatre-vingt-quinze", 95 },
{ "quatre-vingts-seize", 96 },
{ "quatre-vingt-dix-sept", 97 },
{ "quatre-vingt-dix-neuf", 98 },
{ "quatre-vingt-seize", 96 },
{ "huitante", 80 },
{ "octante", 80 },
{ "nonante", 90 },
{ "cent", 100 },
{ "mille", 1000 },
Expand Down Expand Up @@ -271,6 +283,9 @@ public static class NumbersDefinitions
{ "septante", 70 },
{ "quatre-vingt", 80 },
{ "quatre vingt", 80 },
{ "huitante", 80 },
{ "octante", 80 },
{ "nonante", 90 },
{ "quatre vingt dix", 90 },
{ "quatre-vingt-dix", 90 },
{ "cent", 100 },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ public virtual List<ExtractResult> Extract(string source)

private List<ExtractResult> FilterAmbiguity(List<ExtractResult> ers, string text)
{
var result = new List<ExtractResult>();

if (AmbiguityFiltersDict != null)
{
Expand Down
89 changes: 48 additions & 41 deletions .NET/Microsoft.Recognizers.Text.Number/Parsers/BaseNumberParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ protected ParseResult TextNumberParse(ExtractResult extResult)
Text = extResult.Text,
Type = extResult.Type
};

var handle = extResult.Text.ToLower();

#region Special case for "dozen"
Expand All @@ -305,7 +306,7 @@ protected ParseResult TextNumberParse(ExtractResult extResult)
var intPart = numGroup[0];
var sMatch = TextNumberRegex.Match(intPart);

//Store all match str.
// Store all match str.
var matchStrs = new List<string>();

while (sMatch.Success)
Expand All @@ -328,12 +329,14 @@ protected ParseResult TextNumberParse(ExtractResult extResult)
var pointPart = numGroup[1];
sMatch = TextNumberRegex.Match(pointPart);
matchStrs.Clear();

while (sMatch.Success)
{
var matchStr = sMatch.Groups[0].Value.ToLower();
matchStrs.Add(matchStr);
sMatch = sMatch.NextMatch();
}

pointPartRet += GetPointValue(matchStrs);
}

Expand Down Expand Up @@ -361,13 +364,13 @@ protected ParseResult FracLikeNumberParse(ExtractResult extResult)
var numerator = match.Groups["numerator"].Value;
var denominator = match.Groups["denominator"].Value;

var smallValue = char.IsDigit(numerator[0])
? GetDigitalValue(numerator, 1)
: GetIntValue(GetMatches(numerator));
var smallValue = char.IsDigit(numerator[0]) ?
GetDigitalValue(numerator, 1) :
GetIntValue(GetMatches(numerator));

var bigValue = char.IsDigit(denominator[0])
? GetDigitalValue(denominator, 1)
: GetIntValue(GetMatches(denominator));
var bigValue = char.IsDigit(denominator[0]) ?
GetDigitalValue(denominator, 1) :
GetIntValue(GetMatches(denominator));

result.Value = smallValue / bigValue;
}
Expand All @@ -394,10 +397,10 @@ protected ParseResult FracLikeNumberParse(ExtractResult extResult)

var smHundreds = 100;

// previous : hundred
// current : one
if ((previousValue >= smHundreds && previousValue > currentValue)
|| (previousValue < smHundreds && IsComposable(currentValue, previousValue)))
// Previous : hundred
// Current : one
if ((previousValue >= smHundreds && previousValue > currentValue) ||
(previousValue < smHundreds && IsComposable(currentValue, previousValue)))
{
if (previousValue < smHundreds && currentValue >= roundValue)
{
Expand All @@ -409,18 +412,18 @@ protected ParseResult FracLikeNumberParse(ExtractResult extResult)
break;
}

// current is the first word
// Current is the first word
if (splitIndex == 0)
{
// scan, skip the first word
// Scan, skip the first word
splitIndex = 1;
while (splitIndex <= fracWords.Count - 2)
{
// e.g. one hundred thousand
// frac[i+1] % 100 && frac[i] % 100 = 0
if (Config.ResolveCompositeNumber(fracWords[splitIndex]) >= smHundreds
&& !Config.WrittenFractionSeparatorTexts.Contains(fracWords[splitIndex + 1])
&& Config.ResolveCompositeNumber(fracWords[splitIndex + 1]) < smHundreds)
if (Config.ResolveCompositeNumber(fracWords[splitIndex]) >= smHundreds &&
!Config.WrittenFractionSeparatorTexts.Contains(fracWords[splitIndex + 1]) &&
Config.ResolveCompositeNumber(fracWords[splitIndex + 1]) < smHundreds)
{
splitIndex++;
break;
Expand Down Expand Up @@ -457,9 +460,8 @@ protected ParseResult FracLikeNumberParse(ExtractResult extResult)
}
fracWords.RemoveRange(splitIndex, fracWords.Count - splitIndex);

// denomi = denominator
var denomiValue = GetIntValue(fracPart);
// Split mixed number with fraction
var denominator = GetIntValue(fracPart);
double numerValue = 0;
double intValue = 0;

Expand All @@ -479,13 +481,13 @@ protected ParseResult FracLikeNumberParse(ExtractResult extResult)
intValue = GetIntValue(GetMatches(intStr));

// Find mixed number
if (mixedIndex != fracWords.Count && numerValue < denomiValue)
if (mixedIndex != fracWords.Count && numerValue < denominator)
{
result.Value = intValue + numerValue / denomiValue;
result.Value = intValue + numerValue / denominator;
}
else
{
result.Value = (intValue + numerValue) / denomiValue;
result.Value = (intValue + numerValue) / denominator;
}
}

Expand All @@ -503,7 +505,7 @@ private List<string> GetMatches(string input)
var sMatch = TextNumberRegex.Match(input);
var matchStrs = new List<string>();

//Store all match str.
// Store all match str.
while (sMatch.Success)
{
var matchStr = sMatch.Groups[0].Value.ToLower();
Expand All @@ -514,8 +516,8 @@ private List<string> GetMatches(string input)
return matchStrs;
}

//Test if big and combine with small.
//e.g. "hundred" can combine with "thirty" but "twenty" can't combine with "thirty".
// Test if big and combine with small.
// e.g. "hundred" can combine with "thirty" but "twenty" can't combine with "thirty".
private bool IsComposable(long big, long small)
{
var baseNumber = small > 10 ? 100 : 10;
Expand All @@ -535,13 +537,13 @@ private double GetIntValue(List<string> matchStrs)
double tempValue = 0;
long endFlag = 1;

//Scan from end to start, find the end word
// Scan from end to start, find the end word
for (var i = matchStrs.Count - 1; i >= 0; i--)
{
if (RoundNumberSet.Contains(matchStrs[i]))
{
//if false,then continue
//You will meet hundred first, then thousand.
// If false, then continue
// will meet hundred first, then thousand.
if (endFlag > Config.RoundNumberMap[matchStrs[i]])
{
continue;
Expand All @@ -566,15 +568,15 @@ private double GetIntValue(List<string> matchStrs)
? Config.CardinalNumberMap[matchStr]
: Config.OrdinalNumberMap[matchStr];

//This is just for ordinal now. Not for fraction ever.
// This is just for ordinal now. Not for fraction ever.
if (isOrdinal)
{
double fracPart = Config.OrdinalNumberMap[matchStr];
if (tempStack.Any())
{
var intPart = tempStack.Pop();

// if intPart >= fracPart, it means it is an ordinal number
// If intPart >= fracPart, it means it is an ordinal number
// it begins with an integer, ends with an ordinal
// e.g. ninety-ninth
if (intPart >= fracPart)
Expand All @@ -583,7 +585,7 @@ private double GetIntValue(List<string> matchStrs)
}
else
{
// another case of the type is ordinal
// Another case where the type is ordinal
// e.g. three hundredth

while (tempStack.Any())
Expand Down Expand Up @@ -655,8 +657,9 @@ private double GetIntValue(List<string> matchStrs)
}
}

//Calculate the part like "thirty-one"
// Calculate the part like "thirty-one"
mulValue = 1;

if (lastIndex != isEnd.Length)
{
partValue = GetIntValue(matchStrs.GetRange(lastIndex, isEnd.Length - lastIndex));
Expand Down Expand Up @@ -705,17 +708,19 @@ protected ParseResult DigitNumberParse(ExtractResult extResult)
Type = extResult.Type
};

//[1] 24
//[2] 12 32/33
//[3] 1,000,000
//[4] 234.567
//[5] 44/55
//[6] 2 hundred
//dot occured.
// [1] 24
// [2] 12 32/33
// [3] 1,000,000
// [4] 234.567
// [5] 44/55
// [6] 2 hundred
// dot occured.
double power = 1;
var handle = extResult.Text.ToLower();
var match = Config.DigitalNumberRegex.Match(handle);
int startIndex = 0;

var match = Config.DigitalNumberRegex.Match(handle);

while (match.Success)
{
var tmpIndex = -1;
Expand All @@ -733,7 +738,7 @@ protected ParseResult DigitNumberParse(ExtractResult extResult)
match = match.NextMatch();
}

//scale used in the calculate of double
// Scale used in calculating double
result.Value = GetDigitalValue(handle, power);

return result;
Expand Down Expand Up @@ -784,9 +789,10 @@ protected double GetDigitalValue(string digitStr, double power)
isNegative = true;
}
}

calStack.Push(temp);

// is the number is a fraction.
// If the number is a fraction.
double calResult = 0;
if (isFrac)
{
Expand All @@ -799,6 +805,7 @@ protected double GetDigitalValue(string digitStr, double power)
{
calResult += calStack.Pop();
}

calResult *= power;

if (isNegative)
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
/.vs/Microsoft.Recognizers.Text/v14
/TestResults
/packages
/.idea
5 changes: 5 additions & 0 deletions Java/build.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@ECHO off

ECHO.
ECHO # Building Java platform
mvn package
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public List<String> getAmbiguousUnitList() {
public static Map<String, String> DimensionSuffixList = new ImmutableMap.Builder<String, String>()
.putAll(EnglishNumericWithUnit.InformationSuffixList)
.putAll(AreaExtractorConfiguration.AreaSuffixList)
.putAll(LengthExtractorConfiguration.LenghtSuffixList)
.putAll(LengthExtractorConfiguration.LengthSuffixList)
.putAll(SpeedExtractorConfiguration.SpeedSuffixList)
.putAll(VolumeExtractorConfiguration.VolumeSuffixList)
.putAll(WeightExtractorConfiguration.WeightSuffixList)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public String getExtractType() {

@Override
public Map<String, String> getSuffixList() {
return LenghtSuffixList;
return LengthSuffixList;
}

@Override
Expand All @@ -40,5 +40,5 @@ public List<String> getAmbiguousUnitList() {
return EnglishNumericWithUnit.AmbiguousLengthUnitList;
}

public static Map<String, String> LenghtSuffixList = EnglishNumericWithUnit.LenghtSuffixList;
public static Map<String, String> LengthSuffixList = EnglishNumericWithUnit.LengthSuffixList;
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ public LengthParserConfiguration() {
public LengthParserConfiguration(CultureInfo cultureInfo) {
super(cultureInfo);

this.bindDictionary(LengthExtractorConfiguration.LenghtSuffixList);
this.bindDictionary(LengthExtractorConfiguration.LengthSuffixList);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public List<String> getAmbiguousUnitList() {
public static Map<String, String> DimensionSuffixList = new ImmutableMap.Builder<String, String>()
.putAll(FrenchNumericWithUnit.InformationSuffixList)
.putAll(AreaExtractorConfiguration.AreaSuffixList)
.putAll(LengthExtractorConfiguration.LenghtSuffixList)
.putAll(LengthExtractorConfiguration.LengthSuffixList)
.putAll(SpeedExtractorConfiguration.SpeedSuffixList)
.putAll(VolumeExtractorConfiguration.VolumeSuffixList)
.putAll(WeightExtractorConfiguration.WeightSuffixList)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public String getExtractType() {

@Override
public Map<String, String> getSuffixList() {
return LenghtSuffixList;
return LengthSuffixList;
}

@Override
Expand All @@ -40,5 +40,5 @@ public List<String> getAmbiguousUnitList() {
return FrenchNumericWithUnit.AmbiguousLengthUnitList;
}

public static Map<String, String> LenghtSuffixList = FrenchNumericWithUnit.LengthSuffixList;
public static Map<String, String> LengthSuffixList = FrenchNumericWithUnit.LengthSuffixList;
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ public LengthParserConfiguration() {
public LengthParserConfiguration(CultureInfo cultureInfo) {
super(cultureInfo);

this.bindDictionary(LengthExtractorConfiguration.LenghtSuffixList);
this.bindDictionary(LengthExtractorConfiguration.LengthSuffixList);
}
}
Loading

0 comments on commit 4f17899

Please sign in to comment.