From c6c5584b58520162feac068c93e6d3e340e3ebfc Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Wed, 16 Aug 2023 02:49:15 +0300 Subject: [PATCH 01/20] Add classes representing parser messages and lexical errors. --- src/FarkleNeo/Diagnostics/LexicalError.cs | 47 ++++++++++++++++ src/FarkleNeo/Diagnostics/ParserDiagnostic.cs | 55 +++++++++++++++++++ src/FarkleNeo/Properties/Resources.cs | 16 ++++++ src/FarkleNeo/Properties/Resources.el.resx | 3 + src/FarkleNeo/Properties/Resources.resx | 5 +- 5 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 src/FarkleNeo/Diagnostics/LexicalError.cs create mode 100644 src/FarkleNeo/Diagnostics/ParserDiagnostic.cs diff --git a/src/FarkleNeo/Diagnostics/LexicalError.cs b/src/FarkleNeo/Diagnostics/LexicalError.cs new file mode 100644 index 00000000..24d5a577 --- /dev/null +++ b/src/FarkleNeo/Diagnostics/LexicalError.cs @@ -0,0 +1,47 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +namespace Farkle.Diagnostics; + +/// +/// Contains information about a lexical error. +/// +/// +/// A lexical error occurs when the tokenizer cannot recognize some characters as part of a token. +/// +public sealed class LexicalError : IFormattable +{ + /// + /// The characters of the token that caused the error. + /// + /// + /// This value might be truncated by Farkle if the token is too long or spans multiple lines. + /// + public string? TokenText { get; } + + /// + /// The number of the tokenizer's state machine at the time of the error. + /// + public int TokenizerState { get; } + + /// + /// Creates a . + /// + /// The value of . + /// The value of . + /// Optional, defaults to -1. + public LexicalError(string? tokenText, int tokenizerState = -1) + { + TokenText = tokenText; + TokenizerState = tokenizerState; + } + + private string ToString(IFormatProvider? formatProvider) => + Resources.Format(formatProvider, nameof(Resources.Parser_UnrecognizedToken), TokenText); + + /// + public string ToString(string? format, IFormatProvider? formatProvider) => ToString(formatProvider); + + /// + public override string ToString() => ToString(null); +} diff --git a/src/FarkleNeo/Diagnostics/ParserDiagnostic.cs b/src/FarkleNeo/Diagnostics/ParserDiagnostic.cs new file mode 100644 index 00000000..bae9eaa6 --- /dev/null +++ b/src/FarkleNeo/Diagnostics/ParserDiagnostic.cs @@ -0,0 +1,55 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +namespace Farkle.Diagnostics; + +/// +/// Represents a diagnostic message from the parser. +/// +public sealed class ParserDiagnostic : IFormattable +#if NET6_0_OR_GREATER + , ISpanFormattable +#endif +{ + /// + /// The position the message was reported at. + /// + public TextPosition Position { get; } + + /// + /// An that describes the message. + /// + public object Message { get; } + + /// + /// Creates a . + /// + /// The value of . + /// The value of . + /// + /// is . + public ParserDiagnostic(TextPosition position, object message) + { + ArgumentNullExceptionCompat.ThrowIfNull(message); + Position = position; + Message = message; + } + + private string ToString(IFormatProvider? formatProvider) => +#if NET6_0_OR_GREATER + string.Create(formatProvider, $"{Position} {Message}"); +#else + ((FormattableString)$"{Position} {Message}").ToString(formatProvider); +#endif + +#if NET6_0_OR_GREATER + bool ISpanFormattable.TryFormat(Span destination, out int charsWritten, ReadOnlySpan format, IFormatProvider? provider) => + destination.TryWrite(provider, $"{Position} {Message}", out charsWritten); +#endif + + /// + public string ToString(string? format, IFormatProvider? formatProvider) => ToString(formatProvider); + + /// + public override string ToString() => ToString(null); +} diff --git a/src/FarkleNeo/Properties/Resources.cs b/src/FarkleNeo/Properties/Resources.cs index 2b0d90ce..bf73cdf6 100644 --- a/src/FarkleNeo/Properties/Resources.cs +++ b/src/FarkleNeo/Properties/Resources.cs @@ -30,6 +30,20 @@ public static string GetResourceString(string resourceKey, CultureInfo? cultureI return ResourceManager.GetString(resourceKey, cultureInfo)!; } + public static string Format(IFormatProvider? formatProvider, string resourceKey, T arg) + { + if (UsingResourceKeys()) + { +#if NET6_0_OR_GREATER + return string.Create(formatProvider, $"{resourceKey}, {arg}"); +#else + return ((FormattableString)$"{resourceKey}, {arg}").ToString(formatProvider); +#endif + } + + return string.Format(formatProvider, ResourceManager.GetString(resourceKey, culture: formatProvider as CultureInfo), arg); + } + public static string Grammar_TooNewFormat => GetResourceString(nameof(Grammar_TooNewFormat)); public static string Grammar_TooOldFormat => GetResourceString(nameof(Grammar_TooOldFormat)); @@ -53,4 +67,6 @@ public static string GetResourceString(string resourceKey, CultureInfo? cultureI public static string ChainedTokenizerBuilder_NoDefaultTokenizer => GetResourceString(nameof(ChainedTokenizerBuilder_NoDefaultTokenizer)); public static string Tokenizer_AlreadySuspended => GetResourceString(nameof(Tokenizer_AlreadySuspended)); + + public static string Parser_UnrecognizedToken => GetResourceString(nameof(Parser_UnrecognizedToken)); } diff --git a/src/FarkleNeo/Properties/Resources.el.resx b/src/FarkleNeo/Properties/Resources.el.resx index 84f2c0a4..a3fedd2d 100644 --- a/src/FarkleNeo/Properties/Resources.el.resx +++ b/src/FarkleNeo/Properties/Resources.el.resx @@ -48,4 +48,7 @@ Ο λεκτικός αναλυτής έχει ήδη παυθεί. + + Άγνωστο σύμβολο '{0}' + \ No newline at end of file diff --git a/src/FarkleNeo/Properties/Resources.resx b/src/FarkleNeo/Properties/Resources.resx index 42ca5993..94d9267a 100644 --- a/src/FarkleNeo/Properties/Resources.resx +++ b/src/FarkleNeo/Properties/Resources.resx @@ -1,4 +1,4 @@ - + text/microsoft-resx @@ -48,4 +48,7 @@ The tokenizer has already been suspended. + + Unrecognized token '{0}' + \ No newline at end of file From a7b98b699f541547744c20260ee4393e33a9069d Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Wed, 16 Aug 2023 17:28:19 +0300 Subject: [PATCH 02/20] Start writing the default tokenizer. --- .../Parser/Implementation/DefaultTokenizer.cs | 110 ++++++++++++++++++ .../Parser/Implementation/ParserCommon.cs | 42 +++++++ 2 files changed, 152 insertions(+) create mode 100644 src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs create mode 100644 src/FarkleNeo/Parser/Implementation/ParserCommon.cs diff --git a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs new file mode 100644 index 00000000..7064c9b1 --- /dev/null +++ b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs @@ -0,0 +1,110 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using Farkle.Diagnostics; +using Farkle.Grammars; +using Farkle.Grammars.StateMachines; +using Farkle.Parser.Semantics; +using Farkle.Parser.Tokenizers; +using System.Diagnostics; + +namespace Farkle.Parser.Implementation; + +internal sealed class DefaultTokenizer : Tokenizer +{ + private readonly Grammar _grammar; + private readonly Dfa _dfa; + + public DefaultTokenizer(Grammar grammar, Dfa dfa) + { + Debug.Assert(!dfa.HasConflicts); + _grammar = grammar; + _dfa = dfa; + } + + private (TokenSymbolHandle AcceptSymbol, int CharactersRead, int TokenizerState) TokenizeDfa(ReadOnlySpan chars, bool isFinal, bool ignoreLeadingErrors = false) + { + TokenSymbolHandle acceptSymbol = default; + int acceptSymbolLength = 0; + + int currentState = _dfa.InitialState; + int i; + for (i = 0; i < chars.Length; i++) + { + TChar c = chars[i]; + int nextState = _dfa.NextState(currentState, c); + if (nextState >= 0) + { + ignoreLeadingErrors = false; + currentState = nextState; + if (_dfa.GetAcceptSymbol(currentState) is { HasValue: true } s) + { + acceptSymbol = s; + acceptSymbolLength = i + 1; + } + } + else if (!ignoreLeadingErrors) + { + goto Return; + } + } + + if (!isFinal) + { + acceptSymbol = default; + } + + Return: + if (acceptSymbol.HasValue) + { + return (acceptSymbol, acceptSymbolLength, currentState); + } + return (default, i + 1, currentState); + } + + public override bool TryGetNextToken(ref ParserInputReader input, ITokenSemanticProvider semanticProvider, out TokenizerResult result) + { + ref ParserState state = ref input.State; + while (true) + { + if (input.RemainingCharacters.IsEmpty) + { + result = default; + return false; + } + + var (acceptSymbol, charactersRead, tokenizerState) = + TokenizeDfa(input.RemainingCharacters, input.IsFinalBlock); + ReadOnlySpan lexeme = input.RemainingCharacters[..charactersRead]; + + if (acceptSymbol.HasValue) + { + TokenSymbolAttributes symbolFlags = _grammar.GetTokenSymbol(acceptSymbol).Attributes; + if ((symbolFlags & TokenSymbolAttributes.Terminal) != 0) + { + object? semanticValue = semanticProvider.Transform(ref state, acceptSymbol, lexeme); + result = TokenizerResult.CreateSuccess(acceptSymbol, semanticValue, state.CurrentPosition); + input.Consume(charactersRead); + return true; + } + if ((symbolFlags & TokenSymbolAttributes.Noise) != 0) + { + input.Consume(charactersRead); + continue; + } + } + + if (!input.IsFinalBlock && charactersRead == input.RemainingCharacters.Length) + { + input.SuspendTokenizer(this); + result = default; + return false; + } + + string errorText = ParserCommon.GetAbbreviatedLexicalErrorText(lexeme); + result = TokenizerResult.CreateError(new ParserDiagnostic(state.CurrentPosition, + new LexicalError(errorText, tokenizerState))); + return true; + } + } +} diff --git a/src/FarkleNeo/Parser/Implementation/ParserCommon.cs b/src/FarkleNeo/Parser/Implementation/ParserCommon.cs new file mode 100644 index 00000000..5699da2c --- /dev/null +++ b/src/FarkleNeo/Parser/Implementation/ParserCommon.cs @@ -0,0 +1,42 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +namespace Farkle.Parser.Implementation; + +internal static class ParserCommon +{ + private static string GetAbbreviatedLexicalErrorText(ReadOnlySpan chars) + { + const int MaxLength = 20; + bool isAbbreviated = false; + int eolIndex = chars.IndexOfAny('\n', '\r'); + if (eolIndex >= 0) + { + chars = chars[..eolIndex]; + isAbbreviated = true; + } + if (chars.Length > MaxLength) + { + chars = chars[..MaxLength]; + isAbbreviated = true; + } + if (!isAbbreviated) + { + return chars.ToString(); + } +#if NET6_0_OR_GREATER + return $"{chars}…"; +#else + return $"{chars.ToString()}…"; +#endif + } + + public static unsafe string GetAbbreviatedLexicalErrorText(ReadOnlySpan chars) + { + if (typeof(TChar) == typeof(char)) + { + return GetAbbreviatedLexicalErrorText(*(ReadOnlySpan*)&chars); + } + throw new NotImplementedException(); + } +} From ddfcc37db8d9e5e5203c4f459ea1efde341b39a8 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Wed, 16 Aug 2023 22:31:33 +0300 Subject: [PATCH 03/20] Add an optimized stack type. This is a port of Farkle 6's `StackNeo`. --- src/FarkleNeo/Collections/ValueStack.cs | 189 ++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 src/FarkleNeo/Collections/ValueStack.cs diff --git a/src/FarkleNeo/Collections/ValueStack.cs b/src/FarkleNeo/Collections/ValueStack.cs new file mode 100644 index 00000000..e4da379f --- /dev/null +++ b/src/FarkleNeo/Collections/ValueStack.cs @@ -0,0 +1,189 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using System.Buffers; +using System.Diagnostics; +#if NETCOREAPP || NETSTANDARD2_1_OR_GREATER +using System.Runtime.CompilerServices; +#endif + +namespace Farkle.Collections; + +/// +/// A stack type that can store its items in stack-allocated memory. +/// +[DebuggerDisplay("Count = {Count}")] +internal ref struct ValueStack +{ + private Span _items; + private T[]? _pooledArray; + private int _count; + + private const int InitialCapacity = 4; + + private static bool ShouldResetItems => +#if NETCOREAPP || NETSTANDARD2_1_OR_GREATER + RuntimeHelpers.IsReferenceOrContainsReferences(); +#else + // On .NET Standard 2.0 it might return false positives but that's fine. + // We will use this value only for optimizations. + !typeof(T).IsPrimitive; +#endif + + public ValueStack(int initialCapacity) + { + ArgumentOutOfRangeExceptionCompat.ThrowIfNegative(initialCapacity); + _items = _pooledArray = ArrayPool.Shared.Rent(initialCapacity); + _count = 0; + } + + public ValueStack(Span items) + { + _items = items; + _pooledArray = null; + _count = 0; + } + + public ValueStack(State state) + { + _items = _pooledArray = state.Items; + _count = state.Count; + } + + private void Grow() + { + int newCapacity = _items.Length switch + { + 0 => InitialCapacity, + var length => length * 2 + }; + T[] newArray = ArrayPool.Shared.Rent(newCapacity); + _items.CopyTo(newArray); + if (_pooledArray is not null) + { + if (ShouldResetItems) + { + _pooledArray.AsSpan().Clear(); + } + ArrayPool.Shared.Return(_pooledArray); + } + _items = _pooledArray = newArray; + } + + public readonly int Count => _count; + + [DebuggerBrowsable(DebuggerBrowsableState.RootHidden)] + public readonly Span AllItems => _items[.._count]; + + public void Push(T item) + { + if (_count == _items.Length) + { + Grow(); + } + _items[_count++] = item; + } + + public T Pop() + { + if (_count == 0) + { + ThrowHelpers.ThrowInvalidOperationException(); + } + + ref T resultRef = ref _items[_count - 1]; + _count--; + T result = resultRef; + if (ShouldResetItems) + { + resultRef = default!; + } + return result; + } + + public void PopMany(int itemsToPop) + { + if ((uint)itemsToPop > (uint)_count) + { + ThrowHelpers.ThrowArgumentOutOfRangeException(nameof(itemsToPop)); + } + + if (ShouldResetItems) + { + _items[^itemsToPop..].Clear(); + } + _count -= itemsToPop; + } + + public void Clear() + { + if (_count == 0) + { + return; + } + if (ShouldResetItems) + { + _items[.._count].Clear(); + } + _count = 0; + } + + public void Dispose() + { + Clear(); + if (_pooledArray is not null) + { + ArrayPool.Shared.Return(_pooledArray); + _pooledArray = null; + } + } + + public readonly T Peek(int indexFromTheEnd = 0) + { + if ((uint)indexFromTheEnd >= (uint)_count) + { + ThrowHelpers.ThrowArgumentOutOfRangeException(nameof(indexFromTheEnd)); + } + return _items[_count - 1 - indexFromTheEnd]; + } + + public readonly Span PeekMany(int itemsToPeek) + { + if ((uint)itemsToPeek > (uint)_count) + { + ThrowHelpers.ThrowArgumentOutOfRangeException(nameof(itemsToPeek)); + } + return _items.Slice(_count - itemsToPeek, itemsToPeek); + } + + public State ExportState() + { + if (_pooledArray is null) + { + _pooledArray = ArrayPool.Shared.Rent(_items.Length); + _items.CopyTo(_pooledArray); + _items = _pooledArray; + } + return new State(_pooledArray, _count); + } + + public readonly struct State : IDisposable + { + public readonly T[] Items; + public readonly int Count; + + internal State(T[] items, int count) + { + Items = items; + Count = count; + } + + public void Dispose() + { + if (Items is not null) + { + ArrayPool.Shared.Return(Items); + } + } + } +} From 95037d7a7be84ae8bc6c07b50eec17f0ba9656e9 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Fri, 18 Aug 2023 00:08:53 +0300 Subject: [PATCH 04/20] Add internal grammar APIs related to groups. --- src/FarkleNeo/Grammars/Grammar.cs | 6 ++++++ src/FarkleNeo/Grammars/Group.cs | 22 ++++++++++++---------- src/FarkleNeo/Grammars/TokenSymbol.cs | 19 +++++++++++++++++++ 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/FarkleNeo/Grammars/Grammar.cs b/src/FarkleNeo/Grammars/Grammar.cs index 1b2a01de..144b45f1 100644 --- a/src/FarkleNeo/Grammars/Grammar.cs +++ b/src/FarkleNeo/Grammars/Grammar.cs @@ -217,6 +217,12 @@ public TokenSymbol GetTokenSymbol(TokenSymbolHandle handle) return new(this, handle); } + internal Group GetGroup(uint index) + { + Debug.Assert(index > 0 && index <= GrammarTables.GroupRowCount); + return new(this, index); + } + /// /// Gets the pointed by the given . /// diff --git a/src/FarkleNeo/Grammars/Group.cs b/src/FarkleNeo/Grammars/Group.cs index 38f3abf8..e71bf988 100644 --- a/src/FarkleNeo/Grammars/Group.cs +++ b/src/FarkleNeo/Grammars/Group.cs @@ -18,19 +18,19 @@ public readonly struct Group { private readonly Grammar _grammar; - private readonly uint _tableIndex; + internal uint Index { get; } internal Group(Grammar grammar, uint tableIndex) { _grammar = grammar; - _tableIndex = tableIndex; + Index = tableIndex; } [StackTraceHidden] private void AssertHasValue() { Debug.Assert(_grammar is not null); - if (_tableIndex == 0) + if (Index == 0) { ThrowHelpers.ThrowHandleHasNoValue(); } @@ -44,7 +44,7 @@ public StringHandle Name get { AssertHasValue(); - return _grammar.GrammarTables.GetGroupName(_grammar.GrammarFile, _tableIndex); + return _grammar.GrammarTables.GetGroupName(_grammar.GrammarFile, Index); } } @@ -56,7 +56,7 @@ public TokenSymbolHandle Container get { AssertHasValue(); - return _grammar.GrammarTables.GetGroupContainer(_grammar.GrammarFile, _tableIndex); + return _grammar.GrammarTables.GetGroupContainer(_grammar.GrammarFile, Index); } } @@ -68,7 +68,7 @@ public GroupAttributes Attributes get { AssertHasValue(); - return _grammar.GrammarTables.GetGroupFlags(_grammar.GrammarFile, _tableIndex); + return _grammar.GrammarTables.GetGroupFlags(_grammar.GrammarFile, Index); } } @@ -80,7 +80,7 @@ public TokenSymbolHandle Start get { AssertHasValue(); - return _grammar.GrammarTables.GetGroupStart(_grammar.GrammarFile, _tableIndex); + return _grammar.GrammarTables.GetGroupStart(_grammar.GrammarFile, Index); } } @@ -92,14 +92,14 @@ public TokenSymbolHandle End get { AssertHasValue(); - return _grammar.GrammarTables.GetGroupEnd(_grammar.GrammarFile, _tableIndex); + return _grammar.GrammarTables.GetGroupEnd(_grammar.GrammarFile, Index); } } internal (uint Offset, uint NextOffset) GetNestingBounds(ReadOnlySpan grammarFile, in GrammarTables grammarTables) { - uint firstNesting = grammarTables.GetGroupFirstNesting(grammarFile, _tableIndex); - uint firstNestingOfNext = _tableIndex < (uint)grammarTables.GroupNestingRowCount - 1 ? grammarTables.GetGroupFirstNesting(grammarFile, _tableIndex + 1) : (uint)grammarTables.GroupNestingRowCount; + uint firstNesting = grammarTables.GetGroupFirstNesting(grammarFile, Index); + uint firstNestingOfNext = Index < (uint)grammarTables.GroupNestingRowCount - 1 ? grammarTables.GetGroupFirstNesting(grammarFile, Index + 1) : (uint)grammarTables.GroupNestingRowCount; Debug.Assert(firstNesting <= firstNestingOfNext); return (firstNesting, firstNestingOfNext); } @@ -118,6 +118,8 @@ internal bool CanGroupNest(ReadOnlySpan grammarFile, in GrammarTables gram return false; } + internal bool CanGroupNest(uint groupIndex) => CanGroupNest(_grammar.GrammarFile, in _grammar.GrammarTables, groupIndex); + /// /// A collection of the s that can be nested inside this . /// diff --git a/src/FarkleNeo/Grammars/TokenSymbol.cs b/src/FarkleNeo/Grammars/TokenSymbol.cs index 7b6ae5dc..187e2260 100644 --- a/src/FarkleNeo/Grammars/TokenSymbol.cs +++ b/src/FarkleNeo/Grammars/TokenSymbol.cs @@ -1,6 +1,8 @@ // Copyright © Theodore Tsirpanis and Contributors. // SPDX-License-Identifier: MIT +using System.Diagnostics; + namespace Farkle.Grammars; /// @@ -56,6 +58,23 @@ public TokenSymbolAttributes Attributes } } + internal uint GetStartedGroup() + { + Debug.Assert((Attributes & TokenSymbolAttributes.GroupStart) != 0); + var grammarFile = _grammar.GrammarFile; + var groupCount = _grammar.GrammarTables.GroupRowCount; + for (int i = 1; i <= groupCount; i++) + { + var groupStart = _grammar.GrammarTables.GetGroupStart(grammarFile, (uint)i); + if (groupStart == Handle) + { + return (uint)i; + } + } + + return 0; + } + /// /// Returns a string describing the the . /// From 9d2a0a9b9ff8fcb8fc89fcd3aa53d22a938cb263 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Sun, 20 Aug 2023 20:56:11 +0300 Subject: [PATCH 05/20] Support tokenizing groups. --- .../UnexpectedEndOfInputInGroupError.cs | 38 +++ .../Parser/Implementation/DefaultTokenizer.cs | 231 +++++++++++++++++- src/FarkleNeo/Properties/Resources.cs | 2 + src/FarkleNeo/Properties/Resources.el.resx | 3 + src/FarkleNeo/Properties/Resources.resx | 3 + 5 files changed, 271 insertions(+), 6 deletions(-) create mode 100644 src/FarkleNeo/Diagnostics/UnexpectedEndOfInputInGroupError.cs diff --git a/src/FarkleNeo/Diagnostics/UnexpectedEndOfInputInGroupError.cs b/src/FarkleNeo/Diagnostics/UnexpectedEndOfInputInGroupError.cs new file mode 100644 index 00000000..b84bff7f --- /dev/null +++ b/src/FarkleNeo/Diagnostics/UnexpectedEndOfInputInGroupError.cs @@ -0,0 +1,38 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +namespace Farkle.Diagnostics; + +/// +/// Contains information about a tokenizer error where input ended inside a group. +/// +public sealed class UnexpectedEndOfInputInGroupError : IFormattable +{ + /// + /// The name of the group that was left open at the time input ended. + /// + /// + /// In case of nested groups, this property contains the name of the innermost group. + /// + public string GroupName { get; } + + /// + /// Creates a . + /// + /// The value of . + /// is . + public UnexpectedEndOfInputInGroupError(string groupName) + { + ArgumentNullExceptionCompat.ThrowIfNull(groupName); + GroupName = groupName; + } + + private string ToString(IFormatProvider? formatProvider) => + Resources.Format(formatProvider, nameof(Resources.Parser_UnrecognizedToken), GroupName); + + /// + public string ToString(string? format, IFormatProvider? formatProvider) => ToString(formatProvider); + + /// + public override string ToString() => ToString(null); +} diff --git a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs index 7064c9b1..4c6d2000 100644 --- a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs +++ b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs @@ -1,6 +1,7 @@ // Copyright © Theodore Tsirpanis and Contributors. // SPDX-License-Identifier: MIT +using Farkle.Collections; using Farkle.Diagnostics; using Farkle.Grammars; using Farkle.Grammars.StateMachines; @@ -10,7 +11,7 @@ namespace Farkle.Parser.Implementation; -internal sealed class DefaultTokenizer : Tokenizer +internal sealed class DefaultTokenizer : Tokenizer, ITokenizerResumptionPoint.GroupState> { private readonly Grammar _grammar; private readonly Dfa _dfa; @@ -20,6 +21,10 @@ public DefaultTokenizer(Grammar grammar, Dfa dfa) Debug.Assert(!dfa.HasConflicts); _grammar = grammar; _dfa = dfa; + // If a grammar does not have any groups, we will suspend only to return + // to the main tokenizer entry point. Without a wrapping, it would be called + // either way regardless of suspending. + CanSkipChainedTokenizerWrapping = grammar.Groups.Count == 0; } private (TokenSymbolHandle AcceptSymbol, int CharactersRead, int TokenizerState) TokenizeDfa(ReadOnlySpan chars, bool isFinal, bool ignoreLeadingErrors = false) @@ -62,6 +67,181 @@ public DefaultTokenizer(Grammar grammar, Dfa dfa) return (default, i + 1, currentState); } + /// + /// Moves forward with tokenizing a group. + /// + /// if a token was found or the tokenizer failed. + /// if more characters are needed. In the latter case + /// callers need to suspend. + public bool TokenizeGroup(ref ParserInputReader input, bool isNoise, ref ValueStack groupStack, ref int groupLength, out ParserDiagnostic? error) + { + // In Farkle 6, we were tracking two positions in CharStream (the predecessor of ParserInputReader). + // The "current position" was the position where RemainingCharacters would start from, and the + // "starting index" was the index of the first character that we must keep in the buffer. When parsing + // simple terminals, these indices would be the same, but when parsing groups, the starting index was + // storing the start of the outermost group, and the current position was moving forward as the + // characters inside the group were being read. + // Farkle 7 simplifies this by tracking only one position, the characters before which can be discarded. + // Therefore we have to do some bookkeeping ourselves to keep the position without consuming it and + // throwing it away, and use a local variable to store the remaining characters. + ReadOnlySpan chars = input.RemainingCharacters[groupLength..]; + while (groupStack.Count != 0) + { + Group currentGroup = _grammar.GetGroup(groupStack.Peek()); + GroupAttributes groupAttributes = currentGroup.Attributes; + // Check if we ran out of input. + if (chars.IsEmpty) + { + // If this is the final block of input, end the group if it can end when input ends. + // Otherwise report an error. + if (input.IsFinalBlock) + { + if ((groupAttributes & GroupAttributes.EndsOnEndOfInput) != 0) + { + groupStack.Pop(); + continue; + } + // Consume all remaining characters to get the position at the end of input. + // If we are in a noise group, they are already consumed and this will do nothing. + input.Consume(input.RemainingCharacters.Length); + error = new(input.State.CurrentPosition, new UnexpectedEndOfInputInGroupError(_grammar.GetString(currentGroup.Name))); + return true; + } + // If this is not the final block, we have to update the group's length and suspend. + groupLength = input.RemainingCharacters.Length - chars.Length; + error = null; + return false; + } + // When inside token groups, we ignore invalid characters at + // the beginning to avoid discarding just one and repeat the loop. + // We limit this optimization to those that keep the end token because + // we cannot accurately determine where the final invalid characters end + // and the group ending starts. It would be easy because group ends are + // literal strings (except on line groups which are character groups) + // but that's an assumption we'd better not be based on. + bool ignoreLeadingErrors = (groupAttributes & (GroupAttributes.AdvanceByCharacter | GroupAttributes.KeepEndToken)) == 0; + var (acceptSymbol, charactersRead, _) = + TokenizeDfa(chars, input.IsFinalBlock, ignoreLeadingErrors); + // The DFA found something. + if (acceptSymbol.HasValue) + { + TokenSymbol s = _grammar.GetTokenSymbol(acceptSymbol); + TokenSymbolAttributes symbolAttributes = s.Attributes; + // A new group begins. + if ((symbolAttributes & TokenSymbolAttributes.GroupStart) != 0) + { + Group newGroup = _grammar.GetGroup(s.GetStartedGroup()); + // The group is allowed to nest into this one. + if (newGroup.CanGroupNest(currentGroup.Index)) + { + ConsumeInput(ref input, ref chars, charactersRead, isNoise); + groupStack.Push(newGroup.Index); + continue; + } + } + // A symbol is found that ends the current group. + else if (acceptSymbol == currentGroup.End) + { + if ((groupAttributes & GroupAttributes.KeepEndToken) != 0) + { + ConsumeInput(ref input, ref chars, charactersRead, isNoise); + } + groupStack.Pop(); + continue; + } + } + // If the DFA found nothing and reached the end, we have to suspend and wait for more input. + if (!input.IsFinalBlock && charactersRead == chars.Length) + { + Debug.Assert(!acceptSymbol.HasValue); + groupLength = input.RemainingCharacters.Length - chars.Length; + error = null; + return false; + } + // The existing group is continuing. + if ((groupAttributes & GroupAttributes.AdvanceByCharacter) == 0) + { + ConsumeInput(ref input, ref chars, charactersRead, isNoise); + } + else + { + ConsumeInput(ref input, ref chars, 1, isNoise); + // TODO: Optimize by quickly searching for the next interesting character like in Farkle 6. + } + continue; + } + + groupLength = input.RemainingCharacters.Length - chars.Length; + error = null; + return true; + + static void ConsumeInput(ref ParserInputReader input, ref ReadOnlySpan chars, int count, bool isNoise) + { + chars = chars[count..]; + // If the outermost group is a noise group, we actually consume the input, to support discarding the characters. + if (isNoise) + { + input.Consume(count); + Debug.Assert(input.RemainingCharacters == chars); + } + } + } + + /// + /// Starts tokenizing a group. + /// + private unsafe bool TokenizeGroup(ref ParserInputReader input, Group group, out int charactersRead, out ParserDiagnostic? error) + { + TokenSymbolHandle groupContainerSymbol = group.Container; + bool isNoise = !_grammar.IsTerminal(groupContainerSymbol); + ValueStack groupStack = new(stackalloc uint[4]); + groupStack.Push(group.Index); + charactersRead = 0; +#pragma warning disable CS9080 // Use of variable in this context may expose referenced variables outside of their declaration scope + // The compiler cannot prove that the stack pointers of groupStack will not leak to + // input, so it raises an error. We convert it to a warning with the use of unsafe, + // and suppress the warning. + bool finished = TokenizeGroup(ref input, isNoise, ref groupStack, ref charactersRead, out error); +#pragma warning restore CS9080 // Use of variable in this context may expose referenced variables outside of their declaration scope + if (finished) + { + groupStack.Dispose(); + } + else + { + input.SuspendTokenizer(this, GroupState.Create(ref groupStack, groupContainerSymbol, isNoise, charactersRead)); + } + return finished; + } + + bool ITokenizerResumptionPoint.TryGetNextToken(ref ParserInputReader input, ITokenSemanticProvider semanticProvider, GroupState arg, out TokenizerResult result) + { + ValueStack groupStack = new(arg.GroupStackState); + int charactersRead = arg.CharactersRead; + if (TokenizeGroup(ref input, arg.IsNoise, ref groupStack, ref charactersRead, out ParserDiagnostic? error)) + { + groupStack.Dispose(); + if (error is not null) + { + result = TokenizerResult.CreateError(error); + return true; + } + // The group had been a noise group. + // We return to the regular tokenizer logic. + if (arg.IsNoise) + { + return TryGetNextToken(ref input, semanticProvider, out result); + } + object? semanticValue = semanticProvider.Transform(ref input.State, arg.GroupContainerSymbol, input.RemainingCharacters[..charactersRead]); + result = TokenizerResult.CreateSuccess(arg.GroupContainerSymbol, semanticValue, input.State.CurrentPosition); + input.Consume(charactersRead); + return true; + } + input.SuspendTokenizer(this, arg.Update(ref groupStack, charactersRead)); + result = default; + return false; + } + public override bool TryGetNextToken(ref ParserInputReader input, ITokenSemanticProvider semanticProvider, out TokenizerResult result) { ref ParserState state = ref input.State; @@ -79,19 +259,39 @@ public override bool TryGetNextToken(ref ParserInputReader input, ITokenS if (acceptSymbol.HasValue) { - TokenSymbolAttributes symbolFlags = _grammar.GetTokenSymbol(acceptSymbol).Attributes; - if ((symbolFlags & TokenSymbolAttributes.Terminal) != 0) + if (_grammar.IsTerminal(acceptSymbol)) { object? semanticValue = semanticProvider.Transform(ref state, acceptSymbol, lexeme); result = TokenizerResult.CreateSuccess(acceptSymbol, semanticValue, state.CurrentPosition); input.Consume(charactersRead); return true; } - if ((symbolFlags & TokenSymbolAttributes.Noise) != 0) + TokenSymbol tokenSymbol = _grammar.GetTokenSymbol(acceptSymbol); + TokenSymbolAttributes symbolAttributes = tokenSymbol.Attributes; + if ((symbolAttributes & TokenSymbolAttributes.GroupStart) != 0) { - input.Consume(charactersRead); - continue; + Group group = _grammar.GetGroup(tokenSymbol.GetStartedGroup()); + if (!TokenizeGroup(ref input, group, out charactersRead, out ParserDiagnostic? error)) + { + result = default; + return false; + } + if (error is not null) + { + result = TokenizerResult.CreateError(error); + return true; + } + if (_grammar.IsTerminal(group.Container)) + { + object? semanticValue = semanticProvider.Transform(ref state, acceptSymbol, input.RemainingCharacters[..charactersRead]); + result = TokenizerResult.CreateSuccess(acceptSymbol, semanticValue, state.CurrentPosition); + input.Consume(charactersRead); + return true; + } + Debug.Assert(charactersRead == 0); } + input.Consume(charactersRead); + continue; } if (!input.IsFinalBlock && charactersRead == input.RemainingCharacters.Length) @@ -107,4 +307,23 @@ public override bool TryGetNextToken(ref ParserInputReader input, ITokenS return true; } } + + /// + /// Contains the state of a suspended group tokenization operation. + /// + private readonly struct GroupState + { + public ValueStack.State GroupStackState { get; init; } + public TokenSymbolHandle GroupContainerSymbol { get; init; } + public bool IsNoise { get; init; } + public int CharactersRead { get; init; } + + private GroupState(GroupState groupState) => this = groupState; + + public static GroupState Create(ref ValueStack groupStack, TokenSymbolHandle groupContainerSymbol, bool isNoise, int charactersRead) + => new() { GroupStackState = groupStack.ExportState(), GroupContainerSymbol = groupContainerSymbol, IsNoise = isNoise, CharactersRead = charactersRead }; + + public GroupState Update(ref ValueStack groupStack, int charactersRead) + => new(this) { GroupStackState = groupStack.ExportState(), CharactersRead = charactersRead }; + } } diff --git a/src/FarkleNeo/Properties/Resources.cs b/src/FarkleNeo/Properties/Resources.cs index bf73cdf6..08c2aaca 100644 --- a/src/FarkleNeo/Properties/Resources.cs +++ b/src/FarkleNeo/Properties/Resources.cs @@ -69,4 +69,6 @@ public static string Format(IFormatProvider? formatProvider, string resourceK public static string Tokenizer_AlreadySuspended => GetResourceString(nameof(Tokenizer_AlreadySuspended)); public static string Parser_UnrecognizedToken => GetResourceString(nameof(Parser_UnrecognizedToken)); + + public static string Parser_UnexpectedEndOfInputInGroup => GetResourceString(nameof(Parser_UnexpectedEndOfInputInGroup)); } diff --git a/src/FarkleNeo/Properties/Resources.el.resx b/src/FarkleNeo/Properties/Resources.el.resx index a3fedd2d..d16e96ec 100644 --- a/src/FarkleNeo/Properties/Resources.el.resx +++ b/src/FarkleNeo/Properties/Resources.el.resx @@ -51,4 +51,7 @@ Άγνωστο σύμβολο '{0}' + + Απροσδόκητο τέλος της εισόδου καθώς βρισκόταν μέσα στην ομάδα `{0}` + \ No newline at end of file diff --git a/src/FarkleNeo/Properties/Resources.resx b/src/FarkleNeo/Properties/Resources.resx index 94d9267a..aef38895 100644 --- a/src/FarkleNeo/Properties/Resources.resx +++ b/src/FarkleNeo/Properties/Resources.resx @@ -51,4 +51,7 @@ Unrecognized token '{0}' + + Unexpected end of input while being inside group '{0}' + \ No newline at end of file From 1be2ba836732af7bd6a9ed080349f154b17c3a27 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Fri, 18 Aug 2023 03:17:30 +0300 Subject: [PATCH 06/20] Update documentation. --- src/FarkleNeo/Diagnostics/LexicalError.cs | 2 +- src/FarkleNeo/Grammars/Grammar.cs | 2 +- src/FarkleNeo/Grammars/GroupAttributes.cs | 4 ++++ src/FarkleNeo/Parser/ParserState.cs | 2 ++ src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs | 4 ++-- src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs | 4 +++- 6 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/FarkleNeo/Diagnostics/LexicalError.cs b/src/FarkleNeo/Diagnostics/LexicalError.cs index 24d5a577..2ff5f6b3 100644 --- a/src/FarkleNeo/Diagnostics/LexicalError.cs +++ b/src/FarkleNeo/Diagnostics/LexicalError.cs @@ -20,7 +20,7 @@ public sealed class LexicalError : IFormattable public string? TokenText { get; } /// - /// The number of the tokenizer's state machine at the time of the error. + /// The state the tokenizer's state machine was at the time of the error. /// public int TokenizerState { get; } diff --git a/src/FarkleNeo/Grammars/Grammar.cs b/src/FarkleNeo/Grammars/Grammar.cs index 144b45f1..07f06acd 100644 --- a/src/FarkleNeo/Grammars/Grammar.cs +++ b/src/FarkleNeo/Grammars/Grammar.cs @@ -309,7 +309,7 @@ public EntityHandle GetSymbolFromSpecialName(string specialName, bool throwIfNot /// Checks whether the given points to a /// token symbol with the flag set. /// - /// The token symbol handle to check; + /// The token symbol handle to check. public bool IsTerminal(TokenSymbolHandle handle) => GrammarTables.IsTerminal(handle); Grammar IGrammarProvider.GetGrammar() => this; diff --git a/src/FarkleNeo/Grammars/GroupAttributes.cs b/src/FarkleNeo/Grammars/GroupAttributes.cs index 60b5fd0d..ab4f52ee 100644 --- a/src/FarkleNeo/Grammars/GroupAttributes.cs +++ b/src/FarkleNeo/Grammars/GroupAttributes.cs @@ -24,5 +24,9 @@ public enum GroupAttributes : ushort /// /// When the group ends, the parser should keep the token that ended the group in the input stream. /// + /// + /// In GOLD Parser and earlier versions of Farkle, groups with this flag set were called to have an + /// "open ending mode". + /// KeepEndToken = 1 << 2 } diff --git a/src/FarkleNeo/Parser/ParserState.cs b/src/FarkleNeo/Parser/ParserState.cs index 82c82836..300c4ab2 100644 --- a/src/FarkleNeo/Parser/ParserState.cs +++ b/src/FarkleNeo/Parser/ParserState.cs @@ -59,6 +59,8 @@ public struct ParserState /// /// When parsing a file this could be the file's path. /// + /// + /// public string? InputName { get; set; } internal void Consume(ReadOnlySpan characters) diff --git a/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs b/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs index 9bf07047..688802e4 100644 --- a/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs +++ b/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs @@ -65,7 +65,7 @@ public override bool TryGetNextToken(ref ParserInputReader input, ITokenS { // We invoke the next tokenizer in the chain. bool foundToken = Components[i].TryGetNextToken(ref input, semanticProvider, out result); - // Because in the main loop when we suspend, we must update NextChainIndex + // Because in the main loop when we suspend we must update NextChainIndex, // we must always check if we have suspended after invoking a tokenizer. // If our tokenizer state is null, we check again in case we have suspended // for the first time. @@ -78,7 +78,7 @@ public override bool TryGetNextToken(ref ParserInputReader input, ITokenS return foundToken; } // After checking for suspension, we stop the loop if we have found a token. - // In this case, the next time the tokenizer will start over. + // In this case, the next time the chain will start all over. if (foundToken) { return true; diff --git a/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs b/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs index b3d7e077..8ecf27aa 100644 --- a/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs +++ b/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs @@ -26,7 +26,9 @@ protected Tokenizer() { } /// it is the only one in the chain. This enables the tokenizer to be directly /// called by the parser, but the consequence is that suspending the tokenizer /// has no effect. It should therefore be used by tokenizers that are known to - /// never suspend. + /// never suspend. An exception to this is when the tokenizer suspends by calling + /// + /// with a resumption point of . /// internal bool CanSkipChainedTokenizerWrapping { get; private protected init; } From 4d768c673ac342ee87485d40e4cd26a282a12163 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Mon, 21 Aug 2023 03:10:04 +0300 Subject: [PATCH 07/20] Add the default parser implementation. --- src/FarkleNeo/Diagnostics/SyntaxError.cs | 54 +++++ .../Parser/Implementation/DefaultParser.cs | 40 ++++ .../DefaultParserImplementation.cs | 195 ++++++++++++++++++ .../Parser/Implementation/ParserCommon.cs | 19 ++ .../Parser/Tokenizers/ChainedTokenizer.cs | 9 + src/FarkleNeo/Utilities.cs | 17 ++ 6 files changed, 334 insertions(+) create mode 100644 src/FarkleNeo/Diagnostics/SyntaxError.cs create mode 100644 src/FarkleNeo/Parser/Implementation/DefaultParser.cs create mode 100644 src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs create mode 100644 src/FarkleNeo/Utilities.cs diff --git a/src/FarkleNeo/Diagnostics/SyntaxError.cs b/src/FarkleNeo/Diagnostics/SyntaxError.cs new file mode 100644 index 00000000..42730a42 --- /dev/null +++ b/src/FarkleNeo/Diagnostics/SyntaxError.cs @@ -0,0 +1,54 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using System.Collections.Immutable; + +namespace Farkle.Diagnostics; + +/// +/// Contains information about a syntax error. +/// +/// +/// A syntax error occurs when the parser encounters a token in an unexpected place. +/// +public sealed class SyntaxError +{ + /// + /// The name of the token found by the parser, or if the end of the input was reached. + /// + /// + /// A value of indicates that the parser encountered the end of the input. + /// + public string? ActualTokenName { get; } + + /// + /// The names of the tokens that the parser expected to find. + /// + /// + /// A value of in the array indicates that the parser also expected the end of the input. + /// + public ImmutableArray ExpectedTokenNames { get; } + + /// + /// The state the parser's state machine was at the time of the error. + /// + public int ParserState { get; } + + /// + /// Creates a . + /// + /// The value of . + /// The value of . + /// The value of . + /// Optional, defaults to -1. + public SyntaxError(string? actualTokenName, ImmutableArray expectedTokenNames, int parserState = -1) + { + if (expectedTokenNames.IsDefault) + { + expectedTokenNames = ImmutableArray.Empty; + } + ActualTokenName = actualTokenName; + ExpectedTokenNames = expectedTokenNames; + ParserState = parserState; + } +} diff --git a/src/FarkleNeo/Parser/Implementation/DefaultParser.cs b/src/FarkleNeo/Parser/Implementation/DefaultParser.cs new file mode 100644 index 00000000..6043c1b9 --- /dev/null +++ b/src/FarkleNeo/Parser/Implementation/DefaultParser.cs @@ -0,0 +1,40 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using Farkle.Grammars; +using Farkle.Grammars.StateMachines; +using Farkle.Parser.Semantics; +using Farkle.Parser.Tokenizers; + +namespace Farkle.Parser.Implementation; + +internal sealed class DefaultParser : CharParser +{ + private readonly DefaultParserImplementation _implementation; + + private DefaultParser(DefaultParserImplementation implementation) + { + _implementation = implementation; + IsFailing = implementation.Tokenizer.IsFailing; + } + + public DefaultParser(Grammar grammar, LrStateMachine lrStateMachine, ISemanticProvider semanticProvider, Tokenizer tokenizer) + : this(DefaultParserImplementation.Create(grammar, lrStateMachine, semanticProvider, ChainedTokenizer.Create(tokenizer))) + { + } + + public override void Run(ref ParserInputReader input, ref ParserCompletionState completionState) + { + _implementation.Run(ref input, ref completionState); + } + + private protected override IGrammarProvider GetGrammarProvider() => _implementation.Grammar; + + private protected override Tokenizer GetTokenizer() => _implementation.Tokenizer; + + private protected override CharParser WithSemanticProviderCore(ISemanticProvider semanticProvider) => + new DefaultParser(_implementation.WithSemanticProvider(semanticProvider)); + + private protected override CharParser WithTokenizerCore(Tokenizer tokenizer) => + new DefaultParser(_implementation.WithTokenizer(ChainedTokenizer.Create(tokenizer))); +} diff --git a/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs b/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs new file mode 100644 index 00000000..3cf37075 --- /dev/null +++ b/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs @@ -0,0 +1,195 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using Farkle.Collections; +using Farkle.Diagnostics; +using Farkle.Grammars; +using Farkle.Grammars.StateMachines; +using Farkle.Parser.Semantics; +using Farkle.Parser.Tokenizers; +using System.Collections.Immutable; +using System.Diagnostics; + +namespace Farkle.Parser.Implementation; + +internal readonly struct DefaultParserImplementation +{ + public Grammar Grammar { get; } + private readonly LrStateMachine _lrStateMachine; + private readonly object _semanticProvider; + public Tokenizer Tokenizer { get; } + + private const int InitialStackCapacity = 64; + + private ITokenSemanticProvider TokenSemanticProvider => Utilities.UnsafeCast>(_semanticProvider); + private IProductionSemanticProvider ProductionSemanticProvider => Utilities.UnsafeCast(_semanticProvider); + + private DefaultParserImplementation(Grammar grammar, LrStateMachine lrStateMachine, object semanticProvider, Tokenizer tokenizer) + { + Grammar = grammar; + _lrStateMachine = lrStateMachine; + _semanticProvider = semanticProvider; + Tokenizer = tokenizer; + } + + public static DefaultParserImplementation Create(Grammar grammar, LrStateMachine lrStateMachine, ISemanticProvider semanticProvider, Tokenizer tokenizer) + { + Debug.Assert(!lrStateMachine.HasConflicts); + return new(grammar, lrStateMachine, semanticProvider, tokenizer); + } + + public DefaultParserImplementation WithTokenizer(Tokenizer tokenizer) => + new(Grammar, _lrStateMachine, _semanticProvider, tokenizer); + + public DefaultParserImplementation WithSemanticProvider(ISemanticProvider semanticProvider) => + new(Grammar, _lrStateMachine, semanticProvider, Tokenizer); + + private void Reduce(ref ParserInputReader input, ref int currentState, ref ValueStack stateStack, + ref ValueStack semanticValueStack, ProductionHandle production) + { + Production p = Grammar.GetProduction(production); + int membersLength = p.Members.Count; + int goFromState = stateStack.Peek(membersLength); + int gotoState = _lrStateMachine.GetGoto(goFromState, p.Head); + object? semanticValue = ProductionSemanticProvider.Fuse(ref input.State, p.Handle, semanticValueStack.PeekMany(membersLength)); + semanticValueStack.PopMany(membersLength); + semanticValueStack.Push(semanticValue); + stateStack.PopMany(membersLength); + stateStack.Push(gotoState); + currentState = gotoState; + } + + private RunResult Run(ref ParserInputReader input, ref int currentState, ref ValueStack stateStack, ref ValueStack semanticValueStack, out object? result) + { + bool foundToken = Tokenizer.TryGetNextToken(ref input, TokenSemanticProvider, out TokenizerResult token); + while (true) + { + if (!foundToken) + { + if (!input.IsFinalBlock) + { + result = null; + return RunResult.NeedsMoreInput; + } + LrEndOfFileAction eofAction = _lrStateMachine.GetEndOfFileAction(currentState); + if (eofAction.IsAccept) + { + result = semanticValueStack.Peek(); + return RunResult.Success; + } + if (eofAction.IsReduce) + { + Reduce(ref input, ref currentState, ref stateStack, ref semanticValueStack, eofAction.ReduceProduction); + continue; + } + } + else if (!token.IsSuccess) + { + result = token.Data; + return RunResult.Failure; + } + else + { + LrAction action = _lrStateMachine.GetAction(currentState, token.Symbol); + if (action.IsShift) + { + currentState = action.ShiftState; + stateStack.Push(currentState); + semanticValueStack.Push(token.Data); + foundToken = Tokenizer.TryGetNextToken(ref input, TokenSemanticProvider, out token); + continue; + } + if (action.IsReduce) + { + Reduce(ref input, ref currentState, ref stateStack, ref semanticValueStack, action.ReduceProduction); + continue; + } + } + TextPosition errorPos = foundToken ? token.Position : input.State.CurrentPosition; + string? actualTokenName = foundToken ? Grammar.GetString(Grammar.GetTokenSymbol(token.Symbol).Name) : null; + ImmutableArray expectedTokens = ParserCommon.GetExpectedSymbols(Grammar, _lrStateMachine[currentState]); + result = new ParserDiagnostic(errorPos, new SyntaxError(actualTokenName, expectedTokens, currentState)); + return RunResult.Failure; + } + } + + private unsafe void RunOneShot(ref ParserInputReader input, ref ParserCompletionState completionState) + { + ValueStack stateStack = new(stackalloc int[InitialStackCapacity]); + ValueStack semanticValueStack = new(InitialStackCapacity); + int currentState = _lrStateMachine.InitialState; +#pragma warning disable CS9080 // Use of variable in this context may expose referenced variables outside of their declaration scope + RunResult runResult = Run(ref input, ref currentState, ref stateStack, ref semanticValueStack, out object? result); +#pragma warning restore CS9080 // Use of variable in this context may expose referenced variables outside of their declaration scope + switch (runResult) + { + case RunResult.Success: + completionState.SetSuccess((T)result!); + break; + case RunResult.Failure: + Debug.Assert(result is not null); + completionState.SetError(result); + break; + } + } + + public void Run(ref ParserInputReader input, ref ParserCompletionState completionState) + { + if (input.IsFinalBlock && !input.State.TryGetValue(typeof(State), out _)) + { + RunOneShot(ref input, ref completionState); + return; + } + State state = State.GetOrCreate(_lrStateMachine, ref input.State); + var stateStack = new ValueStack(state.StateStack); + var semanticValueStack = new ValueStack(state.SemanticValueStack); + try + { + RunResult result = Run(ref input, ref state.CurrentState, ref stateStack, ref semanticValueStack, out object? runResult); + switch (result) + { + case RunResult.Success: + completionState.SetSuccess((T)runResult!); + break; + case RunResult.Failure: + Debug.Assert(runResult is not null); + completionState.SetError(runResult); + break; + } + } + finally + { + state.StateStack = stateStack.ExportState(); + state.SemanticValueStack = semanticValueStack.ExportState(); + } + } + + private enum RunResult + { + Success, + Failure, + NeedsMoreInput + } + + private sealed class State + { + public int CurrentState; + public ValueStack.State StateStack; + public ValueStack.State SemanticValueStack; + + public static State GetOrCreate(LrStateMachine lrStateMachine, ref ParserState parserState) + { + if (!parserState.TryGetValue(typeof(State), out object? state)) + { + state = new State + { + CurrentState = lrStateMachine.InitialState, + StateStack = new ValueStack(InitialStackCapacity).ExportState(), + SemanticValueStack = new ValueStack(InitialStackCapacity).ExportState() + }; + parserState.SetValue(typeof(State), state); + } + return (State)state; + } + } +} diff --git a/src/FarkleNeo/Parser/Implementation/ParserCommon.cs b/src/FarkleNeo/Parser/Implementation/ParserCommon.cs index 5699da2c..51d83df5 100644 --- a/src/FarkleNeo/Parser/Implementation/ParserCommon.cs +++ b/src/FarkleNeo/Parser/Implementation/ParserCommon.cs @@ -1,6 +1,10 @@ // Copyright © Theodore Tsirpanis and Contributors. // SPDX-License-Identifier: MIT +using Farkle.Grammars; +using Farkle.Grammars.StateMachines; +using System.Collections.Immutable; + namespace Farkle.Parser.Implementation; internal static class ParserCommon @@ -39,4 +43,19 @@ public static unsafe string GetAbbreviatedLexicalErrorText(ReadOnlySpan GetExpectedSymbols(Grammar grammar, LrState state) + { + int count = state.Actions.Count + (state.EndOfFileActions.Count > 0 ? 1 : 0); + var builder = ImmutableArray.CreateBuilder(count); + foreach (var action in state.Actions) + { + builder.Add(grammar.GetString(grammar.GetTokenSymbol(action.Key).Name)); + } + if (state.EndOfFileActions.Count > 0) + { + builder.Add(null); + } + return builder.MoveToImmutable(); + } } diff --git a/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs b/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs index 688802e4..24c70c46 100644 --- a/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs +++ b/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs @@ -18,6 +18,15 @@ private ChainedTokenizer(ImmutableArray> components) Components = components; } + internal static Tokenizer Create(Tokenizer tokenizer) + { + if (tokenizer.CanSkipChainedTokenizerWrapping) + { + return tokenizer; + } + return new ChainedTokenizer(ImmutableArray.Create(tokenizer)); + } + internal static Tokenizer Create(ImmutableArray> components) { Debug.Assert(!components.IsDefaultOrEmpty); diff --git a/src/FarkleNeo/Utilities.cs b/src/FarkleNeo/Utilities.cs new file mode 100644 index 00000000..dc2f6fc9 --- /dev/null +++ b/src/FarkleNeo/Utilities.cs @@ -0,0 +1,17 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace Farkle; + +internal static class Utilities +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static T UnsafeCast(object o) where T : class + { + Debug.Assert(o is T); + return Unsafe.As(o); + } +} From 50479b228484417701a4bd1641dbbe7238bf14ae Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Tue, 22 Aug 2023 02:59:48 +0300 Subject: [PATCH 08/20] Format and localize syntax errors. --- src/FarkleNeo/Diagnostics/DelimitedString.cs | 49 ++++++++++++++++++++ src/FarkleNeo/Diagnostics/SyntaxError.cs | 17 ++++++- src/FarkleNeo/Grammars/TokenSymbol.cs | 12 ++--- src/FarkleNeo/Properties/Resources.cs | 33 +++++++++++-- src/FarkleNeo/Properties/Resources.el.resx | 6 +++ src/FarkleNeo/Properties/Resources.resx | 6 +++ 6 files changed, 113 insertions(+), 10 deletions(-) create mode 100644 src/FarkleNeo/Diagnostics/DelimitedString.cs diff --git a/src/FarkleNeo/Diagnostics/DelimitedString.cs b/src/FarkleNeo/Diagnostics/DelimitedString.cs new file mode 100644 index 00000000..24147b67 --- /dev/null +++ b/src/FarkleNeo/Diagnostics/DelimitedString.cs @@ -0,0 +1,49 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using System.Collections.Immutable; +using System.Text; + +namespace Farkle.Diagnostics; + +internal readonly struct DelimitedString +{ + private readonly ImmutableArray _values; + private readonly string _delimiter; + private readonly string _fallback; + private readonly Func _valueTransform; + + public DelimitedString(ImmutableArray values, string delimiter, string fallback, Func? valueTransform = null) + { + _values = values; + _delimiter = delimiter; + _fallback = fallback; + _valueTransform = valueTransform ?? (x => x); + } + + public override string ToString() + { + switch (_values) + { + case []: return string.Empty; + case [null]: return _fallback; + case [var x]: return _valueTransform(x); + } + + StringBuilder sb = new(); + bool first = true; + foreach (string? value in _values) + { + if (first) + { + first = false; + } + else + { + sb.Append(_delimiter); + } + sb.Append(value is null ? _fallback : _valueTransform(value)); + } + return sb.ToString(); + } +} diff --git a/src/FarkleNeo/Diagnostics/SyntaxError.cs b/src/FarkleNeo/Diagnostics/SyntaxError.cs index 42730a42..95345488 100644 --- a/src/FarkleNeo/Diagnostics/SyntaxError.cs +++ b/src/FarkleNeo/Diagnostics/SyntaxError.cs @@ -1,6 +1,7 @@ // Copyright © Theodore Tsirpanis and Contributors. // SPDX-License-Identifier: MIT +using Farkle.Grammars; using System.Collections.Immutable; namespace Farkle.Diagnostics; @@ -11,7 +12,7 @@ namespace Farkle.Diagnostics; /// /// A syntax error occurs when the parser encounters a token in an unexpected place. /// -public sealed class SyntaxError +public sealed class SyntaxError : IFormattable { /// /// The name of the token found by the parser, or if the end of the input was reached. @@ -51,4 +52,18 @@ public SyntaxError(string? actualTokenName, ImmutableArray expectedToke ExpectedTokenNames = expectedTokenNames; ParserState = parserState; } + + private string ToString(IFormatProvider? formatProvider) + { + string eofString = Resources.GetEofString(formatProvider); + return Resources.Format(formatProvider, + nameof(Resources.Parser_UnexpectedToken), + ActualTokenName ?? eofString, + new DelimitedString(ExpectedTokenNames, ", ", eofString, TokenSymbol.FormatName)); + } + + string IFormattable.ToString(string format, IFormatProvider formatProvider) => ToString(formatProvider); + + /// + public override string ToString() => ToString(null); } diff --git a/src/FarkleNeo/Grammars/TokenSymbol.cs b/src/FarkleNeo/Grammars/TokenSymbol.cs index 187e2260..52756454 100644 --- a/src/FarkleNeo/Grammars/TokenSymbol.cs +++ b/src/FarkleNeo/Grammars/TokenSymbol.cs @@ -75,13 +75,8 @@ internal uint GetStartedGroup() return 0; } - /// - /// Returns a string describing the the . - /// - public override string ToString() + internal static string FormatName(string name) { - string name = _grammar.GetString(Name); - return ShouldQuote(name) ? $"'{name}'" : name; static bool ShouldQuote(string str) @@ -101,4 +96,9 @@ static bool ShouldQuote(string str) return false; } } + + /// + /// Returns a string describing the the . + /// + public override string ToString() => FormatName(_grammar.GetString(Name)); } diff --git a/src/FarkleNeo/Properties/Resources.cs b/src/FarkleNeo/Properties/Resources.cs index 08c2aaca..db4b0703 100644 --- a/src/FarkleNeo/Properties/Resources.cs +++ b/src/FarkleNeo/Properties/Resources.cs @@ -20,14 +20,14 @@ internal static class Resources // The trimming tools are also capable of replacing the value of this method when the application is being trimmed. internal static bool UsingResourceKeys() => s_usingResourceKeys; - public static string GetResourceString(string resourceKey, CultureInfo? cultureInfo = null) + public static string GetResourceString(string resourceKey, IFormatProvider? formatProvider = null) { if (UsingResourceKeys()) { return resourceKey; } - return ResourceManager.GetString(resourceKey, cultureInfo)!; + return ResourceManager.GetString(resourceKey, formatProvider as CultureInfo)!; } public static string Format(IFormatProvider? formatProvider, string resourceKey, T arg) @@ -41,7 +41,30 @@ public static string Format(IFormatProvider? formatProvider, string resourceK #endif } - return string.Format(formatProvider, ResourceManager.GetString(resourceKey, culture: formatProvider as CultureInfo), arg); + return string.Format(formatProvider, ResourceManager.GetString(resourceKey, culture: formatProvider as CultureInfo)!, arg); + } + + public static string Format(IFormatProvider? formatProvider, string resourceKey, T1 arg1, T2 arg2) + { + if (UsingResourceKeys()) + { +#if NET6_0_OR_GREATER + return string.Create(formatProvider, $"{resourceKey}, {arg1}, {arg2}"); +#else + return ((FormattableString)$"{resourceKey}, {arg1}, {arg2}").ToString(formatProvider); +#endif + } + + return string.Format(formatProvider, ResourceManager.GetString(resourceKey, culture: formatProvider as CultureInfo)!, arg1, arg2); + } + + public static string GetEofString(IFormatProvider? formatProvider) + { + if (UsingResourceKeys()) + { + return "(EOF)"; + } + return GetResourceString(nameof(Parser_Eof), formatProvider); } public static string Grammar_TooNewFormat => GetResourceString(nameof(Grammar_TooNewFormat)); @@ -71,4 +94,8 @@ public static string Format(IFormatProvider? formatProvider, string resourceK public static string Parser_UnrecognizedToken => GetResourceString(nameof(Parser_UnrecognizedToken)); public static string Parser_UnexpectedEndOfInputInGroup => GetResourceString(nameof(Parser_UnexpectedEndOfInputInGroup)); + + public static string Parser_UnexpectedToken => GetResourceString(nameof(Parser_UnexpectedToken)); + + public static string Parser_Eof => GetResourceString(nameof(Parser_Eof)); } diff --git a/src/FarkleNeo/Properties/Resources.el.resx b/src/FarkleNeo/Properties/Resources.el.resx index d16e96ec..2c0a521c 100644 --- a/src/FarkleNeo/Properties/Resources.el.resx +++ b/src/FarkleNeo/Properties/Resources.el.resx @@ -54,4 +54,10 @@ Απροσδόκητο τέλος της εισόδου καθώς βρισκόταν μέσα στην ομάδα `{0}` + + Βρέθηκε {0}, αναμένονταν {1} + + + (ΤΕΛΟΣ) + \ No newline at end of file diff --git a/src/FarkleNeo/Properties/Resources.resx b/src/FarkleNeo/Properties/Resources.resx index aef38895..b3fea52e 100644 --- a/src/FarkleNeo/Properties/Resources.resx +++ b/src/FarkleNeo/Properties/Resources.resx @@ -54,4 +54,10 @@ Unexpected end of input while being inside group '{0}' + + Found {0}, expected {1} + + + (EOF) + \ No newline at end of file From 0b34d832f1e8f71a7ba895c6405f70dfb47a69a9 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Fri, 25 Aug 2023 02:58:37 +0300 Subject: [PATCH 09/20] Add APIs to create `CharParser`s. --- designs/7.0/parser-api.md | 8 +++ src/FarkleNeo/CharParser.cs | 70 +++++++++++++++++++ .../Diagnostics/LocalizedDiagnostic.cs | 23 ++++++ src/FarkleNeo/Grammars/Grammar.cs | 27 +++++++ src/FarkleNeo/Parser/FailingCharParser.cs | 40 +++++++++++ src/FarkleNeo/Parser/FailingTokenizer.cs | 25 +++++++ .../Parser/Semantics/SyntaxChecker.cs | 17 +++++ src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs | 38 ++++++++++ src/FarkleNeo/Properties/Resources.cs | 8 +++ src/FarkleNeo/Properties/Resources.el.resx | 12 ++++ src/FarkleNeo/Properties/Resources.resx | 12 ++++ 11 files changed, 280 insertions(+) create mode 100644 src/FarkleNeo/Diagnostics/LocalizedDiagnostic.cs create mode 100644 src/FarkleNeo/Parser/FailingCharParser.cs create mode 100644 src/FarkleNeo/Parser/FailingTokenizer.cs create mode 100644 src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs diff --git a/designs/7.0/parser-api.md b/designs/7.0/parser-api.md index fd059d42..c22ddef7 100644 --- a/designs/7.0/parser-api.md +++ b/designs/7.0/parser-api.md @@ -412,6 +412,14 @@ public abstract class Tokenizer public abstract bool TryGetNextToken(ref ParserInputReader input, ITokenSemanticProvider semanticProvider, out TokenizerResult result); } + +public static class Tokenizer +{ + // Creates a standalone tokenizer from a grammar. Char is the only supported character type. + // If the grammar cannot be used for tokenizing or the character type is unsupported, + // the method will throw. + public static Tokenizer Create(Grammar grammar); +} ``` It resembles the `Tokenizer` class of Farkle 6, with the following differences: diff --git a/src/FarkleNeo/CharParser.cs b/src/FarkleNeo/CharParser.cs index c84dd6ff..857be9f7 100644 --- a/src/FarkleNeo/CharParser.cs +++ b/src/FarkleNeo/CharParser.cs @@ -1,8 +1,10 @@ // Copyright © Theodore Tsirpanis and Contributors. // SPDX-License-Identifier: MIT +using Farkle.Diagnostics; using Farkle.Grammars; using Farkle.Parser; +using Farkle.Parser.Implementation; using Farkle.Parser.Semantics; using Farkle.Parser.Tokenizers; @@ -193,3 +195,71 @@ public CharParser WithTokenizer(ChainedTokenizerBuilder builder) return WithTokenizerCore(builder); } } + +/// +/// Provides factory methods to create s. +/// +public static class CharParser +{ + /// + /// Creates a . + /// + /// The type of objects the parser will produce in case of success. + /// The the parser will use. + /// The the parser will use. + /// or + /// is . + public static CharParser Create(Grammar grammar, ISemanticProvider semanticProvider) + { + ArgumentNullExceptionCompat.ThrowIfNull(grammar); + ArgumentNullExceptionCompat.ThrowIfNull(semanticProvider); + + if (grammar.IsUnparsable(out string? errorKey)) + { + return Fail(errorKey); + } + if (grammar.LrStateMachine is not { HasConflicts: false } lrStateMachine) + { + return Fail(nameof(Resources.Parser_GrammarLrProblem)); + } + + Tokenizer tokenizer = Tokenizer.Create(grammar, throwIfError: false); + return new DefaultParser(grammar, lrStateMachine, semanticProvider, tokenizer); + + CharParser Fail(string resourceKey) => new FailingCharParser(new LocalizedDiagnostic(resourceKey), grammar); + } + + /// + /// Creates a that does not perform any semantic analysis. + /// + /// The type of objects the syntax checker will return in case of success. + /// Must be a reference type and usually it is + /// or . + /// The the syntax checker will use. + /// is . + /// Syntax checkers always return in case of success. + public static CharParser CreateSyntaxChecker(Grammar grammar) where T : class => + Create(grammar, SyntaxChecker.Instance); + + /// + /// Creates a that does not perform any semantic analysis. + /// + /// The the syntax checker will use. + /// Syntax checkers always return in case of success. + public static CharParser CreateSyntaxChecker(Grammar grammar) => + CreateSyntaxChecker(grammar); + + /// + /// Converts a to a syntax checker with a user-defined return type. + /// + /// + public static CharParser ToSyntaxChecker(this CharParser parser) where TNew : class => + parser.WithSemanticProvider(SyntaxChecker.Instance); + + /// + /// Converts a to a syntax checker. + /// + /// + public static CharParser ToSyntaxChecker(this CharParser parser) => + parser.ToSyntaxChecker(); +} diff --git a/src/FarkleNeo/Diagnostics/LocalizedDiagnostic.cs b/src/FarkleNeo/Diagnostics/LocalizedDiagnostic.cs new file mode 100644 index 00000000..6dbc459d --- /dev/null +++ b/src/FarkleNeo/Diagnostics/LocalizedDiagnostic.cs @@ -0,0 +1,23 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +namespace Farkle.Diagnostics; + +/// +/// Represents a diagnostic message with no parameters that can be localized. +/// +internal sealed class LocalizedDiagnostic : IFormattable +{ + private readonly string _resourceKey; + + public LocalizedDiagnostic(string resourceKey) + { + _resourceKey = resourceKey; + } + + string IFormattable.ToString(string? format, IFormatProvider? formatProvider) => + Resources.GetResourceString(_resourceKey, formatProvider); + + public override string ToString() => + Resources.GetResourceString(_resourceKey); +} diff --git a/src/FarkleNeo/Grammars/Grammar.cs b/src/FarkleNeo/Grammars/Grammar.cs index 07f06acd..a7a333ea 100644 --- a/src/FarkleNeo/Grammars/Grammar.cs +++ b/src/FarkleNeo/Grammars/Grammar.cs @@ -5,6 +5,7 @@ using Farkle.Grammars.StateMachines; using System.Collections.Immutable; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; namespace Farkle.Grammars; @@ -188,6 +189,15 @@ public static Grammar CreateFromGoldParserGrammar(Stream grammarFile) return Create(data); } + internal Dfa? GetDfa() + { + if (typeof(TChar) == typeof(char)) + { + return DfaOnChar as Dfa; + } + throw new NotSupportedException(); + } + /// /// Returns the string pointed by the given . /// @@ -312,6 +322,23 @@ public EntityHandle GetSymbolFromSpecialName(string specialName, bool throwIfNot /// The token symbol handle to check. public bool IsTerminal(TokenSymbolHandle handle) => GrammarTables.IsTerminal(handle); + internal bool IsUnparsable([NotNullWhen(true)] out string? errorResourceKey) + { + GrammarAttributes flags = GrammarInfo.Attributes; + if ((flags & GrammarAttributes.Unparsable) != 0) + { + errorResourceKey = nameof(Resources.Parser_UnparsableGrammar); + return true; + } + if (HasUnknownData && (flags & GrammarAttributes.Critical) != 0) + { + errorResourceKey = nameof(Resources.Parser_UnparsableGrammar_Critical); + return true; + } + errorResourceKey = null; + return false; + } + Grammar IGrammarProvider.GetGrammar() => this; internal void ValidateContent() diff --git a/src/FarkleNeo/Parser/FailingCharParser.cs b/src/FarkleNeo/Parser/FailingCharParser.cs new file mode 100644 index 00000000..ca84559f --- /dev/null +++ b/src/FarkleNeo/Parser/FailingCharParser.cs @@ -0,0 +1,40 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using Farkle.Grammars; +using Farkle.Parser.Semantics; +using Farkle.Parser.Tokenizers; + +namespace Farkle.Parser; + +internal sealed class FailingCharParser : CharParser +{ + private readonly object _error; + private readonly Grammar _grammar; + + public FailingCharParser(object error, Grammar grammar) + { + _error = error; + _grammar = grammar; + IsFailing = true; + } + + public override void Run(ref ParserInputReader inputReader, ref ParserCompletionState completionState) => + completionState.SetError(_error); + + private protected override IGrammarProvider GetGrammarProvider() => _grammar; + + private protected override Tokenizer GetTokenizer() => throw new NotSupportedException(); + + private protected override CharParser WithSemanticProviderCore(ISemanticProvider semanticProvider) => + this as CharParser ?? new FailingCharParser(_error, _grammar); + + private protected override CharParser WithSemanticProviderCore(Func> semanticProviderFactory) => + this as CharParser ?? new FailingCharParser(_error, _grammar); + + private protected override CharParser WithTokenizerCore(Tokenizer tokenizer) => this; + + private protected override CharParser WithTokenizerCore(Func> tokenizerFactory) => this; + + private protected override CharParser WithTokenizerCore(ChainedTokenizerBuilder builder) => this; +} diff --git a/src/FarkleNeo/Parser/FailingTokenizer.cs b/src/FarkleNeo/Parser/FailingTokenizer.cs new file mode 100644 index 00000000..a58b75c8 --- /dev/null +++ b/src/FarkleNeo/Parser/FailingTokenizer.cs @@ -0,0 +1,25 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using Farkle.Parser.Semantics; +using Farkle.Parser.Tokenizers; + +namespace Farkle.Parser; + +internal sealed class FailingTokenizer : Tokenizer +{ + private object _message; + + public FailingTokenizer(object message) + { + _message = message; + CanSkipChainedTokenizerWrapping = true; + IsFailing = true; + } + + public override bool TryGetNextToken(ref ParserInputReader input, ITokenSemanticProvider semanticProvider, out TokenizerResult result) + { + result = TokenizerResult.CreateError(_message); + return true; + } +} diff --git a/src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs b/src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs new file mode 100644 index 00000000..ed3744b7 --- /dev/null +++ b/src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs @@ -0,0 +1,17 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using Farkle.Grammars; + +namespace Farkle.Parser.Semantics; + +internal sealed class SyntaxChecker : ISemanticProvider where T : class +{ + private SyntaxChecker() { } + + public static SyntaxChecker Instance { get; } = new(); + + public object? Fuse(ref ParserState parserState, ProductionHandle production, Span members) => null; + + public object? Transform(ref ParserState parserState, TokenSymbolHandle symbol, ReadOnlySpan characters) => null; +} diff --git a/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs b/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs index 8ecf27aa..009c3db0 100644 --- a/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs +++ b/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs @@ -1,6 +1,9 @@ // Copyright © Theodore Tsirpanis and Contributors. // SPDX-License-Identifier: MIT +using Farkle.Diagnostics; +using Farkle.Grammars; +using Farkle.Parser.Implementation; using Farkle.Parser.Semantics; namespace Farkle.Parser.Tokenizers; @@ -58,3 +61,38 @@ protected Tokenizer() { } /// public abstract bool TryGetNextToken(ref ParserInputReader input, ITokenSemanticProvider semanticProvider, out TokenizerResult result); } + +/// +/// Provides factory methods to create s. +/// +public static class Tokenizer +{ + /// + /// Creates a . + /// + /// The type of characters the tokenizer accepts. + /// The the tokenizer will use. + /// is . + /// is not . + /// The grammar cannot be used for tokenizing. + public static Tokenizer Create(Grammar grammar) => Create(grammar, throwIfError: true); + + internal static Tokenizer Create(Grammar grammar, bool throwIfError) + { + ArgumentNullExceptionCompat.ThrowIfNull(grammar); + if (grammar.IsUnparsable(out string? errorKey)) + { + return Fail(errorKey); + } + if (grammar.GetDfa() is not { HasConflicts: false } dfa) + { + return Fail(nameof(Resources.Parser_GrammarDfaProblem)); + } + return new DefaultTokenizer(grammar, dfa); + + Tokenizer Fail(string resourceKey) => + throwIfError + ? throw new InvalidOperationException(Resources.GetResourceString(resourceKey)) + : new FailingTokenizer(new LocalizedDiagnostic(resourceKey)); + } +} diff --git a/src/FarkleNeo/Properties/Resources.cs b/src/FarkleNeo/Properties/Resources.cs index db4b0703..50e560e1 100644 --- a/src/FarkleNeo/Properties/Resources.cs +++ b/src/FarkleNeo/Properties/Resources.cs @@ -98,4 +98,12 @@ public static string GetEofString(IFormatProvider? formatProvider) public static string Parser_UnexpectedToken => GetResourceString(nameof(Parser_UnexpectedToken)); public static string Parser_Eof => GetResourceString(nameof(Parser_Eof)); + + public static string Parser_UnparsableGrammar => GetResourceString(nameof(Parser_UnparsableGrammar)); + + public static string Parser_UnparsableGrammar_Critical => GetResourceString(nameof(Parser_UnparsableGrammar_Critical)); + + public static string Parser_GrammarLrProblem => GetResourceString(nameof(Parser_GrammarLrProblem)); + + public static string Parser_GrammarDfaProblem => GetResourceString(nameof(Parser_GrammarDfaProblem)); } diff --git a/src/FarkleNeo/Properties/Resources.el.resx b/src/FarkleNeo/Properties/Resources.el.resx index 2c0a521c..9bf099ef 100644 --- a/src/FarkleNeo/Properties/Resources.el.resx +++ b/src/FarkleNeo/Properties/Resources.el.resx @@ -60,4 +60,16 @@ (ΤΕΛΟΣ) + + Η γραμματική δεν μπορεί να χρησιμοποιηθεί για συντακτική ή λεκτική ανάλυση λόγω ενός ακαθόριστου προβλήματος με αυτήν + + + Η γραμματική δεν μπορεί να χρησιμοποιηθεί για συντακτική ή λεκτική ανάλυση επειδή περιέχει δεδομένα που δεν αναγνωρίζονται από αυτήν την έκδοση του Farkle + + + Η γραμματική δεν μπορεί να χρησιμοποιηθεί για συντακτική ανάλυση επειδή ο πίνακας καταστάσεων LR έχει διενέξεις ή δεν υπάρχει + + + Η γραμματική δεν μπορεί να χρησιμοποιηθεί για λεκτική ανάλυση επειδή το αυτόματο DFA της έχει διενέξεις ή δεν υπάρχει + \ No newline at end of file diff --git a/src/FarkleNeo/Properties/Resources.resx b/src/FarkleNeo/Properties/Resources.resx index b3fea52e..60c5e108 100644 --- a/src/FarkleNeo/Properties/Resources.resx +++ b/src/FarkleNeo/Properties/Resources.resx @@ -60,4 +60,16 @@ (EOF) + + The grammar cannot be used for parsing or tokenizing because of an unspecified problem with it + + + The grammar cannot be used for parsing or tokenizing because it contains data not recognized by this version of Farkle + + + The grammar cannot be used for parsing because its LR state table has conflicts or does not exist + + + The grammar cannot be used for parsing because its DFA has conflicts or does not exist + \ No newline at end of file From d13f5c4398ad4beda755440a0b6dfe61663c6f83 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Fri, 25 Aug 2023 02:59:40 +0300 Subject: [PATCH 10/20] Rename `ParserCommon` to `ParserUtilities`. --- .../Parser/Implementation/DefaultParserImplementation.cs | 2 +- src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs | 2 +- .../Implementation/{ParserCommon.cs => ParserUtilities.cs} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename src/FarkleNeo/Parser/Implementation/{ParserCommon.cs => ParserUtilities.cs} (97%) diff --git a/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs b/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs index 3cf37075..7664dbed 100644 --- a/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs +++ b/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs @@ -107,7 +107,7 @@ private RunResult Run(ref ParserInputReader input, ref int currentState, } TextPosition errorPos = foundToken ? token.Position : input.State.CurrentPosition; string? actualTokenName = foundToken ? Grammar.GetString(Grammar.GetTokenSymbol(token.Symbol).Name) : null; - ImmutableArray expectedTokens = ParserCommon.GetExpectedSymbols(Grammar, _lrStateMachine[currentState]); + ImmutableArray expectedTokens = ParserUtilities.GetExpectedSymbols(Grammar, _lrStateMachine[currentState]); result = new ParserDiagnostic(errorPos, new SyntaxError(actualTokenName, expectedTokens, currentState)); return RunResult.Failure; } diff --git a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs index 4c6d2000..44ce73ba 100644 --- a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs +++ b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs @@ -301,7 +301,7 @@ public override bool TryGetNextToken(ref ParserInputReader input, ITokenS return false; } - string errorText = ParserCommon.GetAbbreviatedLexicalErrorText(lexeme); + string errorText = ParserUtilities.GetAbbreviatedLexicalErrorText(lexeme); result = TokenizerResult.CreateError(new ParserDiagnostic(state.CurrentPosition, new LexicalError(errorText, tokenizerState))); return true; diff --git a/src/FarkleNeo/Parser/Implementation/ParserCommon.cs b/src/FarkleNeo/Parser/Implementation/ParserUtilities.cs similarity index 97% rename from src/FarkleNeo/Parser/Implementation/ParserCommon.cs rename to src/FarkleNeo/Parser/Implementation/ParserUtilities.cs index 51d83df5..fc350019 100644 --- a/src/FarkleNeo/Parser/Implementation/ParserCommon.cs +++ b/src/FarkleNeo/Parser/Implementation/ParserUtilities.cs @@ -7,7 +7,7 @@ namespace Farkle.Parser.Implementation; -internal static class ParserCommon +internal static class ParserUtilities { private static string GetAbbreviatedLexicalErrorText(ReadOnlySpan chars) { From bc3a169d5300741c4dda2af284ac035a6ebe536e Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Sun, 27 Aug 2023 18:37:35 +0300 Subject: [PATCH 11/20] Fix bugs and SonarCloud suggestions. --- .../Grammars/StateMachines/StateMachineUtilities.cs | 12 ++++++------ src/FarkleNeo/Parser/FailingCharParser.cs | 2 +- src/FarkleNeo/Parser/FailingTokenizer.cs | 2 +- .../Parser/Implementation/DefaultTokenizer.cs | 3 +-- src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/FarkleNeo/Grammars/StateMachines/StateMachineUtilities.cs b/src/FarkleNeo/Grammars/StateMachines/StateMachineUtilities.cs index ec080a04..9010d51e 100644 --- a/src/FarkleNeo/Grammars/StateMachines/StateMachineUtilities.cs +++ b/src/FarkleNeo/Grammars/StateMachines/StateMachineUtilities.cs @@ -62,22 +62,22 @@ public static T CastUInt(uint value) where T : unmanaged public static unsafe int BufferBinarySearch(ReadOnlySpan buffer, int @base, int length, T item) where T : unmanaged, IComparable { - int low = @base, high = @base + length * sizeof(T); + int low = 0, high = length; while (low <= high) { int mid = low + (high - low) / 2; - T midItem = Read(buffer, mid); + T midItem = Read(buffer, @base + mid * sizeof(T)); switch (midItem.CompareTo(item)) { case 0: return mid; - case -1: - low = mid + sizeof(T); + case < 0: + low = mid + 1; break; - case 1: - high = mid - sizeof(T); + case > 0: + high = mid - 1; break; } } diff --git a/src/FarkleNeo/Parser/FailingCharParser.cs b/src/FarkleNeo/Parser/FailingCharParser.cs index ca84559f..65ad60d8 100644 --- a/src/FarkleNeo/Parser/FailingCharParser.cs +++ b/src/FarkleNeo/Parser/FailingCharParser.cs @@ -19,7 +19,7 @@ public FailingCharParser(object error, Grammar grammar) IsFailing = true; } - public override void Run(ref ParserInputReader inputReader, ref ParserCompletionState completionState) => + public override void Run(ref ParserInputReader input, ref ParserCompletionState completionState) => completionState.SetError(_error); private protected override IGrammarProvider GetGrammarProvider() => _grammar; diff --git a/src/FarkleNeo/Parser/FailingTokenizer.cs b/src/FarkleNeo/Parser/FailingTokenizer.cs index a58b75c8..c89ecf89 100644 --- a/src/FarkleNeo/Parser/FailingTokenizer.cs +++ b/src/FarkleNeo/Parser/FailingTokenizer.cs @@ -8,7 +8,7 @@ namespace Farkle.Parser; internal sealed class FailingTokenizer : Tokenizer { - private object _message; + private readonly object _message; public FailingTokenizer(object message) { diff --git a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs index 44ce73ba..c1b3a067 100644 --- a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs +++ b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs @@ -64,7 +64,7 @@ public DefaultTokenizer(Grammar grammar, Dfa dfa) { return (acceptSymbol, acceptSymbolLength, currentState); } - return (default, i + 1, currentState); + return (default, i, currentState); } /// @@ -168,7 +168,6 @@ public bool TokenizeGroup(ref ParserInputReader input, bool isNoise, ref ConsumeInput(ref input, ref chars, 1, isNoise); // TODO: Optimize by quickly searching for the next interesting character like in Farkle 6. } - continue; } groupLength = input.RemainingCharacters.Length - chars.Length; diff --git a/src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs b/src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs index ed3744b7..9d83644a 100644 --- a/src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs +++ b/src/FarkleNeo/Parser/Semantics/SyntaxChecker.cs @@ -5,7 +5,7 @@ namespace Farkle.Parser.Semantics; -internal sealed class SyntaxChecker : ISemanticProvider where T : class +internal sealed class SyntaxChecker : ISemanticProvider where T : class? { private SyntaxChecker() { } From 0540aad88354e6ca00f912250a45de433ba86a3a Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Mon, 28 Aug 2023 02:47:58 +0300 Subject: [PATCH 12/20] Do not discard the last token of a non-final block if the DFA cannot proceed from its current state. --- src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs index c1b3a067..29b137f1 100644 --- a/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs +++ b/src/FarkleNeo/Parser/Implementation/DefaultTokenizer.cs @@ -54,7 +54,12 @@ public DefaultTokenizer(Grammar grammar, Dfa dfa) } } - if (!isFinal) + // If this is not the final input block and the DFA can move forward, we cannot accept + // a token. To see why, consider a JSON grammar and the tokenizer finding `184` at the + // end of the input block. We cannot accept it, there could be more digits after it that + // were not yet read yet. By contrast, if we had found `true` at the end of the block, we + // can accept it, because there is no way for a longer token to be formed. + if (!(isFinal || _dfa[currentState] is { Edges.Count: 0 } and { DefaultTransition: < 0 })) { acceptSymbol = default; } From 4349edfa0f7b62c25abf33a171ed7f70c3040a97 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Mon, 28 Aug 2023 02:49:14 +0300 Subject: [PATCH 13/20] Wrap the result of `Tokenizer.Create` in a chain. --- src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs b/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs index 009c3db0..37dd3840 100644 --- a/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs +++ b/src/FarkleNeo/Parser/Tokenizers/Tokenizer.cs @@ -88,7 +88,7 @@ internal static Tokenizer Create(Grammar grammar, bool throwIfErro { return Fail(nameof(Resources.Parser_GrammarDfaProblem)); } - return new DefaultTokenizer(grammar, dfa); + return ChainedTokenizer.Create(new DefaultTokenizer(grammar, dfa)); Tokenizer Fail(string resourceKey) => throwIfError From 478006859c857bf79336dbdf6e4ca7e2f0d45d6d Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Mon, 28 Aug 2023 02:49:51 +0300 Subject: [PATCH 14/20] Fix a nullability mismatch. --- src/FarkleNeo/Diagnostics/SyntaxError.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/FarkleNeo/Diagnostics/SyntaxError.cs b/src/FarkleNeo/Diagnostics/SyntaxError.cs index 95345488..d9df5635 100644 --- a/src/FarkleNeo/Diagnostics/SyntaxError.cs +++ b/src/FarkleNeo/Diagnostics/SyntaxError.cs @@ -62,7 +62,7 @@ private string ToString(IFormatProvider? formatProvider) new DelimitedString(ExpectedTokenNames, ", ", eofString, TokenSymbol.FormatName)); } - string IFormattable.ToString(string format, IFormatProvider formatProvider) => ToString(formatProvider); + string IFormattable.ToString(string? format, IFormatProvider? formatProvider) => ToString(formatProvider); /// public override string ToString() => ToString(null); From 7f0fbd6b7b9b6398f90a4fcb628904e03ce5e425 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Mon, 28 Aug 2023 03:04:31 +0300 Subject: [PATCH 15/20] Add some tests for the tokenizer. --- tests/Farkle.Tests.CSharp/TestUtilities.cs | 5 ++ tests/Farkle.Tests.CSharp/TokenizerTests.cs | 92 +++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 tests/Farkle.Tests.CSharp/TokenizerTests.cs diff --git a/tests/Farkle.Tests.CSharp/TestUtilities.cs b/tests/Farkle.Tests.CSharp/TestUtilities.cs index c43f536f..5bc93ac0 100644 --- a/tests/Farkle.Tests.CSharp/TestUtilities.cs +++ b/tests/Farkle.Tests.CSharp/TestUtilities.cs @@ -1,6 +1,8 @@ // Copyright © Theodore Tsirpanis and Contributors. // SPDX-License-Identifier: MIT +using Farkle.Grammars; + namespace Farkle.Tests.CSharp; public static class TestUtilities @@ -10,4 +12,7 @@ public static class TestUtilities public static IEnumerable Farkle7Grammars => Directory.EnumerateFiles(ResourcePath, "*.grammar.dat"); public static string GetResourceFile(string fileName) => Path.Combine(ResourcePath, fileName); + + public static Grammar LoadGrammarFromResource(string fileName) => + Grammar.Create(File.ReadAllBytes(GetResourceFile(fileName))); } diff --git a/tests/Farkle.Tests.CSharp/TokenizerTests.cs b/tests/Farkle.Tests.CSharp/TokenizerTests.cs new file mode 100644 index 00000000..2c2992e8 --- /dev/null +++ b/tests/Farkle.Tests.CSharp/TokenizerTests.cs @@ -0,0 +1,92 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +using Farkle.Grammars; +using Farkle.Parser; +using Farkle.Parser.Semantics; +using Farkle.Parser.Tokenizers; + +namespace Farkle.Tests.CSharp +{ + internal class TokenizerTests + { + /// + /// Tests that some strings produce JSON tokens. + /// + [TestCase("137", "Number")] + [TestCase("\"Hello\"", "String")] + [TestCase(@"""\""\r\n\t \u00B8""", "String")] + [TestCase("true", "true")] + [TestCase("false", "false")] + [TestCase("null", "null")] + public void TestJsonSuccess(string text, string tokenName) + { + var grammar = TestUtilities.LoadGrammarFromResource("JSON.grammar.dat"); + var tokenizer = Tokenizer.Create(grammar); + Assert.Multiple(() => + { +#pragma warning disable CS0618 // Type or member is obsolete + var stateBox = new ParserStateBox(); +#pragma warning restore CS0618 // Type or member is obsolete + ParserInputReader reader = new(stateBox, text.AsSpan()); + + Assert.That(tokenizer.TryGetNextToken(ref reader, SyntaxChecker.Instance, out var token)); + Assert.That(token.IsSuccess); + Assert.That(token.Position, Is.EqualTo(TextPosition.Initial)); + Assert.That(grammar.GetString(grammar.GetTokenSymbol(token.Symbol).Name), Is.EqualTo(tokenName)); + }); + } + + /// + /// Tests that some strings either produce or not produce a JSON token, + /// given that there might be more characters coming after the string. + /// + [TestCase("137", false)] + [TestCase("\"Hello\"", true)] + [TestCase("\"Hello", false)] + [TestCase("true", true)] + [TestCase("false", true)] + [TestCase("null", true)] + [TestCase("nul", false)] + public void TestJsonSuccessNonFinal(string text, bool shouldFindToken) + { + var grammar = TestUtilities.LoadGrammarFromResource("JSON.grammar.dat"); + var tokenizer = Tokenizer.Create(grammar); +#pragma warning disable CS0618 // Type or member is obsolete + var stateBox = new ParserStateBox(); +#pragma warning restore CS0618 // Type or member is obsolete + ParserInputReader reader = new(stateBox, text.AsSpan(), isFinal: false); + Assert.That(tokenizer.TryGetNextToken(ref reader, SyntaxChecker.Instance, out _), Is.EqualTo(shouldFindToken)); + } + + /// + /// Tests that some strings do not produce JSON tokens. + /// + [TestCase(@"""Hello")] + [TestCase(@"""Hello\""")] + [TestCase("tru")] + [TestCase("fals")] + [TestCase("nul")] + public void TestJsonFailure(string text) + { + var grammar = TestUtilities.LoadGrammarFromResource("JSON.grammar.dat"); + var tokenizer = Tokenizer.Create(grammar); + Assert.Multiple(() => + { +#pragma warning disable CS0618 // Type or member is obsolete + var stateBox = new ParserStateBox(); +#pragma warning restore CS0618 // Type or member is obsolete + ParserInputReader reader = new(stateBox, text.AsSpan()); + Assert.That(tokenizer.TryGetNextToken(ref reader, SyntaxChecker.Instance, out var token)); + Assert.That(token.IsSuccess, Is.False); + }); + } + + private sealed class SyntaxChecker : ITokenSemanticProvider + { + public static readonly SyntaxChecker Instance = new(); + + public object? Transform(ref ParserState parserState, TokenSymbolHandle symbol, ReadOnlySpan characters) => null; + } + } +} From dd69b7be8d53b0d0ce6ae5829f4fe49500648b6b Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Tue, 29 Aug 2023 02:23:41 +0300 Subject: [PATCH 16/20] Fix bugs and oversights in the grammar code. --- src/FarkleNeo/Grammars/GoldParser/GoldGrammarConverter.cs | 2 +- src/FarkleNeo/Grammars/Group.cs | 2 +- src/FarkleNeo/Grammars/Nonterminal.cs | 8 ++++---- src/FarkleNeo/Grammars/NonterminalCollection.cs | 1 + src/FarkleNeo/Grammars/Production.cs | 8 ++++---- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/FarkleNeo/Grammars/GoldParser/GoldGrammarConverter.cs b/src/FarkleNeo/Grammars/GoldParser/GoldGrammarConverter.cs index 00ed7310..9c11fe97 100644 --- a/src/FarkleNeo/Grammars/GoldParser/GoldGrammarConverter.cs +++ b/src/FarkleNeo/Grammars/GoldParser/GoldGrammarConverter.cs @@ -115,7 +115,7 @@ void AddSymbolsOfType(SymbolKind goldSymbolKind, TokenSymbolAttributes flags) productionOriginalPositions[productions[i]] = i; } // We could have sorted the original array but let's not; it's supposed to be immutable. - GoldGrammar.Production[]? sortedProductions = productions.AsSpan().ToArray(); + GoldGrammar.Production[] sortedProductions = productions.AsSpan().ToArray(); // Because the nonterminals were added in increasing order of appearance, // sorting by their original head index is the same as sorting by their mapped index. // The algorithm does not need to be stable. diff --git a/src/FarkleNeo/Grammars/Group.cs b/src/FarkleNeo/Grammars/Group.cs index e71bf988..3e314f9d 100644 --- a/src/FarkleNeo/Grammars/Group.cs +++ b/src/FarkleNeo/Grammars/Group.cs @@ -99,7 +99,7 @@ public TokenSymbolHandle End internal (uint Offset, uint NextOffset) GetNestingBounds(ReadOnlySpan grammarFile, in GrammarTables grammarTables) { uint firstNesting = grammarTables.GetGroupFirstNesting(grammarFile, Index); - uint firstNestingOfNext = Index < (uint)grammarTables.GroupNestingRowCount - 1 ? grammarTables.GetGroupFirstNesting(grammarFile, Index + 1) : (uint)grammarTables.GroupNestingRowCount; + uint firstNestingOfNext = Index < (uint)grammarTables.GroupRowCount ? grammarTables.GetGroupFirstNesting(grammarFile, Index + 1) : (uint)grammarTables.GroupNestingRowCount + 1; Debug.Assert(firstNesting <= firstNestingOfNext); return (firstNesting, firstNestingOfNext); } diff --git a/src/FarkleNeo/Grammars/Nonterminal.cs b/src/FarkleNeo/Grammars/Nonterminal.cs index 9c2f673b..bcaea797 100644 --- a/src/FarkleNeo/Grammars/Nonterminal.cs +++ b/src/FarkleNeo/Grammars/Nonterminal.cs @@ -41,10 +41,10 @@ private void AssertHasValue() internal (uint Offset, uint NextOffset) GetProductionsBounds(ReadOnlySpan grammarFile, in GrammarTables grammarTables) { uint tableIndex = Handle.TableIndex; - uint firstNesting = grammarTables.GetNonterminalFirstProduction(grammarFile, tableIndex).TableIndex; - uint firstNestingOfNext = tableIndex < (uint)grammarTables.ProductionRowCount - 1 ? grammarTables.GetNonterminalFirstProduction(grammarFile, tableIndex + 1).TableIndex : (uint)grammarTables.ProductionRowCount; - Debug.Assert(firstNesting <= firstNestingOfNext); - return (firstNesting, firstNestingOfNext); + uint firstProduction = grammarTables.GetNonterminalFirstProduction(grammarFile, tableIndex).TableIndex; + uint firstProductionOfNext = tableIndex < (uint)grammarTables.NonterminalRowCount ? grammarTables.GetNonterminalFirstProduction(grammarFile, tableIndex + 1).TableIndex : (uint)grammarTables.ProductionRowCount + 1; + Debug.Assert(firstProduction <= firstProductionOfNext); + return (firstProduction, firstProductionOfNext); } /// diff --git a/src/FarkleNeo/Grammars/NonterminalCollection.cs b/src/FarkleNeo/Grammars/NonterminalCollection.cs index a46a51c4..b557422a 100644 --- a/src/FarkleNeo/Grammars/NonterminalCollection.cs +++ b/src/FarkleNeo/Grammars/NonterminalCollection.cs @@ -11,6 +11,7 @@ namespace Farkle.Grammars; /// /// [DebuggerDisplay("Count = {Count}")] +[DebuggerTypeProxy(typeof(FlatCollectionProxy))] public readonly struct NonterminalCollection : IReadOnlyCollection { private readonly Grammar _grammar; diff --git a/src/FarkleNeo/Grammars/Production.cs b/src/FarkleNeo/Grammars/Production.cs index e7e1f9d4..9c6deb9a 100644 --- a/src/FarkleNeo/Grammars/Production.cs +++ b/src/FarkleNeo/Grammars/Production.cs @@ -45,10 +45,10 @@ private void AssertHasValue() internal (uint Offset, uint NextOffset) GetMemberBounds(ReadOnlySpan grammarFile, in GrammarTables grammarTables) { uint tableIndex = Handle.TableIndex; - uint firstNesting = grammarTables.GetProductionFirstMember(grammarFile, tableIndex); - uint firstNestingOfNext = tableIndex < (uint)grammarTables.ProductionRowCount - 1 ? grammarTables.GetProductionFirstMember(grammarFile, tableIndex + 1) : (uint)grammarTables.ProductionMemberRowCount; - Debug.Assert(firstNesting <= firstNestingOfNext); - return (firstNesting, firstNestingOfNext); + uint firstMember = grammarTables.GetProductionFirstMember(grammarFile, tableIndex); + uint firstMemberOfNext = tableIndex < (uint)grammarTables.ProductionRowCount ? grammarTables.GetProductionFirstMember(grammarFile, tableIndex + 1) : (uint)grammarTables.ProductionMemberRowCount + 1; + Debug.Assert(firstMember <= firstMemberOfNext); + return (firstMember, firstMemberOfNext); } /// From 55b8450917a3437a12b72671f8f3ef77e9a1c17a Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Tue, 29 Aug 2023 02:39:17 +0300 Subject: [PATCH 17/20] Track in ParserState if suspending the tokenizer is supported. I thought that suspending the tokenizer without being in a chain would have no effect, but it actually has the effect of prohibiting subsequent suspensions. With this change, suspending is a real no-op if the tokenizer is not wrapped in a chain. --- src/FarkleNeo/Parser/ParserState.cs | 11 +++++++++++ src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs | 4 ++++ .../Parser/Tokenizers/TokenizerExtensions.cs | 10 ++++++++++ 3 files changed, 25 insertions(+) diff --git a/src/FarkleNeo/Parser/ParserState.cs b/src/FarkleNeo/Parser/ParserState.cs index 300c4ab2..c106f356 100644 --- a/src/FarkleNeo/Parser/ParserState.cs +++ b/src/FarkleNeo/Parser/ParserState.cs @@ -63,6 +63,17 @@ public struct ParserState /// public string? InputName { get; set; } + /// + /// Whether suspending the tokenizer will have any effect. + /// + /// + /// We must have this property because if the tokenizer is not wrapped in a chain + /// and suspends to itself (which is still supported), there is no chain to "unsuspend" + /// the tokenizer, and when the tokenizer returns and wants to suspend again, it will + /// throw with a message that it is already suspended. + /// + internal bool TokenizerSupportsSuspending { get; set; } + internal void Consume(ReadOnlySpan characters) { _positionTracker.Advance(characters); diff --git a/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs b/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs index 24c70c46..c014c7b5 100644 --- a/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs +++ b/src/FarkleNeo/Parser/Tokenizers/ChainedTokenizer.cs @@ -39,6 +39,10 @@ internal static Tokenizer Create(ImmutableArray> compone public override bool TryGetNextToken(ref ParserInputReader input, ITokenSemanticProvider semanticProvider, out TokenizerResult result) { + // We mark this parser operation as supporting suspending the tokenizer. + // It might cause some issues if the tokenizer changes in the middle of the + // operation but this is not a supported scenario. + input.State.TokenizerSupportsSuspending = true; // Get the state of the chained tokenizer. If we have not suspended before, // it will be null. var tokenizerState = input.GetChainedTokenizerStateOrNull(); diff --git a/src/FarkleNeo/Parser/Tokenizers/TokenizerExtensions.cs b/src/FarkleNeo/Parser/Tokenizers/TokenizerExtensions.cs index 322e2c47..d6bcdb94 100644 --- a/src/FarkleNeo/Parser/Tokenizers/TokenizerExtensions.cs +++ b/src/FarkleNeo/Parser/Tokenizers/TokenizerExtensions.cs @@ -78,6 +78,11 @@ private static void SuspendTokenizerCore(this ref ParserInputReader(this ref ParserInputReader input, Tokenizer tokenizer) { ArgumentNullExceptionCompat.ThrowIfNull(tokenizer); + + if (!input.State.TokenizerSupportsSuspending) + { + return; + } input.SuspendTokenizerCore(tokenizer); } @@ -119,6 +124,11 @@ public static void SuspendTokenizer(this ref ParserInputReader resumptionPoint, TArg arg) { ArgumentNullExceptionCompat.ThrowIfNull(resumptionPoint); + + if (!input.State.TokenizerSupportsSuspending) + { + return; + } input.SuspendTokenizerCore(new TokenizerResumptionPoint(resumptionPoint, arg)); } From 5983a996c13c7cbb509e330671231494d2778c57 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Wed, 30 Aug 2023 00:56:37 +0300 Subject: [PATCH 18/20] Add a benchmark comparing parsing JSON in Farkle 6 and 7. --- .../Farkle.Benchmarks.CSharp/JsonBenchmark.cs | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 performance/Farkle.Benchmarks.CSharp/JsonBenchmark.cs diff --git a/performance/Farkle.Benchmarks.CSharp/JsonBenchmark.cs b/performance/Farkle.Benchmarks.CSharp/JsonBenchmark.cs new file mode 100644 index 00000000..29842cdf --- /dev/null +++ b/performance/Farkle.Benchmarks.CSharp/JsonBenchmark.cs @@ -0,0 +1,47 @@ +// Copyright © Theodore Tsirpanis and Contributors. +// SPDX-License-Identifier: MIT + +#nullable disable + +extern alias farkle6; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Configs; +using Farkle.Grammars; +using Farkle6 = farkle6::Farkle; + +namespace Farkle.Benchmarks.CSharp; + +[MemoryDiagnoser, GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] +public class JsonBenchmark +{ + [Params("small.json", "medium.json", "big.json")] public string FileName { get; set; } + + private byte[] _jsonBytes; + + private string _jsonText; + + private Farkle6.RuntimeFarkle _farkle6Runtime; + + private CharParser _farkle7Parser; + + [GlobalSetup] + public void GlobalSetup() + { + _jsonBytes = File.ReadAllBytes($"resources/{FileName}"); + _jsonText = File.ReadAllText($"resources/{FileName}"); + _farkle6Runtime = Farkle6.RuntimeFarkle.Create(Farkle6.Grammar.EGT.ReadFromFile("resources/JSON.egt"), Farkle6.PostProcessors.SyntaxChecker); + _farkle7Parser = CharParser.CreateSyntaxChecker(Grammar.Create(File.ReadAllBytes("resources/JSON.grammar.dat"))); + } + + [Benchmark(Baseline = true), BenchmarkCategory("MemoryInput")] + public object Farkle6String() => _farkle6Runtime.Parse(_jsonText).ResultValue; + + [Benchmark, BenchmarkCategory("MemoryInput")] + public object Farkle7String() => _farkle7Parser.Parse(_jsonText).Value; + + [Benchmark(Baseline = true), BenchmarkCategory("StreamingInput")] + public object Farkle6Stream() => _farkle6Runtime.Parse(new StreamReader(new MemoryStream(_jsonBytes, false))).ResultValue; + + [Benchmark, BenchmarkCategory("StreamingInput")] + public object Farkle7Stream() => _farkle7Parser.Parse(new StreamReader(new MemoryStream(_jsonBytes, false))).Value; +} From 3958c81e10fd07799ead9364a0f28cd247a6d273 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Wed, 30 Aug 2023 00:14:03 +0300 Subject: [PATCH 19/20] Fix a bug in `CharacterBufferManager`. `GetWriteBufferOffset` can resize the buffer, but calls to `_buffer.AsSpan(GetWriteBufferOffset(sizeHint))` always use the old buffer. The code was restructured to make the correct way to do it more apparent. --- .../Parser/CharacterBufferManager.cs | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/FarkleNeo/Parser/CharacterBufferManager.cs b/src/FarkleNeo/Parser/CharacterBufferManager.cs index 58928d6a..a88a9b36 100644 --- a/src/FarkleNeo/Parser/CharacterBufferManager.cs +++ b/src/FarkleNeo/Parser/CharacterBufferManager.cs @@ -45,7 +45,7 @@ private void ExpandBuffer(int minRequiredCharacters) _usedCharacterStart = 0; } - private int GetWriteBufferOffset(int sizeHint) + private void EnsureWriteBufferSize(int sizeHint) { CheckInputNotCompleted(); sizeHint = Math.Max(sizeHint, 0); @@ -58,8 +58,6 @@ private int GetWriteBufferOffset(int sizeHint) { ExpandBuffer(sizeHint); } - - return _usedCharacterEnd; } public bool IsInputCompleted { get; private set; } @@ -84,17 +82,23 @@ public void CompleteInput() IsInputCompleted = true; } - public Span GetSpan(int sizeHint) => - _buffer.AsSpan(GetWriteBufferOffset(sizeHint)); + public Span GetSpan(int sizeHint) + { + EnsureWriteBufferSize(sizeHint); + return _buffer.AsSpan(_usedCharacterEnd); + } - public Memory GetMemory(int sizeHint) => - _buffer.AsMemory(GetWriteBufferOffset(sizeHint)); + public Memory GetMemory(int sizeHint) + { + EnsureWriteBufferSize(sizeHint); + return _buffer.AsMemory(_usedCharacterEnd); + } #if !(NETCOREAPP || NETSTANDARD2_1_OR_GREATER) public ArraySegment GetArraySegment(int sizeHint) { - int offset = GetWriteBufferOffset(sizeHint); - return new ArraySegment(_buffer, offset, _buffer.Length - offset); + EnsureWriteBufferSize(sizeHint); + return new ArraySegment(_buffer, _usedCharacterEnd, _buffer.Length - _usedCharacterEnd); } #endif From 112e674ef165f5bdf4f5f605f9321ee5b636c961 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Wed, 30 Aug 2023 00:36:51 +0300 Subject: [PATCH 20/20] Push the initial state to the LR parser stack. Turns out it is always needed to do GOTOs from the initial state if we reduce a production with as many members as the stack. And with that we don't need to separately track the LR state anymore; it's at the top of the stack. --- .../DefaultParserImplementation.cs | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs b/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs index 7664dbed..0e077d00 100644 --- a/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs +++ b/src/FarkleNeo/Parser/Implementation/DefaultParserImplementation.cs @@ -59,8 +59,9 @@ private void Reduce(ref ParserInputReader input, ref int currentState, re currentState = gotoState; } - private RunResult Run(ref ParserInputReader input, ref int currentState, ref ValueStack stateStack, ref ValueStack semanticValueStack, out object? result) + private RunResult Run(ref ParserInputReader input, ref ValueStack stateStack, ref ValueStack semanticValueStack, out object? result) { + int currentState = stateStack.Peek(); bool foundToken = Tokenizer.TryGetNextToken(ref input, TokenSemanticProvider, out TokenizerResult token); while (true) { @@ -117,9 +118,10 @@ private unsafe void RunOneShot(ref ParserInputReader input, ref Parser { ValueStack stateStack = new(stackalloc int[InitialStackCapacity]); ValueStack semanticValueStack = new(InitialStackCapacity); - int currentState = _lrStateMachine.InitialState; + stateStack.Push(_lrStateMachine.InitialState); + semanticValueStack.Push(null); #pragma warning disable CS9080 // Use of variable in this context may expose referenced variables outside of their declaration scope - RunResult runResult = Run(ref input, ref currentState, ref stateStack, ref semanticValueStack, out object? result); + RunResult runResult = Run(ref input, ref stateStack, ref semanticValueStack, out object? result); #pragma warning restore CS9080 // Use of variable in this context may expose referenced variables outside of their declaration scope switch (runResult) { @@ -145,7 +147,7 @@ public void Run(ref ParserInputReader input, ref ParserCompletionState var semanticValueStack = new ValueStack(state.SemanticValueStack); try { - RunResult result = Run(ref input, ref state.CurrentState, ref stateStack, ref semanticValueStack, out object? runResult); + RunResult result = Run(ref input, ref stateStack, ref semanticValueStack, out object? runResult); switch (result) { case RunResult.Success: @@ -173,7 +175,6 @@ private enum RunResult private sealed class State { - public int CurrentState; public ValueStack.State StateStack; public ValueStack.State SemanticValueStack; @@ -183,13 +184,19 @@ public static State GetOrCreate(LrStateMachine lrStateMachine, ref ParserState p { state = new State { - CurrentState = lrStateMachine.InitialState, - StateStack = new ValueStack(InitialStackCapacity).ExportState(), - SemanticValueStack = new ValueStack(InitialStackCapacity).ExportState() + StateStack = CreateStack(lrStateMachine.InitialState), + SemanticValueStack = CreateStack(null) }; parserState.SetValue(typeof(State), state); } return (State)state; + + static ValueStack.State CreateStack(T initialValue) + { + var stack = new ValueStack(InitialStackCapacity); + stack.Push(initialValue); + return stack.ExportState(); + } } } }