diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs index 83ad97b22..0014d7b09 100644 --- a/src/CsvHelper/CsvParser.cs +++ b/src/CsvHelper/CsvParser.cs @@ -54,12 +54,12 @@ public class CsvParser : IParser, IDisposable private bool inQuotes; private bool inEscape; private Field[] fields; - private string[] processedFields; + private Memory[] processedFields; private int fieldsPosition; private bool disposed; private int quoteCount; private char[] processFieldBuffer; - private int processFieldBufferSize; + private int processFieldBufferUsedLength; private ParserState state; private int delimiterPosition = 1; private int newLinePosition = 1; @@ -69,6 +69,11 @@ public class CsvParser : IParser, IDisposable private bool isRecordProcessed; private string[] record = []; + /// + /// Gets a span representing the currently unused space in . + /// + private Span AvailableProcessFieldBuffer => processFieldBuffer.AsSpan(processFieldBufferUsedLength); + /// public long CharCount => charCount; @@ -110,6 +115,16 @@ public string[]? Record /// public string RawRecord => new string(buffer, rowStartPosition, bufferPosition - rowStartPosition); + /// + /// Gets a span over the raw record for the current row. + /// + /// + /// The underlying memory of the returned span may change upon subsequent calls to + /// , resulting in undefined behavior. If you want a consistent + /// view of the raw record at this point in time, use instead. + /// + public ReadOnlySpan RawRecordSpan => buffer.AsSpan(rowStartPosition, bufferPosition - rowStartPosition); + /// public int Count => fieldsPosition; @@ -126,27 +141,14 @@ public string[]? Record public IParserConfiguration Configuration => configuration; /// + /// is negative or greater than or equal to . public string this[int index] { get { - if (isProcessingField) - { - var message = - $"You can't access {nameof(IParser)}[int] or {nameof(IParser)}.{nameof(IParser.Record)} inside of the {nameof(BadDataFound)} callback. " + - $"Use {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.Field)} and {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.RawRecord)} instead." - ; - - throw new ParserException(Context, message); - } + ReadOnlySpan fieldSpan = GetFieldSpan(index); - isProcessingField = true; - - var field = GetField(index); - - isProcessingField = false; - - return field; + return cacheFields ? fieldCache.GetField(fieldSpan) : fieldSpan.ToString(); } } @@ -193,15 +195,14 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea newLine = configuration.NewLine; newLineFirstChar = configuration.NewLine[0]; mode = configuration.Mode; - processFieldBufferSize = configuration.ProcessFieldBufferSize; quote = configuration.Quote; whiteSpaceChars = configuration.WhiteSpaceChars; trimOptions = configuration.TrimOptions; buffer = new char[bufferSize]; - processFieldBuffer = new char[processFieldBufferSize]; + processFieldBuffer = new char[configuration.ProcessFieldBufferSize]; fields = new Field[128]; - processedFields = new string[128]; + processedFields = new Memory[128]; } /// @@ -212,6 +213,7 @@ public bool Read() fieldStartPosition = rowStartPosition; fieldsPosition = 0; quoteCount = 0; + processFieldBufferUsedLength = 0; row++; rawRow++; var c = '\0'; @@ -247,6 +249,7 @@ public async Task ReadAsync() fieldStartPosition = rowStartPosition; fieldsPosition = 0; quoteCount = 0; + processFieldBufferUsedLength = 0; row++; rawRow++; var c = '\0'; @@ -800,30 +803,58 @@ private async Task FillBufferAsync() return true; } - private string GetField(int index) + /// + /// Gets a span over the field at the specified index in the current row. + /// + /// The index of the field in the current row. + /// A span representing the field at the specified index in the current row. + /// + /// The underlying memory of the returned span may change upon subsequent calls to + /// , resulting in undefined behavior. If you want a consistent + /// view of the raw record at this point in time, use instead. + /// + /// is negative or greater than or equal to . + public ReadOnlySpan GetFieldSpan(int index) { - if (index > fieldsPosition) + if ((uint)index >= Count) { - throw new IndexOutOfRangeException(); + throw new IndexOutOfRangeException($"Index was out of range. Must be non-negative and less than {nameof(Count)}"); } ref var field = ref fields[index]; if (field.Length == 0) { - return string.Empty; + return ReadOnlySpan.Empty; } if (field.IsProcessed) { - return processedFields[index]; + return processedFields[index].Span; + } + + if (isProcessingField) + { + // This check is to guard against stack overflow in the case that + // someone tries to access a (bad and unprocessed) field from + // within BadDataFound, and that access calls BadDataFound, + // which then tries to access... (and so on until the process crashes). + + var message = + $"You can't access {nameof(CsvParser)}[int], {nameof(CsvParser)}.{nameof(GetFieldSpan)} or " + + $"{nameof(CsvParser)}.{nameof(Record)} inside of the {nameof(BadDataFound)} callback. " + + $"Use {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.Field)} and {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.RawRecord)} instead."; + + throw new ParserException(Context, message); } + isProcessingField = true; + var start = field.Start + rowStartPosition; var length = field.Length; var quoteCount = field.QuoteCount; - ProcessedField processedField; + Memory processedField; switch (mode) { case CsvMode.RFC4180: @@ -841,14 +872,12 @@ private string GetField(int index) throw new InvalidOperationException($"ParseMode '{mode}' is not handled."); } - var value = cacheFields - ? fieldCache.GetField(processedField.Buffer, processedField.Start, processedField.Length) - : new string(processedField.Buffer, processedField.Start, processedField.Length); - - processedFields[index] = value; + processedFields[index] = processedField; field.IsProcessed = true; - return value; + isProcessingField = false; + + return processedField.Span; } /// @@ -858,7 +887,7 @@ private string GetField(int index) /// The length of the field. /// The number of counted quotes. /// The processed field. - protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCount) + protected Memory ProcessRFC4180Field(int start, int length, int quoteCount) { var newStart = start; var newLength = length; @@ -872,8 +901,7 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou { // Not quoted. // No processing needed. - - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); } if (buffer[newStart] != quote || buffer[newStart + newLength - 1] != quote || newLength == 1 && buffer[newStart] == quote) @@ -895,19 +923,10 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou { // The only quotes are the ends of the field. // No more processing is needed. - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); } - if (newLength > processFieldBuffer.Length) - { - // Make sure the field processing buffer is large engough. - while (newLength > processFieldBufferSize) - { - processFieldBufferSize *= 2; - } - - processFieldBuffer = new char[processFieldBufferSize]; - } + EnsureAvailableProcessFieldBuffer(newLength); // Remove escapes. var inEscape = false; @@ -927,11 +946,14 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou continue; } - processFieldBuffer[position] = c; + AvailableProcessFieldBuffer[position] = c; position++; } - return new ProcessedField(0, position, processFieldBuffer); + int fieldStartIndex = processFieldBufferUsedLength; + processFieldBufferUsedLength += position; + + return processFieldBuffer.AsMemory(fieldStartIndex, position); } /// @@ -940,12 +962,10 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou /// The start index of the field. /// The length of the field. /// The processed field. - protected ProcessedField ProcessRFC4180BadField(int start, int length) + protected Memory ProcessRFC4180BadField(int start, int length) { // If field is already known to be bad, different rules can be applied. - - var args = new BadDataFoundArgs(new string(buffer, start, length), RawRecord, Context); - badDataFound?.Invoke(args); + badDataFound?.Invoke(new BadDataFoundArgs(new string(buffer, start, length), RawRecord, Context)); var newStart = start; var newLength = length; @@ -958,19 +978,10 @@ protected ProcessedField ProcessRFC4180BadField(int start, int length) if (buffer[newStart] != quote) { // If the field doesn't start with a quote, don't process it. - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); } - if (newLength > processFieldBuffer.Length) - { - // Make sure the field processing buffer is large engough. - while (newLength > processFieldBufferSize) - { - processFieldBufferSize *= 2; - } - - processFieldBuffer = new char[processFieldBufferSize]; - } + EnsureAvailableProcessFieldBuffer(newLength); // Remove escapes until the last quote is found. var inEscape = false; @@ -1011,11 +1022,14 @@ protected ProcessedField ProcessRFC4180BadField(int start, int length) continue; } - processFieldBuffer[position] = c; + AvailableProcessFieldBuffer[position] = c; position++; } - return new ProcessedField(0, position, processFieldBuffer); + int fieldStartIndex = processFieldBufferUsedLength; + processFieldBufferUsedLength += position; + + return processFieldBuffer.AsMemory(fieldStartIndex, position); } /// @@ -1024,7 +1038,7 @@ protected ProcessedField ProcessRFC4180BadField(int start, int length) /// The start index of the field. /// The length of the field. /// The processed field. - protected ProcessedField ProcessEscapeField(int start, int length) + protected Memory ProcessEscapeField(int start, int length) { var newStart = start; var newLength = length; @@ -1034,16 +1048,7 @@ protected ProcessedField ProcessEscapeField(int start, int length) ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars); } - if (newLength > processFieldBuffer.Length) - { - // Make sure the field processing buffer is large engough. - while (newLength > processFieldBufferSize) - { - processFieldBufferSize *= 2; - } - - processFieldBuffer = new char[processFieldBufferSize]; - } + EnsureAvailableProcessFieldBuffer(newLength); // Remove escapes. var inEscape = false; @@ -1062,11 +1067,14 @@ protected ProcessedField ProcessEscapeField(int start, int length) continue; } - processFieldBuffer[position] = c; + AvailableProcessFieldBuffer[position] = c; position++; } - return new ProcessedField(0, position, processFieldBuffer); + int fieldStartIndex = processFieldBufferUsedLength; + processFieldBufferUsedLength += position; + + return processFieldBuffer.AsMemory(fieldStartIndex, position); } /// @@ -1076,7 +1084,7 @@ protected ProcessedField ProcessEscapeField(int start, int length) /// The start index of the field. /// The length of the field. /// The processed field. - protected ProcessedField ProcessNoEscapeField(int start, int length) + protected Memory ProcessNoEscapeField(int start, int length) { var newStart = start; var newLength = length; @@ -1086,7 +1094,36 @@ protected ProcessedField ProcessNoEscapeField(int start, int length) ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars); } - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); + } + + private void EnsureAvailableProcessFieldBuffer(int length) + { + if (AvailableProcessFieldBuffer.Length >= length) + { + return; + } + + int minimumSize = processFieldBufferUsedLength + length; + + int arrayMaxLength = +#if NET + Array.MaxLength; +#else + 0x7FFFFFC7; +#endif + + if ((uint)minimumSize > arrayMaxLength) + { + throw new OutOfMemoryException(); + } + + // Double the existing buffer size (capped at Array.MaxLength). + int newSize = Math.Max(minimumSize, (int)Math.Min(arrayMaxLength, 2 * (uint)processFieldBuffer.Length)); + + Array.Resize(ref processFieldBuffer, newSize); + + Debug.Assert(AvailableProcessFieldBuffer.Length >= length); } /// @@ -1124,42 +1161,6 @@ protected virtual void Dispose(bool disposing) disposed = true; } - /// - /// Processes a raw field based on configuration. - /// This will remove quotes, remove escapes, and trim if configured to. - /// - [DebuggerDisplay("Start = {Start}, Length = {Length}, Buffer.Length = {Buffer.Length}")] - protected readonly struct ProcessedField - { - /// - /// The start of the field in the buffer. - /// - public readonly int Start; - - /// - /// The length of the field in the buffer. - /// - public readonly int Length; - - /// - /// The buffer that contains the field. - /// - public readonly char[] Buffer; - - /// - /// Creates a new instance of ProcessedField. - /// - /// The start of the field in the buffer. - /// The length of the field in the buffer. - /// The buffer that contains the field. - public ProcessedField(int start, int length, char[] buffer) - { - Start = start; - Length = length; - Buffer = buffer; - } - } - private enum ReadLineResult { None = 0, diff --git a/src/CsvHelper/FieldCache.cs b/src/CsvHelper/FieldCache.cs index 722a8fe24..a9b4bcad0 100644 --- a/src/CsvHelper/FieldCache.cs +++ b/src/CsvHelper/FieldCache.cs @@ -4,6 +4,7 @@ // https://github.com/JoshClose/CsvHelper using System.Diagnostics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; // https://blog.markvincze.com/back-to-basics-dictionary-part-2-net-implementation/ @@ -29,26 +30,26 @@ public FieldCache(int initialSize = 128, int maxFieldSize = 128) entries = new Entry[size]; } - public string GetField(char[] buffer, int start, int length) + public string GetField(ReadOnlySpan buffer) { - if (length == 0) + if (buffer.IsEmpty) { return string.Empty; } - if (length > maxFieldSize) + if (buffer.Length > maxFieldSize) { - return new string(buffer, start, length); + return buffer.ToString(); } - var hashCode = GetHashCode(buffer, start, length); + var hashCode = GetHashCode(buffer); ref var bucket = ref GetBucket(hashCode); int i = bucket - 1; while ((uint)i < (uint)entries.Length) { ref var entry = ref entries[i]; - if (entry.HashCode == hashCode && entry.Value.AsSpan().SequenceEqual(new Span(buffer, start, length))) + if (entry.HashCode == hashCode && entry.Value.AsSpan().SequenceEqual(buffer)) { return entry.Value; } @@ -65,7 +66,7 @@ public string GetField(char[] buffer, int start, int length) ref var reference = ref entries[count]; reference.HashCode = hashCode; reference.Next = bucket - 1; - reference.Value = new string(buffer, start, length); + reference.Value = buffer.ToString(); bucket = count + 1; count++; @@ -73,17 +74,23 @@ public string GetField(char[] buffer, int start, int length) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private uint GetHashCode(char[] buffer, int start, int length) + private static uint GetHashCode(ReadOnlySpan buffer) { unchecked { - uint hash = 17; - for (var i = start; i < start + length; i++) +#if NET6_0_OR_GREATER + HashCode hash = new(); + hash.AddBytes(MemoryMarshal.AsBytes(buffer)); + return (uint)hash.ToHashCode(); +#else + HashCode hash = new(); + foreach (char c in buffer) { - hash = hash * 31 + buffer[i]; + hash.Add(c); } - return hash; + return (uint)hash.ToHashCode(); +#endif } } diff --git a/tests/CsvHelper.Tests/CsvParserTests.cs b/tests/CsvHelper.Tests/CsvParserTests.cs index a4ffc2202..c373df2d9 100644 --- a/tests/CsvHelper.Tests/CsvParserTests.cs +++ b/tests/CsvHelper.Tests/CsvParserTests.cs @@ -1366,5 +1366,18 @@ public void RawRowCountWithSingleLineAndNoLineEndingTest() Assert.Equal(1, parser.RawRow); } } + + [Theory] + [InlineData(-1)] + [InlineData(2)] + public void Parser_IndexOutOfRangeException(int index) + { + using (var reader = new StringReader("1,2\r\n")) + using (var parser = new CsvParser(reader, CultureInfo.InvariantCulture)) + { + Assert.True(parser.Read()); + Assert.Throws(() => parser[index]); + } + } } } diff --git a/tests/CsvHelper.Tests/Parsing/BadDataTests.cs b/tests/CsvHelper.Tests/Parsing/BadDataTests.cs index 248685147..765b9ffaf 100644 --- a/tests/CsvHelper.Tests/Parsing/BadDataTests.cs +++ b/tests/CsvHelper.Tests/Parsing/BadDataTests.cs @@ -149,5 +149,35 @@ public void Read_AccessingParserRecordInBadDataFound_ThrowsParserException() Assert.Throws(() => parser[1]); } + + [Fact] + public void ConsecutiveBadDataTest() + { + var config = new CsvConfiguration(CultureInfo.InvariantCulture) + { + BadDataFound = null, + CacheFields = false, + ProcessFieldBufferSize = 4 + }; + // These 3 fields each use the processFieldBuffer. + // The test is to ensure consistency of the fields during a read, + // i.e. the memory that each field points to is not overwritten + // during the processing of the other fields in the same row. + string csv = "\"\"\"\",\"two\" \"2,\"three\" \"3\r\n"; // """","two" "2,"three" "3 + using (var reader = new StringReader(csv)) + using (var parser = new CsvParser(reader, config)) + { + Assert.True(parser.Read()); + + Assert.Equal(3, parser.Count); + Assert.Equal("\"", parser.GetFieldSpan(0).ToString()); + Assert.Equal("two \"2", parser.GetFieldSpan(1).ToString()); + Assert.Equal("three \"3", parser.GetFieldSpan(2).ToString()); + Assert.Equal("two \"2", parser.GetFieldSpan(1).ToString()); + Assert.Equal("\"", parser.GetFieldSpan(0).ToString()); + + Assert.False(parser.Read()); + } + } } } diff --git a/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs b/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs index cab3236fb..a0923a9c8 100644 --- a/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs +++ b/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs @@ -64,8 +64,6 @@ public void Read_WithFieldCacheDisabled_ReturnsDifferentFieldInstance() [Fact] public void Test1() { - // "542008", "27721116", "98000820" have hash code 3769566006 - var value1 = "542008"; var value2 = "27721116"; var value3 = "98000820"; @@ -73,21 +71,16 @@ public void Test1() var cache = new FieldCache(1); - var field1 = cache.GetField(value1.ToCharArray(), 0, value1.Length); - var field2 = cache.GetField(value2.ToCharArray(), 0, value2.Length); - var field3 = cache.GetField(value3.ToCharArray(), 0, value3.Length); - var field4 = cache.GetField(value4.ToCharArray(), 0, value4.Length); + var field1 = cache.GetField(value1.AsSpan()); + var field2 = cache.GetField(value2.AsSpan()); + var field3 = cache.GetField(value3.AsSpan()); + var field4 = cache.GetField(value4.AsSpan()); Assert.Equal(value1, field1); Assert.Equal(value2, field2); Assert.Equal(value3, field3); Assert.Equal(value4, field4); - Assert.NotSame(value1, field1); - Assert.NotSame(value2, field2); - Assert.NotSame(value3, field3); - Assert.NotSame(value4, field4); - Assert.Same(field1, field4); } }