Skip to content

Commit e097f4b

Browse files
committed
feat: ✨ add CSV repair option and test case
1 parent b21bf89 commit e097f4b

File tree

4 files changed

+45
-27
lines changed

4 files changed

+45
-27
lines changed

docs/src/content/docs/reference/scripts/csv.md

+27-19
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,9 @@ title: CSV
33
description: Learn how to parse and stringify CSV data using the CSV class in scripting.
44
keywords: CSV parsing, CSV stringifying, CSV data, CSV manipulation, CSV utility
55
sidebar:
6-
order: 17
6+
order: 17
77
genaiscript:
8-
files: src/samples/penguins.csv
9-
8+
files: src/samples/penguins.csv
109
---
1110

1211
Parsing and stringifying of Comma Separated Values (CSV) data.
@@ -24,18 +23,18 @@ maps to the following array of objects:
2423

2524
```json
2625
[
27-
{
28-
"name": "A",
29-
"value": 10
30-
},
31-
{
32-
"name": "B",
33-
"value": 2
34-
},
35-
{
36-
"name": "C",
37-
"value": 3
38-
}
26+
{
27+
"name": "A",
28+
"value": 10
29+
},
30+
{
31+
"name": "B",
32+
"value": 2
33+
},
34+
{
35+
"name": "C",
36+
"value": 3
37+
}
3938
]
4039
```
4140

@@ -51,9 +50,9 @@ def("DATA", env.files[0])
5150

5251
```js assistant=false
5352
def("DATA", env.files[0], {
54-
sliceHead: 50, // take first 50
55-
sliceTail: 25, // take last 25
56-
sliceSample: 5 // take 5 at random
53+
sliceHead: 50, // take first 50
54+
sliceTail: 25, // take last 25
55+
sliceSample: 5, // take 5 at random
5756
})
5857
```
5958

@@ -75,7 +74,7 @@ If the CSV file does not have a header row, you can specify the column names as
7574
```js
7675
const rows = CSV.parse(csv, {
7776
delimiter: "|",
78-
headers: ["name", "value"]
77+
headers: ["name", "value"],
7978
})
8079
```
8180

@@ -122,3 +121,12 @@ The [parsers](/genaiscript/reference/scripts/parsers) also provide a parser for
122121
```js
123122
const rows = parsers.CSV(env.files[0])
124123
```
124+
125+
126+
## Repair
127+
128+
You can specify the `repair: true` option to fix common LLM mistakes around CSV.
129+
130+
```js
131+
const rows = CSV.parse(csv, { repair: true })
132+
```

packages/core/src/csv.test.ts

+8
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ describe("CSVParse", () => {
3939
{ name: "Jane", age: "25" },
4040
])
4141
})
42+
test("Parse CSV data with invalid quotes", () => {
43+
const csv = "\"\\\"John\\\"\",30\nJane,25"
44+
const result = CSVParse(csv, { headers: ["name", "age"], repair: true })
45+
assert.deepEqual(result, [
46+
{ name: "\"John\"", age: "30" },
47+
{ name: "Jane", age: "25" },
48+
])
49+
})
4250

4351
})
4452

packages/core/src/csv.ts

+8-1
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,17 @@ export function CSVParse(
2020
options?: {
2121
delimiter?: string
2222
headers?: ElementOrArray<string>
23+
repair?: boolean
2324
}
2425
): object[] {
2526
// Destructure options or provide defaults
26-
const { delimiter, headers, ...rest } = options || {}
27+
const { delimiter, headers, repair, ...rest } = options || {}
2728
const columns = headers ? arrayify(headers) : true
29+
30+
// common LLM escape errors
31+
if (repair) {
32+
text = text.replace(/\\"/g, '""').replace(/""""/g, '""')
33+
}
2834
// Parse the CSV string based on the provided options
2935
return parse(text, {
3036
autoParse: true, // Automatically parse values to appropriate types
@@ -56,6 +62,7 @@ export function CSVTryParse(
5662
options?: {
5763
delimiter?: string
5864
headers?: ElementOrArray<string>
65+
repair?: boolean
5966
} & TraceOptions
6067
): object[] | undefined {
6168
const { trace } = options || {}

packages/core/src/types/prompt_template.d.ts

+2-7
Original file line numberDiff line numberDiff line change
@@ -1097,6 +1097,7 @@ interface Tokenizer {
10971097
interface CSVParseOptions {
10981098
delimiter?: string
10991099
headers?: string[]
1100+
repair?: boolean
11001101
}
11011102

11021103
interface TextChunk extends WorkspaceFile {
@@ -1939,13 +1940,7 @@ interface CSV {
19391940
* @param options.headers - An array of headers to use. If not provided, headers will be inferred from the first row.
19401941
* @returns An array of objects representing the parsed CSV data.
19411942
*/
1942-
parse(
1943-
text: string,
1944-
options?: {
1945-
delimiter?: string
1946-
headers?: string[]
1947-
}
1948-
): object[]
1943+
parse(text: string, options?: CSVParseOptions): object[]
19491944

19501945
/**
19511946
* Converts an array of objects to a CSV string.

0 commit comments

Comments
 (0)