Skip to content

Commit bad870d

Browse files
authored
ast-grep automatic language selection (#1394)
* prep ts docs * update gitignore rules * automatic language selection * document mapping * typo * revert api change * typo * fix sample * no model needed * fix test
1 parent 0d8a718 commit bad870d

File tree

11 files changed

+207
-61
lines changed

11 files changed

+207
-61
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,5 @@ packages/sample/src/xpai/*.json
3939
packages/sample/src/xpai/*.csv
4040
packages/sample/src/mlads/*
4141
packages/core/src/dbg.ts
42-
42+
TypeScript/
43+
react/

docs/astro.config.mjs

+3
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ is at https://microsoft.github.io/genaiscript/reference/scripts.md
5858
- save generated code in the "./genaisrc" folder with ".genai.mts" extension
5959
`,
6060
pageSeparator: "\n\n=|=|=|=|=|=\n\n",
61+
minify: {
62+
customSelectors: ["picture"]
63+
},
6164
promote: ["index*", "getting-started*", "!*/*"],
6265
customSets: [
6366
{

docs/src/content/docs/reference/scripts/ast-grep.mdx

+28-1
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,9 @@ This version of `ast-grep` [supports the following built-in languages](https://a
132132
133133
- Html
134134
- JavaScript
135+
- TypeScript
135136
- Tsx
136137
- Css
137-
- TypeScript
138138
139139
The following languages require installing an additional package:
140140
@@ -144,6 +144,9 @@ The following languages require installing an additional package:
144144
npm install -D @ast-grep/lang-c
145145
```
146146
147+
- C++, `@ast-grep/lang-cpp`
148+
- Python, `@ast-grep/lang-python`
149+
- C#, `@ast-grep/lang-csharp`
147150
- SQL, `@ast-grep/lang-sql`
148151
- Angular, `@ast-grep/lang-angular`
149152
@@ -153,6 +156,30 @@ If your language is not supported, go to [ast-grep langs](https://github.com/ast
153156
154157
:::
155158
159+
### Filename extension mapping
160+
161+
The following file extensions are mapped to the corresponding languages:
162+
163+
- HTML: `html`, `htm`
164+
- JavaScript: `cjs`, `mjs`, `js`
165+
- TypeScript: `cts`, `mts`, `ts`
166+
- TSX: `tsx`
167+
- CSS: `css`
168+
- c: `c`
169+
- cpp: `cpp`, `cxx`, `h`, `hpp`, `hxx`
170+
- python: `py`
171+
- C#: `cs`
172+
- sql: `sql`
173+
174+
### Overriding the language selection
175+
176+
GenAIScript has default mappings from well-known file extensions to languages.
177+
However, you can override this by passing the `lang` option to the `search` method.
178+
179+
```ts "{ lang: "ts" }"
180+
const { matches } = await sg.search("ts", "src/fib.ts", {...}, { lang: "ts" })
181+
```
182+
156183
## Learning ast-grep
157184
158185
There is a learning curve to grasp the query language of `ast-grep`.

docs/src/content/docs/reference/scripts/diagrams.md

+6
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,9 @@ graph LR
6262
````
6363

6464
and it gets rendered automatically once you install the extension.
65+
66+
```mermaid
67+
graph LR
68+
A[Master] --> C[New Commit]
69+
B[Feature Branch] --> C
70+
```

genaisrc/docs.genai.mts

+25-16
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,17 @@ script({
3434
default: true,
3535
description: "Update existing docs.",
3636
},
37+
maxFiles: {
38+
type: "integer",
39+
description: "Maximum number of files to process.",
40+
},
3741
},
3842
})
3943
const { output, dbg, vars } = env
4044
let { files } = env
41-
const { applyEdits, diff, pretty, missing, update } = vars
45+
const { applyEdits, diff, pretty, missing, update, maxFiles } = vars
4246

43-
dbg({ applyEdits, diff, pretty, missing, update })
47+
dbg({ applyEdits, diff, pretty, missing, update, maxFiles })
4448

4549
if (!missing && !update) cancel(`not generating or updating docs, exiting...`)
4650

@@ -60,6 +64,14 @@ if (diffFiles?.length) {
6064
)
6165
dbg(`diff filtered files: ${files.length}`)
6266
}
67+
68+
if (maxFiles && files.length > maxFiles) {
69+
dbg(`random slicing files to ${maxFiles}`)
70+
files = parsers.tidyData(files, {
71+
sliceSample: maxFiles,
72+
}) as WorkspaceFile[]
73+
}
74+
6375
const sg = await host.astGrep()
6476
const stats = []
6577
for (const file of files) {
@@ -123,7 +135,7 @@ async function generateDocs(file: WorkspaceFile, fileStats: any) {
123135
},
124136
},
125137
},
126-
{ diff: gitDiff }
138+
{ diff: gitDiff, applyGitIgnore: false }
127139
)
128140
dbg(`found ${missingDocs.length} missing docs`)
129141
const edits = sg.changeset()
@@ -206,19 +218,16 @@ async function updateDocs(file: WorkspaceFile, fileStats: any) {
206218
const { matches } = await sg.search(
207219
"ts",
208220
file.filename,
209-
{
210-
rule: {
211-
kind: "export_statement",
212-
follows: {
213-
kind: "comment",
214-
stopBy: "neighbor",
215-
},
216-
has: {
217-
kind: "function_declaration",
218-
},
219-
},
220-
},
221-
{ diff: gitDiff }
221+
YAML`
222+
rule:
223+
kind: "export_statement"
224+
follows:
225+
kind: "comment"
226+
stopBy: neighbor
227+
has:
228+
kind: "function_declaration"
229+
`,
230+
{ diff: gitDiff, applyGitIgnore: false }
222231
)
223232
dbg(`found ${matches.length} docs to update`)
224233
const edits = sg.changeset()

genaisrc/update-tree-sitter-queries.genai.mts

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { Octokit } from "octokit"
2+
script({ model: "none" })
23

34
const files: Record<string, string> = {}
45
const downloadScm = async (repo: string, name: string) => {

packages/core/src/astgrep.test.ts

+41
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { beforeEach, describe, test } from "node:test"
22
import assert from "node:assert/strict"
33
import { astGrepFindFiles, astGrepParse } from "./astgrep"
44
import { TestHost } from "./testhost"
5+
import { dedent } from "./indent"
56

67
describe("astgrep", () => {
78
beforeEach(() => {
@@ -35,4 +36,44 @@ describe("astgrep", () => {
3536
const result = await astGrepParse(file, { lang: "js" })
3637
assert.equal(result, undefined)
3738
})
39+
40+
test("parse C++ file", async () => {
41+
const file: WorkspaceFile = {
42+
filename: "test.cpp",
43+
content: dedent`
44+
#include <iostream>
45+
46+
int main() {
47+
std::cout << 'Hello, world!' << std::endl;
48+
return 0;
49+
}
50+
`,
51+
}
52+
const result = await astGrepParse(file)
53+
assert(result)
54+
})
55+
test("parse TypeScript file", async () => {
56+
const file: WorkspaceFile = {
57+
filename: "test.ts",
58+
content: "const x: number = 1;",
59+
}
60+
const result = await astGrepParse(file)
61+
assert(result)
62+
})
63+
test("parse python file", async () => {
64+
const file: WorkspaceFile = {
65+
filename: "test.py",
66+
content: "x = 1",
67+
}
68+
const result = await astGrepParse(file)
69+
assert(result)
70+
})
71+
test("parse C file", async () => {
72+
const file: WorkspaceFile = {
73+
filename: "test.c",
74+
content: "#include <stdio.h>",
75+
}
76+
const result = await astGrepParse(file)
77+
assert(result)
78+
})
3879
})

packages/core/src/astgrep.ts

+67-42
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import debug from "debug"
22
const dbg = debug("genaiscript:astgrep")
3+
const dbgLang = debug("genaiscript:astgrep:lang")
34

45
import { CancellationOptions, checkCancelled } from "./cancellation"
56
import { CancelError, errorMessage } from "./error"
@@ -94,12 +95,10 @@ export async function astGrepFindFiles(
9495
}
9596
const diffFiles = diffResolve(diff)
9697

97-
dbg(`finding files with ${lang} %O`, matcher)
98+
dbg(`search %O`, matcher)
9899
if (diffFiles?.length) dbg(`diff files: ${diffFiles.length}`)
99100
const { findInFiles } = await import("@ast-grep/napi")
100101
checkCancelled(cancellationToken)
101-
const sglang = await resolveLang(lang)
102-
dbg(`resolving language: ${lang}`)
103102

104103
let paths = await host.findFiles(glob, options)
105104
if (!paths?.length) {
@@ -129,6 +128,7 @@ export async function astGrepFindFiles(
129128
const p = new Promise<number>(async (resolve, reject) => {
130129
let i = 0
131130
let n: number = undefined
131+
const sglang = await resolveLang(lang)
132132
n = await findInFiles(
133133
sglang,
134134
{
@@ -230,7 +230,7 @@ export async function astGrepWriteRootEdits(
230230
*/
231231
export async function astGrepParse(
232232
file: WorkspaceFile,
233-
options?: { lang?: SgLang } & CancellationOptions
233+
options?: { lang?: SgLang | Record<string, SgLang> } & CancellationOptions
234234
): Promise<SgRoot> {
235235
const { cancellationToken } = options || {}
236236
if (file.encoding) {
@@ -249,7 +249,6 @@ export async function astGrepParse(
249249
dbg(`parsing file: ${filename}`)
250250
const { parseAsync } = await import("@ast-grep/napi")
251251
const lang = await resolveLang(options?.lang, filename)
252-
dbg(`resolving language for file: ${filename}`)
253252
if (!lang) {
254253
return undefined
255254
}
@@ -259,65 +258,91 @@ export async function astGrepParse(
259258
return root
260259
}
261260

262-
async function resolveLang(lang: SgLang, filename?: string) {
261+
async function resolveLang(
262+
lang: SgLang | Record<string, SgLang>,
263+
filename?: string
264+
) {
263265
const { Lang } = await import("@ast-grep/napi")
264-
if (lang === "html") {
265-
return Lang.Html
266-
}
267-
if (lang === "js") {
268-
return Lang.JavaScript
269-
}
270-
if (lang === "ts") {
271-
return Lang.TypeScript
266+
267+
const norm = (l: string) => l.toLowerCase().replace(/^\./, "")
268+
269+
// pre-compiled with ast-grep
270+
const builtins: any = {
271+
html: Lang.Html,
272+
htm: Lang.Html,
273+
cjs: Lang.JavaScript,
274+
mjs: Lang.JavaScript,
275+
js: Lang.JavaScript,
276+
cts: Lang.TypeScript,
277+
mts: Lang.TypeScript,
278+
ts: Lang.TypeScript,
279+
tsx: Lang.Tsx,
280+
css: Lang.Css,
272281
}
273-
if (lang === "tsx") {
274-
return Lang.Tsx
282+
283+
const dynamics: any = {
284+
h: "c",
285+
c: "c",
286+
cpp: "cpp",
287+
hpp: "cpp",
288+
hxx: "cpp",
289+
cxx: "cpp",
290+
cs: "csharp",
291+
py: "python",
292+
sql: "sql",
275293
}
276-
if (lang === "css") {
277-
return Lang.Css
294+
295+
const forbidden = ["bin", "exe", "dll"]
296+
297+
// user provided a string
298+
if (typeof lang === "string") {
299+
lang = norm(lang)
300+
dbgLang(`resolving language ${lang}`)
301+
const builtin = builtins[lang]
302+
if (builtin) return builtin
303+
else return await loadDynamicLanguage(lang)
278304
}
279-
if (lang) {
280-
return await loadDynamicLanguage(lang.toLowerCase())
305+
306+
if (!filename) {
307+
dbgLang(`filename not provided`)
308+
throw new Error("filename is required to resolve language")
281309
}
282310

283311
if (filename) {
284-
dbg(`resolving language based on filename: ${filename}`)
285-
if (/\.m?js$/i.test(filename)) {
286-
return Lang.JavaScript
287-
}
288-
if (/\.m?ts$/i.test(filename)) {
289-
return Lang.TypeScript
290-
}
291-
if (/\.(j|t)sx$/i.test(filename)) {
292-
return Lang.Tsx
293-
}
294-
if (/\.html$/i.test(filename)) {
295-
return Lang.Html
296-
}
297-
if (/\.css$/i.test(filename)) {
298-
return Lang.Css
299-
}
300-
return await loadDynamicLanguage(
301-
extname(filename).slice(1).toLowerCase()
302-
)
312+
const ext = norm(extname(filename))
313+
dbgLang(`resolving language for ${ext}`)
314+
315+
// known builtins
316+
const builtin = builtins[ext]
317+
if (builtin) return builtin
318+
319+
// known dynamics
320+
const dynamic = dynamics[ext]
321+
if (dynamic) return await loadDynamicLanguage(dynamic)
322+
323+
if (forbidden.includes(ext)) return undefined
324+
325+
// try our luck
326+
return await loadDynamicLanguage(ext)
303327
}
304328

329+
dbgLang(`language not resolved`, { lang, filename })
305330
throw new Error("language not resolved")
306331
}
307332

308333
const loadedDynamicLanguages = new Set<string>()
309334
async function loadDynamicLanguage(langName: string) {
310335
if (!loadedDynamicLanguages.has(langName)) {
311-
dbg(`loading language: ${langName}`)
336+
dbgLang(`loading language: ${langName}`)
312337
const { registerDynamicLanguage } = await import("@ast-grep/napi")
313338
try {
314339
const dynamicLang = (await import(`@ast-grep/lang-${langName}`))
315340
.default
316341
registerDynamicLanguage({ [langName]: dynamicLang })
317342
loadedDynamicLanguages.add(langName)
318-
dbg(`language ${langName} registered `)
343+
dbgLang(`language ${langName} registered `)
319344
} catch (err) {
320-
dbg(`error loading language ${langName}: ${errorMessage(err)}`)
345+
dbgLang(`error loading language ${langName}: ${errorMessage(err)}`)
321346
throw Error(
322347
`@ast-grep/lang-${langName} package failed to load, please install it using 'npm install -D @ast-grep/lang-${langName}'`
323348
)

0 commit comments

Comments
 (0)