Skip to content

Commit 372b10a

Browse files
authored
pdf markdown runtime herlp (#1082)
* added helper * increase horizon * more ocr work * added logging of pdf rendering * support formatting re-enabled for 03 models * link docs * adding vision small models * refresh lockfiles
1 parent f3b4443 commit 372b10a

File tree

17 files changed

+594
-479
lines changed

17 files changed

+594
-479
lines changed

Diff for: THIRD_PARTY_LICENSES.md

+23-23
Original file line numberDiff line numberDiff line change
@@ -2862,7 +2862,7 @@ MIT License
28622862

28632863
The following npm package may be included in this product:
28642864

2865-
- genaiscript-vscode@1.97.3
2865+
- genaiscript-vscode@1.98.0
28662866

28672867
This package contains the following license:
28682868

@@ -6447,20 +6447,20 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
64476447

64486448
The following npm packages may be included in this product:
64496449

6450-
- @inquirer/checkbox@4.0.7
6451-
- @inquirer/[email protected].4
6452-
- @inquirer/[email protected].5
6453-
- @inquirer/[email protected].4
6454-
- @inquirer/[email protected].7
6450+
- @inquirer/checkbox@4.1.1
6451+
- @inquirer/[email protected].5
6452+
- @inquirer/[email protected].6
6453+
- @inquirer/[email protected].6
6454+
- @inquirer/[email protected].8
64556455
- @inquirer/[email protected]
6456-
- @inquirer/[email protected].4
6457-
- @inquirer/[email protected].7
6458-
- @inquirer/[email protected].7
6459-
- @inquirer/prompts@7.2.4
6460-
- @inquirer/[email protected].7
6461-
- @inquirer/[email protected].7
6462-
- @inquirer/[email protected].7
6463-
- @inquirer/[email protected].3
6456+
- @inquirer/[email protected].5
6457+
- @inquirer/[email protected].8
6458+
- @inquirer/[email protected].8
6459+
- @inquirer/prompts@7.3.1
6460+
- @inquirer/[email protected].8
6461+
- @inquirer/[email protected].8
6462+
- @inquirer/[email protected].8
6463+
- @inquirer/[email protected].4
64646464

64656465
These packages each contain the following license:
64666466

@@ -8009,15 +8009,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
80098009
The following npm packages may be included in this product:
80108010

80118011
- @esbuild/[email protected]
8012-
- @napi-rs/[email protected].65
8013-
- @napi-rs/[email protected].65
8012+
- @napi-rs/[email protected].66
8013+
- @napi-rs/[email protected].66
80148014
- @tokenizer/[email protected]
80158015
80168016
8017-
- genaiscript-core-internal@1.97.3
8018-
- genaiscript-sample@1.97.3
8019-
- genaiscript-web@1.97.3
8020-
- genaiscript@1.97.3
8017+
- genaiscript-core-internal@1.98.0
8018+
- genaiscript-sample@1.98.0
8019+
- genaiscript-web@1.98.0
8020+
- genaiscript@1.98.0
80218021
80228022
80238023
@@ -8737,7 +8737,7 @@ SOFTWARE.
87378737

87388738
The following npm package may be included in this product:
87398739

8740-
- @napi-rs/[email protected].65
8740+
- @napi-rs/[email protected].66
87418741

87428742
This package contains the following license:
87438743

@@ -9037,7 +9037,7 @@ SOFTWARE.
90379037

90389038
The following npm package may be included in this product:
90399039

9040-
- @lvce-editor/ripgrep@1.5.0
9040+
- @lvce-editor/ripgrep@1.6.0
90419041

90429042
This package contains the following license:
90439043

@@ -10603,7 +10603,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1060310603
The following npm packages may be included in this product:
1060410604

1060510605
- @octokit/[email protected]
10606-
- @octokit/graphql@8.1.2
10606+
- @octokit/graphql@8.2.0
1060710607
- @octokit/[email protected]
1060810608
- @octokit/[email protected]
1060910609

Diff for: docs/src/content/docs/reference/scripts/o-models.mdx

+4
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,7 @@ script({
3636

3737
- `o1-preview`, `o1-mini` do not support streaming
3838
- `o1` models do not support tool calling so GenAIScript uses [fallback tools](/genaiscript/reference/scripts/tools).
39+
40+
## Advice on prompting
41+
42+
OpenAI provides an extensive [advice on prompting](https://platform.openai.com/docs/guides/reasoning#advice-on-prompting) reasoning models.

Diff for: docs/src/content/docs/reference/scripts/system.mdx

+1
Original file line numberDiff line numberDiff line change
@@ -2749,6 +2749,7 @@ Base system prompt
27492749
system({ title: "Base system prompt" })
27502750
$`## Markdown Output
27512751
Respond in Markdown (GitHub Flavored Markdown also supported).`
2752+
if (/o3/.test(env.meta.model)) $`Formatting re-enabled.`
27522753
27532754
`````
27542755

Diff for: docs/yarn.lock

+136-136
Large diffs are not rendered by default.

Diff for: packages/cli/src/info.ts

+13-1
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,18 @@
44
* and resolving model connection info for specific scripts.
55
*/
66

7+
import { re } from "mathjs"
78
import { resolveLanguageModelConfigurations } from "../../core/src/config"
89
import { host, runtimeHost } from "../../core/src/host"
910
import {
1011
ModelConnectionInfo,
12+
resolveModelAlias,
1113
resolveModelConnectionInfo,
1214
} from "../../core/src/models"
1315
import { CORE_VERSION } from "../../core/src/version"
1416
import { YAMLStringify } from "../../core/src/yaml"
1517
import { buildProject } from "./build"
18+
import { kMaxLength } from "buffer"
1619

1720
/**
1821
* Outputs basic system information including node version, platform, architecture, and process ID.
@@ -92,5 +95,14 @@ export async function scriptModelInfo(
9295
}
9396

9497
export async function modelAliasesInfo() {
95-
console.log(YAML.stringify(runtimeHost.modelAliases))
98+
const res = Object.fromEntries(
99+
Object.entries(runtimeHost.modelAliases).map(([k, v]) => [
100+
k,
101+
{
102+
...v,
103+
resolved: resolveModelAlias(k).model,
104+
},
105+
])
106+
)
107+
console.log(YAMLStringify(res))
96108
}

Diff for: packages/cli/src/runtime.ts

+81
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,84 @@ export async function cast(
228228
? { text, data: res.json }
229229
: { text, error: res.error?.message }
230230
}
231+
232+
/**
233+
*
234+
* @param file
235+
* @param options
236+
* @returns
237+
*/
238+
export async function markdownifyPdf(
239+
file: WorkspaceFile,
240+
options?: PromptGeneratorOptions &
241+
Omit<ParsePDFOptions, "renderAsImage"> & {
242+
instructions?: string | PromptGenerator
243+
ctx?: ChatGenerationContext
244+
}
245+
) {
246+
const {
247+
ctx = env.generator,
248+
label = `markdownify PDF`,
249+
model = "ocr",
250+
responseType = "markdown",
251+
systemSafety = true,
252+
instructions,
253+
...rest
254+
} = options || {}
255+
256+
// extract text and render pages as images
257+
const { pages, images = [] } = await parsers.PDF(file, {
258+
...rest,
259+
renderAsImage: true,
260+
})
261+
const markdowns: string[] = []
262+
for (let i = 0; i < pages.length; ++i) {
263+
const page = pages[i]
264+
const image = images[i]
265+
// mix of text and vision
266+
const res = await ctx.runPrompt(
267+
async (_) => {
268+
const previousPages = markdowns.slice(-2).join("\n\n")
269+
if (previousPages.length) _.def("PREVIOUS_PAGES", previousPages)
270+
if (page) _.def("PAGE", page)
271+
if (image)
272+
_.defImages(image, { autoCrop: true, greyscale: true })
273+
_.$`You are an expert at converting PDFs to markdown.
274+
275+
## Task
276+
Your task is to analyze the image and extract textual content in markdown format.
277+
278+
The image is a screenshot of the current page in the PDF document.
279+
We used pdfjs-dist to extract the text of the current page in <PAGE>, use it to help with the conversion.
280+
The text from the previous pages is in <PREVIOUS_PAGES>, use it to ensure consistency in the conversion.
281+
282+
## Instructions
283+
- Ensure markdown text formatting for the extracted text is applied properly by analyzing the image.
284+
- Do not change any content in the original extracted text while applying markdown formatting and do not repeat the extracted text.
285+
- Preserve markdown text formatting if present such as horizontal lines, header levels, footers, bullet points, links/urls, or other markdown elements.
286+
- Extract source code snippets in code fences.
287+
- Do not omit any textual content from the markdown formatted extracted text.
288+
- Do not generate page breaks
289+
- Do not repeat the <PREVIOUS_PAGES> content.
290+
- Do not include any additional explanations or comments in the markdown formatted extracted text.
291+
`
292+
if (image)
293+
$`- For images, generate a short alt-text description.`
294+
if (typeof instructions === "string") _.$`${instructions}`
295+
else if (typeof instructions === "function")
296+
await instructions(_)
297+
},
298+
{
299+
...rest,
300+
model,
301+
label: `${label}: page ${i + 1}`,
302+
responseType,
303+
system: ["system", "system.assistant"],
304+
}
305+
)
306+
if (res.error) throw new Error(res.error?.message)
307+
markdowns.push(res.text)
308+
}
309+
310+
return { pages, images, markdowns }
311+
}

Diff for: packages/core/src/expander.ts

+4
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,10 @@ export async function expandTemplate(
230230
template.topLogprobs || 0
231231
)
232232

233+
// finalize options
234+
env.meta.model = model
235+
Object.freeze(env.meta)
236+
233237
trace.startDetails("💾 script")
234238

235239
traceEnv(model, trace, env)
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
system({ title: "Base system prompt" })
22
$`## Markdown Output
33
Respond in Markdown (GitHub Flavored Markdown also supported).`
4+
if (/o3/.test(env.meta.model)) $`Formatting re-enabled.`

Diff for: packages/core/src/llms.json

+4-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"large": "gpt-4o",
1313
"small": "gpt-4o-mini",
1414
"vision": "gpt-4o",
15+
"vision_small": "gpt-4o-mini",
1516
"embeddings": "text-embedding-3-small",
1617
"reasoning": "o1",
1718
"reasoning_small": "o1-mini",
@@ -38,6 +39,7 @@
3839
"large": "gpt-4o",
3940
"small": "gpt-4o-mini",
4041
"vision": "gpt-4o",
42+
"vision_small": "gpt-4o-mini",
4143
"reasoning": "o1",
4244
"reasoning_small": "o1-mini"
4345
}
@@ -236,10 +238,10 @@
236238
"agent": "large",
237239
"long": "large",
238240
"memory": "small",
239-
"reasoning_small": "reasoning",
240241
"classify": "small",
241242
"summarize": "small",
242-
"cast": "small"
243+
"cast": "small",
244+
"ocr": "vision_small"
243245
},
244246
"pricings": {
245247
"github:gpt-4o-2024-11-20": {

Diff for: packages/core/src/llms.ts

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export function defaultModelConfigurations(): ModelConfigurations {
88
LARGE_MODEL_ID,
99
SMALL_MODEL_ID,
1010
VISION_MODEL_ID,
11+
"vision_small",
1112
"embeddings",
1213
"reasoning",
1314
"reasoning_small",

Diff for: packages/core/src/models.ts

+20-17
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import {
55
MODEL_PROVIDER_OPENAI,
66
} from "./constants"
77
import { errorMessage } from "./error"
8-
import { host, runtimeHost } from "./host"
8+
import { host, ModelConfiguration, runtimeHost } from "./host"
99
import { MarkdownTrace, TraceOptions } from "./trace"
1010
import { arrayify, assert, logVerbose, toStringList } from "./util"
1111
import { CancellationOptions } from "./cancellation"
@@ -119,6 +119,22 @@ export function traceLanguageModelConnection(
119119
}
120120
}
121121

122+
export function resolveModelAlias(model: string): ModelConfiguration {
123+
const { modelAliases } = runtimeHost
124+
const seen: string[] = []
125+
let res: ModelConfiguration = { model, source: "script" }
126+
while (modelAliases[res.model]) {
127+
let next = modelAliases[res.model]
128+
if (seen.includes(next.model))
129+
throw new Error(
130+
`Circular model alias: ${next.model}, seen ${[...seen].join(",")}`
131+
)
132+
seen.push(next.model)
133+
res = next
134+
}
135+
return res
136+
}
137+
122138
const resolvedModels = new Set<string>()
123139
export async function resolveModelConnectionInfo(
124140
conn: ModelConnectionOptions,
@@ -136,22 +152,9 @@ export async function resolveModelConnectionInfo(
136152
const hint = options?.model || conn.model
137153
// supports candidate if no model hint or hint is a model alias
138154
const supportsCandidates = !hint || !!modelAliases[hint]
139-
let modelId = hint || LARGE_MODEL_ID
140-
let candidates: string[]
141-
// recursively resolve model aliases
142-
{
143-
const seen: string[] = []
144-
while (modelAliases[modelId]) {
145-
const { model: id, candidates: c } = modelAliases[modelId]
146-
if (seen.includes(id))
147-
throw new Error(
148-
`Circular model alias: ${id}, seen ${[...seen].join(",")}`
149-
)
150-
seen.push(modelId)
151-
modelId = id
152-
if (supportsCandidates) candidates = c
153-
}
154-
}
155+
const resolved = resolveModelAlias(hint || LARGE_MODEL_ID)
156+
const modelId = resolved.model
157+
let candidates = supportsCandidates ? resolved.candidates : undefined
155158

156159
const resolveModel = async (
157160
model: string,

Diff for: packages/core/src/pdf.ts

+1
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ async function PDFTryParse(
168168

169169
// Iterate through each page and extract text content
170170
for (let i = 0; i < numPages; i++) {
171+
logVerbose(`pdf: extracting page ${i + 1}`)
171172
const page = await doc.getPage(1 + i) // 1-indexed
172173
const content = await page.getTextContent()
173174
const items: TextItem[] = content.items.filter(

Diff for: packages/core/src/promptrunner.ts

+8-10
Original file line numberDiff line numberDiff line change
@@ -85,16 +85,14 @@ async function resolveExpansionVars(
8585
}
8686

8787
// Create and return an object containing resolved variables
88-
const meta: PromptDefinition & ModelConnectionOptions = Object.freeze(
89-
structuredClone({
90-
id: template.id,
91-
title: template.title,
92-
description: template.description,
93-
group: template.group,
94-
model: template.model,
95-
defTools: template.defTools,
96-
})
97-
)
88+
const meta: PromptDefinition & ModelConnectionOptions = structuredClone({
89+
id: template.id,
90+
title: template.title,
91+
description: template.description,
92+
group: template.group,
93+
model: template.model,
94+
defTools: template.defTools,
95+
}) // frozen later
9896
const res = {
9997
dir: ".",
10098
files,

Diff for: packages/core/src/types/prompt_template.d.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1136,7 +1136,7 @@ interface ExpansionVariables {
11361136
output: OutputTrace
11371137

11381138
/**
1139-
* Metadata of the top-level prompt
1139+
* Resolved metadata
11401140
*/
11411141
meta: PromptDefinition & ModelConnectionOptions
11421142
}

Diff for: packages/sample/genaisrc/mdpdf.genai.mjs

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import { markdownifyPdf } from "genaiscript/runtime"
2+
script({
3+
files: "src/pdf/jacdac.pdf",
4+
})
5+
6+
const res = await markdownifyPdf(env.files[0])
7+
for (const md of res.markdowns) env.output.appendContent(md)

0 commit comments

Comments
 (0)