pdf markdown runtime herlp (#1082)

pelikhan · web-flow · commit 372b10a5774b · 2025-02-02T22:30:31.000-08:00
* added helper

* increase horizon

* more ocr work

* added logging of pdf rendering

* support formatting re-enabled for 03 models

* link docs

* adding vision small models

* refresh lockfiles
diff --git a/THIRD_PARTY_LICENSES.md b/THIRD_PARTY_LICENSES.md
@@ -2862,7 +2862,7 @@ MIT License
 
 The following npm package may be included in this product:
 
- - genaiscript-vscode@1.97.3
+ - genaiscript-vscode@1.98.0
 
 This package contains the following license:
 
@@ -6447,20 +6447,20 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 The following npm packages may be included in this product:
 
- - @inquirer/checkbox@4.0.7
- - @inquirer/confirm@5.1.4
- - @inquirer/core@10.1.5
- - @inquirer/editor@4.2.4
- - @inquirer/expand@4.0.7
+ - @inquirer/checkbox@4.1.1
+ - @inquirer/confirm@5.1.5
+ - @inquirer/core@10.1.6
+ - @inquirer/editor@4.2.6
+ - @inquirer/expand@4.0.8
  - @inquirer/figures@1.0.10
- - @inquirer/input@4.1.4
- - @inquirer/number@3.0.7
- - @inquirer/password@4.0.7
- - @inquirer/prompts@7.2.4
- - @inquirer/rawlist@4.0.7
- - @inquirer/search@3.0.7
- - @inquirer/select@4.0.7
- - @inquirer/type@3.0.3
+ - @inquirer/input@4.1.5
+ - @inquirer/number@3.0.8
+ - @inquirer/password@4.0.8
+ - @inquirer/prompts@7.3.1
+ - @inquirer/rawlist@4.0.8
+ - @inquirer/search@3.0.8
+ - @inquirer/select@4.0.8
+ - @inquirer/type@3.0.4
 
 These packages each contain the following license:
 
@@ -8009,15 +8009,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 The following npm packages may be included in this product:
 
  - @esbuild/linux-x64@0.23.1
- - @napi-rs/canvas-linux-x64-gnu@0.1.65
- - @napi-rs/canvas-linux-x64-musl@0.1.65
+ - @napi-rs/canvas-linux-x64-gnu@0.1.66
+ - @napi-rs/canvas-linux-x64-musl@0.1.66
  - @tokenizer/token@0.3.0
  - agent-base@6.0.2
  - eastasianwidth@0.2.0
- - genaiscript-core-internal@1.97.3
- - genaiscript-sample@1.97.3
- - genaiscript-web@1.97.3
- - genaiscript@1.97.3
+ - genaiscript-core-internal@1.98.0
+ - genaiscript-sample@1.98.0
+ - genaiscript-web@1.98.0
+ - genaiscript@1.98.0
  - get-port@1.0.0
  - https-proxy-agent@5.0.1
  - isarray@1.0.0
@@ -8737,7 +8737,7 @@ SOFTWARE.
 
 The following npm package may be included in this product:
 
- - @napi-rs/canvas@0.1.65
+ - @napi-rs/canvas@0.1.66
 
 This package contains the following license:
 
@@ -9037,7 +9037,7 @@ SOFTWARE.
 
 The following npm package may be included in this product:
 
- - @lvce-editor/ripgrep@1.5.0
+ - @lvce-editor/ripgrep@1.6.0
 
 This package contains the following license:
 
@@ -10603,7 +10603,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 The following npm packages may be included in this product:
 
  - @octokit/endpoint@10.1.2
- - @octokit/graphql@8.1.2
+ - @octokit/graphql@8.2.0
  - @octokit/plugin-throttling@9.4.0
  - @octokit/request@9.2.0
 
diff --git a/docs/src/content/docs/reference/scripts/o-models.mdx b/docs/src/content/docs/reference/scripts/o-models.mdx
@@ -36,3 +36,7 @@ script({
 
 - `o1-preview`, `o1-mini` do not support streaming
 - `o1` models do not support tool calling so GenAIScript uses [fallback tools](/genaiscript/reference/scripts/tools).
+
+## Advice on prompting
+
+OpenAI provides an extensive [advice on prompting](https://platform.openai.com/docs/guides/reasoning#advice-on-prompting) reasoning models.
diff --git a/docs/src/content/docs/reference/scripts/system.mdx b/docs/src/content/docs/reference/scripts/system.mdx
@@ -2749,6 +2749,7 @@ Base system prompt
 system({ title: "Base system prompt" })
 $`## Markdown Output
 Respond in Markdown (GitHub Flavored Markdown also supported).`
+if (/o3/.test(env.meta.model)) $`Formatting re-enabled.`
 
 `````
 
diff --git a/docs/yarn.lock b/docs/yarn.lock
diff --git a/packages/cli/src/info.ts b/packages/cli/src/info.ts
@@ -4,15 +4,18 @@
  * and resolving model connection info for specific scripts.
  */
 
+import { re } from "mathjs"
 import { resolveLanguageModelConfigurations } from "../../core/src/config"
 import { host, runtimeHost } from "../../core/src/host"
 import {
     ModelConnectionInfo,
+    resolveModelAlias,
     resolveModelConnectionInfo,
 } from "../../core/src/models"
 import { CORE_VERSION } from "../../core/src/version"
 import { YAMLStringify } from "../../core/src/yaml"
 import { buildProject } from "./build"
+import { kMaxLength } from "buffer"
 
 /**
  * Outputs basic system information including node version, platform, architecture, and process ID.
@@ -92,5 +95,14 @@ export async function scriptModelInfo(
 }
 
 export async function modelAliasesInfo() {
-    console.log(YAML.stringify(runtimeHost.modelAliases))
+    const res = Object.fromEntries(
+        Object.entries(runtimeHost.modelAliases).map(([k, v]) => [
+            k,
+            {
+                ...v,
+                resolved: resolveModelAlias(k).model,
+            },
+        ])
+    )
+    console.log(YAMLStringify(res))
 }
diff --git a/packages/cli/src/runtime.ts b/packages/cli/src/runtime.ts
@@ -228,3 +228,84 @@ export async function cast(
         ? { text, data: res.json }
         : { text, error: res.error?.message }
 }
+
+/**
+ *
+ * @param file
+ * @param options
+ * @returns
+ */
+export async function markdownifyPdf(
+    file: WorkspaceFile,
+    options?: PromptGeneratorOptions &
+        Omit<ParsePDFOptions, "renderAsImage"> & {
+            instructions?: string | PromptGenerator
+            ctx?: ChatGenerationContext
+        }
+) {
+    const {
+        ctx = env.generator,
+        label = `markdownify PDF`,
+        model = "ocr",
+        responseType = "markdown",
+        systemSafety = true,
+        instructions,
+        ...rest
+    } = options || {}
+
+    // extract text and render pages as images
+    const { pages, images = [] } = await parsers.PDF(file, {
+        ...rest,
+        renderAsImage: true,
+    })
+    const markdowns: string[] = []
+    for (let i = 0; i < pages.length; ++i) {
+        const page = pages[i]
+        const image = images[i]
+        // mix of text and vision
+        const res = await ctx.runPrompt(
+            async (_) => {
+                const previousPages = markdowns.slice(-2).join("\n\n")
+                if (previousPages.length) _.def("PREVIOUS_PAGES", previousPages)
+                if (page) _.def("PAGE", page)
+                if (image)
+                    _.defImages(image, { autoCrop: true, greyscale: true })
+                _.$`You are an expert at converting PDFs to markdown.
+                
+                ## Task
+                Your task is to analyze the image and extract textual content in markdown format.
+
+                The image is a screenshot of the current page in the PDF document.
+                We used pdfjs-dist to extract the text of the current page in <PAGE>, use it to help with the conversion.
+                The text from the previous pages is in <PREVIOUS_PAGES>, use it to ensure consistency in the conversion.
+
+                ## Instructions
+                - Ensure markdown text formatting for the extracted text is applied properly by analyzing the image.
+                - Do not change any content in the original extracted text while applying markdown formatting and do not repeat the extracted text.
+                - Preserve markdown text formatting if present such as horizontal lines, header levels, footers, bullet points, links/urls, or other markdown elements.
+                - Extract source code snippets in code fences.
+                - Do not omit any textual content from the markdown formatted extracted text.
+                - Do not generate page breaks
+                - Do not repeat the <PREVIOUS_PAGES> content.
+                - Do not include any additional explanations or comments in the markdown formatted extracted text.
+                `
+                if (image)
+                    $`- For images, generate a short alt-text description.`
+                if (typeof instructions === "string") _.$`${instructions}`
+                else if (typeof instructions === "function")
+                    await instructions(_)
+            },
+            {
+                ...rest,
+                model,
+                label: `${label}: page ${i + 1}`,
+                responseType,
+                system: ["system", "system.assistant"],
+            }
+        )
+        if (res.error) throw new Error(res.error?.message)
+        markdowns.push(res.text)
+    }
+
+    return { pages, images, markdowns }
+}
diff --git a/packages/core/src/expander.ts b/packages/core/src/expander.ts
@@ -230,6 +230,10 @@ export async function expandTemplate(
         template.topLogprobs || 0
     )
 
+    // finalize options
+    env.meta.model = model
+    Object.freeze(env.meta)
+
     trace.startDetails("💾 script")
 
     traceEnv(model, trace, env)
diff --git a/packages/core/src/genaisrc/system.output_markdown.genai.js b/packages/core/src/genaisrc/system.output_markdown.genai.js
@@ -1,3 +1,4 @@
 system({ title: "Base system prompt" })
 $`## Markdown Output
 Respond in Markdown (GitHub Flavored Markdown also supported).`
+if (/o3/.test(env.meta.model)) $`Formatting re-enabled.`
diff --git a/packages/core/src/llms.json b/packages/core/src/llms.json
@@ -12,6 +12,7 @@
                 "large": "gpt-4o",
                 "small": "gpt-4o-mini",
                 "vision": "gpt-4o",
+                "vision_small": "gpt-4o-mini",
                 "embeddings": "text-embedding-3-small",
                 "reasoning": "o1",
                 "reasoning_small": "o1-mini",
@@ -38,6 +39,7 @@
                 "large": "gpt-4o",
                 "small": "gpt-4o-mini",
                 "vision": "gpt-4o",
+                "vision_small": "gpt-4o-mini",
                 "reasoning": "o1",
                 "reasoning_small": "o1-mini"
             }
@@ -236,10 +238,10 @@
         "agent": "large",
         "long": "large",
         "memory": "small",
-        "reasoning_small": "reasoning",
         "classify": "small",
         "summarize": "small",
-        "cast": "small"
+        "cast": "small",
+        "ocr": "vision_small"
     },
     "pricings": {
         "github:gpt-4o-2024-11-20": {
diff --git a/packages/core/src/llms.ts b/packages/core/src/llms.ts
@@ -8,6 +8,7 @@ export function defaultModelConfigurations(): ModelConfigurations {
         LARGE_MODEL_ID,
         SMALL_MODEL_ID,
         VISION_MODEL_ID,
+        "vision_small",
         "embeddings",
         "reasoning",
         "reasoning_small",
diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts
@@ -5,7 +5,7 @@ import {
     MODEL_PROVIDER_OPENAI,
 } from "./constants"
 import { errorMessage } from "./error"
-import { host, runtimeHost } from "./host"
+import { host, ModelConfiguration, runtimeHost } from "./host"
 import { MarkdownTrace, TraceOptions } from "./trace"
 import { arrayify, assert, logVerbose, toStringList } from "./util"
 import { CancellationOptions } from "./cancellation"
@@ -119,6 +119,22 @@ export function traceLanguageModelConnection(
     }
 }
 
+export function resolveModelAlias(model: string): ModelConfiguration {
+    const { modelAliases } = runtimeHost
+    const seen: string[] = []
+    let res: ModelConfiguration = { model, source: "script" }
+    while (modelAliases[res.model]) {
+        let next = modelAliases[res.model]
+        if (seen.includes(next.model))
+            throw new Error(
+                `Circular model alias: ${next.model}, seen ${[...seen].join(",")}`
+            )
+        seen.push(next.model)
+        res = next
+    }
+    return res
+}
+
 const resolvedModels = new Set<string>()
 export async function resolveModelConnectionInfo(
     conn: ModelConnectionOptions,
@@ -136,22 +152,9 @@ export async function resolveModelConnectionInfo(
     const hint = options?.model || conn.model
     // supports candidate if no model hint or hint is a model alias
     const supportsCandidates = !hint || !!modelAliases[hint]
-    let modelId = hint || LARGE_MODEL_ID
-    let candidates: string[]
-    // recursively resolve model aliases
-    {
-        const seen: string[] = []
-        while (modelAliases[modelId]) {
-            const { model: id, candidates: c } = modelAliases[modelId]
-            if (seen.includes(id))
-                throw new Error(
-                    `Circular model alias: ${id}, seen ${[...seen].join(",")}`
-                )
-            seen.push(modelId)
-            modelId = id
-            if (supportsCandidates) candidates = c
-        }
-    }
+    const resolved = resolveModelAlias(hint || LARGE_MODEL_ID)
+    const modelId = resolved.model
+    let candidates = supportsCandidates ? resolved.candidates : undefined
 
     const resolveModel = async (
         model: string,
diff --git a/packages/core/src/pdf.ts b/packages/core/src/pdf.ts
@@ -168,6 +168,7 @@ async function PDFTryParse(
 
         // Iterate through each page and extract text content
         for (let i = 0; i < numPages; i++) {
+            logVerbose(`pdf: extracting page ${i + 1}`)
             const page = await doc.getPage(1 + i) // 1-indexed
             const content = await page.getTextContent()
             const items: TextItem[] = content.items.filter(
diff --git a/packages/core/src/promptrunner.ts b/packages/core/src/promptrunner.ts
@@ -85,16 +85,14 @@ async function resolveExpansionVars(
     }
 
     // Create and return an object containing resolved variables
-    const meta: PromptDefinition & ModelConnectionOptions = Object.freeze(
-        structuredClone({
-            id: template.id,
-            title: template.title,
-            description: template.description,
-            group: template.group,
-            model: template.model,
-            defTools: template.defTools,
-        })
-    )
+    const meta: PromptDefinition & ModelConnectionOptions = structuredClone({
+        id: template.id,
+        title: template.title,
+        description: template.description,
+        group: template.group,
+        model: template.model,
+        defTools: template.defTools,
+    }) // frozen later
     const res = {
         dir: ".",
         files,
diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts
@@ -1136,7 +1136,7 @@ interface ExpansionVariables {
     output: OutputTrace
 
     /**
-     * Metadata of the top-level prompt
+     * Resolved metadata
      */
     meta: PromptDefinition & ModelConnectionOptions
 }
diff --git a/packages/sample/genaisrc/mdpdf.genai.mjs b/packages/sample/genaisrc/mdpdf.genai.mjs
@@ -0,0 +1,7 @@
+import { markdownifyPdf } from "genaiscript/runtime"
+script({
+    files: "src/pdf/jacdac.pdf",
+})
+
+const res = await markdownifyPdf(env.files[0])
+for (const md of res.markdowns) env.output.appendContent(md)
diff --git a/slides/yarn.lock b/slides/yarn.lock
diff --git a/yarn.lock b/yarn.lock

-Original file line number
+Diff line change
 The following npm package may be included in this product:
 - - genaiscript-vscode@1.97.3
 + - genaiscript-vscode@1.98.0
 This package contains the following license:
 The following npm packages may be included in this product:
 - - @inquirer/checkbox@4.0.7
 - - @inquirer/[email protected].4
 - - @inquirer/[email protected].5
 - - @inquirer/[email protected].4
 - - @inquirer/[email protected].7
 + - @inquirer/checkbox@4.1.1
 + - @inquirer/[email protected].5
 + - @inquirer/[email protected].6
 + - @inquirer/[email protected].6
 + - @inquirer/[email protected].8
  - @inquirer/[email protected]
 - - @inquirer/[email protected].4
 - - @inquirer/[email protected].7
 - - @inquirer/[email protected].7
 - - @inquirer/prompts@7.2.4
 - - @inquirer/[email protected].7
 - - @inquirer/[email protected].7
 - - @inquirer/[email protected].7
 - - @inquirer/[email protected].3
 + - @inquirer/[email protected].5
 + - @inquirer/[email protected].8
 + - @inquirer/[email protected].8
 + - @inquirer/prompts@7.3.1
 + - @inquirer/[email protected].8
 + - @inquirer/[email protected].8
 + - @inquirer/[email protected].8
 + - @inquirer/[email protected].4
 These packages each contain the following license:
 The following npm packages may be included in this product:
  - @esbuild/[email protected]
 - - @napi-rs/[email protected].65
 - - @napi-rs/[email protected].65
 + - @napi-rs/[email protected].66
 + - @napi-rs/[email protected].66
  - @tokenizer/[email protected]
  - [email protected]
  - [email protected]
 - - genaiscript-core-internal@1.97.3
 - - genaiscript-sample@1.97.3
 - - genaiscript-web@1.97.3
 - - genaiscript@1.97.3
 + - genaiscript-core-internal@1.98.0
 + - genaiscript-sample@1.98.0
 + - genaiscript-web@1.98.0
 + - genaiscript@1.98.0
  - [email protected]
  - [email protected]
  - [email protected]
 The following npm package may be included in this product:
 - - @napi-rs/[email protected].65
 + - @napi-rs/[email protected].66
 This package contains the following license:
 The following npm package may be included in this product:
 - - @lvce-editor/ripgrep@1.5.0
 + - @lvce-editor/ripgrep@1.6.0
 This package contains the following license:
 The following npm packages may be included in this product:
  - @octokit/[email protected]
 - - @octokit/graphql@8.1.2
 + - @octokit/graphql@8.2.0
  - @octokit/[email protected]
  - @octokit/[email protected]
Original file line number	Diff line number	Diff line change
`@@ -230,6 +230,10 @@ export async function expandTemplate(`
`230`	`230`	`template.topLogprobs \|\| 0`
`231`	`231`	`)`
`232`	`232`
	`233`	`+ // finalize options`
	`234`	`+ env.meta.model = model`
	`235`	`+ Object.freeze(env.meta)`
	`236`	`+`
`233`	`237`	`trace.startDetails("💾 script")`
`234`	`238`
`235`	`239`	`traceEnv(model, trace, env)`
Original file line number	Diff line number	Diff line change
`@@ -1136,7 +1136,7 @@ interface ExpansionVariables {`
`1136`	`1136`	`output: OutputTrace`
`1137`	`1137`
`1138`	`1138`	`/**`
`1139`		`- * Metadata of the top-level prompt`
	`1139`	`+ * Resolved metadata`
`1140`	`1140`	`*/`
`1141`	`1141`	`meta: PromptDefinition & ModelConnectionOptions`
`1142`	`1142`	`}`