microsoft · pelikhan · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/docs/src/content/docs/reference/scripts/pdf.md b/docs/src/content/docs/reference/scripts/pdf.md
@@ -33,19 +33,28 @@
 // or analyze page per page, filter pages
 pages.slice(0, 2).forEach((page, i) => {
    def(`PAGE_${i}`, page)
 })
 ```
 
-## Rendering to images
+## Images and figures
+
+GenAIScript automatically extracts bitmap images from PDFs and stores them in the data array. You can use these images to generate prompts. The image are encoded as PNG and may be large.
+
+```js
+const { data } = await parsers.PDF(env.files[0])
+```
+
+## Rendering pages to images
 
 Add the `renderAsImage` option to also reach each page to a PNG image (as a buffer). This buffer can be used with a vision model to perform
 an OCR operation.
 
 ```js wrap
-const { images } = await parsers.PDF(env.files[0], 
-  { renderAsImage: true })
+const { images } = await parsers.PDF(env.files[0], { renderAsImage: true })
 ```
 
+You can control the quality of the rendered image using the `scale` parameter (default is 3).
+
 ## PDFs are messy
 
 The PDF format was never really meant to allow for clean text extraction. The `parsers.PDF` function uses the `pdf-parse` package to extract text from PDFs. This package is not perfect and may fail to extract text from some PDFs. If you have access to the original document, it is recommended to use a more text-friendly format such as markdown or plain text.
diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts
@@ -243,6 +243,8 @@ export const MODEL_PRICINGS = Object.freeze<
 export const NEW_SCRIPT_TEMPLATE = `$\`Write a short poem in code.\`
 `
 export const PDF_SCALE = 4
+export const PDF_HASH_LENGTH = 12
+
 export const PDF_MIME_TYPE = "application/pdf"
 export const DOCX_MIME_TYPE =
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
@@ -358,4 +360,6 @@ export const SCHEMA_DEFAULT_FORMAT = "json"
 export const THINK_REGEX = /<think>(.*?)($|<\/think>)/gis
 
 export const MAX_FILE_CONTENT_SIZE = 1024 * 1024 * 2 // 2MB
-export const TEST_CSV_ENTRY_SEPARATOR = /[;|\n]/g
+export const TEST_CSV_ENTRY_SEPARATOR = /[;|\n]/g
+
+export const INVALID_FILENAME_REGEX = /[<>:"/\\|?*\x00-\x1F]+/g
diff --git a/packages/core/src/filecache.ts b/packages/core/src/filecache.ts
@@ -7,8 +7,8 @@ import { stat, writeFile } from "fs/promises"
 import { ensureDir } from "fs-extra"
 
 export async function fileWriteCached(
-    bufferLike: BufferLike,
     dir: string,
+    bufferLike: BufferLike,
     options?: TraceOptions
 ): Promise<string> {
     const bytes = await resolveBufferLike(bufferLike, options)

diff --git a/packages/core/src/parsers.ts b/packages/core/src/parsers.ts
@@ -93,7 +93,7 @@ export async function createParsers(options: {
             }
         },
         PDF: async (file, options) => {
-            if (!file) return { file: undefined, pages: [] }
+            if (!file) return { file: undefined, pages: [], data: [] }
             const opts = {
                 ...(options || {}),
                 trace,
@@ -107,6 +107,7 @@ export async function createParsers(options: {
                 },
                 pages: pages?.map((p) => p.content),
                 images: pages?.map((p) => p.image),
+                data: pages,
             }
         },
         code: async (file, query) => {

diff --git a/packages/core/src/pdf.ts b/packages/core/src/pdf.ts
@@ -4,9 +4,15 @@ import { host } from "./host"
 import { TraceOptions } from "./trace"
 import os from "os"
 import { serializeError } from "./error"
-import { logVerbose, logWarn } from "./util"
-import { PDF_SCALE } from "./constants"
+import { dotGenaiscriptPath, logVerbose, logWarn } from "./util"
+import { INVALID_FILENAME_REGEX, PDF_HASH_LENGTH, PDF_SCALE } from "./constants"
 import { resolveGlobal } from "./globals"
+import { isUint8Array, isUint8ClampedArray } from "util/types"
+import { hash } from "./crypto"
+import { join } from "path"
+import { readFile, writeFile } from "fs/promises"
+import { ensureDir } from "fs-extra"
+import { YAMLStringify } from "./yaml"
 
 let standardFontDataUrl: string
 
@@ -124,10 +130,26 @@ function installPromiseWithResolversShim() {
         })
 }
 
-export interface PDFPage {
-    index: number
-    content: string
-    image?: Buffer
+enum ImageKind {
+    GRAYSCALE_1BPP = 1,
+    RGB_24BPP = 2,
+    RGBA_32BPP = 3,
+}
+
+async function computeHashFolder(
+    filename: string | WorkspaceFile,
+    options: TraceOptions & ParsePDFOptions & { content?: Uint8Array }
+) {
+    const { trace, content, ...rest } = options
+    const h = await hash(
+        [typeof filename === "string" ? { filename } : filename, content, rest],
+        {
+            readWorkspaceFiles: true,
+            version: true,
+            length: PDF_HASH_LENGTH,
+        }
+    )
+    return dotGenaiscriptPath("cache", "pdf", h)
 }
 
 /**
@@ -147,13 +169,42 @@ async function PDFTryParse(
         trace,
         renderAsImage,
         scale = PDF_SCALE,
+        cache,
     } = options || {}
 
+    const folder = await computeHashFolder(fileOrUrl, {
+        content,
+        ...(options || {}),
+    })
+    const resFilename = join(folder, "res.json")
+    const readCache = async () => {
+        if (cache === false) return undefined
+        try {
+            const res = JSON.parse(
+                await readFile(resFilename, {
+                    encoding: "utf-8",
+                })
+            )
+            logVerbose(`pdf: cache hit at ${folder}`)
+            return res
+        } catch {
+            return undefined
+        }
+    }
+
+    {
+        // try cache hit
+        const cached = await readCache()
+        if (cached) return cached
+    }
+
+    logVerbose(`pdf: decoding ${fileOrUrl || ""} in ${folder}`)
+    trace?.itemValue(`pdf: decoding ${fileOrUrl || ""}`, folder)
+    await ensureDir(folder)
     try {
         const pdfjs = await tryImportPdfjs(options)
-        const createCanvas = renderAsImage ? await tryImportCanvas() : undefined
+        const createCanvas = await tryImportCanvas()
         const { getDocument } = pdfjs
-        // Read data from file or use provided content
         const data = content || (await host.readFile(fileOrUrl))
         const loader = await getDocument({
             data,
@@ -168,9 +219,6 @@ async function PDFTryParse(
 
         // Iterate through each page and extract text content
         for (let i = 0; i < numPages; i++) {
-            logVerbose(
-                `pdf: extracting ${fileOrUrl || ""} page ${i + 1} / ${numPages}`
-            )
             const page = await doc.getPage(1 + i) // 1-indexed
             const content = await page.getTextContent()
             const items: TextItem[] = content.items.filter(
@@ -187,9 +235,11 @@ async function PDFTryParse(
                 index: i + 1,
                 content: lines.join("\n"),
             }
+
+            await writeFile(join(folder, `page_${p.index}.txt`), p.content)
             pages.push(p)
 
-            if (createCanvas) {
+            if (createCanvas && renderAsImage) {
                 const viewport = page.getViewport({ scale })
                 const canvas = await createCanvas(
                     viewport.width,
@@ -202,15 +252,118 @@ async function PDFTryParse(
                 })
                 await render.promise
                 const buffer = canvas.toBufferSync("png")
-                p.image = buffer
+                p.image = join(folder, `page_${i + 1}.png`)
+                await writeFile(p.image, buffer)
+            }
+
+            const opList = await page.getOperatorList()
+            const figures: PDFPageImage[] = []
+            for (let j = 0; j < opList.fnArray.length; j++) {
+                const fn = opList.fnArray[j]
+                const args = opList.argsArray[j]
+                if (fn === pdfjs.OPS.paintImageXObject && args) {
+                    const imageObj = args[0]
+                    if (imageObj) {
+                        const img = await new Promise<any>(
+                            (resolve, reject) => {
+                                page.objs.get(imageObj, (r: any) => {
+                                    resolve(r)
+                                })
+                            }
+                        )
+                        const fig = await decodeImage(
+                            p.index,
+                            img,
+                            createCanvas,
+                            imageObj,
+                            folder
+                        )
+                        if (fig) figures.push(fig)
+                    }
+                }
             }
+            p.figures = figures
+
+            logVerbose(
+                `pdf: extracted ${fileOrUrl || ""} page ${i + 1} / ${numPages}, ${p.figures.length ? `${p.figures.length} figures` : ""}`
+            )
         }
-        return { ok: true, pages }
+
+        const res = { ok: true, pages, content: PDFPagesToString(pages) }
+        await writeFile(join(folder, "content.txt"), res.content)
+        await writeFile(resFilename, JSON.stringify(res))
+        return res
     } catch (error) {
+        {
+            // try cache hit
+            const cached = await readCache()
+            if (cached) return cached
+        }
+
         logVerbose(error)
         trace?.error(`reading pdf`, error) // Log error if tracing is enabled
+        await writeFile(
+            join(folder, "error.txt"),
+            YAMLStringify(serializeError(error))
+        )
         return { ok: false, error: serializeError(error) }
     }
+
+    async function decodeImage(
+        pageIndex: number,
+        img: {
+            data: Uint8Array | Uint8ClampedArray
+            width: number
+            height: number
+            kind: ImageKind
+        },
+        createCanvas: (w: number, h: number) => any,
+        imageObj: any,
+        folder: string
+    ) {
+        if (!isUint8ClampedArray(img?.data) && !isUint8Array(img?.data))
+            return undefined
+
+        const { width, height, data: _data, kind } = img
+        const imageData = new ImageData(width, height)
+        for (let y = 0; y < height; y++) {
+            for (let x = 0; x < width; x++) {
+                const dstIdx = (y * width + x) * 4
+                imageData.data[dstIdx + 3] = 255 // A
+                if (kind === ImageKind.GRAYSCALE_1BPP) {
+                    const srcIdx = y * width + x
+                    imageData.data[dstIdx + 0] = _data[srcIdx] // B
+                    imageData.data[dstIdx + 1] = _data[srcIdx] // G
+                    imageData.data[dstIdx + 2] = _data[srcIdx] // R
+                } else {
+                    const srcIdx =
+                        (y * width + x) *
+                        (kind === ImageKind.RGBA_32BPP ? 4 : 3)
+                    imageData.data[dstIdx + 0] = _data[srcIdx] // B
+                    imageData.data[dstIdx + 1] = _data[srcIdx + 1] // G
+                    imageData.data[dstIdx + 2] = _data[srcIdx + 2] // R
+                }
+            }
+        }
+        const canvas = await createCanvas(width, height)
+        const ctx = canvas.getContext("2d")
+        ctx.putImageData(imageData, 0, 0)
+        const buffer = canvas.toBufferSync("png")
+        const fn = join(
+            folder,
+            `page-${pageIndex}-${imageObj.replace(INVALID_FILENAME_REGEX, "")}.png`
+        )
+        await writeFile(fn, buffer)
+
+        return {
+            id: imageObj,
+            width,
+            height,
+            type: "image/png",
+            size: buffer.length,
+            filename: fn,
+        } satisfies PDFPageImage
+    }
 }
 
 /**
@@ -234,7 +387,6 @@ export async function parsePdf(
     filenameOrBuffer: string | Uint8Array,
     options?: ParsePDFOptions & TraceOptions
 ): Promise<{ pages: PDFPage[]; content: string }> {
-    const { filter } = options || {}
     const filename =
         typeof filenameOrBuffer === "string" ? filenameOrBuffer : undefined
     const bytes =
@@ -243,10 +395,6 @@ export async function parsePdf(
             : (filenameOrBuffer as Uint8Array)
     let { pages, ok } = await PDFTryParse(filename, bytes, options)
     if (!ok) return { pages: [], content: "" }
-
-    // Apply filter if provided
-    if (filter)
-        pages = pages.filter((page, index) => filter(index, page.content))
     const content = PDFPagesToString(pages)
     return { pages, content }
 }

diff --git a/packages/core/src/promptcontext.ts b/packages/core/src/promptcontext.ts
@@ -74,7 +74,7 @@ export async function createPromptContext(
                 scope === "run"
                     ? join(runDir, "files")
                     : dotGenaiscriptPath("cache", "files")
-            return await fileWriteCached(f, dir)
+            return await fileWriteCached(dir, f)
         },
         copyFile: (src, dest) => runtimeHost.workspace.copyFile(src, dest),
         cache: (n) => runtimeHost.workspace.cache(n),