Skip to content

Commit e3159ed

Browse files
sinediedshibbas
authored andcommitted
refactor(indexer): extract handlers
1 parent 703d278 commit e3159ed

File tree

2 files changed

+6
-7
lines changed

2 files changed

+6
-7
lines changed

packages/indexer/src/lib/document-processor.ts

+2-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import { type BaseLogger } from 'pino';
22
import { getBlobNameFromFile } from './blob-storage.js';
33
import { type ContentPage, type ContentSection, type Section } from './document.js';
4-
import { extractText, extractTextFromPdf } from './formats/index.js';
54

65
const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
76
const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
@@ -12,11 +11,7 @@ const SECTION_OVERLAP = 100;
1211
export class DocumentProcessor {
1312
formatHandlers = new Map<string, (data: Buffer) => Promise<ContentPage[]>>();
1413

15-
constructor(private logger: BaseLogger) {
16-
this.registerFormatHandler('text/plain', extractText);
17-
this.registerFormatHandler('text/markdown', extractText);
18-
this.registerFormatHandler('application/pdf', extractTextFromPdf);
19-
}
14+
constructor(private logger: BaseLogger) {}
2015

2116
async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) {
2217
const pages = await this.extractText(data, type);
@@ -25,7 +20,7 @@ export class DocumentProcessor {
2520
return { filename, type, category, sections };
2621
}
2722

28-
private registerFormatHandler(type: string, handler: (data: Buffer) => Promise<ContentPage[]>) {
23+
public registerFormatHandler(type: string, handler: (data: Buffer) => Promise<ContentPage[]>) {
2924
this.formatHandlers.set(type, handler);
3025
}
3126

packages/indexer/src/lib/indexer.ts

+4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { type AzureClients } from '../plugins/azure.js';
55
import { type OpenAiService } from '../plugins/openai.js';
66
import { wait } from './util/index.js';
77
import { DocumentProcessor } from './document-processor.js';
8+
import { extractText, extractTextFromPdf } from './formats/index.js';
89
import { MODELS_SUPPORTED_BATCH_SIZE } from './model-limits.js';
910
import { BlobStorage } from './blob-storage.js';
1011
import { type Section } from './document.js';
@@ -137,6 +138,9 @@ export class Indexer {
137138
}
138139

139140
const documentProcessor = new DocumentProcessor(this.logger);
141+
documentProcessor.registerFormatHandler('text/plain', extractText);
142+
documentProcessor.registerFormatHandler('text/markdown', extractText);
143+
documentProcessor.registerFormatHandler('application/pdf', extractTextFromPdf);
140144
const document = await documentProcessor.createDocumentFromFile(filename, data, type, category);
141145
const sections = document.sections;
142146
if (options.useVectors) {

0 commit comments

Comments
 (0)