1
1
import { type BaseLogger } from 'pino' ;
2
2
import { getBlobNameFromFile } from './blob-storage.js' ;
3
3
import { type ContentPage , type ContentSection , type Section } from './document.js' ;
4
- import { extractText , extractTextFromPdf } from './formats/index.js' ;
5
4
6
5
const SENTENCE_ENDINGS = new Set ( [ '.' , '!' , '?' ] ) ;
7
6
const WORD_BREAKS = new Set ( [ ',' , ';' , ':' , ' ' , '(' , ')' , '[' , ']' , '{' , '}' , '\t' , '\n' ] ) ;
@@ -12,11 +11,7 @@ const SECTION_OVERLAP = 100;
12
11
export class DocumentProcessor {
13
12
formatHandlers = new Map < string , ( data : Buffer ) => Promise < ContentPage [ ] > > ( ) ;
14
13
15
- constructor ( private logger : BaseLogger ) {
16
- this . registerFormatHandler ( 'text/plain' , extractText ) ;
17
- this . registerFormatHandler ( 'text/markdown' , extractText ) ;
18
- this . registerFormatHandler ( 'application/pdf' , extractTextFromPdf ) ;
19
- }
14
+ constructor ( private logger : BaseLogger ) { }
20
15
21
16
async createDocumentFromFile ( filename : string , data : Buffer , type : string , category : string ) {
22
17
const pages = await this . extractText ( data , type ) ;
@@ -25,7 +20,7 @@ export class DocumentProcessor {
25
20
return { filename, type, category, sections } ;
26
21
}
27
22
28
- private registerFormatHandler ( type : string , handler : ( data : Buffer ) => Promise < ContentPage [ ] > ) {
23
+ public registerFormatHandler ( type : string , handler : ( data : Buffer ) => Promise < ContentPage [ ] > ) {
29
24
this . formatHandlers . set ( type , handler ) ;
30
25
}
31
26
0 commit comments