-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathawsTextract.ts
58 lines (50 loc) · 1.55 KB
/
awsTextract.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import { TextractClient, AnalyzeDocumentCommand } from '@aws-sdk/client-textract';
import { ModelProvider } from './base';
// https://aws.amazon.com/textract/pricing/
// $4 per 1000 pages for the first 1M pages, Layout model
const COST_PER_PAGE = 4 / 1000;
export class AWSTextractProvider extends ModelProvider {
private client: TextractClient;
constructor() {
super('aws-textract');
this.client = new TextractClient({
region: process.env.AWS_REGION,
credentials: {
accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
},
});
}
async ocr(imagePath: string) {
try {
// Convert image URL to base64
const response = await fetch(imagePath);
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
const start = performance.now();
const command = new AnalyzeDocumentCommand({
Document: {
Bytes: buffer,
},
FeatureTypes: ['LAYOUT'],
});
const result = await this.client.send(command);
const end = performance.now();
// Extract text from blocks
const text =
result.Blocks?.filter((block) => block.Text)
.map((block) => block.Text)
.join('\n') || '';
return {
text,
usage: {
duration: end - start,
totalCost: COST_PER_PAGE, // the input is always 1 page.
},
};
} catch (error) {
console.error('AWS Textract Error:', error);
throw error;
}
}
}