Skip to content

Commit 563ee2d

Browse files
committed
extract text from images
1 parent 4e82eb4 commit 563ee2d

9 files changed

+245
-8
lines changed

Dockerfile

+8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
ARG IMAGE=intersystemsdc/iris-community:latest
22
FROM $IMAGE
33

4+
USER root
5+
6+
RUN apt-get update && \
7+
apt-get install -yq tesseract-ocr && \
8+
apt-get install -yq poppler-utils
9+
10+
USER ${ISC_PACKAGE_MGRUSER}
11+
412
WORKDIR /home/irisowner/dev
513

614
ARG TESTS=0

README.md

+35-2
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
11
## text-extractor
2-
Extracts text from PDF files using embeded python
2+
Extracts text from PDF, PPTX files and Images (PNG, JPEG, ...) using embeded python
33

44

55
## Installation ZPM
66

7+
1. text-extractor
78
```
89
USER>zpm "install text-extractor"
910
```
1011

12+
2. Images (optional)
13+
This package uses tesseract-ocr to extract text from images. If you will be using to extract text from images, you will need to install tesseract-ocr additionally: apt-get install -yq tesseract-ocr:
14+
`apt-get install tesseract-ocr`
15+
16+
If the text is in any of the languages other than English, you will need the appropriate packages, for example, tesseract-ocr-fra for French: `apt-get install tesseract-ocr-fra`
17+
18+
3. PDF to Image (optional)
19+
This package supports several ways to work with PDF. One of them involves converting pdf to images first, and then using text extraction from images. If you will use this approach you need to install poppler-utils:
20+
`apt-get install poppler-utils`
1121

1222
## How to work with it
1323

@@ -32,6 +42,29 @@ USER>set pdf = ##class(NSolov.TextExtract.PDF).%New("/full/path/to/file.pdf")
3242
USER>set string = pdf.Extract(0)
3343
```
3444

45+
The examples above ignore images that can be inside .pdf and also contain text data
46+
47+
To get text and add text from images to it - use:
48+
```
49+
USER>set pdf = ##class(NSolov.TextExtract.PDF).%New("/full/path/to/file.pdf")
50+
USER>set string = pdf.ExtractWithImages(0,"eng")
51+
```
52+
53+
Another option is to save each .pdf page as an image, and then extract the text from those images
54+
```
55+
USER>set pdf = ##class(NSolov.TextExtract.PDF).%New("/full/path/to/file.pdf")
56+
USER>set string = pdf.ExtractWithImages(0,"eng")
57+
```
58+
59+
### IMAGES
60+
61+
To get text from the image:
62+
```
63+
USER>set img = ##class(NSolov.TextExtract.Image).%New("/full/path/to/file.png", "fra")
64+
USER>set string = img.Extract()
65+
```
66+
(second argument in %New() is language (`eng` by default))
67+
3568
### PPTX
3669

3770
To get text from the whole presentation:
@@ -55,5 +88,5 @@ USER>set string = pptx.Extract(0)
5588

5689
### Interoperability
5790

58-
From Interoperability you can use Business Operation `NSolov.TextExtract.BusinessOperation` with request `NSolov.TextExtract.PDFRequest` for pdf and `NSolov.TextExtract.PPTXRequest` for pptx.
91+
From Interoperability you can use Business Operation `NSolov.TextExtract.BusinessOperation` with request `NSolov.TextExtract.PDFRequest` for pdf, `NSolov.TextExtract.PPTXRequest` for pptx and `NSolov.TextExtract.ImageRequest` for images.
5992
The response is `Ens.StringContainer` object.

module.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
<Document name="text-extractor.ZPM">
44
<Module>
55
<Name>text-extractor</Name>
6-
<Version>2.0.0</Version>
7-
<Description>Extracts text from .pdf and .pptx files</Description>
6+
<Version>2.1.0</Version>
7+
<Description>Extracts text from .pdf, images and .pptx files</Description>
88
<Packaging>module</Packaging>
99
<SourcesRoot>src</SourcesRoot>
1010
<Resource Name="NSolov.TextExtract.PKG"/>

requirements.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
pypdf==3.9.1
2-
python-pptx==0.6.21
2+
python-pptx==0.6.21
3+
pytesseract==0.3.10
4+
pdf2image==1.16.3

src/NSolov/TextExtract/BusinessOperation.cls

+24-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,16 @@ Class NSolov.TextExtract.BusinessOperation Extends Ens.BusinessOperation
44
Method ExtractTextFromPDF(pRequest As NSolov.TextExtract.PDFRequest, Output pResponse As Ens.StringContainer) As %Status
55
{
66
Try {
7-
Set extractor = ##class(NSolov.TextExtract.PDF).%New(pRequest.Filename)
8-
Set pResponse = ##class(Ens.StringContainer).%New(extractor.Extract(pRequest.Page))
7+
Set text = ""
8+
Set extractor = ##class(NSolov.TextExtract.PDF).%New( ##class(%File).NormalizeFilenameWithSpaces(pRequest.Filename) )
9+
If (pRequest.UseOCR = "pagesAsImages") {
10+
Set text = extractor.ExtractAsImages(pRequest.Page, pRequest.LanguageOCR)
11+
} ElseIf (pRequest.UseOCR = "fromImages") {
12+
Set text = extractor.ExtractWithImages(pRequest.Page, pRequest.LanguageOCR)
13+
} Else {
14+
Set text = extractor.Extract(pRequest.Page)
15+
}
16+
Set pResponse = ##class(Ens.StringContainer).%New(text)
917
Return $$$OK
1018
} Catch ex {
1119
Return ex.AsStatus()
@@ -23,6 +31,17 @@ Method ExtractTextFromPPTX(pRequest As NSolov.TextExtract.PPTXRequest, Output pR
2331
}
2432
}
2533

34+
Method ExtractTextFromImage(pRequest As NSolov.TextExtract.ImageRequest, Output pResponse As Ens.StringContainer) As %Status
35+
{
36+
Try {
37+
Set extractor = ##class(NSolov.TextExtract.Image).%New(pRequest.Filename, pRequest.Language)
38+
Set pResponse = ##class(Ens.StringContainer).%New(extractor.Extract())
39+
} Catch ex {
40+
Return ex.AsStatus()
41+
}
42+
Return $$$OK
43+
}
44+
2645
XData MessageMap
2746
{
2847
<MapItems>
@@ -32,6 +51,9 @@ XData MessageMap
3251
<MapItem MessageType="NSolov.TextExtract.PPTXRequest">
3352
<Method>ExtractTextFromPPTX</Method>
3453
</MapItem>
54+
<MapItem MessageType="NSolov.TextExtract.ImageRequest">
55+
<Method>ExtractTextFromImage</Method>
56+
</MapItem>
3557
</MapItems>
3658
}
3759

src/NSolov/TextExtract/Image.cls

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Class NSolov.TextExtract.Image Extends NSolov.TextExtract.AbstractExtractor
2+
{
3+
4+
Property Language As %String;
5+
6+
Method %OnNew(filename As %String, language As %String = "eng") As %Status
7+
{
8+
do ##super(filename)
9+
set ..Language = language
10+
return $$$OK
11+
}
12+
13+
/// Extract text from image
14+
Method Extract() As %String [ Language = python ]
15+
{
16+
from pytesseract import pytesseract
17+
18+
str = pytesseract.image_to_string(self.Filename, lang=self.Language)
19+
return str
20+
}
21+
22+
}
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
Class NSolov.TextExtract.ImageRequest Extends Ens.Request
2+
{
3+
4+
Property Filename As %String(MAXLEN = "");
5+
6+
Property Language As %String [ InitialExpression = "eng" ];
7+
8+
Storage Default
9+
{
10+
<Data name="ImageRequestDefaultData">
11+
<Subscript>"ImageRequest"</Subscript>
12+
<Value name="1">
13+
<Value>Filename</Value>
14+
</Value>
15+
<Value name="2">
16+
<Value>Language</Value>
17+
</Value>
18+
</Data>
19+
<DefaultData>ImageRequestDefaultData</DefaultData>
20+
<Type>%Storage.Persistent</Type>
21+
}
22+
23+
}

src/NSolov/TextExtract/PDF.cls

+114-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
Class NSolov.TextExtract.PDF Extends NSolov.TextExtract.AbstractExtractor
22
{
33

4+
Property TmpDir As %String(MAXLEN = 1000);
5+
46
Method GetNumPages() As %Integer [ Language = python ]
57
{
68
from pypdf import PdfReader
@@ -10,7 +12,118 @@ Method GetNumPages() As %Integer [ Language = python ]
1012
return len(pdf_reader.pages)
1113
}
1214

13-
/// Extract text from a page
15+
/// returns the number of saved images
16+
Method SavePagesAsImages() As %Integer [ Language = python, Private ]
17+
{
18+
19+
from pdf2image import convert_from_path
20+
21+
images = convert_from_path(self.Filename)
22+
for i in range(len(images)):
23+
images[i].save(self.TmpDir+'page'+ str(i).zfill(5) +'.ppm', 'PPM')
24+
25+
return len(images)
26+
}
27+
28+
Method SaveImagesFromPage(pagenumber As %Integer) As %Integer [ Language = python ]
29+
{
30+
from pypdf import PdfReader
31+
import os
32+
import shutil
33+
34+
pdf_reader = PdfReader(self.Filename)
35+
36+
count = 0
37+
38+
try:
39+
if (pagenumber == -1):
40+
# all pages
41+
for page in pdf_reader.pages:
42+
print(page.images)
43+
for image_file_object in page.images:
44+
with open(self.TmpDir+str(count) + image_file_object.name, "wb") as fp:
45+
fp.write(image_file_object.data)
46+
count += 1
47+
else:
48+
page = pdf_reader.pages[pagenumber]
49+
50+
for image_file_object in page.images:
51+
with open(self.TmpDir+str(count) + image_file_object.name, "wb") as fp:
52+
fp.write(image_file_object.data)
53+
count += 1
54+
except:
55+
shutil.rmtree(self.TmpDir)
56+
os.makedirs(self.TmpDir)
57+
return count
58+
}
59+
60+
Method getTextFromOnePageWithImages(pagenum As %Integer = -1, lang = "eng") As %String
61+
{
62+
Set ..TmpDir = ##class(%File).NormalizeDirectory($$$FileTempDir)
63+
Set text = ..Extract(pagenum)
64+
Set imgNum = ..SaveImagesFromPage(pagenum)
65+
Set statement = ##class(%SQL.Statement).%New()
66+
Do statement.%PrepareClassQuery("%File", "FileSet")
67+
Set rs = statement.%Execute(..TmpDir)
68+
While rs.%Next(.sc) {
69+
If $$$ISERR(sc) Quit
70+
Set file = rs.%Get("Name")
71+
Set imgExtractor = ##class(NSolov.TextExtract.Image).%New(file, lang)
72+
Set text = text_$$$NL_imgExtractor.Extract()
73+
}
74+
Do ##class(%File).RemoveDirectoryTree(..TmpDir)
75+
Set ..TmpDir = ""
76+
Return text
77+
}
78+
79+
/// Extract text and text from each image
80+
Method ExtractWithImages(pageNum As %Integer = -1, lang As %String = "eng") As %String
81+
{
82+
Set text = ""
83+
If (pageNum = -1) {
84+
// all pages
85+
Set pnum = ..GetNumPages()
86+
For i=0:1:pnum-1 {
87+
Set text = text _ ..getTextFromOnePageWithImages(i, lang)
88+
}
89+
} Else {
90+
// one page
91+
Set text = ..getTextFromOnePageWithImages(pageNum, lang)
92+
}
93+
Return text
94+
}
95+
96+
/// Save .pdf pages as images, than extract text from images
97+
Method ExtractAsImages(pageNum As %Integer = -1, lang As %String = "eng") As %String
98+
{
99+
Set ..TmpDir = ##class(%File).NormalizeDirectory($$$FileTempDir)
100+
Set imgNum = ..SavePagesAsImages()
101+
Set text = ""
102+
Set statement = ##class(%SQL.Statement).%New()
103+
Do statement.%PrepareClassQuery("%File", "FileSet")
104+
Set rs = statement.%Execute(..TmpDir)
105+
Set i = 0
106+
While rs.%Next(.sc) {
107+
If $$$ISERR(sc) Quit
108+
Set file = rs.%Get("Name")
109+
If (pageNum = -1){
110+
Set imgExtractor = ##class(NSolov.TextExtract.Image).%New(file, lang)
111+
Set text = text_$$$NL_imgExtractor.Extract()
112+
} Else {
113+
If i=pageNum {
114+
Set imgExtractor = ##class(NSolov.TextExtract.Image).%New(file, lang)
115+
Set text = imgExtractor.Extract()
116+
Quit
117+
}
118+
}
119+
Set i = i + 1
120+
}
121+
Do ##class(%File).RemoveDirectoryTree(..TmpDir)
122+
Set ..TmpDir = ""
123+
Return text
124+
}
125+
126+
/// Extracts text only from a page
14127
/// Use -1 to extract text from the whole document
15128
Method Extract(page As %Integer = -1) As %String [ Language = python ]
16129
{

src/NSolov/TextExtract/PDFRequest.cls

+14
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
Class NSolov.TextExtract.PDFRequest Extends Ens.Request
22
{
33

4+
/// How to work with images in .pdf
5+
/// Empty value - extract only text
6+
/// fromImages - save all images from documents and concatenate text with text from images; you must install 'tesseract-ocr' to use this option
7+
/// pagesAsImages - save each page of .pdf as image and then use; you must install 'tesseract-ocr' and 'poppler-utils'
8+
Property UseOCR(VALUELIST = ",fromImages,pagesAsImages");
9+
10+
Property LanguageOCR [ InitialExpression = "eng" ];
11+
412
Property Filename As %String(MAXLEN = "");
513

614
/// use 0 for the first page
@@ -17,6 +25,12 @@ Storage Default
1725
<Value name="2">
1826
<Value>Page</Value>
1927
</Value>
28+
<Value name="3">
29+
<Value>UseOCR</Value>
30+
</Value>
31+
<Value name="4">
32+
<Value>LanguageOCR</Value>
33+
</Value>
2034
</Data>
2135
<DefaultData>PDFRequestDefaultData</DefaultData>
2236
<Type>%Storage.Persistent</Type>

0 commit comments

Comments
 (0)