Skip to content

Commit fc03a9d

Browse files
Port to python3
Port to python3 testsuite run
1 parent 5e59ead commit fc03a9d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+273
-172
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1212
# for more details.
1313

14-
PYTHON = python
14+
PYTHON = python3
1515

1616
PREFIX = /usr/local
1717
DESTDIR =

djvu2hocr

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
# encoding=UTF-8
33

44
# Copyright © 2009-2018 Jakub Wilk <[email protected]>
@@ -14,6 +14,7 @@
1414
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1515
# for more details.
1616

17+
from __future__ import unicode_literals
1718
import sys
1819

1920
basedir = None

doc/dependencies

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ The following software is needed to run ocrodjvu:
1414

1515
* python-djvulibre_
1616

17+
* python-regex
18+
1719
* subprocess32_
1820

1921
* lxml_ ≥ 2.0

hocr2djvused

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
# encoding=UTF-8
33

44
# Copyright © 2008-2018 Jakub Wilk <[email protected]>
@@ -14,6 +14,7 @@
1414
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1515
# for more details.
1616

17+
from __future__ import unicode_literals
1718
import sys
1819

1920
basedir = None

lib/__init__.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
from __future__ import unicode_literals
12
import sys
23

34
if sys.version_info < (2, 7): # no coverage
45
raise RuntimeError('Python 2.7 is required')
5-
if sys.version_info >= (3, 0): # no coverage
6+
elif sys.version_info >= (3, 3): # no coverage
7+
pass
8+
else:
69
raise RuntimeError('Python 2.X is required')
710

811
# vim:ts=4 sts=4 sw=4 et

lib/cli/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1414
# for more details.
1515

16+
from __future__ import unicode_literals
1617
from .. import errors
1718
from .. import utils
1819

lib/cli/djvu2hocr.py

+25-18
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,16 @@
1414
# for more details.
1515

1616
from __future__ import print_function
17+
from __future__ import division
18+
from __future__ import unicode_literals
1719

20+
from builtins import str
21+
from builtins import map
22+
from builtins import range
23+
from past.utils import old_div
24+
from builtins import object
1825
import argparse
19-
import cgi
26+
import html
2027
import locale
2128
import os
2229
import re
@@ -99,7 +106,7 @@ def text(self):
99106
raise TypeError('list of {0} (!= 6) elements'.format(len(self._sexpr))) # no coverage
100107
if not isinstance(self._sexpr[5], sexpr.StringExpression):
101108
raise TypeError('last element is not a string') # no coverage
102-
return unicode(self._sexpr[5].value, 'UTF-8', 'replace')
109+
return self._sexpr[5].value
103110

104111
@property
105112
def children(self):
@@ -153,9 +160,9 @@ def break_chars(char_zone_list, options):
153160
continue
154161
for i, char in enumerate(char_text):
155162
subbox = text_zones.BBox(
156-
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(char_text) + 0.5),
163+
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(char_text)) + 0.5),
157164
bbox.y0,
158-
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * (i + 1) / len(char_text) + 0.5),
165+
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * (i + 1), len(char_text)) + 0.5),
159166
bbox.y1,
160167
)
161168
bbox_list += [subbox]
@@ -172,7 +179,7 @@ def break_chars(char_zone_list, options):
172179
i = j
173180
continue
174181
bbox = text_zones.BBox()
175-
for k in xrange(i, j):
182+
for k in range(i, j):
176183
bbox.update(bbox_list[k])
177184
element = etree.Element('span')
178185
element.set('class', 'ocrx_word')
@@ -196,9 +203,9 @@ def break_plain_text(text, bbox, options):
196203
i = j
197204
continue
198205
subbox = text_zones.BBox(
199-
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * i / len(text) + 0.5),
206+
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * i, len(text)) + 0.5),
200207
bbox.y0,
201-
int(bbox.x0 + (bbox.x1 - bbox.x0) * 1.0 * j / len(text) + 0.5),
208+
int(bbox.x0 + old_div((bbox.x1 - bbox.x0) * 1.0 * j, len(text)) + 0.5),
202209
bbox.y1,
203210
)
204211
element = etree.Element('span')
@@ -244,7 +251,7 @@ def process_zone(parent, zone, last, options):
244251
if child is not None and zone_type == const.TEXT_ZONE_WORD and not last:
245252
child.tail = ' '
246253
self = None
247-
elif isinstance(child_zone, unicode):
254+
elif isinstance(child_zone, str):
248255
text = child_zone
249256
if zone_type >= const.TEXT_ZONE_WORD and options.icu is not None and parent is not None:
250257
# Do word segmentation by hand.
@@ -267,7 +274,7 @@ def process_zone(parent, zone, last, options):
267274
def process_page(page_text, options):
268275
result = process_zone(None, page_text, last=True, options=options)
269276
tree = etree.ElementTree(result)
270-
tree.write(sys.stdout, encoding='UTF-8')
277+
tree.write(sys.stdout)
271278

272279
hocr_header_template = '''\
273280
<?xml version="1.0" encoding="UTF-8"?>
@@ -290,9 +297,9 @@ def process_page(page_text, options):
290297
</html>
291298
'''
292299

293-
def main(argv=sys.argv):
300+
def main(argv=[os.fsencode(arg) for arg in sys.argv]):
294301
options = ArgumentParser().parse_args(argv[1:])
295-
logger.info('Converting {path}:'.format(path=utils.smart_repr(options.path, system_encoding)))
302+
logger.info('Converting {path}:'.format(path=options.path))
296303
if options.pages is None:
297304
djvused = ipc.Subprocess(
298305
['djvused', '-e', 'n', os.path.abspath(options.path)],
@@ -302,9 +309,9 @@ def main(argv=sys.argv):
302309
n_pages = int(djvused.stdout.readline())
303310
finally:
304311
djvused.wait()
305-
options.pages = xrange(1, n_pages + 1)
312+
options.pages = range(1, n_pages + 1)
306313
page_iterator = iter(options.pages)
307-
sed_script = temporary.file(suffix='.djvused')
314+
sed_script = temporary.file(suffix='.djvused', mode='w+',encoding='UTF-8')
308315
for n in options.pages:
309316
print('select {0}; size; print-txt'.format(n), file=sed_script)
310317
sed_script.flush()
@@ -316,17 +323,17 @@ def main(argv=sys.argv):
316323
hocr_header = hocr_header_template.format(
317324
ocr_system=ocr_system,
318325
ocr_capabilities=' '.join(hocr.djvu2hocr_capabilities),
319-
title=cgi.escape(options.title),
320-
css=cgi.escape(options.css),
326+
title=html.escape(options.title),
327+
css=html.escape(options.css),
321328
)
322329
if not options.css:
323330
hocr_header = re.sub(hocr_header_style_re, '', hocr_header, count=1)
324-
sys.stdout.write(hocr_header)
331+
sys.stdout.write(hocr_header.encode('UTF-8'))
325332
for n in page_iterator:
326333
try:
327334
page_size = [
328335
int(str(sexpr.Expression.from_stream(djvused.stdout).value).split('=')[1])
329-
for i in xrange(2)
336+
for i in range(2)
330337
]
331338
options.page_bbox = text_zones.BBox(0, 0, page_size[0], page_size[1])
332339
page_text = sexpr.Expression.from_stream(djvused.stdout)
@@ -335,7 +342,7 @@ def main(argv=sys.argv):
335342
logger.info('- Page #{n}'.format(n=n))
336343
page_zone = Zone(page_text, page_size[1])
337344
process_page(page_zone, options)
338-
sys.stdout.write(hocr_footer)
345+
sys.stdout.write(hocr_footer.encode('UTF-8'))
339346
djvused.wait()
340347

341348
# vim:ts=4 sts=4 sw=4 et

lib/cli/hocr2djvused.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1414
# for more details.
1515

16+
from __future__ import unicode_literals
17+
from builtins import map
1618
import argparse
1719
import sys
1820

@@ -36,7 +38,7 @@ def __init__(self):
3638
self.add_argument('--version', action=version.VersionAction)
3739
self.add_argument('--rotation', dest='rotation', action='store', type=int, default=0, help='page rotation (in degrees)')
3840
def size(s):
39-
return map(int, s.split('x', 1))
41+
return list(map(int, s.split('x', 1)))
4042
self.add_argument('--page-size', metavar='WxH', dest='page_size', action='store', type=size, default=None, help='page size (in pixels)')
4143
group = self.add_argument_group(title='word segmentation options')
4244
group.add_argument('-t', '--details', dest='details', choices=('lines', 'words', 'chars'), action='store', default='words', help='amount of text details to extract')

lib/cli/ocrodjvu.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
# for more details.
1515

1616
from __future__ import print_function
17+
from __future__ import unicode_literals
1718

19+
from future import standard_library
20+
standard_library.install_aliases()
21+
from builtins import str
22+
from builtins import range
23+
from builtins import object
1824
import argparse
1925
import contextlib
2026
import inspect
@@ -258,7 +264,7 @@ def __init__(self):
258264
self.add_argument('--list-engines', action=self.list_engines, nargs=0, help='print list of available OCR engines')
259265
self.add_argument('-l', '--language', dest='language', help='set recognition language')
260266
self.add_argument('--list-languages', action=self.list_languages, nargs=0, help='print list of available languages')
261-
self.add_argument('--render', dest='render_layers', choices=self._render_map.keys(), action='store', default='mask', help='image layers to render')
267+
self.add_argument('--render', dest='render_layers', choices=list(self._render_map.keys()), action='store', default='mask', help='image layers to render')
262268
def pages(x):
263269
return utils.parse_page_numbers(x)
264270
self.add_argument('-p', '--pages', dest='pages', action='store', default=None, type=pages, help='pages to process')
@@ -400,9 +406,9 @@ def init(self, options):
400406
bpp = 24 if self._options.render_layers != djvu.decode.RENDER_MASK_ONLY else 1
401407
self._image_format = self._options.engine.image_format(bpp)
402408

403-
def _temp_file(self, name, auto_remove=True):
409+
def _temp_file(self, name, mode='w+', encoding=locale.getpreferredencoding(),auto_remove=True):
404410
path = os.path.join(self._temp_dir, name)
405-
file = open(path, 'w+b')
411+
file = open(path,mode=mode,encoding=encoding)
406412
if not self._debug and auto_remove:
407413
file = temporary.wrapper(file, file.name)
408414
return file
@@ -417,7 +423,7 @@ def get_output_image(self, nth, page_job):
417423
file = self._temp_file('{n:06}.{ext}'.format(
418424
n=nth,
419425
ext=output_format.extension
420-
))
426+
),mode='wb',encoding=None)
421427
try:
422428
output_format.write_image(page_job, self._options.render_layers, file)
423429
file.flush()
@@ -510,7 +516,7 @@ def page_thread(self, pages, results, condition):
510516

511517
def _process(self, path, pages=None):
512518
self._engine = self._options.engine
513-
logger.info('Processing {path}:'.format(path=utils.smart_repr(path, system_encoding)))
519+
logger.info('Processing {path}:'.format(path=path))
514520
document = self.new_document(djvu.decode.FileURI(path))
515521
document.decoding_job.wait()
516522
if pages is None:
@@ -524,7 +530,7 @@ def _process(self, path, pages=None):
524530
condition = threading.Condition()
525531
threads = [
526532
threading.Thread(target=self.page_thread, args=(pages, results, condition))
527-
for i in xrange(njobs)
533+
for i in range(njobs)
528534
]
529535
def stop_threads():
530536
with condition:
@@ -540,7 +546,7 @@ def stop_threads():
540546
sed_file.write('remove-txt\n')
541547
for page in pages:
542548
try:
543-
file_id = page.file.id.encode(system_encoding)
549+
file_id = page.file.id
544550
except UnicodeError:
545551
pageno = page.n + 1
546552
logger.warning('warning: cannot convert page {n} identifier to locale encoding'.format(n=pageno))

lib/engines/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1414
# for more details.
1515

16+
from __future__ import unicode_literals
1617
import pkgutil
1718

1819
def get_engines():

lib/engines/common.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1414
# for more details.
1515

16+
from __future__ import unicode_literals
17+
from builtins import str
18+
from builtins import object
1619
from .. import utils
1720
from .. import image_io
1821

@@ -33,7 +36,7 @@ def __init__(self, *args, **kwargs):
3336
raise TypeError('{tp}.name must be a string'.format(tp=tpname)) # no coverage
3437
if not issubclass(self.image_format, image_io.ImageFormat):
3538
raise TypeError('{tp}.image_format must be an ImageFormat subclass'.format(tp=tpname)) # no coverage
36-
for key, value in kwargs.iteritems():
39+
for key, value in kwargs.items():
3740
try:
3841
prop = getattr(type(self), key)
3942
if not isinstance(prop, utils.property):
@@ -63,6 +66,6 @@ def save(self, prefix):
6366
file.write(str(self))
6467

6568
def as_stringio(self):
66-
return io.BytesIO(str(self))
69+
return io.StringIO(str(self))
6770

6871
# vim:ts=4 sts=4 sw=4 et

lib/engines/cuneiform.py

+5
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,14 @@
1313
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1414
# for more details.
1515

16+
from __future__ import unicode_literals
1617
import os
1718
import re
1819
import shlex
1920
import warnings
21+
import locale
22+
import sys
23+
import codecs
2024

2125
from . import common
2226
from .. import errors
@@ -62,6 +66,7 @@ def _get_languages(self):
6266
)
6367
except OSError:
6468
raise errors.UnknownLanguageList
69+
cuneiform.stdout=codecs.getreader(sys.stdout.encoding or locale.getpreferredencoding())(cuneiform.stdout)
6570
self._cuneiform_to_iso = {}
6671
self._user_to_cuneiform = {}
6772
try:

lib/engines/dummy.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1414
# for more details.
1515

16+
from __future__ import unicode_literals
1617
from . import common
1718
from .. import image_io
1819
from .. import text_zones

lib/engines/gocr.py

+3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
# for more details.
1515

1616
from __future__ import division
17+
from __future__ import unicode_literals
1718

19+
from builtins import map
20+
from builtins import object
1821
import functools
1922
import re
2023
import shlex

0 commit comments

Comments
 (0)