14
14
# for more details.
15
15
16
16
from __future__ import print_function
17
+ from __future__ import division
18
+ from __future__ import unicode_literals
17
19
20
+ from builtins import str
21
+ from builtins import map
22
+ from builtins import range
23
+ from past .utils import old_div
24
+ from builtins import object
18
25
import argparse
19
- import cgi
26
+ import html
20
27
import locale
21
28
import os
22
29
import re
@@ -99,7 +106,7 @@ def text(self):
99
106
raise TypeError ('list of {0} (!= 6) elements' .format (len (self ._sexpr ))) # no coverage
100
107
if not isinstance (self ._sexpr [5 ], sexpr .StringExpression ):
101
108
raise TypeError ('last element is not a string' ) # no coverage
102
- return unicode ( self ._sexpr [5 ].value , 'UTF-8' , 'replace' )
109
+ return self ._sexpr [5 ].value
103
110
104
111
@property
105
112
def children (self ):
@@ -153,9 +160,9 @@ def break_chars(char_zone_list, options):
153
160
continue
154
161
for i , char in enumerate (char_text ):
155
162
subbox = text_zones .BBox (
156
- int (bbox .x0 + ( bbox .x1 - bbox .x0 ) * 1.0 * i / len (char_text ) + 0.5 ),
163
+ int (bbox .x0 + old_div (( bbox .x1 - bbox .x0 ) * 1.0 * i , len (char_text ) ) + 0.5 ),
157
164
bbox .y0 ,
158
- int (bbox .x0 + ( bbox .x1 - bbox .x0 ) * 1.0 * (i + 1 ) / len (char_text ) + 0.5 ),
165
+ int (bbox .x0 + old_div (( bbox .x1 - bbox .x0 ) * 1.0 * (i + 1 ), len (char_text ) ) + 0.5 ),
159
166
bbox .y1 ,
160
167
)
161
168
bbox_list += [subbox ]
@@ -172,7 +179,7 @@ def break_chars(char_zone_list, options):
172
179
i = j
173
180
continue
174
181
bbox = text_zones .BBox ()
175
- for k in xrange (i , j ):
182
+ for k in range (i , j ):
176
183
bbox .update (bbox_list [k ])
177
184
element = etree .Element ('span' )
178
185
element .set ('class' , 'ocrx_word' )
@@ -196,9 +203,9 @@ def break_plain_text(text, bbox, options):
196
203
i = j
197
204
continue
198
205
subbox = text_zones .BBox (
199
- int (bbox .x0 + ( bbox .x1 - bbox .x0 ) * 1.0 * i / len (text ) + 0.5 ),
206
+ int (bbox .x0 + old_div (( bbox .x1 - bbox .x0 ) * 1.0 * i , len (text ) ) + 0.5 ),
200
207
bbox .y0 ,
201
- int (bbox .x0 + ( bbox .x1 - bbox .x0 ) * 1.0 * j / len (text ) + 0.5 ),
208
+ int (bbox .x0 + old_div (( bbox .x1 - bbox .x0 ) * 1.0 * j , len (text ) ) + 0.5 ),
202
209
bbox .y1 ,
203
210
)
204
211
element = etree .Element ('span' )
@@ -244,7 +251,7 @@ def process_zone(parent, zone, last, options):
244
251
if child is not None and zone_type == const .TEXT_ZONE_WORD and not last :
245
252
child .tail = ' '
246
253
self = None
247
- elif isinstance (child_zone , unicode ):
254
+ elif isinstance (child_zone , str ):
248
255
text = child_zone
249
256
if zone_type >= const .TEXT_ZONE_WORD and options .icu is not None and parent is not None :
250
257
# Do word segmentation by hand.
@@ -267,7 +274,7 @@ def process_zone(parent, zone, last, options):
267
274
def process_page (page_text , options ):
268
275
result = process_zone (None , page_text , last = True , options = options )
269
276
tree = etree .ElementTree (result )
270
- tree .write (sys .stdout , encoding = 'UTF-8' )
277
+ tree .write (sys .stdout )
271
278
272
279
hocr_header_template = '''\
273
280
<?xml version="1.0" encoding="UTF-8"?>
@@ -290,9 +297,9 @@ def process_page(page_text, options):
290
297
</html>
291
298
'''
292
299
293
- def main (argv = sys .argv ):
300
+ def main (argv = [ os . fsencode ( arg ) for arg in sys .argv ] ):
294
301
options = ArgumentParser ().parse_args (argv [1 :])
295
- logger .info ('Converting {path}:' .format (path = utils . smart_repr ( options .path , system_encoding ) ))
302
+ logger .info ('Converting {path}:' .format (path = options .path ))
296
303
if options .pages is None :
297
304
djvused = ipc .Subprocess (
298
305
['djvused' , '-e' , 'n' , os .path .abspath (options .path )],
@@ -302,9 +309,9 @@ def main(argv=sys.argv):
302
309
n_pages = int (djvused .stdout .readline ())
303
310
finally :
304
311
djvused .wait ()
305
- options .pages = xrange (1 , n_pages + 1 )
312
+ options .pages = range (1 , n_pages + 1 )
306
313
page_iterator = iter (options .pages )
307
- sed_script = temporary .file (suffix = '.djvused' )
314
+ sed_script = temporary .file (suffix = '.djvused' , mode = 'w+' , encoding = 'UTF-8' )
308
315
for n in options .pages :
309
316
print ('select {0}; size; print-txt' .format (n ), file = sed_script )
310
317
sed_script .flush ()
@@ -316,17 +323,17 @@ def main(argv=sys.argv):
316
323
hocr_header = hocr_header_template .format (
317
324
ocr_system = ocr_system ,
318
325
ocr_capabilities = ' ' .join (hocr .djvu2hocr_capabilities ),
319
- title = cgi .escape (options .title ),
320
- css = cgi .escape (options .css ),
326
+ title = html .escape (options .title ),
327
+ css = html .escape (options .css ),
321
328
)
322
329
if not options .css :
323
330
hocr_header = re .sub (hocr_header_style_re , '' , hocr_header , count = 1 )
324
- sys .stdout .write (hocr_header )
331
+ sys .stdout .write (hocr_header . encode ( 'UTF-8' ) )
325
332
for n in page_iterator :
326
333
try :
327
334
page_size = [
328
335
int (str (sexpr .Expression .from_stream (djvused .stdout ).value ).split ('=' )[1 ])
329
- for i in xrange (2 )
336
+ for i in range (2 )
330
337
]
331
338
options .page_bbox = text_zones .BBox (0 , 0 , page_size [0 ], page_size [1 ])
332
339
page_text = sexpr .Expression .from_stream (djvused .stdout )
@@ -335,7 +342,7 @@ def main(argv=sys.argv):
335
342
logger .info ('- Page #{n}' .format (n = n ))
336
343
page_zone = Zone (page_text , page_size [1 ])
337
344
process_page (page_zone , options )
338
- sys .stdout .write (hocr_footer )
345
+ sys .stdout .write (hocr_footer . encode ( 'UTF-8' ) )
339
346
djvused .wait ()
340
347
341
348
# vim:ts=4 sts=4 sw=4 et
0 commit comments