@@ -23,11 +23,11 @@ def get_blocks():
23
23
def get_data ():
24
24
""" Download the info file for Unicode blocks.
25
25
"""
26
+ logging .info ("Downloading character data..." )
26
27
req = request .urlopen (
27
28
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
28
29
)
29
30
content = req .read ().decode ()
30
- logging .info ("Downloading character data..." )
31
31
logging .info ("Done" )
32
32
return content
33
33
@@ -53,26 +53,34 @@ def load_blocks():
53
53
indices .append ((int (start , 16 ), int (stop , 16 )))
54
54
blocks .append (name .strip ())
55
55
56
- def locate_block (code ):
57
- for index , [start , stop ] in enumerate (indices ):
58
- if code > stop :
59
- continue
60
- else :
61
- if code >= start :
62
- return blocks [index ]
56
+ def locate_block (code , left = 0 , right = len (indices )):
57
+ """
58
+ Binary search on an ordered list of intervals.
59
+ """
60
+ half = left + (right - left ) // 2
61
+ [start , end ] = indices [half ]
62
+ if start > code :
63
+ return locate_block (code , left , right = half )
64
+ elif end < code :
65
+ return locate_block (code , half , right = right )
66
+ else :
67
+ return blocks [half ]
63
68
64
69
return locate_block
65
70
66
71
67
72
def main ():
73
+ """ Read the character and block data and unite them to a text file containing the following fields:
74
+ `<character name> <character comment> <code> <block name>`
75
+ seperated by tab characters.
76
+ """
68
77
get_block = load_blocks ()
69
78
characters = clean (get_data ())
70
79
71
80
logging .info ("Parsing character data..." )
72
-
73
81
output = []
74
82
for line in characters .split ("\n " ):
75
- # Parse the needed data
83
+ # Parse the needed data from the character's line
76
84
attributes = line .strip ().split (";" )
77
85
code = attributes [0 ]
78
86
name = attributes [1 ]
0 commit comments