Skip to content

Commit 64f6441

Browse files
committed
Changed the character block search algo to binary search
1 parent ae19a66 commit 64f6441

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

generate_character_list.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ def get_blocks():
2323
def get_data():
2424
""" Download the info file for Unicode blocks.
2525
"""
26+
logging.info("Downloading character data...")
2627
req = request.urlopen(
2728
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
2829
)
2930
content = req.read().decode()
30-
logging.info("Downloading character data...")
3131
logging.info("Done")
3232
return content
3333

@@ -53,26 +53,34 @@ def load_blocks():
5353
indices.append((int(start, 16), int(stop, 16)))
5454
blocks.append(name.strip())
5555

56-
def locate_block(code):
57-
for index, [start, stop] in enumerate(indices):
58-
if code > stop:
59-
continue
60-
else:
61-
if code >= start:
62-
return blocks[index]
56+
def locate_block(code, left=0, right=len(indices)):
57+
"""
58+
Binary search on an ordered list of intervals.
59+
"""
60+
half = left + (right - left) // 2
61+
[start, end] = indices[half]
62+
if start > code:
63+
return locate_block(code, left, right=half)
64+
elif end < code:
65+
return locate_block(code, half, right=right)
66+
else:
67+
return blocks[half]
6368

6469
return locate_block
6570

6671

6772
def main():
73+
""" Read the character and block data and unite them to a text file containing the following fields:
74+
`<character name> <character comment> <code> <block name>`
75+
seperated by tab characters.
76+
"""
6877
get_block = load_blocks()
6978
characters = clean(get_data())
7079

7180
logging.info("Parsing character data...")
72-
7381
output = []
7482
for line in characters.split("\n"):
75-
# Parse the needed data
83+
# Parse the needed data from the character's line
7684
attributes = line.strip().split(";")
7785
code = attributes[0]
7886
name = attributes[1]

0 commit comments

Comments
 (0)