1
+
2
+ from argparse import ArgumentParser , FileType , Action
3
+ import xml .etree .ElementTree as ET
4
+ import sqlite3
5
+ import sys
6
+ import os
7
+
8
+ # For store the groups tree into database the nested set model
9
+ # method has been used. Check this post for details:
10
+ # https://falsinsoft.blogspot.com/2013/01/tree-in-sql-database-nested-set-model.html
11
+
12
+ kvg = "{http://kanjivg.tagaini.net}"
13
+
14
+ position_id_list = {
15
+ "left" : 0 ,
16
+ "right" : 1 ,
17
+ "top" : 2 ,
18
+ "bottom" : 3 ,
19
+ "nyo" : 4 ,
20
+ "tare" : 5 ,
21
+ "kamae" : 6 ,
22
+ "kamae1" : 7 ,
23
+ "kamae2" : 8
24
+ }
25
+ radical_id_list = {
26
+ "general" : 0 ,
27
+ "nelson" : 1 ,
28
+ "tradit" : 2
29
+ }
30
+
31
+ def parse_cmdline ():
32
+ parser = ArgumentParser ()
33
+ parser .add_argument ("--kanjivgfile" , help = "path to the .xml KanjiVG file" , default = "kanjivg.xml" )
34
+ parser .add_argument ("--sqlitefile" , help = "path to the sqlite database to create" , required = True )
35
+ return parser .parse_args ()
36
+
37
+
38
+ def create_database (name ):
39
+ sqlitefile = name
40
+
41
+ if len (sqlitefile ) < 3 or sqlitefile [- 3 :] != ".db" :
42
+ sqlitefile = sqlitefile + ".db"
43
+
44
+ if os .path .exists (sqlitefile ):
45
+ os .remove (sqlitefile )
46
+
47
+ database = sqlite3 .connect (sqlitefile )
48
+ c = database .cursor ()
49
+
50
+ c .execute ("CREATE TABLE kanji ("
51
+ "character TEXT"
52
+ ")" )
53
+
54
+ c .execute ("CREATE TABLE groups ("
55
+ "kanji_id INTEGER,"
56
+ "lft INTEGER,"
57
+ "rgt INTEGER,"
58
+ "sequence INTEGER,"
59
+ "element TEXT DEFAULT NULL,"
60
+ "original TEXT DEFAULT NULL,"
61
+ "position INTEGER DEFAULT NULL,"
62
+ "variant INTEGER DEFAULT NULL,"
63
+ "partial INTEGER DEFAULT NULL,"
64
+ "number INTEGER DEFAULT NULL,"
65
+ "radical INTEGER DEFAULT NULL,"
66
+ "phon TEXT DEFAULT NULL,"
67
+ "tradForm INTEGER DEFAULT NULL,"
68
+ "radicalForm INTEGER DEFAULT NULL"
69
+ ")" )
70
+
71
+ c .execute ("CREATE TABLE strokes ("
72
+ "group_id INTEGER,"
73
+ "sequence INTEGER,"
74
+ "type TEXT,"
75
+ "path TEXT"
76
+ ")" )
77
+
78
+ c .execute ("CREATE INDEX kanji_index ON kanji (character)" )
79
+ c .execute ("CREATE INDEX groups_index ON groups (kanji_id)" )
80
+ c .execute ("CREATE INDEX tree_index ON groups (lft,rgt)" )
81
+ c .execute ("CREATE INDEX strokes_index ON strokes (group_id)" )
82
+
83
+ return database
84
+
85
+
86
+ def parse_path (path , group_id , database ):
87
+ c = database .cursor ()
88
+
89
+ id = path .attrib .get ("id" )
90
+ type = path .attrib .get (kvg + "type" )
91
+ d = path .attrib .get ("d" )
92
+
93
+ sequence = int (id [11 :])
94
+
95
+ c .execute ("INSERT INTO strokes (group_id, sequence, type, path) VALUES (?,?,?,?)" , [group_id , sequence , type , d ])
96
+
97
+
98
+ def parse_group (group , kanji_id , parent_lft , database ):
99
+ c = database .cursor ()
100
+
101
+ id = group .attrib .get ("id" )
102
+ element = group .attrib .get (kvg + "element" )
103
+ original = group .attrib .get (kvg + "original" )
104
+ position = group .attrib .get (kvg + "position" )
105
+ variant = group .attrib .get (kvg + "variant" )
106
+ partial = group .attrib .get (kvg + "partial" )
107
+ number = group .attrib .get (kvg + "number" )
108
+ radical = group .attrib .get (kvg + "radical" )
109
+ phon = group .attrib .get (kvg + "phon" )
110
+ tradForm = group .attrib .get (kvg + "tradForm" )
111
+ radicalForm = group .attrib .get (kvg + "radicalForm" )
112
+
113
+ sequence = 0
114
+ if id .find ("-g" ) != - 1 :
115
+ sequence = int (id [11 :])
116
+
117
+ if parent_lft > 0 :
118
+ c .execute ("UPDATE groups SET lft = lft + 2 WHERE kanji_id = ? AND lft > ?" , [kanji_id , parent_lft ])
119
+ c .execute ("UPDATE groups SET rgt = rgt + 2 WHERE kanji_id = ? AND rgt > ?" , [kanji_id , parent_lft ])
120
+
121
+ lft = parent_lft + 1
122
+ rgt = parent_lft + 2
123
+
124
+ query = "INSERT INTO groups (kanji_id, lft, rgt, sequence"
125
+ values = [kanji_id , lft , rgt , sequence ]
126
+
127
+ if element != None :
128
+ values .append (element )
129
+ query += ", element"
130
+ if original != None :
131
+ values .append (original )
132
+ query += ", original"
133
+ if position != None :
134
+ values .append (position_id_list [position ])
135
+ query += ", position"
136
+ if variant != None :
137
+ if variant == "true" :
138
+ values .append (1 )
139
+ else :
140
+ values .append (0 )
141
+ query += ", variant"
142
+ if partial != None :
143
+ if partial == "true" :
144
+ values .append (1 )
145
+ else :
146
+ values .append (0 )
147
+ query += ", partial"
148
+ if number != None :
149
+ values .append (number )
150
+ query += ", number"
151
+ if radical != None :
152
+ values .append (radical_id_list [radical ])
153
+ query += ", radical"
154
+ if phon != None :
155
+ values .append (phon )
156
+ query += ", phon"
157
+ if tradForm != None :
158
+ if tradForm == "true" :
159
+ values .append (1 )
160
+ else :
161
+ values .append (0 )
162
+ query += ", tradForm"
163
+ if radicalForm != None :
164
+ if radicalForm == "true" :
165
+ values .append (1 )
166
+ else :
167
+ values .append (0 )
168
+ query += ", radicalForm"
169
+
170
+ query += ") VALUES (?,?,?,?"
171
+ for i in range (0 , len (values )- 4 ):
172
+ query += ",?"
173
+ query += ")"
174
+
175
+ c .execute (query , values )
176
+ group_id = c .lastrowid
177
+
178
+ for item in group :
179
+ if item .tag == "g" :
180
+ parse_group (item , kanji_id , lft , database )
181
+ elif item .tag == "path" :
182
+ parse_path (item , group_id , database )
183
+
184
+
185
+ def parse_kanji (kanji , database ):
186
+ kanji_code = kanji .attrib .get ("id" )
187
+
188
+ if kanji_code == None or len (kanji_code ) != 15 or kanji_code [0 :10 ] != "kvg:kanji_" :
189
+ print ("Invalid 'id' attribute" )
190
+ return
191
+
192
+ kanji_code = int (kanji_code [10 :15 ], 16 )
193
+
194
+ if kanji_code >= 0x4E00 and kanji_code <= 0x9FBF : # Only kanji are stored into database
195
+ c = database .cursor ()
196
+ c .execute ("INSERT INTO kanji (character) VALUES (?)" , chr (kanji_code ))
197
+ kanji_id = c .lastrowid
198
+
199
+ if len (kanji ) != 1 or kanji [0 ].tag != "g" :
200
+ print ("Invalid kanji format" )
201
+ return
202
+
203
+ parse_group (kanji [0 ], kanji_id , 0 , database )
204
+
205
+
206
+ def parse_kanjisv (file , database ):
207
+ tree = ET .parse (file )
208
+ root = tree .getroot ()
209
+ counter = 0
210
+
211
+ if root .tag != "kanjivg" :
212
+ print ("Invalid kanjivg file" )
213
+ return
214
+
215
+ for item in root :
216
+ if item .tag == "kanji" :
217
+ parse_kanji (item , database )
218
+ counter += 1
219
+ if not counter % 100 :
220
+ print ("#" , end = "" , flush = True )
221
+
222
+ database .commit ()
223
+
224
+
225
+ def main ():
226
+ args = parse_cmdline ()
227
+
228
+ print ("Create database..." , end = "\n " , flush = True )
229
+ database = create_database (args .sqlitefile .strip ())
230
+
231
+ print ("Start importing KanjiVG data:" , end = "" , flush = True )
232
+ parse_kanjisv (args .kanjivgfile , database )
233
+
234
+ database .close ()
235
+
236
+
237
+ if __name__ == '__main__' :
238
+ main ()
0 commit comments