Skip to content

Commit ef49145

Browse files
committedNov 19, 2018
Added KanjiVGToSQLite script
1 parent 293f06b commit ef49145

File tree

4 files changed

+303
-0
lines changed

4 files changed

+303
-0
lines changed
 

‎KanjiVGToSQLite/KanjiVGToSQLite.py

+238
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
2+
from argparse import ArgumentParser, FileType, Action
3+
import xml.etree.ElementTree as ET
4+
import sqlite3
5+
import sys
6+
import os
7+
8+
# For store the groups tree into database the nested set model
9+
# method has been used. Check this post for details:
10+
# https://falsinsoft.blogspot.com/2013/01/tree-in-sql-database-nested-set-model.html
11+
12+
kvg = "{http://kanjivg.tagaini.net}"
13+
14+
position_id_list = {
15+
"left" : 0,
16+
"right" : 1,
17+
"top" : 2,
18+
"bottom" : 3,
19+
"nyo" : 4,
20+
"tare" : 5,
21+
"kamae" : 6,
22+
"kamae1" : 7,
23+
"kamae2" : 8
24+
}
25+
radical_id_list = {
26+
"general" : 0,
27+
"nelson" : 1,
28+
"tradit" : 2
29+
}
30+
31+
def parse_cmdline():
32+
parser = ArgumentParser()
33+
parser.add_argument("--kanjivgfile", help="path to the .xml KanjiVG file", default="kanjivg.xml")
34+
parser.add_argument("--sqlitefile", help="path to the sqlite database to create", required=True)
35+
return parser.parse_args()
36+
37+
38+
def create_database(name):
39+
sqlitefile = name
40+
41+
if len(sqlitefile) < 3 or sqlitefile[-3:] != ".db":
42+
sqlitefile = sqlitefile + ".db"
43+
44+
if os.path.exists(sqlitefile):
45+
os.remove(sqlitefile)
46+
47+
database = sqlite3.connect(sqlitefile)
48+
c = database.cursor()
49+
50+
c.execute("CREATE TABLE kanji ("
51+
"character TEXT"
52+
")")
53+
54+
c.execute("CREATE TABLE groups ("
55+
"kanji_id INTEGER,"
56+
"lft INTEGER,"
57+
"rgt INTEGER,"
58+
"sequence INTEGER,"
59+
"element TEXT DEFAULT NULL,"
60+
"original TEXT DEFAULT NULL,"
61+
"position INTEGER DEFAULT NULL,"
62+
"variant INTEGER DEFAULT NULL,"
63+
"partial INTEGER DEFAULT NULL,"
64+
"number INTEGER DEFAULT NULL,"
65+
"radical INTEGER DEFAULT NULL,"
66+
"phon TEXT DEFAULT NULL,"
67+
"tradForm INTEGER DEFAULT NULL,"
68+
"radicalForm INTEGER DEFAULT NULL"
69+
")")
70+
71+
c.execute("CREATE TABLE strokes ("
72+
"group_id INTEGER,"
73+
"sequence INTEGER,"
74+
"type TEXT,"
75+
"path TEXT"
76+
")")
77+
78+
c.execute("CREATE INDEX kanji_index ON kanji (character)")
79+
c.execute("CREATE INDEX groups_index ON groups (kanji_id)")
80+
c.execute("CREATE INDEX tree_index ON groups (lft,rgt)")
81+
c.execute("CREATE INDEX strokes_index ON strokes (group_id)")
82+
83+
return database
84+
85+
86+
def parse_path(path, group_id, database):
87+
c = database.cursor()
88+
89+
id = path.attrib.get("id")
90+
type = path.attrib.get(kvg+"type")
91+
d = path.attrib.get("d")
92+
93+
sequence = int(id[11:])
94+
95+
c.execute("INSERT INTO strokes (group_id, sequence, type, path) VALUES (?,?,?,?)", [group_id, sequence, type, d])
96+
97+
98+
def parse_group(group, kanji_id, parent_lft, database):
99+
c = database.cursor()
100+
101+
id = group.attrib.get("id")
102+
element = group.attrib.get(kvg+"element")
103+
original = group.attrib.get(kvg+"original")
104+
position = group.attrib.get(kvg+"position")
105+
variant = group.attrib.get(kvg+"variant")
106+
partial = group.attrib.get(kvg+"partial")
107+
number = group.attrib.get(kvg+"number")
108+
radical = group.attrib.get(kvg+"radical")
109+
phon = group.attrib.get(kvg+"phon")
110+
tradForm = group.attrib.get(kvg+"tradForm")
111+
radicalForm = group.attrib.get(kvg+"radicalForm")
112+
113+
sequence = 0
114+
if id.find("-g") != -1:
115+
sequence = int(id[11:])
116+
117+
if parent_lft > 0:
118+
c.execute("UPDATE groups SET lft = lft + 2 WHERE kanji_id = ? AND lft > ?", [kanji_id, parent_lft])
119+
c.execute("UPDATE groups SET rgt = rgt + 2 WHERE kanji_id = ? AND rgt > ?", [kanji_id, parent_lft])
120+
121+
lft = parent_lft + 1
122+
rgt = parent_lft + 2
123+
124+
query = "INSERT INTO groups (kanji_id, lft, rgt, sequence"
125+
values = [kanji_id, lft, rgt, sequence]
126+
127+
if element != None:
128+
values.append(element)
129+
query += ", element"
130+
if original != None:
131+
values.append(original)
132+
query += ", original"
133+
if position != None:
134+
values.append(position_id_list[position])
135+
query += ", position"
136+
if variant != None:
137+
if variant == "true":
138+
values.append(1)
139+
else:
140+
values.append(0)
141+
query += ", variant"
142+
if partial != None:
143+
if partial == "true":
144+
values.append(1)
145+
else:
146+
values.append(0)
147+
query += ", partial"
148+
if number != None:
149+
values.append(number)
150+
query += ", number"
151+
if radical != None:
152+
values.append(radical_id_list[radical])
153+
query += ", radical"
154+
if phon != None:
155+
values.append(phon)
156+
query += ", phon"
157+
if tradForm != None:
158+
if tradForm == "true":
159+
values.append(1)
160+
else:
161+
values.append(0)
162+
query += ", tradForm"
163+
if radicalForm != None:
164+
if radicalForm == "true":
165+
values.append(1)
166+
else:
167+
values.append(0)
168+
query += ", radicalForm"
169+
170+
query += ") VALUES (?,?,?,?"
171+
for i in range(0, len(values)-4):
172+
query += ",?"
173+
query += ")"
174+
175+
c.execute(query, values)
176+
group_id = c.lastrowid
177+
178+
for item in group:
179+
if item.tag == "g":
180+
parse_group(item, kanji_id, lft, database)
181+
elif item.tag == "path":
182+
parse_path(item, group_id, database)
183+
184+
185+
def parse_kanji(kanji, database):
186+
kanji_code = kanji.attrib.get("id")
187+
188+
if kanji_code == None or len(kanji_code) != 15 or kanji_code[0:10] != "kvg:kanji_":
189+
print("Invalid 'id' attribute")
190+
return
191+
192+
kanji_code = int(kanji_code[10:15], 16)
193+
194+
if kanji_code >= 0x4E00 and kanji_code <= 0x9FBF: # Only kanji are stored into database
195+
c = database.cursor()
196+
c.execute("INSERT INTO kanji (character) VALUES (?)", chr(kanji_code))
197+
kanji_id = c.lastrowid
198+
199+
if len(kanji) != 1 or kanji[0].tag != "g":
200+
print("Invalid kanji format")
201+
return
202+
203+
parse_group(kanji[0], kanji_id, 0, database)
204+
205+
206+
def parse_kanjisv(file, database):
207+
tree = ET.parse(file)
208+
root = tree.getroot()
209+
counter = 0
210+
211+
if root.tag != "kanjivg":
212+
print("Invalid kanjivg file")
213+
return
214+
215+
for item in root:
216+
if item.tag == "kanji":
217+
parse_kanji(item, database)
218+
counter += 1
219+
if not counter % 100:
220+
print("#", end="", flush=True)
221+
222+
database.commit()
223+
224+
225+
def main():
226+
args = parse_cmdline()
227+
228+
print("Create database...", end="\n", flush=True)
229+
database = create_database(args.sqlitefile.strip())
230+
231+
print("Start importing KanjiVG data:", end="", flush=True)
232+
parse_kanjisv(args.kanjivgfile, database)
233+
234+
database.close()
235+
236+
237+
if __name__ == '__main__':
238+
main()
+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
2+
<PropertyGroup>
3+
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
4+
<SchemaVersion>2.0</SchemaVersion>
5+
<ProjectGuid>a945977f-57b8-4159-a7b0-f423f2c66a83</ProjectGuid>
6+
<ProjectHome>.</ProjectHome>
7+
<StartupFile>KanjiVGToSQLite.py</StartupFile>
8+
<SearchPath>
9+
</SearchPath>
10+
<WorkingDirectory>.</WorkingDirectory>
11+
<OutputPath>.</OutputPath>
12+
<Name>KanjiVGToSQLite</Name>
13+
<RootNamespace>KanjiVGToSQLite</RootNamespace>
14+
<LaunchProvider>Standard Python launcher</LaunchProvider>
15+
<CommandLineArguments>--sqlitefile=kanjisv.db</CommandLineArguments>
16+
<EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
17+
</PropertyGroup>
18+
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
19+
<DebugSymbols>true</DebugSymbols>
20+
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
21+
</PropertyGroup>
22+
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
23+
<DebugSymbols>true</DebugSymbols>
24+
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
25+
</PropertyGroup>
26+
<ItemGroup>
27+
<Compile Include="KanjiVGToSQLite.py" />
28+
</ItemGroup>
29+
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
30+
<!-- Uncomment the CoreCompile target to enable the Build command in
31+
Visual Studio and specify your pre- and post-build commands in
32+
the BeforeBuild and AfterBuild targets below. -->
33+
<!--<Target Name="CoreCompile" />-->
34+
<Target Name="BeforeBuild">
35+
</Target>
36+
<Target Name="AfterBuild">
37+
</Target>
38+
</Project>

‎KanjiVGToSQLite/KanjiVGToSQLite.sln

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio 15
4+
VisualStudioVersion = 15.0.28010.2050
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "KanjiVGToSQLite", "KanjiVGToSQLite.pyproj", "{A945977F-57B8-4159-A7B0-F423F2C66A83}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{A945977F-57B8-4159-A7B0-F423F2C66A83}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{A945977F-57B8-4159-A7B0-F423F2C66A83}.Release|Any CPU.ActiveCfg = Release|Any CPU
16+
EndGlobalSection
17+
GlobalSection(SolutionProperties) = preSolution
18+
HideSolutionNode = FALSE
19+
EndGlobalSection
20+
GlobalSection(ExtensibilityGlobals) = postSolution
21+
SolutionGuid = {D0FE3BAE-C681-4C35-809D-84567580FA4B}
22+
EndGlobalSection
23+
EndGlobal

‎README.md

+4
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,7 @@ Convert the [KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) to
2626
JMnedictToSQLite
2727
---------
2828
Convert the [JMnedict](https://www.edrdg.org/enamdict/enamdict_doc.html) to sqlite database
29+
30+
KanjiVGToSQLite
31+
---------
32+
Convert the [KanjiVG](https://kanjivg.tagaini.net/) to sqlite database

0 commit comments

Comments
 (0)
Please sign in to comment.