Skip to content

Database

database

This module defines handles the database connection through the get_db context manager.

get_db()

Context-managed SQLAlchemy session providing access to the database

Yields:

Name Type Description
Session Session

SQLAlchemy Session object.

Raises:

Type Description
EnvironmentError

If the kotobase.db file doesn't exist.

Source code in kotobase/src/kotobase/db/database.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@contextmanager
def get_db():
    """
    Context-managed `SQLAlchemy` session providing access to the database

    Yields:
      Session (Session): `SQLAlchemy` Session object.

    Raises:
      EnvironmentError: If the `kotobase.db` file doesn't exist.
    """
    if not DATABASE_PATH.exists():
        raise EnvironmentError(
            "Couldn't find Database. Try running CLI build or pull command")
    new = not hasattr(_local, "db")
    if new:
        _local.db = Session(_engine(),
                            expire_on_commit=False,
                            autoflush=False)

    try:
        yield _local.db
    finally:
        if new:
            _local.db.close()
            del _local.db

build_database

This module defines the click command which builds the kotobase.db database using SQLAlchemy.

build(force)

Downloads source files, processes, and builds the Kotobase database.

Source code in kotobase/src/kotobase/db_builder/build_database.py
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
@click.command('build')
@click.option('--force',
              is_flag=True,
              help="Force re-build even if the file exists."
              )
def build(force):
    """
    Downloads source files, processes, and builds the Kotobase database.
    """

    if DATABASE_PATH.exists() and not force:
        click.echo("Database file already exists. Use --force to re-build.")
        return
    elif DATABASE_PATH.exists() and force:
        try:
            DATABASE_PATH.unlink()
            click.secho("Deleted Old Database File", fg="green")

        except FileNotFoundError:
            click.secho("Database File Doesn't Exist, Remove '--force' flag.",
                        fg="red",
                        err=True
                        )
            sys.exit(1)
        except PermissionError:
            click.secho("No Permission To Delete Database File",
                        fg="red",
                        err=True
                        )
            sys.exit(1)
        except Exception as e:
            click.secho(
                f"Unexpected Error While Deleting Database File: {e}",
                fg="red",
                err=True
                )
            sys.exit(1)
    session = Session()
    try:
        start = time.perf_counter()
        click.secho("--- Step 1: Downloading raw data files ---", fg="blue")
        download_data()

        click.secho("\n--- Step 2: Processing raw data into JSON ---",
                    fg="blue")
        parse_jmdict()
        parse_jmnedict()
        parse_kanjidic()
        parse_tatoeba()

        click.secho("\n--- Step 3: Building SQLite database ---", fg="blue")
        create_database()
        populate_jmdict(session)
        populate_jmnedict(session)
        populate_kanjidic(session)
        populate_tatoeba(session)
        populate_jlpt(session)
        end = time.perf_counter()
        click.secho("\nDatabase build process complete.",
                    fg="green", bold=True)
        build_time_sec = end - start
        build_date = str(datetime.datetime.now())
        build_file_size_mb = (os.path.getsize(DATABASE_PATH) / 1024) / 1024
        build_log_txt = dedent(
            f"""BUILD_TIME={build_time_sec}
BUILD_DATE={build_date}
SIZE_MB={build_file_size_mb}
""")

        # Log Build Info
        try:
            DB_BUILD_LOG_PATH.unlink(missing_ok=True)
            DB_BUILD_LOG_PATH.touch()
            DB_BUILD_LOG_PATH.write_text(build_log_txt)
        except Exception as e:
            click.secho(f"Couldn't write log: {e}",
                        fg="yellow")
            pass

        click.secho(f"\nBuild Time: {build_time_sec} seconds")
        click.secho(f"\nBuild Date: {build_date}")
        click.secho(f"\nFile Size: {build_file_size_mb} MB")
    finally:
        # Delete Raw Files
        for p in RAW_DATA_DIR.iterdir():
            p.unlink(missing_ok=True)
        # Delete JSON Files
        JMDICT_PATH.unlink(missing_ok=True)
        JMNEDICT_PATH.unlink(missing_ok=True)
        KANJIDIC2_PATH.unlink(missing_ok=True)
        TATOEBA_PATH.unlink(missing_ok=True)
        # Close Session
        session.close()

create_database()

Click command helper which creates the database and all tables.

Source code in kotobase/src/kotobase/db_builder/build_database.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def create_database() -> None:
    """
    Click command helper which creates the database and all tables.
    """
    # Rebuild even when existent
    DATABASE_PATH.unlink(missing_ok=True)
    DATABASE_PATH.touch()
    Base.metadata.drop_all(engine)
    Base.metadata.create_all(engine)
    with engine.begin() as conn:
        conn.exec_driver_sql(
            "CREATE INDEX IF NOT EXISTS idx_kana_text ON jmdict_kana(text)"
            )
        conn.exec_driver_sql(
            "CREATE INDEX IF NOT EXISTS idx_kanji_text ON jmdict_kanji(text)"
            )
    click.echo("Database Created Successfully")

populate_jlpt(session)

Click command helper which populates JLPT tables in the database.

Parameters:

Name Type Description Default
session Session

SQLAlchemy Session object

required
Source code in kotobase/src/kotobase/db_builder/build_database.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def populate_jlpt(session: SessionType) -> None:
    """
    Click command helper which populates JLPT tables in the database.

    Args:
      session (Session): SQLAlchemy Session object
    """

    click.echo("Populating JLPT Tables ...")
    jlpt_dir = JLPT_FOLDER
    json_files = list(jlpt_dir.glob("*.json"))
    level_pattern = re.compile(r'n(\d)')

    with click.progressbar(json_files,
                           label="Processing JLPT Files -> "
                           ) as bar:

        for json_file in bar:
            match = level_pattern.search(json_file.stem)
            if not match:
                continue
            level = int(match.group(1))

            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if "vocab" in json_file.name:
                for item in data:
                    item['level'] = level
                session.bulk_insert_mappings(JlptVocab, data)
            elif "kanji" in json_file.name:
                for item in data:
                    item['level'] = level
                    item['on_yomi'] = item.pop('on')
                    item['kun_yomi'] = item.pop('kun')
                session.bulk_insert_mappings(JlptKanji, data)
            elif "grammar" in json_file.name:
                for item in data:
                    item['level'] = level
                    item['examples'] = "\n".join(item['examples'])
                session.bulk_insert_mappings(JlptGrammar, data)

    session.commit()
    click.echo("\nJLPT Tables Populated")

populate_jmdict(session)

Click command helper which populates JMDict tables in the database.

Parameters:

Name Type Description Default
session Session

SQLAlchemy Session object

required
Source code in kotobase/src/kotobase/db_builder/build_database.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def populate_jmdict(session: SessionType) -> None:
    """
    Click command helper which populates JMDict tables in the database.

    Args:
      session (Session): SQLAlchemy Session object
    """
    click.echo("Populating JMDict Tables ...")
    with open(JMDICT_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    entries = []
    kanji_entries = []
    kana_entries = []
    sense_entries = []
    seen_ids: set[int] = set()
    with click.progressbar(data,
                           label="\nPreparing JMDict Data -> "
                           ) as bar:
        for item in bar:

            # Skip Duplicates
            if item['id'] in seen_ids:
                continue
            seen_ids.add(item['id'])
            entries.append({'id': item['id'],
                            'rank': item.get('rank', 99)
                            })

            # Kanji Readings
            for k in item.get('kanji', []):
                kanji_entries.append(
                    {'entry_id': item['id'],
                     'order': k.get('order', 0),
                     'text': k['text']
                     }
                    )

            # Kanji Readings
            for r in item.get('kana', []):
                kana_entries.append(
                    {'entry_id': item['id'],
                     'order': r.get('order', 0),
                     'text': r['text']
                     }
                    )
            # Senses
            for s in item.get('senses', []):
                sense_entries.append(
                    {'entry_id': item['id'],
                     'order': s.get('order'),
                     'pos': ", ".join(s.get('pos', [])),
                     'gloss': ", ".join(s.get('gloss', []))
                     }
                    )

    click.echo("\nInserting JMDict Data -> ")
    session.bulk_insert_mappings(JMDictEntry, entries)
    session.bulk_insert_mappings(JMDictKanji, kanji_entries)
    session.bulk_insert_mappings(JMDictKana, kana_entries)
    session.bulk_insert_mappings(JMDictSense, sense_entries)
    session.commit()
    click.echo("\nJMDict Tables Populated")

populate_jmnedict(session)

Click command helper which populates JMNeDict tables in the database.

Parameters:

Name Type Description Default
session Session

SQLAlchemy Session object

required
Source code in kotobase/src/kotobase/db_builder/build_database.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def populate_jmnedict(session: SessionType) -> None:
    """
    Click command helper which populates JMNeDict tables in the database.

    Args:
      session (Session): SQLAlchemy Session object
    """

    click.echo("Populating JMnedict Table ...")
    with open(JMNEDICT_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    entries = []
    with click.progressbar(data,
                           label="\nPreparing JMnedict Data -> "
                           ) as bar:
        for item in bar:
            types = [
                t for sublist in item['translations'] for t in sublist['type']
                ]
            translations = [
                t for sublist in item[
                    'translations'] for t in sublist['translation']
                ]
            entries.append(
                {'id': item['id'],
                 'kanji': ", ".join([k['text'] for k in item['kanji']]),
                 'kana': ", ".join([k['text'] for k in item['kana']]),
                 'translation_type': ", ".join(types),
                 'translation': ", ".join(translations)
                 }
                )

    click.echo("\nInserting JMnedict Entries -> ")
    session.bulk_insert_mappings(JMnedictEntry, entries)
    session.commit()
    click.echo("\nJMnedict Table Populated")

populate_kanjidic(session)

Click command helper which populates KANJIDIC table in the database.

Parameters:

Name Type Description Default
session Session

SQLAlchemy Session object

required
Source code in kotobase/src/kotobase/db_builder/build_database.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def populate_kanjidic(session: SessionType) -> None:
    """
    Click command helper which populates KANJIDIC table in the database.

    Args:
      session (Session): SQLAlchemy Session object
    """

    click.echo("Populating Kanjidic Table ...")
    with open(KANJIDIC2_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    entries = []
    with click.progressbar(data,
                           label="Preparing Kanjidic Data ->"
                           ) as bar:
        for item in bar:
            on_readings = [
                r['value'] for r in item['reading_meaning']['readings']
                if r['type'] == 'ja_on'
                ]
            kun_readings = [
                r['value'] for r in item['reading_meaning']['readings']
                if r['type'] == 'ja_kun'
                ]
            meanings = [
                m['value'] for m in item['reading_meaning']['meanings']
                if m['lang'] == 'en'
                ]

            entries.append(
                {'literal': item['literal'],
                 'grade': item.get('grade'),
                 'stroke_count': (
                     item['stroke_count'][0] if item['stroke_count'] else None
                     ),
                 'jlpt': item.get('jlpt'),
                 'on_readings': ", ".join(on_readings),
                 'kun_readings': ", ".join(kun_readings),
                 'meanings': ", ".join(meanings)
                 }
                )

    click.echo("\nInserting Kanjidic Entries -> ")
    session.bulk_insert_mappings(Kanjidic, entries)
    session.commit()
    click.echo("\nKanjidic Table Populated")

populate_tatoeba(session)

Click command helper which populates Tatoeba table in the database.

Parameters:

Name Type Description Default
session Session

SQLAlchemy Session object

required
Source code in kotobase/src/kotobase/db_builder/build_database.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def populate_tatoeba(session: SessionType) -> None:
    """
    Click command helper which populates Tatoeba table in the database.

    Args:
      session (Session): SQLAlchemy Session object
    """

    click.echo("Populating Tatoeba Table ...")
    with open(TATOEBA_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)

    entries = []
    with click.progressbar(data,
                           label="\nPreparing Tatoeba Data -> "
                           ) as bar:
        for item in bar:
            entries.append({'id': item['id'], 'text': item['text']})

    click.echo("\nInserting Tatoeba Entries ->")
    session.bulk_insert_mappings(TatoebaSentence, entries)
    session.commit()
    click.echo("Tatoeba Table Populated.")

download

This module defines the helper function which downloads the zipped source files and extracts their content for processing.

download_and_extract(url, output_filename)

Click command helper which downloads a file from an URL and extracts it if it is compressed.

Parameters:

Name Type Description Default
url str

File download URL

required
output_filename str

Path of where to save the file.

required
Source code in kotobase/src/kotobase/db_builder/download.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def download_and_extract(url: str,
                         output_filename: str
                         ) -> None:
    """
    Click command helper which downloads a file
    from an URL and extracts it if it is compressed.

    Args:
      url (str): File download URL
      output_filename (str): Path of where to save the file.
    """
    try:
        output_path = RAW_DATA_DIR / output_filename
        # Delete if it already exists
        output_path.unlink(missing_ok=True)
        click.secho(f"Downloading '{url}' ...",
                    fg="blue")
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        click.secho("  Download Successful",
                    fg="green")

        # Uncompress GZ file
        if url.endswith(".gz"):
            uncompressed_path = output_path.with_suffix('')
            # Delete if it already exists
            uncompressed_path.unlink(missing_ok=True)
            click.secho(f"  Extracting to '{uncompressed_path}' ...",
                        fg="blue")
            with gzip.open(output_path, "rb") as f_in:
                with open(uncompressed_path, "wb") as f_out:
                    f_out.write(f_in.read())
            output_path.unlink()
            click.secho("  Done!",
                        fg="green")

        # Uncompress BZ2 file
        elif url.endswith(".bz2"):
            uncompressed_path = output_path.with_suffix('')
            # Delete if it already exists
            uncompressed_path.unlink(missing_ok=True)
            click.secho(f"  Extracting to '{uncompressed_path}' ...",
                        fg="blue")
            with bz2.open(output_path, "rb") as f_in:
                with open(uncompressed_path, "wb") as f_out:
                    f_out.write(f_in.read())
            output_path.unlink()
            click.secho("  Done!",
                        fg="bright_green")
    except Exception as e:
        click.secho(f"Error while downloading '{url}' : {e}",
                    fg="red",
                    err=True
                    )
        sys.exit(1)

main()

Main download function to download all data sources from URLs specified in the config module.

Source code in kotobase/src/kotobase/db_builder/download.py
85
86
87
88
89
90
91
92
93
94
def main() -> None:
    """
    Main download function to download all data sources
    from URLs specified in the `config` module.
    """
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    download_and_extract(JMDICT_URL, "JMdict_e.xml.gz")
    download_and_extract(JMNEDICT_URL, "JMnedict.xml.gz")
    download_and_extract(KANJIDIC2_URL, "kanjidic2.xml.gz")
    download_and_extract(TATOEBA_URL, "jpn_sentences.tsv.bz2")

process_jmdict

This module defines the helper function which processes the raw JMDict XML file into a JSON file using XSLT transform for performance.

parse_jmdict()

Click helper function which parses JMdict_e.xml and saves it as a JSON file using an embedded XSLT.

Source code in kotobase/src/kotobase/db_builder/process_jmdict.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def parse_jmdict() -> None:
    """
    Click helper function which parses JMdict_e.xml
    and saves it as a JSON file using an embedded XSLT.
    """

    raw_path = RAW_JMDICT_PATH
    processed_path = JMDICT_PATH
    # Delete if it already exists
    processed_path.unlink(missing_ok=True)

    processed_path.parent.mkdir(parents=True, exist_ok=True)

    click.echo(f"Parsing '{raw_path.name}' with embedded XSLT ...")

    xml_doc = etree.parse(str(raw_path))
    xslt_doc = etree.parse(BytesIO(XSLT_TRANSFORM))
    transform = etree.XSLT(xslt_doc)

    result_tree = transform(xml_doc)

    entries = []
    lines = str(result_tree).splitlines()

    with click.progressbar(lines,
                           label="Assembling JSON -> ",
                           item_show_func=lambda x: "") as bar:
        for i, line in enumerate(bar):
            parts = line.split('|')
            if len(parts) != 5:
                continue

            entry_id, kanji_str, kana_str, pri_str, senses_str = parts
            pri_list = [p for p in pri_str.split('~') if p]
            entry_rank = _rank(pri_list)
            senses = []
            for j, sense_part in enumerate(senses_str.split('^')):
                if not sense_part:
                    continue
                if ';' in sense_part:
                    gloss_str, pos_str = sense_part.split(';', 1)
                else:
                    gloss_str, pos_str = sense_part, ""

                senses.append({
                    "order": j,
                    "gloss": [g for g in gloss_str.split('~') if g],
                    "pos": [p for p in pos_str.split('~') if p]
                })

                entries.append({
                    "id": int(entry_id),
                    "rank": entry_rank,
                    "kanji": [
                        {"text": k, "order": i}
                        for i, k in enumerate(kanji_str.split('~')) if k
                        ],
                    "kana": [{"text": k, "order": i}
                             for i, k in enumerate(kana_str.split('~')) if k
                             ],
                    "senses": senses
                    })

    click.echo(
        f"\nWriting {len(entries)} entries to '{processed_path.name}' ..."
        )
    with open(processed_path, 'w', encoding='utf-8') as f:
        json.dump(entries, f, ensure_ascii=False)

    click.secho("Successfully Processed JMDict.", fg="green")

process_jmnedict

This module defines the helper function which processes the raw JMNeDict XML file into a JSON file using XSLT transform for performance.

parse_jmnedict()

Click helper function which parses JMNedict_e.xml and saves it as a JSON file using an embedded XSLT.

Source code in kotobase/src/kotobase/db_builder/process_jmnedict.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def parse_jmnedict():
    """
    Click helper function which parses JMNedict_e.xml
    and saves it as a JSON file using an embedded XSLT.
    """

    raw_path = RAW_JMNEDICT_PATH
    processed_path = JMNEDICT_PATH
    # Delete if it already exists
    processed_path.unlink(missing_ok=True)

    processed_path.parent.mkdir(parents=True, exist_ok=True)

    click.echo(f"Parsing '{raw_path.name}' with embedded XSLT ...")

    # Load XML from file and XSLT from embedded string
    xml_doc = etree.parse(str(raw_path))
    xslt_doc = etree.parse(BytesIO(XSLT_TRANSFORM))
    transform = etree.XSLT(xslt_doc)

    # Apply transformation at C-level for speed
    result_tree = transform(xml_doc)

    # Process the simplified text output
    entries = []
    lines = str(result_tree).splitlines()

    with click.progressbar(lines,
                           label="Assembling JSON -> ",
                           item_show_func=lambda x: "") as bar:
        for i, line in enumerate(bar):
            parts = line.split('|')
            if len(parts) != 4:
                continue

            entry_id, kanji_str, kana_str, trans_str = parts
            translations = []
            for trans_part in trans_str.split('^'):
                if not trans_part:
                    continue
                if ';' in trans_part:
                    type_str, detail_str = trans_part.split(';', 1)
                else:
                    type_str, detail_str = trans_part, ""
                translations.append({
                    "type": [t for t in type_str.split('~') if t],
                    "translation": [d for d in detail_str.split('~') if d]
                })

            entries.append({
                "id": int(entry_id),
                "kanji": [{"text": k} for k in kanji_str.split('~') if k],
                "kana": [{"text": k} for k in kana_str.split('~') if k],
                "translations": translations
            })

    click.echo(
        f"\nWriting {len(entries)} entries to '{processed_path.name}' ...")
    with open(processed_path, 'w', encoding='utf-8') as f:
        json.dump(entries, f, ensure_ascii=False)

    click.secho("Successfully Processed JMnedict.", fg="green")

process_kanjidic

This module defines the helper function which processes the raw KANJIDIC2 XML file into a JSON

parse_kanjidic()

Click helper function which parses kanjidic.xml and saves it as a JSON file.

Source code in kotobase/src/kotobase/db_builder/process_kanjidic.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def parse_kanjidic() -> None:
    """
    Click helper function which parses kanjidic.xml
    and saves it as a JSON file.
    """

    raw_path = RAW_KANJIDIC2_PATH
    processed_path = KANJIDIC2_PATH
    # Delete if it already exists
    processed_path.unlink(missing_ok=True)
    processed_path.parent.mkdir(parents=True, exist_ok=True)

    click.echo(f"Parsing '{raw_path.name}' ...")

    characters = []
    # Use iterparse for memory-efficient parsing
    with click.progressbar(etree.iterparse(raw_path, tag='character'),
                           label="Processing characters -> ",
                           item_show_func=lambda x: ""
                           ) as bar:
        for _, element in bar:
            character = {
                "literal": element.findtext('literal'),
                "codepoint": [
                    {"type": cp.get('cp_type'), "value": cp.text}
                    for cp in element.findall('codepoint/cp_value')
                ],
                "radical": [
                    {"type": rad.get('rad_type'), "value": rad.text}
                    for rad in element.findall('radical/rad_value')
                ],
                "grade": element.findtext('misc/grade'),
                "stroke_count": [
                    sc.text for sc in element.findall('misc/stroke_count')],
                "variants": [
                    {"type": var.get('var_type'), "value": var.text}
                    for var in element.findall('misc/variant')
                ],
                "freq": element.findtext('misc/freq'),
                "jlpt": element.findtext('misc/jlpt'),
                "dic_number": [
                    {"type": dr.get('dr_type'),
                     "m_vol": dr.get('m_vol'),
                     "m_page": dr.get('m_page'),
                     "value": dr.text}
                    for dr in element.findall('dic_number/dic_ref')
                ],
                "query_code": [
                    {"type": qc.get('qc_type'),
                     "skip_misclass": qc.get('skip_misclass'),
                     "value": qc.text}
                    for qc in element.findall('query_code/q_code')
                ],
                "reading_meaning": {
                    "readings": [
                        {"type": r.get('r_type'),
                         "on_type": r.get('on_type'),
                         "r_status": r.get('r_status'),
                         "value": r.text}
                        for r in element.findall(
                            'reading_meaning/rmgroup/reading')
                    ],
                    "meanings": [
                        {"lang": m.get('m_lang', 'en'),
                         "value": m.text}
                        for m in element.findall(
                            'reading_meaning/rmgroup/meaning')
                    ]
                }
            }

            characters.append(character)
            # Free up memory
            element.clear()
            while element.getprevious() is not None:
                del element.getparent()[0]

    click.echo(f"\nWriting {len(characters)} characters\
        to '{processed_path.name}' ...")
    with open(processed_path, 'w', encoding='utf-8') as f:
        json.dump(characters, f, ensure_ascii=False)

    click.secho("Successfully Processed Kanjidic2.", fg="green")

process_tatoeba

This module defines the helper function which processes the raw Tatoeba Examples tsv file into a JSON

parse_tatoeba()

Click helper function which parses tatoeba.tsv and saves it as a JSON file.

Source code in kotobase/src/kotobase/db_builder/process_tatoeba.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def parse_tatoeba() -> None:
    """
    Click helper function which parses tatoeba.tsv
    and saves it as a JSON file.
    """
    raw_path = RAW_TATOEBA_PATH
    processed_path = TATOEBA_PATH
    # Delete if it already exists
    processed_path.unlink(missing_ok=True)
    processed_path.parent.mkdir(parents=True, exist_ok=True)

    click.echo(f"Parsing {raw_path.name}...")

    sentences = []
    with open(raw_path, 'r', encoding='utf-8') as f:
        # Get total number of lines for progress bar
        total_lines = sum(1 for line in f)
        f.seek(0)

        reader = csv.reader(f, delimiter='	', quoting=csv.QUOTE_NONE)
        with click.progressbar(reader, length=total_lines,
                               label="Processing sentences -> ") as bar:
            for row in bar:
                if len(row) == 3:
                    sentences.append({
                        "id": int(row[0]),
                        "lang": row[1],
                        "text": row[2]
                    })

    click.echo(f"\nWriting {len(sentences)} \
        sentences to '{processed_path.name}' ...")
    with open(processed_path, 'w', encoding='utf-8') as f:
        json.dump(sentences, f, ensure_ascii=False)

    click.secho("Successfully Processed Tatoeba Sentences.", fg="green")

pull

This module defines the click command which pulls a pre-built database from a public Google Drive folder.

pull_db(force)

Downloads the latest Kotobase database from Google Drive.

Source code in kotobase/src/kotobase/db_builder/pull.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
@click.command('pull-db')
@click.option('--force',
              is_flag=True,
              help="Force re-download even if the file exists."
              )
def pull_db(force):
    """
    Downloads the latest Kotobase database from Google Drive.
    """

    if DATABASE_PATH.exists() and not force:
        click.echo("Database file already exists. Use --force to re-download.")
        return
    elif DATABASE_PATH.exists() and force:
        try:
            DATABASE_PATH.unlink()
            DB_BUILD_LOG_PATH.unlink(missing_ok=True)
            click.secho("Deleted Old Database File", fg="green")

        except FileNotFoundError:
            click.secho("Database File Doesn't Exist, Remove '--force' flag.",
                        fg="red",
                        err=True
                        )
            sys.exit(1)
        except PermissionError:
            click.secho("No Permission To Delete Database File",
                        fg="red",
                        err=True
                        )
            sys.exit(1)
        except Exception as e:
            click.secho(
                f"Unexpected Error While Deleting Database File: {e}",
                fg="red",
                err=True
                )
            sys.exit(1)

    click.secho("Pulling Latest From Drive...",
                fg="blue")

    try:
        # Use the file ID directly
        gdown.download(id=DRIVE_FILE_ID,
                       output=str(DATABASE_PATH),
                       quiet=False)
        click.secho("Pulling Build Log...")
        # Also pull build log
        gdown.download(id=DRIVE_LOG_FILE_ID,
                       output=str(DB_BUILD_LOG_PATH),
                       quiet=False)
        click.secho("Database downloaded successfully.", fg="green")
    except Exception as e:
        click.secho(f"An error occurred: {e}", fg="red")
        click.echo("Please try building the \
            database manually with 'kotobase build'.")