chore: json dataset

This commit is contained in:
2025-02-09 14:30:25 +06:00
parent 94babaa7aa
commit fd380e250d
4 changed files with 13827 additions and 0 deletions

13477
cn_en_wn_dataset.json Normal file

File diff suppressed because it is too large Load Diff

89
gen_alpaca.py Normal file
View File

@@ -0,0 +1,89 @@
import sqlite3
import json
import random
from typing import List, Dict, Any
from pathlib import Path
def create_alpaca_dataset(
db_path: str, output_path: str, samples_per_book: int = 155
) -> None:
"""
Create an Alpaca-style JSON dataset for Chinese to English translation.
Args:
db_path: Path to the SQLite database
output_path: Path where the JSON dataset will be saved
samples_per_book: Maximum number of samples to take from each book_id
"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(
"select distinct book_id from paragraph_chunks where text_en is not null and text_zh is not null"
)
book_ids = [row[0] for row in cursor.fetchall()]
dataset: List[Dict[str, Any]] = []
for book_id in book_ids:
# get samples for current book_id
cursor.execute(
"""
select text_zh, text_en
from paragraph_chunks
where book_id = ?
and text_en is not null
and text_zh is not null
and length(text_zh) > 0
and length(text_en) > 0
""",
(book_id,),
)
samples = cursor.fetchall()
if not samples:
continue
selected_samples = random.sample(samples, min(len(samples), samples_per_book))
# Alpaca foramt
for zh_text, en_text in selected_samples:
entry = {
"instruction": "Translate the following Chinese text to English:",
"input": zh_text.strip(),
"output": en_text.strip(),
}
dataset.append(entry)
conn.close()
random.shuffle(dataset)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(dataset, f, ensure_ascii=False, indent=2)
print(f"Dataset created successfully with {len(dataset)} total samples")
print(f"Number of unique books: {len(book_ids)}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Generate Alpaca-style translation dataset"
)
parser.add_argument(
"--db_path", type=str, required=True, help="Path to SQLite database"
)
parser.add_argument(
"--output_path", type=str, required=True, help="Path for output JSON file"
)
parser.add_argument(
"--samples_per_book",
type=int,
default=155,
help="Maximum number of samples to take from each book_id",
)
args = parser.parse_args()
create_alpaca_dataset(args.db_path, args.output_path, args.samples_per_book)

249
paragraph_ctx_collect.py Normal file
View File

@@ -0,0 +1,249 @@
from typing import List, Tuple
import sqlite3
import re
def get_chapter_paragraphs(
cursor: sqlite3.Cursor, book_id: str, chapter_id: str
) -> Tuple[List[str], List[str]]:
"""
Gets all paragraphs for a specific chapter.
Returns (english_paragraphs, chinese_paragraphs).
"""
cursor.execute(
"""
select text_en, text_zh
from paragraphs
where book_id = ? and chapter_id = ?
""",
(book_id, chapter_id),
)
en_texts = []
zh_texts = []
for en, zh in cursor.fetchall():
if en and zh: # Skip empty paragraphs
en_texts.append(en.strip())
zh_texts.append(zh.strip())
return en_texts, zh_texts
def get_text_state(text: str) -> tuple[int, bool, bool]:
"""
Analyzes text for continuity markers
Returns (bracket_change, ends_with_colon, incomplete_sentence)
Args:
text: String to analyze
Returns:
tuple containing:
- int: Net change in bracket depth (positive for unclosed, negative for extra closing)
- bool: Whether the text ends with a colon
- bool: Whether the text ends without proper sentence termination
"""
if not text:
return 0, False, False
# count bracket balance
opens = len(re.findall(r"[【「『]", text))
closes = len(re.findall(r"[】」』]", text))
ends_with_punct = bool(re.search(r"[.!?。!?]\s*$", text.rstrip()))
return (opens - closes, text.rstrip().endswith(":"), not ends_with_punct)
def create_chunks(
en_texts: List[str],
zh_texts: List[str],
target_size: int = 1024,
min_size: int = 512,
max_size: int = 2048,
) -> List[Tuple[str, str]]:
"""
Creates parallel text chunks respecting continuity markers and size constraints
Args:
en_texts: List of English text paragraphs
zh_texts: List of corresponding Chinese text paragraphs
target_size: Ideal size for each chunk in characters
min_size: Minimum acceptable chunk size
max_size: Maximum acceptable chunk size
Returns:
List of tuples containing (english_chunk, chinese_chunk)
"""
chunks = []
current_en = []
current_zh = []
current_chars = 0
bracket_depth = 0
i = 0
while i < len(en_texts):
current_text = en_texts[i]
para_chars = len(current_text)
bracket_change, ends_with_colon, incomplete_sentence = get_text_state(
current_text
)
bracket_depth += bracket_change
# check if adding would exceed max_size
if current_chars + para_chars > max_size:
# only split if we're not in brackets, sentence is complete, and have met min_size
if (
bracket_depth <= 0
and not incomplete_sentence
and current_chars >= min_size
):
chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
current_en = []
current_zh = []
current_chars = 0
# add cur p
current_en.append(current_text)
current_zh.append(zh_texts[i])
current_chars += para_chars
# can we create a chunk?
next_exists = i + 1 < len(en_texts)
if (
current_chars >= target_size
and bracket_depth <= 0
and not ends_with_colon
and not incomplete_sentence
and next_exists
):
chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
current_en = []
current_zh = []
current_chars = 0
bracket_depth = 0
i += 1
# add remaining text if it it's min_size
if current_chars >= min_size:
chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
return chunks
def create_chunk_table(cursor: sqlite3.Cursor):
"""Creates the paragraph_chunks table if it doesn't exist"""
cursor.execute(
"""
create table if not exists paragraph_chunks (
id integer primary key autoincrement,
book_id text not null,
chapter_id text not null,
chunk_index integer not null,
text_en text,
text_zh text,
char_count integer,
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id),
unique(book_id, chapter_id, chunk_index)
)
"""
)
def store_book_chunks(db_path: str, book_id: str):
"""Process a book and store its chunks in the database"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
create_chunk_table(cursor)
chunks_by_chapter = process_book(db_path, book_id)
for chapter_id, chapter_chunks in chunks_by_chapter:
for i, (en_chunk, zh_chunk) in enumerate(chapter_chunks):
cursor.execute(
"""
insert into paragraph_chunks
(book_id, chapter_id, chunk_index, text_en, text_zh, char_count)
values (?, ?, ?, ?, ?, ?)
on conflict(book_id, chapter_id, chunk_index)
do update set
text_en = excluded.text_en,
text_zh = excluded.text_zh,
char_count = excluded.char_count
""",
(book_id, chapter_id, i, en_chunk, zh_chunk, len(en_chunk)),
)
conn.commit()
conn.close()
def process_book(db_path: str, book_id: str) -> List[Tuple[str, List[Tuple[str, str]]]]:
"""
Process book chapter by chapter, respecting chapter boundaries
Returns list of (chapter_id, chapter_chunks) tuples
"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(
"""
select distinct chapter_id
from paragraphs
where book_id = ?
order by chapter_id
""",
(book_id,),
)
chapter_ids = [row[0] for row in cursor.fetchall()]
all_chapter_chunks = []
for chapter_id in chapter_ids:
en_texts, zh_texts = get_chapter_paragraphs(cursor, book_id, chapter_id)
if en_texts and zh_texts: # skip empty chapters
chapter_chunks = create_chunks(en_texts, zh_texts)
all_chapter_chunks.append((chapter_id, chapter_chunks))
conn.close()
return all_chapter_chunks
def process_all_books(db_path: str):
"""Process and store chunks for all books in database"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("select book_id from books")
book_ids = [row[0] for row in cursor.fetchall()]
conn.close()
for book_id in book_ids:
print(f"Processing and storing book: {book_id}")
store_book_chunks(db_path, book_id)
if __name__ == "__main__":
import sys
if len(sys.argv) == 3 and sys.argv[1] == "--store":
db_path = sys.argv[2]
process_all_books(db_path)
else:
# test
test_en = [
"On it were words left by Wen Jin's parents:",
"【We learned from the news that you two got married.",
"Take care of each other in the future, if you need anything,",
"talk to us, even though you may not need to.",
"From Mom and Dad.】",
"After reading this, Wen Jin felt:",
"A complex mix of emotions surged through him.",
'Returning home with the parcels, Jiang Wan asked him, "Should the shoes be unpacked?"',
]
test_zh = ["zh" + str(i) for i in range(len(test_en))]
chunks = create_chunks(test_en, test_zh, target_size=1024)
for i, (en, zh) in enumerate(chunks, 1):
print(f"\nChunk {i}:")
print(en)
print("-" * 40)

View File

@@ -21,3 +21,15 @@ create table if not exists paragraphs (
char_count integer,
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id)
);
create table if not exists paragraph_chunks (
id integer primary key autoincrement,
book_id text not null,
chapter_id text not null,
chunk_index integer not null,
text_en text,
text_zh text,
char_count integer,
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id),
unique(book_id, chapter_id, chunk_index)
);