chore: json dataset
This commit is contained in:
13477
cn_en_wn_dataset.json
Normal file
13477
cn_en_wn_dataset.json
Normal file
File diff suppressed because it is too large
Load Diff
89
gen_alpaca.py
Normal file
89
gen_alpaca.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def create_alpaca_dataset(
|
||||||
|
db_path: str, output_path: str, samples_per_book: int = 155
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Create an Alpaca-style JSON dataset for Chinese to English translation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db_path: Path to the SQLite database
|
||||||
|
output_path: Path where the JSON dataset will be saved
|
||||||
|
samples_per_book: Maximum number of samples to take from each book_id
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"select distinct book_id from paragraph_chunks where text_en is not null and text_zh is not null"
|
||||||
|
)
|
||||||
|
book_ids = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
dataset: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
|
for book_id in book_ids:
|
||||||
|
# get samples for current book_id
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
select text_zh, text_en
|
||||||
|
from paragraph_chunks
|
||||||
|
where book_id = ?
|
||||||
|
and text_en is not null
|
||||||
|
and text_zh is not null
|
||||||
|
and length(text_zh) > 0
|
||||||
|
and length(text_en) > 0
|
||||||
|
""",
|
||||||
|
(book_id,),
|
||||||
|
)
|
||||||
|
|
||||||
|
samples = cursor.fetchall()
|
||||||
|
if not samples:
|
||||||
|
continue
|
||||||
|
selected_samples = random.sample(samples, min(len(samples), samples_per_book))
|
||||||
|
# Alpaca foramt
|
||||||
|
for zh_text, en_text in selected_samples:
|
||||||
|
entry = {
|
||||||
|
"instruction": "Translate the following Chinese text to English:",
|
||||||
|
"input": zh_text.strip(),
|
||||||
|
"output": en_text.strip(),
|
||||||
|
}
|
||||||
|
dataset.append(entry)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
random.shuffle(dataset)
|
||||||
|
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(dataset, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"Dataset created successfully with {len(dataset)} total samples")
|
||||||
|
print(f"Number of unique books: {len(book_ids)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Generate Alpaca-style translation dataset"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--db_path", type=str, required=True, help="Path to SQLite database"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output_path", type=str, required=True, help="Path for output JSON file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--samples_per_book",
|
||||||
|
type=int,
|
||||||
|
default=155,
|
||||||
|
help="Maximum number of samples to take from each book_id",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
create_alpaca_dataset(args.db_path, args.output_path, args.samples_per_book)
|
||||||
249
paragraph_ctx_collect.py
Normal file
249
paragraph_ctx_collect.py
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
from typing import List, Tuple
|
||||||
|
import sqlite3
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def get_chapter_paragraphs(
|
||||||
|
cursor: sqlite3.Cursor, book_id: str, chapter_id: str
|
||||||
|
) -> Tuple[List[str], List[str]]:
|
||||||
|
"""
|
||||||
|
Gets all paragraphs for a specific chapter.
|
||||||
|
Returns (english_paragraphs, chinese_paragraphs).
|
||||||
|
"""
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
select text_en, text_zh
|
||||||
|
from paragraphs
|
||||||
|
where book_id = ? and chapter_id = ?
|
||||||
|
""",
|
||||||
|
(book_id, chapter_id),
|
||||||
|
)
|
||||||
|
|
||||||
|
en_texts = []
|
||||||
|
zh_texts = []
|
||||||
|
for en, zh in cursor.fetchall():
|
||||||
|
if en and zh: # Skip empty paragraphs
|
||||||
|
en_texts.append(en.strip())
|
||||||
|
zh_texts.append(zh.strip())
|
||||||
|
|
||||||
|
return en_texts, zh_texts
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_state(text: str) -> tuple[int, bool, bool]:
|
||||||
|
"""
|
||||||
|
Analyzes text for continuity markers
|
||||||
|
Returns (bracket_change, ends_with_colon, incomplete_sentence)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: String to analyze
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple containing:
|
||||||
|
- int: Net change in bracket depth (positive for unclosed, negative for extra closing)
|
||||||
|
- bool: Whether the text ends with a colon
|
||||||
|
- bool: Whether the text ends without proper sentence termination
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return 0, False, False
|
||||||
|
|
||||||
|
# count bracket balance
|
||||||
|
opens = len(re.findall(r"[【「『]", text))
|
||||||
|
closes = len(re.findall(r"[】」』]", text))
|
||||||
|
ends_with_punct = bool(re.search(r"[.!?。!?]\s*$", text.rstrip()))
|
||||||
|
|
||||||
|
return (opens - closes, text.rstrip().endswith(":"), not ends_with_punct)
|
||||||
|
|
||||||
|
|
||||||
|
def create_chunks(
|
||||||
|
en_texts: List[str],
|
||||||
|
zh_texts: List[str],
|
||||||
|
target_size: int = 1024,
|
||||||
|
min_size: int = 512,
|
||||||
|
max_size: int = 2048,
|
||||||
|
) -> List[Tuple[str, str]]:
|
||||||
|
"""
|
||||||
|
Creates parallel text chunks respecting continuity markers and size constraints
|
||||||
|
|
||||||
|
Args:
|
||||||
|
en_texts: List of English text paragraphs
|
||||||
|
zh_texts: List of corresponding Chinese text paragraphs
|
||||||
|
target_size: Ideal size for each chunk in characters
|
||||||
|
min_size: Minimum acceptable chunk size
|
||||||
|
max_size: Maximum acceptable chunk size
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tuples containing (english_chunk, chinese_chunk)
|
||||||
|
"""
|
||||||
|
chunks = []
|
||||||
|
current_en = []
|
||||||
|
current_zh = []
|
||||||
|
current_chars = 0
|
||||||
|
bracket_depth = 0
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(en_texts):
|
||||||
|
current_text = en_texts[i]
|
||||||
|
para_chars = len(current_text)
|
||||||
|
bracket_change, ends_with_colon, incomplete_sentence = get_text_state(
|
||||||
|
current_text
|
||||||
|
)
|
||||||
|
bracket_depth += bracket_change
|
||||||
|
|
||||||
|
# check if adding would exceed max_size
|
||||||
|
if current_chars + para_chars > max_size:
|
||||||
|
# only split if we're not in brackets, sentence is complete, and have met min_size
|
||||||
|
if (
|
||||||
|
bracket_depth <= 0
|
||||||
|
and not incomplete_sentence
|
||||||
|
and current_chars >= min_size
|
||||||
|
):
|
||||||
|
chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
|
||||||
|
current_en = []
|
||||||
|
current_zh = []
|
||||||
|
current_chars = 0
|
||||||
|
|
||||||
|
# add cur p
|
||||||
|
current_en.append(current_text)
|
||||||
|
current_zh.append(zh_texts[i])
|
||||||
|
current_chars += para_chars
|
||||||
|
|
||||||
|
# can we create a chunk?
|
||||||
|
next_exists = i + 1 < len(en_texts)
|
||||||
|
if (
|
||||||
|
current_chars >= target_size
|
||||||
|
and bracket_depth <= 0
|
||||||
|
and not ends_with_colon
|
||||||
|
and not incomplete_sentence
|
||||||
|
and next_exists
|
||||||
|
):
|
||||||
|
chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
|
||||||
|
current_en = []
|
||||||
|
current_zh = []
|
||||||
|
current_chars = 0
|
||||||
|
bracket_depth = 0
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# add remaining text if it it's min_size
|
||||||
|
if current_chars >= min_size:
|
||||||
|
chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def create_chunk_table(cursor: sqlite3.Cursor):
|
||||||
|
"""Creates the paragraph_chunks table if it doesn't exist"""
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
create table if not exists paragraph_chunks (
|
||||||
|
id integer primary key autoincrement,
|
||||||
|
book_id text not null,
|
||||||
|
chapter_id text not null,
|
||||||
|
chunk_index integer not null,
|
||||||
|
text_en text,
|
||||||
|
text_zh text,
|
||||||
|
char_count integer,
|
||||||
|
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id),
|
||||||
|
unique(book_id, chapter_id, chunk_index)
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def store_book_chunks(db_path: str, book_id: str):
|
||||||
|
"""Process a book and store its chunks in the database"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
create_chunk_table(cursor)
|
||||||
|
chunks_by_chapter = process_book(db_path, book_id)
|
||||||
|
|
||||||
|
for chapter_id, chapter_chunks in chunks_by_chapter:
|
||||||
|
for i, (en_chunk, zh_chunk) in enumerate(chapter_chunks):
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
insert into paragraph_chunks
|
||||||
|
(book_id, chapter_id, chunk_index, text_en, text_zh, char_count)
|
||||||
|
values (?, ?, ?, ?, ?, ?)
|
||||||
|
on conflict(book_id, chapter_id, chunk_index)
|
||||||
|
do update set
|
||||||
|
text_en = excluded.text_en,
|
||||||
|
text_zh = excluded.text_zh,
|
||||||
|
char_count = excluded.char_count
|
||||||
|
""",
|
||||||
|
(book_id, chapter_id, i, en_chunk, zh_chunk, len(en_chunk)),
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def process_book(db_path: str, book_id: str) -> List[Tuple[str, List[Tuple[str, str]]]]:
|
||||||
|
"""
|
||||||
|
Process book chapter by chapter, respecting chapter boundaries
|
||||||
|
Returns list of (chapter_id, chapter_chunks) tuples
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
select distinct chapter_id
|
||||||
|
from paragraphs
|
||||||
|
where book_id = ?
|
||||||
|
order by chapter_id
|
||||||
|
""",
|
||||||
|
(book_id,),
|
||||||
|
)
|
||||||
|
|
||||||
|
chapter_ids = [row[0] for row in cursor.fetchall()]
|
||||||
|
all_chapter_chunks = []
|
||||||
|
|
||||||
|
for chapter_id in chapter_ids:
|
||||||
|
en_texts, zh_texts = get_chapter_paragraphs(cursor, book_id, chapter_id)
|
||||||
|
if en_texts and zh_texts: # skip empty chapters
|
||||||
|
chapter_chunks = create_chunks(en_texts, zh_texts)
|
||||||
|
all_chapter_chunks.append((chapter_id, chapter_chunks))
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return all_chapter_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def process_all_books(db_path: str):
|
||||||
|
"""Process and store chunks for all books in database"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("select book_id from books")
|
||||||
|
book_ids = [row[0] for row in cursor.fetchall()]
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
for book_id in book_ids:
|
||||||
|
print(f"Processing and storing book: {book_id}")
|
||||||
|
store_book_chunks(db_path, book_id)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if len(sys.argv) == 3 and sys.argv[1] == "--store":
|
||||||
|
db_path = sys.argv[2]
|
||||||
|
process_all_books(db_path)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# test
|
||||||
|
test_en = [
|
||||||
|
"On it were words left by Wen Jin's parents:",
|
||||||
|
"【We learned from the news that you two got married.",
|
||||||
|
"Take care of each other in the future, if you need anything,",
|
||||||
|
"talk to us, even though you may not need to.",
|
||||||
|
"From Mom and Dad.】",
|
||||||
|
"After reading this, Wen Jin felt:",
|
||||||
|
"A complex mix of emotions surged through him.",
|
||||||
|
'Returning home with the parcels, Jiang Wan asked him, "Should the shoes be unpacked?"',
|
||||||
|
]
|
||||||
|
test_zh = ["zh" + str(i) for i in range(len(test_en))]
|
||||||
|
|
||||||
|
chunks = create_chunks(test_en, test_zh, target_size=1024)
|
||||||
|
for i, (en, zh) in enumerate(chunks, 1):
|
||||||
|
print(f"\nChunk {i}:")
|
||||||
|
print(en)
|
||||||
|
print("-" * 40)
|
||||||
12
schema.sql
12
schema.sql
@@ -21,3 +21,15 @@ create table if not exists paragraphs (
|
|||||||
char_count integer,
|
char_count integer,
|
||||||
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id)
|
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
create table if not exists paragraph_chunks (
|
||||||
|
id integer primary key autoincrement,
|
||||||
|
book_id text not null,
|
||||||
|
chapter_id text not null,
|
||||||
|
chunk_index integer not null,
|
||||||
|
text_en text,
|
||||||
|
text_zh text,
|
||||||
|
char_count integer,
|
||||||
|
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id),
|
||||||
|
unique(book_id, chapter_id, chunk_index)
|
||||||
|
);
|
||||||
|
|||||||
Reference in New Issue
Block a user