Files
zh-en-wn-dataset/paragraph_split.py
2025-02-10 17:42:04 +06:00

391 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import sqlite3
import os
import re
from bs4 import BeautifulSoup
from pathlib import Path
import unicodedata
def create_paragraphs_table(conn):
"""Create the paragraphs table with necessary columns and constraints."""
conn.execute(
"""
create table if not exists paragraphs (
id integer primary key autoincrement,
book_id text not null,
chapter_id text not null,
text_en text,
text_zh text,
char_count integer,
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id)
)
"""
)
def normalize_quotes(text):
# normalize unicode characters to their composed form
text = unicodedata.normalize("NFKC", text)
quote_map = {
"\u201c": '"', # LEFT DOUBLE QUOTATION MARK
"\u201d": '"', # RIGHT DOUBLE QUOTATION MARK
"\u2018": "'", # LEFT SINGLE QUOTATION MARK
"\u2019": "'", # RIGHT SINGLE QUOTATION MARK
"\u00ab": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"\u00bb": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"\u2039": "'", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
"\u203a": "'", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
"\u2032": "'", # PRIME
"\u2033": '"', # DOUBLE PRIME
}
for old, new in quote_map.items():
text = text.replace(old, new)
return text
def strip_paragraph_markers(text):
"""remove p markers like #<# and #>#"""
return re.sub(r"#<#|#>#", "", text).strip()
def normalize_text(text):
"""text normalziations"""
text = normalize_quotes(text)
text = strip_paragraph_markers(text)
return text
def get_paragraphs_from_xhtml(xhtml_path):
"""extract p from html"""
with open(xhtml_path, "r", encoding="utf-8") as f:
content = f.read()
soup = BeautifulSoup(content, "html.parser")
paragraphs = []
p_elements = soup.find_all("p")
for p in p_elements:
text = normalize_text(p.get_text())
if text: # only add non-empty paragraphs
paragraphs.append(text)
# try br split
if not paragraphs:
content = re.sub(r"<br\s*/>", "<br>", content, flags=re.IGNORECASE)
parts = re.split(r"<br>\s*<br>", content, flags=re.IGNORECASE)
for part in parts:
clean_text = BeautifulSoup(part, "html.parser").get_text()
text = normalize_text(clean_text)
if text: # only add non-empty paragraphs
paragraphs.append(text)
return paragraphs
def get_zh_text_for_lines(zh_lines, start_idx, end_idx):
"""Get corresponding Chinese text for given line range."""
return " ".join(zh_lines[start_idx : end_idx + 1])
def extract_paragraphs(text_en, text_zh):
"""
Extract matching paragraphs from English and Chinese texts.
Returns list of paragraphs with normalized text.
"""
paragraphs = []
current_en_lines = []
current_en_indices = []
# split into lines and normalize
en_lines = [line.strip() for line in text_en.split("\n")]
zh_lines = [line.strip() for line in text_zh.split("\n")] if text_zh else []
i = 0
while i < len(en_lines):
line = en_lines[i]
normalized_line = normalize_text(line)
if not normalized_line:
i += 1
continue
current_en_lines.append(normalized_line)
current_en_indices.append(i)
# Look ahead to check if next line is empty or ends the paragraph
next_idx = i + 1
while next_idx < len(en_lines):
next_line = en_lines[next_idx].strip()
if not next_line:
# Empty line - continue current paragraph
next_idx += 1
continue
# If we have Chinese text, check if these lines correspond to a complete thought
if zh_lines:
zh_text = get_zh_text_for_lines(
zh_lines, current_en_indices[0], next_idx - 1
)
if zh_text:
# Found corresponding Chinese text - end paragraph
break
# Add next line to current paragraph
normalized_next = normalize_text(next_line)
current_en_lines.append(normalized_next)
current_en_indices.append(next_idx)
next_idx += 1
# Create paragraph
if current_en_lines:
en_text = " ".join(current_en_lines)
zh_text = ""
if zh_lines:
zh_text = get_zh_text_for_lines(
zh_lines, current_en_indices[0], current_en_indices[-1]
)
paragraphs.append(
{"text_en": en_text, "text_zh": zh_text, "char_count": len(en_text)}
)
# Reset for next paragraph
current_en_lines = []
current_en_indices = []
i = next_idx
return paragraphs
def match_paragraphs(xhtml_paragraphs, db_lines, lines_to_try=3):
"""
Match paragraphs from XHTML with lines from database.
Tries first few lines at start before giving up, to handle chapter titles and initial dialog.
Args:
xhtml_paragraphs: List of XHTML paragraph texts
db_lines: List of database text lines
lines_to_try: Number of initial lines to try before giving up
Returns:
List of tuples containing (start_idx, end_idx) for matched paragraphs
"""
def find_next_content_line(current_idx):
"""Find next non-empty line and return its index and content."""
while current_idx < len(db_lines):
line = normalize_text(db_lines[current_idx].strip())
if line:
return current_idx, line
current_idx += 1
return current_idx, None
matched_indices = []
xhtml_idx = 0
db_idx = 0
tried_lines = 0
while xhtml_idx < len(xhtml_paragraphs) and db_idx < len(db_lines):
# find next non-empty line in db
db_check_idx, db_line = find_next_content_line(db_idx)
if not db_line:
break
# search for p containing this line
while (
xhtml_idx < len(xhtml_paragraphs)
and db_line not in xhtml_paragraphs[xhtml_idx]
):
xhtml_idx += 1
# try ~3 db_lines at start
if xhtml_idx >= len(xhtml_paragraphs):
if not matched_indices and tried_lines < lines_to_try:
tried_lines += 1
xhtml_idx = 0
db_idx = db_check_idx + 1
continue
break
# collect all database lines that belong to this p
start_idx = db_check_idx
current_idx = db_check_idx
while current_idx < len(db_lines):
current_line = normalize_text(db_lines[current_idx].strip())
if current_line and current_line not in xhtml_paragraphs[xhtml_idx]:
break
current_idx += 1
matched_indices.append((start_idx, current_idx - 1))
db_idx = current_idx
xhtml_idx += 1
return matched_indices
def normalize_chapter_id(chapter_id):
"""
Normalize chapter IDs by removing padding and handling special cases.
Examples:
- gfyxjdcz_0001 -> 1
- 00001-1-Swindler -> 1>
- wyctUp_0001 -> 1
- ltzz_0002 -> 2
"""
# handle IDs with _
if "_" in chapter_id:
chapter_id = chapter_id.split("_")[-1]
# rm any non-digit prefix and suffix
digits = re.search(r"(\d+)", chapter_id)
if digits:
chapter_id = digits.group(1)
# rm leading zeros
return str(int(chapter_id))
def find_chapter_file(epub_dir, normalized_id):
epub_dir = Path(epub_dir)
search_dirs = [
epub_dir / "OEBPS" / "Text",
epub_dir / "OEBPS",
epub_dir / "EPUB",
]
for directory in search_dirs:
if not directory.exists():
continue
for file_path in directory.glob("*.*html"):
numbers = re.findall(r"\d+", file_path.stem)
if numbers:
file_chapter_num = str(int(numbers[0]))
if file_chapter_num == normalized_id:
return file_path
return None
def preserve_lines(text):
return [line.strip() if line.strip() else line for line in text.split("\n")]
def print_matched_paragraphs(text_en_lines, text_zh_lines, matched_indices):
"""
Print matched paragraphs from English and Chinese text, with Chinese translation
immediately following each English paragraph.
Args:
text_en_lines (list): List of English text lines
text_zh_lines (list): List of Chinese text lines
matched_indices (list): List of tuples containing (start_idx, end_idx)
"""
if not matched_indices:
print("No matched paragraphs found.")
return
for start_idx, end_idx in matched_indices:
# Get and join English lines for this range
en_para = " ".join(text_en_lines[start_idx : end_idx + 1])
# Get and join Chinese lines for the same range
zh_para = " ".join(text_zh_lines[start_idx : end_idx + 1])
# Print English followed by Chinese
print(strip_paragraph_markers(en_para))
print(strip_paragraph_markers(zh_para))
print() # Extra newline between pairs
def process_book(conn, epub_base_dir, book_id):
"""Process an entire book and add paragraphs to database."""
epub_dir = Path(epub_base_dir) / book_id
if not epub_dir.exists():
# print(f"Warning: EPUB directory not found for book {book_id}: {epub_dir}")
return
print(f"Processing book {book_id} from: {epub_dir}")
# Get all chapters for this book
chapters = conn.execute(
"select chapter_id, text_en, text_zh from chapters where book_id = ?",
(book_id,),
).fetchall()
print(f"Chapter count: {len(chapters)}")
for chapter_id, text_en, text_zh in chapters:
if not text_en or not text_zh:
print(
f"Warning: Missing content for chapter {chapter_id} in book {book_id}"
)
continue
# find html file
normalized_id = normalize_chapter_id(chapter_id)
xhtml_path = find_chapter_file(epub_dir, normalized_id)
if not xhtml_path:
print(
f"Warning: Could not find XHTML file for chapter {chapter_id}. normalized_id: {normalized_id}, xhtml_path: {xhtml_path}"
)
continue
# extract p from html
xhtml_paragraphs = get_paragraphs_from_xhtml(xhtml_path)
# split by \n only, strip only non-empty lines
text_en_lines = preserve_lines(text_en)
text_zh_lines = preserve_lines(text_zh)
# match ps between XHTML and db content
matched_indices = match_paragraphs(xhtml_paragraphs, text_en_lines)
# print_matched_paragraphs(text_en_lines, text_zh_lines, matched_indices)
matched_pairs = []
for start_idx, end_idx in matched_indices:
en_para = strip_paragraph_markers(
" ".join(text_en_lines[start_idx : end_idx + 1])
)
zh_para = strip_paragraph_markers(
" ".join(text_zh_lines[start_idx : end_idx + 1])
)
matched_pairs.append((en_para, zh_para))
for en_para, zh_para in matched_pairs:
conn.execute(
"""
INSERT INTO paragraphs (book_id, chapter_id, text_en, text_zh, char_count)
VALUES (?, ?, ?, ?, ?)
""",
(book_id, chapter_id, en_para, zh_para, len(en_para)),
)
conn.commit()
def process_all_books(db_path, epub_base_dir):
"""Process all books in the database."""
conn = sqlite3.connect(db_path)
create_paragraphs_table(conn)
books = conn.execute("select book_id from books").fetchall()
for (book_id,) in books:
process_book(conn, epub_base_dir, book_id)
conn.close()
if __name__ == "__main__":
db_path = "parallel_texts.db"
epub_base_dir = "epubs" # base dir
process_all_books(db_path, epub_base_dir)