391 lines
12 KiB
Python
391 lines
12 KiB
Python
import sqlite3
|
||
import os
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from pathlib import Path
|
||
import unicodedata
|
||
|
||
|
||
def create_paragraphs_table(conn):
|
||
"""Create the paragraphs table with necessary columns and constraints."""
|
||
conn.execute(
|
||
"""
|
||
create table if not exists paragraphs (
|
||
id integer primary key autoincrement,
|
||
book_id text not null,
|
||
chapter_id text not null,
|
||
text_en text,
|
||
text_zh text,
|
||
char_count integer,
|
||
foreign key (book_id, chapter_id) references chapters(book_id, chapter_id)
|
||
)
|
||
"""
|
||
)
|
||
|
||
|
||
def normalize_quotes(text):
|
||
# normalize unicode characters to their composed form
|
||
text = unicodedata.normalize("NFKC", text)
|
||
|
||
quote_map = {
|
||
"\u201c": '"', # LEFT DOUBLE QUOTATION MARK
|
||
"\u201d": '"', # RIGHT DOUBLE QUOTATION MARK
|
||
"\u2018": "'", # LEFT SINGLE QUOTATION MARK
|
||
"\u2019": "'", # RIGHT SINGLE QUOTATION MARK
|
||
"\u00ab": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||
"\u00bb": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||
"\u2039": "'", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||
"\u203a": "'", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||
"\u2032": "'", # PRIME
|
||
"\u2033": '"', # DOUBLE PRIME
|
||
}
|
||
|
||
for old, new in quote_map.items():
|
||
text = text.replace(old, new)
|
||
|
||
return text
|
||
|
||
|
||
def strip_paragraph_markers(text):
|
||
"""remove p markers like #<# and #>#"""
|
||
return re.sub(r"#<#|#>#", "", text).strip()
|
||
|
||
|
||
def normalize_text(text):
|
||
"""text normalziations"""
|
||
text = normalize_quotes(text)
|
||
text = strip_paragraph_markers(text)
|
||
return text
|
||
|
||
|
||
def get_paragraphs_from_xhtml(xhtml_path):
|
||
"""extract p from html"""
|
||
with open(xhtml_path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
soup = BeautifulSoup(content, "html.parser")
|
||
paragraphs = []
|
||
|
||
p_elements = soup.find_all("p")
|
||
|
||
for p in p_elements:
|
||
text = normalize_text(p.get_text())
|
||
if text: # only add non-empty paragraphs
|
||
paragraphs.append(text)
|
||
|
||
# try br split
|
||
if not paragraphs:
|
||
content = re.sub(r"<br\s*/>", "<br>", content, flags=re.IGNORECASE)
|
||
parts = re.split(r"<br>\s*<br>", content, flags=re.IGNORECASE)
|
||
|
||
for part in parts:
|
||
clean_text = BeautifulSoup(part, "html.parser").get_text()
|
||
text = normalize_text(clean_text)
|
||
if text: # only add non-empty paragraphs
|
||
paragraphs.append(text)
|
||
|
||
return paragraphs
|
||
|
||
|
||
def get_zh_text_for_lines(zh_lines, start_idx, end_idx):
|
||
"""Get corresponding Chinese text for given line range."""
|
||
return " ".join(zh_lines[start_idx : end_idx + 1])
|
||
|
||
|
||
def extract_paragraphs(text_en, text_zh):
|
||
"""
|
||
Extract matching paragraphs from English and Chinese texts.
|
||
Returns list of paragraphs with normalized text.
|
||
"""
|
||
paragraphs = []
|
||
current_en_lines = []
|
||
current_en_indices = []
|
||
|
||
# split into lines and normalize
|
||
en_lines = [line.strip() for line in text_en.split("\n")]
|
||
zh_lines = [line.strip() for line in text_zh.split("\n")] if text_zh else []
|
||
|
||
i = 0
|
||
while i < len(en_lines):
|
||
line = en_lines[i]
|
||
normalized_line = normalize_text(line)
|
||
|
||
if not normalized_line:
|
||
i += 1
|
||
continue
|
||
|
||
current_en_lines.append(normalized_line)
|
||
current_en_indices.append(i)
|
||
|
||
# Look ahead to check if next line is empty or ends the paragraph
|
||
next_idx = i + 1
|
||
while next_idx < len(en_lines):
|
||
next_line = en_lines[next_idx].strip()
|
||
if not next_line:
|
||
# Empty line - continue current paragraph
|
||
next_idx += 1
|
||
continue
|
||
|
||
# If we have Chinese text, check if these lines correspond to a complete thought
|
||
if zh_lines:
|
||
zh_text = get_zh_text_for_lines(
|
||
zh_lines, current_en_indices[0], next_idx - 1
|
||
)
|
||
if zh_text:
|
||
# Found corresponding Chinese text - end paragraph
|
||
break
|
||
|
||
# Add next line to current paragraph
|
||
normalized_next = normalize_text(next_line)
|
||
current_en_lines.append(normalized_next)
|
||
current_en_indices.append(next_idx)
|
||
next_idx += 1
|
||
|
||
# Create paragraph
|
||
if current_en_lines:
|
||
en_text = " ".join(current_en_lines)
|
||
zh_text = ""
|
||
if zh_lines:
|
||
zh_text = get_zh_text_for_lines(
|
||
zh_lines, current_en_indices[0], current_en_indices[-1]
|
||
)
|
||
|
||
paragraphs.append(
|
||
{"text_en": en_text, "text_zh": zh_text, "char_count": len(en_text)}
|
||
)
|
||
|
||
# Reset for next paragraph
|
||
current_en_lines = []
|
||
current_en_indices = []
|
||
i = next_idx
|
||
|
||
return paragraphs
|
||
|
||
|
||
def match_paragraphs(xhtml_paragraphs, db_lines, lines_to_try=3):
|
||
"""
|
||
Match paragraphs from XHTML with lines from database.
|
||
Tries first few lines at start before giving up, to handle chapter titles and initial dialog.
|
||
|
||
Args:
|
||
xhtml_paragraphs: List of XHTML paragraph texts
|
||
db_lines: List of database text lines
|
||
lines_to_try: Number of initial lines to try before giving up
|
||
|
||
Returns:
|
||
List of tuples containing (start_idx, end_idx) for matched paragraphs
|
||
"""
|
||
|
||
def find_next_content_line(current_idx):
|
||
"""Find next non-empty line and return its index and content."""
|
||
while current_idx < len(db_lines):
|
||
line = normalize_text(db_lines[current_idx].strip())
|
||
if line:
|
||
return current_idx, line
|
||
current_idx += 1
|
||
return current_idx, None
|
||
|
||
matched_indices = []
|
||
xhtml_idx = 0
|
||
db_idx = 0
|
||
tried_lines = 0
|
||
|
||
while xhtml_idx < len(xhtml_paragraphs) and db_idx < len(db_lines):
|
||
# find next non-empty line in db
|
||
db_check_idx, db_line = find_next_content_line(db_idx)
|
||
if not db_line:
|
||
break
|
||
|
||
# search for p containing this line
|
||
while (
|
||
xhtml_idx < len(xhtml_paragraphs)
|
||
and db_line not in xhtml_paragraphs[xhtml_idx]
|
||
):
|
||
xhtml_idx += 1
|
||
|
||
# try ~3 db_lines at start
|
||
if xhtml_idx >= len(xhtml_paragraphs):
|
||
if not matched_indices and tried_lines < lines_to_try:
|
||
tried_lines += 1
|
||
xhtml_idx = 0
|
||
db_idx = db_check_idx + 1
|
||
continue
|
||
break
|
||
|
||
# collect all database lines that belong to this p
|
||
start_idx = db_check_idx
|
||
current_idx = db_check_idx
|
||
|
||
while current_idx < len(db_lines):
|
||
current_line = normalize_text(db_lines[current_idx].strip())
|
||
if current_line and current_line not in xhtml_paragraphs[xhtml_idx]:
|
||
break
|
||
current_idx += 1
|
||
|
||
matched_indices.append((start_idx, current_idx - 1))
|
||
db_idx = current_idx
|
||
xhtml_idx += 1
|
||
|
||
return matched_indices
|
||
|
||
|
||
def normalize_chapter_id(chapter_id):
|
||
"""
|
||
Normalize chapter IDs by removing padding and handling special cases.
|
||
Examples:
|
||
- gfyxjdcz!_0001 -> 1
|
||
- 00001-1-Swindler -> 1>
|
||
- wyctUp_0001 -> 1
|
||
- ltzz_0002 -> 2
|
||
"""
|
||
# handle IDs with _
|
||
if "_" in chapter_id:
|
||
chapter_id = chapter_id.split("_")[-1]
|
||
|
||
# rm any non-digit prefix and suffix
|
||
digits = re.search(r"(\d+)", chapter_id)
|
||
if digits:
|
||
chapter_id = digits.group(1)
|
||
|
||
# rm leading zeros
|
||
return str(int(chapter_id))
|
||
|
||
|
||
def find_chapter_file(epub_dir, normalized_id):
|
||
epub_dir = Path(epub_dir)
|
||
search_dirs = [
|
||
epub_dir / "OEBPS" / "Text",
|
||
epub_dir / "OEBPS",
|
||
epub_dir / "EPUB",
|
||
]
|
||
|
||
for directory in search_dirs:
|
||
if not directory.exists():
|
||
continue
|
||
|
||
for file_path in directory.glob("*.*html"):
|
||
numbers = re.findall(r"\d+", file_path.stem)
|
||
if numbers:
|
||
file_chapter_num = str(int(numbers[0]))
|
||
if file_chapter_num == normalized_id:
|
||
return file_path
|
||
|
||
return None
|
||
|
||
|
||
def preserve_lines(text):
|
||
return [line.strip() if line.strip() else line for line in text.split("\n")]
|
||
|
||
|
||
def print_matched_paragraphs(text_en_lines, text_zh_lines, matched_indices):
|
||
"""
|
||
Print matched paragraphs from English and Chinese text, with Chinese translation
|
||
immediately following each English paragraph.
|
||
|
||
Args:
|
||
text_en_lines (list): List of English text lines
|
||
text_zh_lines (list): List of Chinese text lines
|
||
matched_indices (list): List of tuples containing (start_idx, end_idx)
|
||
"""
|
||
if not matched_indices:
|
||
print("No matched paragraphs found.")
|
||
return
|
||
|
||
for start_idx, end_idx in matched_indices:
|
||
# Get and join English lines for this range
|
||
en_para = " ".join(text_en_lines[start_idx : end_idx + 1])
|
||
# Get and join Chinese lines for the same range
|
||
zh_para = " ".join(text_zh_lines[start_idx : end_idx + 1])
|
||
|
||
# Print English followed by Chinese
|
||
print(strip_paragraph_markers(en_para))
|
||
print(strip_paragraph_markers(zh_para))
|
||
print() # Extra newline between pairs
|
||
|
||
|
||
def process_book(conn, epub_base_dir, book_id):
|
||
"""Process an entire book and add paragraphs to database."""
|
||
epub_dir = Path(epub_base_dir) / book_id
|
||
|
||
if not epub_dir.exists():
|
||
# print(f"Warning: EPUB directory not found for book {book_id}: {epub_dir}")
|
||
return
|
||
|
||
print(f"Processing book {book_id} from: {epub_dir}")
|
||
|
||
# Get all chapters for this book
|
||
chapters = conn.execute(
|
||
"select chapter_id, text_en, text_zh from chapters where book_id = ?",
|
||
(book_id,),
|
||
).fetchall()
|
||
|
||
print(f"Chapter count: {len(chapters)}")
|
||
|
||
for chapter_id, text_en, text_zh in chapters:
|
||
if not text_en or not text_zh:
|
||
print(
|
||
f"Warning: Missing content for chapter {chapter_id} in book {book_id}"
|
||
)
|
||
continue
|
||
|
||
# find html file
|
||
normalized_id = normalize_chapter_id(chapter_id)
|
||
xhtml_path = find_chapter_file(epub_dir, normalized_id)
|
||
if not xhtml_path:
|
||
print(
|
||
f"Warning: Could not find XHTML file for chapter {chapter_id}. normalized_id: {normalized_id}, xhtml_path: {xhtml_path}"
|
||
)
|
||
continue
|
||
|
||
# extract p from html
|
||
xhtml_paragraphs = get_paragraphs_from_xhtml(xhtml_path)
|
||
|
||
# split by \n only, strip only non-empty lines
|
||
text_en_lines = preserve_lines(text_en)
|
||
text_zh_lines = preserve_lines(text_zh)
|
||
|
||
# match ps between XHTML and db content
|
||
matched_indices = match_paragraphs(xhtml_paragraphs, text_en_lines)
|
||
|
||
# print_matched_paragraphs(text_en_lines, text_zh_lines, matched_indices)
|
||
matched_pairs = []
|
||
for start_idx, end_idx in matched_indices:
|
||
en_para = strip_paragraph_markers(
|
||
" ".join(text_en_lines[start_idx : end_idx + 1])
|
||
)
|
||
zh_para = strip_paragraph_markers(
|
||
" ".join(text_zh_lines[start_idx : end_idx + 1])
|
||
)
|
||
matched_pairs.append((en_para, zh_para))
|
||
|
||
for en_para, zh_para in matched_pairs:
|
||
conn.execute(
|
||
"""
|
||
INSERT INTO paragraphs (book_id, chapter_id, text_en, text_zh, char_count)
|
||
VALUES (?, ?, ?, ?, ?)
|
||
""",
|
||
(book_id, chapter_id, en_para, zh_para, len(en_para)),
|
||
)
|
||
|
||
conn.commit()
|
||
|
||
|
||
def process_all_books(db_path, epub_base_dir):
|
||
"""Process all books in the database."""
|
||
conn = sqlite3.connect(db_path)
|
||
create_paragraphs_table(conn)
|
||
|
||
books = conn.execute("select book_id from books").fetchall()
|
||
|
||
for (book_id,) in books:
|
||
process_book(conn, epub_base_dir, book_id)
|
||
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
db_path = "parallel_texts.db"
|
||
epub_base_dir = "epubs" # base dir
|
||
|
||
process_all_books(db_path, epub_base_dir)
|