202 lines
6.2 KiB
Python
202 lines
6.2 KiB
Python
import os
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Dict
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
"""normalize whitespace and line end"""
|
|
return re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]:
|
|
"""
|
|
most chinese raws are split with 2 br tags rather than
|
|
by <p> elements so.. yeah
|
|
"""
|
|
if h1_tag := soup.find("h1"):
|
|
h1_tag.decompose()
|
|
|
|
for br in soup.find_all("br"):
|
|
br.replace_with("\n")
|
|
|
|
content = soup.body.get_text()
|
|
paragraphs = [clean_text(p) for p in content.split("\n\n") if clean_text(p)]
|
|
|
|
return paragraphs
|
|
|
|
|
|
def extract_en_paragraphs(soup: BeautifulSoup) -> List[str]:
|
|
"""etract paragraphs from English HTML"""
|
|
if h1_tag := soup.find("h1"):
|
|
h1_tag.decompose()
|
|
|
|
for footnote in soup.find_all("span", id=re.compile(r"easy-footnote.*")):
|
|
footnote.decompose()
|
|
|
|
paragraphs = [
|
|
clean_text(p.get_text()) for p in soup.find_all("p") if clean_text(p.get_text())
|
|
]
|
|
|
|
return paragraphs
|
|
|
|
|
|
def print_debug_info(
|
|
chapter_num: str,
|
|
en_file: Path,
|
|
zh_file: Path,
|
|
en_paragraphs: List[str],
|
|
zh_paragraphs: List[str],
|
|
):
|
|
"""debug"""
|
|
print(f"\n=== MISMATCH DETECTED IN CHAPTER {chapter_num} ===")
|
|
print(f"English file: {en_file}")
|
|
print(f"Chinese file: {zh_file}")
|
|
print(f"\nParagraph count:")
|
|
print(f" English: {len(en_paragraphs)}")
|
|
print(f" Chinese: {len(zh_paragraphs)}")
|
|
|
|
print("\nFirst 3 English p:")
|
|
for i, p in enumerate(en_paragraphs[:3]):
|
|
print(f" {i+1}: {p[:100]}...")
|
|
|
|
print("\nFirst 3 Chinese p:")
|
|
for i, p in enumerate(zh_paragraphs[:3]):
|
|
print(f" {i+1}: {p[:100]}...")
|
|
|
|
print("\nRaw Chinese HTML:")
|
|
with open(zh_file, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
print(content[:500])
|
|
|
|
|
|
def process_chapter_pair(en_path: Path, zh_path: Path) -> Tuple[List[str], List[str]]:
|
|
"""Process a pair of corresponding chapter files"""
|
|
with open(en_path, "r", encoding="utf-8") as f:
|
|
en_soup = BeautifulSoup(f, "html.parser")
|
|
en_paragraphs = extract_en_paragraphs(en_soup)
|
|
|
|
with open(zh_path, "r", encoding="utf-8") as f:
|
|
zh_soup = BeautifulSoup(f, "html.parser")
|
|
zh_paragraphs = extract_zh_paragraphs(zh_soup)
|
|
|
|
return en_paragraphs, zh_paragraphs
|
|
|
|
|
|
def insert_book_chapters(
|
|
db_path: str, book_id: str, matched_chapters: Dict[str, Tuple[List[str], List[str]]]
|
|
):
|
|
"""
|
|
Insert chapters and paragraphs into the database for a given book_id.
|
|
Only inserts when English and Chinese paragraph counts match.
|
|
"""
|
|
conn = sqlite3.connect(db_path)
|
|
cur = conn.cursor()
|
|
|
|
try:
|
|
cur.execute("insert or ignore into books (book_id) values (?)", (book_id,))
|
|
|
|
for chapter_id, (en_paragraphs, zh_paragraphs) in matched_chapters.items():
|
|
# only process if paragraph counts match
|
|
if len(en_paragraphs) != len(zh_paragraphs):
|
|
print(
|
|
f"Skipping chapter {chapter_id} due to paragraph count mismatch: "
|
|
f"EN: {len(en_paragraphs)}, ZH: {len(zh_paragraphs)}"
|
|
)
|
|
continue
|
|
|
|
# join paragraphs for chapter text
|
|
chapter_text_en = "\n".join(en_paragraphs)
|
|
chapter_text_zh = "\n".join(zh_paragraphs)
|
|
|
|
cur.execute(
|
|
"""
|
|
insert into chapters (book_id, chapter_id, text_en, text_zh)
|
|
values (?, ?, ?, ?)
|
|
on conflict (book_id, chapter_id) do update set
|
|
text_en = excluded.text_en,
|
|
text_zh = excluded.text_zh
|
|
""",
|
|
(book_id, chapter_id, chapter_text_en, chapter_text_zh),
|
|
)
|
|
|
|
# insert p
|
|
for en_text, zh_text in zip(en_paragraphs, zh_paragraphs):
|
|
char_count = len(en_text)
|
|
cur.execute(
|
|
"""
|
|
insert into paragraphs
|
|
(book_id, chapter_id, text_en, text_zh, char_count)
|
|
values (?, ?, ?, ?, ?)
|
|
""",
|
|
(book_id, chapter_id, en_text, zh_text, char_count),
|
|
)
|
|
|
|
print(
|
|
f"Processed chapter {chapter_id} with {len(en_paragraphs)} paragraphs"
|
|
)
|
|
|
|
conn.commit()
|
|
print(f"Successfully processed all matching chapters for book {book_id}")
|
|
|
|
except Exception as e:
|
|
conn.rollback()
|
|
print(f"Error processing chapters: {str(e)}")
|
|
raise
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def match_chapters(epub_dir: str) -> Dict[str, Tuple[List[str], List[str]]]:
|
|
"""Match and process chapters between English and Chinese directories."""
|
|
base_dir = Path(epub_dir)
|
|
en_dir = base_dir / "en"
|
|
zh_dir = base_dir / "zh"
|
|
|
|
matched_paragraphs = {}
|
|
|
|
# Get all English files and sort them
|
|
en_files = sorted([f for f in en_dir.glob("*.xhtml")])
|
|
|
|
for en_file in en_files:
|
|
# Construct corresponding Chinese filename
|
|
chapter_num = re.search(r"(\d{4})", en_file.name).group(1)
|
|
zh_file = zh_dir / f"{chapter_num}_.xhtml"
|
|
|
|
if not zh_file.exists():
|
|
print(f"Warning: No matching Chinese file for {en_file.name}")
|
|
continue
|
|
|
|
try:
|
|
en_paragraphs, zh_paragraphs = process_chapter_pair(en_file, zh_file)
|
|
|
|
# Check for significant mismatch in paragraph counts
|
|
# if abs(len(en_paragraphs) - len(zh_paragraphs)) > 5:
|
|
# print_debug_info(
|
|
# chapter_num, en_file, zh_file, en_paragraphs, zh_paragraphs
|
|
# )
|
|
# else:
|
|
print(f"Chapter {chapter_num}:")
|
|
print(f" English paragraphs: {len(en_paragraphs)}")
|
|
print(f" Chinese paragraphs: {len(zh_paragraphs)}")
|
|
|
|
# Store results
|
|
matched_paragraphs[chapter_num] = (en_paragraphs, zh_paragraphs)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing chapter {chapter_num}: {str(e)}")
|
|
|
|
return matched_paragraphs
|
|
|
|
|
|
def main():
|
|
epub_dir = "epubs/1v1h"
|
|
matched_chapters = match_chapters(epub_dir)
|
|
insert_book_chapters("parallel_texts.db", "1v1h", matched_chapters)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|