first commit

This commit is contained in:
2025-02-09 03:07:07 +06:00
commit d060cdba14
6 changed files with 737 additions and 0 deletions

View File

@@ -0,0 +1,198 @@
import os
from bs4 import BeautifulSoup
import re
import sqlite3
from pathlib import Path
from typing import List, Tuple, Dict
def clean_text(text: str) -> str:
"""normalize whitespace and line end"""
return re.sub(r"\s+", " ", text).strip()
def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]:
"""extract paragraphs from zh HTML"""
if h1_tag := soup.find("h1"):
h1_tag.decompose()
for br in soup.find_all("br"):
br.replace_with("\n")
content = soup.body.get_text()
paragraphs = [clean_text(p) for p in content.split("\n\n") if clean_text(p)]
return paragraphs
def extract_en_paragraphs(soup: BeautifulSoup) -> List[str]:
"""etract paragraphs from English HTML"""
if h1_tag := soup.find("h1"):
h1_tag.decompose()
for footnote in soup.find_all("span", id=re.compile(r"easy-footnote.*")):
footnote.decompose()
paragraphs = [
clean_text(p.get_text()) for p in soup.find_all("p") if clean_text(p.get_text())
]
return paragraphs
def print_debug_info(
chapter_num: str,
en_file: Path,
zh_file: Path,
en_paragraphs: List[str],
zh_paragraphs: List[str],
):
"""debug"""
print(f"\n=== MISMATCH DETECTED IN CHAPTER {chapter_num} ===")
print(f"English file: {en_file}")
print(f"Chinese file: {zh_file}")
print(f"\nParagraph count:")
print(f" English: {len(en_paragraphs)}")
print(f" Chinese: {len(zh_paragraphs)}")
print("\nFirst 3 English p:")
for i, p in enumerate(en_paragraphs[:3]):
print(f" {i+1}: {p[:100]}...")
print("\nFirst 3 Chinese p:")
for i, p in enumerate(zh_paragraphs[:3]):
print(f" {i+1}: {p[:100]}...")
print("\nRaw Chinese HTML:")
with open(zh_file, "r", encoding="utf-8") as f:
content = f.read()
print(content[:500])
def process_chapter_pair(en_path: Path, zh_path: Path) -> Tuple[List[str], List[str]]:
"""Process a pair of corresponding chapter files"""
with open(en_path, "r", encoding="utf-8") as f:
en_soup = BeautifulSoup(f, "html.parser")
en_paragraphs = extract_en_paragraphs(en_soup)
with open(zh_path, "r", encoding="utf-8") as f:
zh_soup = BeautifulSoup(f, "html.parser")
zh_paragraphs = extract_zh_paragraphs(zh_soup)
return en_paragraphs, zh_paragraphs
def insert_book_chapters(
db_path: str, book_id: str, matched_chapters: Dict[str, Tuple[List[str], List[str]]]
):
"""
Insert chapters and paragraphs into the database for a given book_id.
Only inserts when English and Chinese paragraph counts match.
"""
conn = sqlite3.connect(db_path)
cur = conn.cursor()
try:
cur.execute("insert or ignore into books (book_id) values (?)", (book_id,))
for chapter_id, (en_paragraphs, zh_paragraphs) in matched_chapters.items():
# only process if paragraph counts match
if len(en_paragraphs) != len(zh_paragraphs):
print(
f"Skipping chapter {chapter_id} due to paragraph count mismatch: "
f"EN: {len(en_paragraphs)}, ZH: {len(zh_paragraphs)}"
)
continue
# join paragraphs for chapter text
chapter_text_en = "\n".join(en_paragraphs)
chapter_text_zh = "\n".join(zh_paragraphs)
cur.execute(
"""
insert into chapters (book_id, chapter_id, text_en, text_zh)
values (?, ?, ?, ?)
on conflict (book_id, chapter_id) do update set
text_en = excluded.text_en,
text_zh = excluded.text_zh
""",
(book_id, chapter_id, chapter_text_en, chapter_text_zh),
)
# insert p
for en_text, zh_text in zip(en_paragraphs, zh_paragraphs):
char_count = len(en_text)
cur.execute(
"""
insert into paragraphs
(book_id, chapter_id, text_en, text_zh, char_count)
values (?, ?, ?, ?, ?)
""",
(book_id, chapter_id, en_text, zh_text, char_count),
)
print(
f"Processed chapter {chapter_id} with {len(en_paragraphs)} paragraphs"
)
conn.commit()
print(f"Successfully processed all matching chapters for book {book_id}")
except Exception as e:
conn.rollback()
print(f"Error processing chapters: {str(e)}")
raise
finally:
conn.close()
def match_chapters(epub_dir: str) -> Dict[str, Tuple[List[str], List[str]]]:
"""Match and process chapters between English and Chinese directories."""
base_dir = Path(epub_dir)
en_dir = base_dir / "en"
zh_dir = base_dir / "zh"
matched_paragraphs = {}
# Get all English files and sort them
en_files = sorted([f for f in en_dir.glob("*.xhtml")])
for en_file in en_files:
# Construct corresponding Chinese filename
chapter_num = re.search(r"(\d{4})", en_file.name).group(1)
zh_file = zh_dir / f"{chapter_num}_.xhtml"
if not zh_file.exists():
print(f"Warning: No matching Chinese file for {en_file.name}")
continue
try:
en_paragraphs, zh_paragraphs = process_chapter_pair(en_file, zh_file)
# Check for significant mismatch in paragraph counts
# if abs(len(en_paragraphs) - len(zh_paragraphs)) > 5:
# print_debug_info(
# chapter_num, en_file, zh_file, en_paragraphs, zh_paragraphs
# )
# else:
print(f"Chapter {chapter_num}:")
print(f" English paragraphs: {len(en_paragraphs)}")
print(f" Chinese paragraphs: {len(zh_paragraphs)}")
# Store results
matched_paragraphs[chapter_num] = (en_paragraphs, zh_paragraphs)
except Exception as e:
print(f"Error processing chapter {chapter_num}: {str(e)}")
return matched_paragraphs
def main():
epub_dir = "epubs/1v1h"
matched_chapters = match_chapters(epub_dir)
insert_book_chapters("parallel_texts.db", "1v1h", matched_chapters)
if __name__ == "__main__":
main()