zh-en-wn-dataset/custom_parser.py

import os
import json
from bs4 import BeautifulSoup
from pathlib import Path
import re


class ChapterParser:
    def __init__(self, base_dir):
        self.base_dir = Path(base_dir)

    def load_format_config(self, novel_dir):
        """Load format.json configuration for a novel directory."""
        format_path = self.base_dir / novel_dir / "format.json"
        with open(format_path, "r", encoding="utf-8") as f:
            return json.load(f)

    def get_txt_content(self, novel_dir, config):
        """Read and parse text content based on indentation patterns."""
        txt_path = self.base_dir / novel_dir / config["path"]
        print(f"\nDebug: Reading text file from {txt_path}")
        try:
            with open(txt_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            print(f"Debug: Successfully read {len(lines)} lines")
        except Exception as e:
            print(f"Debug: Error reading file: {e}")
            return []

        # Skip lines until reaching the starting index
        content_lines = lines[config["idx"] - 1 :]

        chapters = []
        current_chapter = None
        chapter_content = []

        for i, line in enumerate(content_lines):
            line = line.rstrip("\n")  # Preserve leading whitespace
            if not line:  # Skip empty lines
                continue

            # Check if this is a root-level line (no indentation)
            if not line.startswith("　") and not line.startswith(" "):
                # Check if next line exists and is indented
                if i + 1 < len(content_lines):
                    next_line = content_lines[i + 1].rstrip("\n")
                    if next_line and (
                        next_line.startswith("　") or next_line.startswith(" ")
                    ):
                        # This is a chapter header
                        if current_chapter:
                            chapters.append(
                                {
                                    "number": current_chapter["number"],
                                    "title": current_chapter["title"],
                                    "content": chapter_content,
                                }
                            )

                        # Try to extract chapter number if present
                        chapter_num_match = re.search(r"第(\d+)章", line)
                        chapter_num = (
                            int(chapter_num_match.group(1))
                            if chapter_num_match
                            else len(chapters) + 1
                        )

                        current_chapter = {
                            "number": chapter_num,
                            "title": line,
                        }
                        chapter_content = []
                        continue

            # If we have a current chapter and the line is indented, add it to content
            if current_chapter and (line.startswith("　") or line.startswith(" ")):
                chapter_content.append(line.lstrip("　 "))  # Remove indentation

        # Add the last chapter
        if current_chapter:
            chapters.append(
                {
                    "number": current_chapter["number"],
                    "title": current_chapter["title"],
                    "content": chapter_content,
                }
            )

        return chapters

    def get_epub_content(self, novel_dir, config):
        """Parse EPUB XHTML content and extract chapters."""
        epub_dir = self.base_dir / novel_dir / config["path"]
        print(f"\nDebug: Reading EPUB content from {epub_dir}")

        def extract_number_from_filename(filename, pattern=None):
            """Extract chapter number from filename using pattern or default patterns."""
            patterns = [pattern] if pattern else []
            patterns.extend(
                [
                    r"Section0*(\d+)",  # Section0000.xhtml
                    r"^0*(\d+)[-_]",  # 0000_Book_1, 00001-1
                    r"split_0*(\d+)",  # index_split_001
                ]
            )

            for pat in patterns:
                if pat and (match := re.search(pat, filename)):
                    return int(match.group(1))
            return None

        def get_valid_files():
            """Get list of XHTML files that meet the index criteria."""
            try:
                xhtml_files = sorted(
                    f for f in os.listdir(epub_dir) if f.endswith((".xhtml", ".html"))
                )

                if not xhtml_files:
                    return []

                if config["idx"] == 0:
                    return xhtml_files

                pattern = config.get("pattern")
                return [
                    fname
                    for fname in xhtml_files
                    if (num := extract_number_from_filename(fname, pattern)) is not None
                    and num >= config["idx"]
                ]

            except Exception as e:
                print(f"Debug: Error listing directory: {e}")
                return []

        def parse_chapter(filename, chapter_num):
            """Parse single chapter file and extract content."""
            try:
                with open(epub_dir / filename, "r", encoding="utf-8") as f:
                    soup = BeautifulSoup(f.read(), "html.parser")

                paragraphs = [
                    p.get_text().strip()
                    for p in soup.find_all("p")
                    if p.get_text().strip()
                ]

                return {
                    "number": chapter_num,
                    "title": f"Chapter {chapter_num}",
                    "content": paragraphs,
                }
            except Exception as e:
                print(f"Debug: Error processing {filename}: {e}")
                return None

        # Main processing
        files_to_process = get_valid_files()
        if not files_to_process:
            print("Debug: No valid files found to process")
            return []

        chapters = []
        for i, filename in enumerate(files_to_process, start=1):
            if chapter := parse_chapter(filename, i):
                chapters.append(chapter)

        return chapters

    def compare_chapters(self, zh_chapters, en_chapters):
        """Compare Chinese and English chapters and return aggregate statistics."""
        total_chapters = len(zh_chapters)
        print(
            f"\nDebug: Found {len(zh_chapters)} Chinese chapters and {len(en_chapters)} English chapters"
        )

        if total_chapters == 0:
            print("Debug: No chapters found to compare!")
            return {
                "total_chapters": 0,
                "matching_chapters": 0,
                "mismatched_chapters": 0,
                "total_zh_paragraphs": 0,
                "total_en_paragraphs": 0,
                "paragraph_difference": 0,
            }

        matches = 0
        mismatches = 0
        total_zh_paragraphs = 0
        total_en_paragraphs = 0

        for zh_chapter, en_chapter in zip(zh_chapters, en_chapters):
            zh_para_count = len(zh_chapter["content"])
            en_para_count = len(en_chapter["content"])

            total_zh_paragraphs += zh_para_count
            total_en_paragraphs += en_para_count

            if zh_para_count == en_para_count:
                matches += 1
            else:
                mismatches += 1

        return {
            "total_chapters": total_chapters,
            "matching_chapters": matches,
            "mismatched_chapters": mismatches,
            "total_zh_paragraphs": total_zh_paragraphs,
            "total_en_paragraphs": total_en_paragraphs,
            "paragraph_difference": abs(total_zh_paragraphs - total_en_paragraphs),
        }


def main():
    parser = ChapterParser("custom")
    print(f"Debug: Starting parser with base directory: {os.path.abspath('custom')}")

    for novel_dir in ["ast", "desolate_era"]:
        print(f"\n=== Analyzing {novel_dir} ===")

        config = parser.load_format_config(novel_dir)
        zh_chapters = parser.get_txt_content(novel_dir, config["zh"])[:5]
        en_chapters = parser.get_epub_content(novel_dir, config["en"])[:5]

        for i in range(min(5, max(len(zh_chapters), len(en_chapters)))):
            print(f"\nChapter {i+1}:")

            if i >= len(zh_chapters):
                print("ZH chapter missing")
                continue

            if i >= len(en_chapters):
                print("EN chapter missing")
                continue

            zh_ch = zh_chapters[i]
            en_ch = en_chapters[i]

            print(f"ZH Title: {zh_ch['title']}")
            print(f"EN Title: {en_ch['title']}")
            print(f"ZH paragraphs: {len(zh_ch['content'])}")
            print(f"EN paragraphs: {len(en_ch['content'])}")

            # Compare paragraphs
            max_paras = max(len(zh_ch["content"]), len(en_ch["content"]))
            for p_idx in range(max_paras):
                zh_para = (
                    zh_ch["content"][p_idx] if p_idx < len(zh_ch["content"]) else None
                )
                en_para = (
                    en_ch["content"][p_idx] if p_idx < len(en_ch["content"]) else None
                )

                # Only print if one is missing or they're significantly different in length
                if zh_para is None:
                    print(f"\nExtra EN paragraph at position {p_idx+1}:")
                    print(f"EN: {en_para[:100]}...")
                elif en_para is None:
                    print(f"\nExtra ZH paragraph at position {p_idx+1}:")
                    print(f"ZH: {zh_para[:100]}...")
                elif (
                    abs(len(zh_para) - len(en_para)) > 50
                ):  # Threshold for length mismatch
                    print(f"\nLength mismatch at paragraph {p_idx+1}:")
                    print(f"ZH ({len(zh_para)} chars): {zh_para[:100]}...")
                    print(f"EN ({len(en_para)} chars): {en_para[:100]}...")


if __name__ == "__main__":
    main()