chore: asdas

2025-02-10 17:42:04 +06:00
parent fd380e250d
commit 9746aad58a
3987 changed files with 775441 additions and 16 deletions
--- a/custom_parser.py
+++ b/custom_parser.py
@@ -0,0 +1,272 @@
+import os
+import json
+from bs4 import BeautifulSoup
+from pathlib import Path
+import re
+
+
+class ChapterParser:
+    def __init__(self, base_dir):
+        self.base_dir = Path(base_dir)
+
+    def load_format_config(self, novel_dir):
+        """Load format.json configuration for a novel directory."""
+        format_path = self.base_dir / novel_dir / "format.json"
+        with open(format_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    def get_txt_content(self, novel_dir, config):
+        """Read and parse text content based on indentation patterns."""
+        txt_path = self.base_dir / novel_dir / config["path"]
+        print(f"\nDebug: Reading text file from {txt_path}")
+        try:
+            with open(txt_path, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+            print(f"Debug: Successfully read {len(lines)} lines")
+        except Exception as e:
+            print(f"Debug: Error reading file: {e}")
+            return []
+
+        # Skip lines until reaching the starting index
+        content_lines = lines[config["idx"] - 1 :]
+
+        chapters = []
+        current_chapter = None
+        chapter_content = []
+
+        for i, line in enumerate(content_lines):
+            line = line.rstrip("\n")  # Preserve leading whitespace
+            if not line:  # Skip empty lines
+                continue
+
+            # Check if this is a root-level line (no indentation)
+            if not line.startswith("　") and not line.startswith(" "):
+                # Check if next line exists and is indented
+                if i + 1 < len(content_lines):
+                    next_line = content_lines[i + 1].rstrip("\n")
+                    if next_line and (
+                        next_line.startswith("　") or next_line.startswith(" ")
+                    ):
+                        # This is a chapter header
+                        if current_chapter:
+                            chapters.append(
+                                {
+                                    "number": current_chapter["number"],
+                                    "title": current_chapter["title"],
+                                    "content": chapter_content,
+                                }
+                            )
+
+                        # Try to extract chapter number if present
+                        chapter_num_match = re.search(r"第(\d+)章", line)
+                        chapter_num = (
+                            int(chapter_num_match.group(1))
+                            if chapter_num_match
+                            else len(chapters) + 1
+                        )
+
+                        current_chapter = {
+                            "number": chapter_num,
+                            "title": line,
+                        }
+                        chapter_content = []
+                        continue
+
+            # If we have a current chapter and the line is indented, add it to content
+            if current_chapter and (line.startswith("　") or line.startswith(" ")):
+                chapter_content.append(line.lstrip("　 "))  # Remove indentation
+
+        # Add the last chapter
+        if current_chapter:
+            chapters.append(
+                {
+                    "number": current_chapter["number"],
+                    "title": current_chapter["title"],
+                    "content": chapter_content,
+                }
+            )
+
+        return chapters
+
+    def get_epub_content(self, novel_dir, config):
+        """Parse EPUB XHTML content and extract chapters."""
+        epub_dir = self.base_dir / novel_dir / config["path"]
+        print(f"\nDebug: Reading EPUB content from {epub_dir}")
+
+        def extract_number_from_filename(filename, pattern=None):
+            """Extract chapter number from filename using pattern or default patterns."""
+            patterns = [pattern] if pattern else []
+            patterns.extend(
+                [
+                    r"Section0*(\d+)",  # Section0000.xhtml
+                    r"^0*(\d+)[-_]",  # 0000_Book_1, 00001-1
+                    r"split_0*(\d+)",  # index_split_001
+                ]
+            )
+
+            for pat in patterns:
+                if pat and (match := re.search(pat, filename)):
+                    return int(match.group(1))
+            return None
+
+        def get_valid_files():
+            """Get list of XHTML files that meet the index criteria."""
+            try:
+                xhtml_files = sorted(
+                    f for f in os.listdir(epub_dir) if f.endswith((".xhtml", ".html"))
+                )
+
+                if not xhtml_files:
+                    return []
+
+                if config["idx"] == 0:
+                    return xhtml_files
+
+                pattern = config.get("pattern")
+                return [
+                    fname
+                    for fname in xhtml_files
+                    if (num := extract_number_from_filename(fname, pattern)) is not None
+                    and num >= config["idx"]
+                ]
+
+            except Exception as e:
+                print(f"Debug: Error listing directory: {e}")
+                return []
+
+        def parse_chapter(filename, chapter_num):
+            """Parse single chapter file and extract content."""
+            try:
+                with open(epub_dir / filename, "r", encoding="utf-8") as f:
+                    soup = BeautifulSoup(f.read(), "html.parser")
+
+                paragraphs = [
+                    p.get_text().strip()
+                    for p in soup.find_all("p")
+                    if p.get_text().strip()
+                ]
+
+                return {
+                    "number": chapter_num,
+                    "title": f"Chapter {chapter_num}",
+                    "content": paragraphs,
+                }
+            except Exception as e:
+                print(f"Debug: Error processing {filename}: {e}")
+                return None
+
+        # Main processing
+        files_to_process = get_valid_files()
+        if not files_to_process:
+            print("Debug: No valid files found to process")
+            return []
+
+        chapters = []
+        for i, filename in enumerate(files_to_process, start=1):
+            if chapter := parse_chapter(filename, i):
+                chapters.append(chapter)
+
+        return chapters
+
+    def compare_chapters(self, zh_chapters, en_chapters):
+        """Compare Chinese and English chapters and return aggregate statistics."""
+        total_chapters = len(zh_chapters)
+        print(
+            f"\nDebug: Found {len(zh_chapters)} Chinese chapters and {len(en_chapters)} English chapters"
+        )
+
+        if total_chapters == 0:
+            print("Debug: No chapters found to compare!")
+            return {
+                "total_chapters": 0,
+                "matching_chapters": 0,
+                "mismatched_chapters": 0,
+                "total_zh_paragraphs": 0,
+                "total_en_paragraphs": 0,
+                "paragraph_difference": 0,
+            }
+
+        matches = 0
+        mismatches = 0
+        total_zh_paragraphs = 0
+        total_en_paragraphs = 0
+
+        for zh_chapter, en_chapter in zip(zh_chapters, en_chapters):
+            zh_para_count = len(zh_chapter["content"])
+            en_para_count = len(en_chapter["content"])
+
+            total_zh_paragraphs += zh_para_count
+            total_en_paragraphs += en_para_count
+
+            if zh_para_count == en_para_count:
+                matches += 1
+            else:
+                mismatches += 1
+
+        return {
+            "total_chapters": total_chapters,
+            "matching_chapters": matches,
+            "mismatched_chapters": mismatches,
+            "total_zh_paragraphs": total_zh_paragraphs,
+            "total_en_paragraphs": total_en_paragraphs,
+            "paragraph_difference": abs(total_zh_paragraphs - total_en_paragraphs),
+        }
+
+
+def main():
+    parser = ChapterParser("custom")
+    print(f"Debug: Starting parser with base directory: {os.path.abspath('custom')}")
+
+    for novel_dir in ["ast", "desolate_era"]:
+        print(f"\n=== Analyzing {novel_dir} ===")
+
+        config = parser.load_format_config(novel_dir)
+        zh_chapters = parser.get_txt_content(novel_dir, config["zh"])[:5]
+        en_chapters = parser.get_epub_content(novel_dir, config["en"])[:5]
+
+        for i in range(min(5, max(len(zh_chapters), len(en_chapters)))):
+            print(f"\nChapter {i+1}:")
+
+            if i >= len(zh_chapters):
+                print("ZH chapter missing")
+                continue
+
+            if i >= len(en_chapters):
+                print("EN chapter missing")
+                continue
+
+            zh_ch = zh_chapters[i]
+            en_ch = en_chapters[i]
+
+            print(f"ZH Title: {zh_ch['title']}")
+            print(f"EN Title: {en_ch['title']}")
+            print(f"ZH paragraphs: {len(zh_ch['content'])}")
+            print(f"EN paragraphs: {len(en_ch['content'])}")
+
+            # Compare paragraphs
+            max_paras = max(len(zh_ch["content"]), len(en_ch["content"]))
+            for p_idx in range(max_paras):
+                zh_para = (
+                    zh_ch["content"][p_idx] if p_idx < len(zh_ch["content"]) else None
+                )
+                en_para = (
+                    en_ch["content"][p_idx] if p_idx < len(en_ch["content"]) else None
+                )
+
+                # Only print if one is missing or they're significantly different in length
+                if zh_para is None:
+                    print(f"\nExtra EN paragraph at position {p_idx+1}:")
+                    print(f"EN: {en_para[:100]}...")
+                elif en_para is None:
+                    print(f"\nExtra ZH paragraph at position {p_idx+1}:")
+                    print(f"ZH: {zh_para[:100]}...")
+                elif (
+                    abs(len(zh_para) - len(en_para)) > 50
+                ):  # Threshold for length mismatch
+                    print(f"\nLength mismatch at paragraph {p_idx+1}:")
+                    print(f"ZH ({len(zh_para)} chars): {zh_para[:100]}...")
+                    print(f"EN ({len(en_para)} chars): {en_para[:100]}...")
+
+
+if __name__ == "__main__":
+    main()