import os import json from bs4 import BeautifulSoup from pathlib import Path import re class ChapterParser: def __init__(self, base_dir): self.base_dir = Path(base_dir) def load_format_config(self, novel_dir): """Load format.json configuration for a novel directory.""" format_path = self.base_dir / novel_dir / "format.json" with open(format_path, "r", encoding="utf-8") as f: return json.load(f) def get_txt_content(self, novel_dir, config): """Read and parse text content based on indentation patterns.""" txt_path = self.base_dir / novel_dir / config["path"] print(f"\nDebug: Reading text file from {txt_path}") try: with open(txt_path, "r", encoding="utf-8") as f: lines = f.readlines() print(f"Debug: Successfully read {len(lines)} lines") except Exception as e: print(f"Debug: Error reading file: {e}") return [] # Skip lines until reaching the starting index content_lines = lines[config["idx"] - 1 :] chapters = [] current_chapter = None chapter_content = [] for i, line in enumerate(content_lines): line = line.rstrip("\n") # Preserve leading whitespace if not line: # Skip empty lines continue # Check if this is a root-level line (no indentation) if not line.startswith(" ") and not line.startswith(" "): # Check if next line exists and is indented if i + 1 < len(content_lines): next_line = content_lines[i + 1].rstrip("\n") if next_line and ( next_line.startswith(" ") or next_line.startswith(" ") ): # This is a chapter header if current_chapter: chapters.append( { "number": current_chapter["number"], "title": current_chapter["title"], "content": chapter_content, } ) # Try to extract chapter number if present chapter_num_match = re.search(r"第(\d+)章", line) chapter_num = ( int(chapter_num_match.group(1)) if chapter_num_match else len(chapters) + 1 ) current_chapter = { "number": chapter_num, "title": line, } chapter_content = [] continue # If we have a current chapter and the line is indented, add it to content if current_chapter and (line.startswith(" ") or line.startswith(" ")): chapter_content.append(line.lstrip("  ")) # Remove indentation # Add the last chapter if current_chapter: chapters.append( { "number": current_chapter["number"], "title": current_chapter["title"], "content": chapter_content, } ) return chapters def get_epub_content(self, novel_dir, config): """Parse EPUB XHTML content and extract chapters.""" epub_dir = self.base_dir / novel_dir / config["path"] print(f"\nDebug: Reading EPUB content from {epub_dir}") def extract_number_from_filename(filename, pattern=None): """Extract chapter number from filename using pattern or default patterns.""" patterns = [pattern] if pattern else [] patterns.extend( [ r"Section0*(\d+)", # Section0000.xhtml r"^0*(\d+)[-_]", # 0000_Book_1, 00001-1 r"split_0*(\d+)", # index_split_001 ] ) for pat in patterns: if pat and (match := re.search(pat, filename)): return int(match.group(1)) return None def get_valid_files(): """Get list of XHTML files that meet the index criteria.""" try: xhtml_files = sorted( f for f in os.listdir(epub_dir) if f.endswith((".xhtml", ".html")) ) if not xhtml_files: return [] if config["idx"] == 0: return xhtml_files pattern = config.get("pattern") return [ fname for fname in xhtml_files if (num := extract_number_from_filename(fname, pattern)) is not None and num >= config["idx"] ] except Exception as e: print(f"Debug: Error listing directory: {e}") return [] def parse_chapter(filename, chapter_num): """Parse single chapter file and extract content.""" try: with open(epub_dir / filename, "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") paragraphs = [ p.get_text().strip() for p in soup.find_all("p") if p.get_text().strip() ] return { "number": chapter_num, "title": f"Chapter {chapter_num}", "content": paragraphs, } except Exception as e: print(f"Debug: Error processing {filename}: {e}") return None # Main processing files_to_process = get_valid_files() if not files_to_process: print("Debug: No valid files found to process") return [] chapters = [] for i, filename in enumerate(files_to_process, start=1): if chapter := parse_chapter(filename, i): chapters.append(chapter) return chapters def compare_chapters(self, zh_chapters, en_chapters): """Compare Chinese and English chapters and return aggregate statistics.""" total_chapters = len(zh_chapters) print( f"\nDebug: Found {len(zh_chapters)} Chinese chapters and {len(en_chapters)} English chapters" ) if total_chapters == 0: print("Debug: No chapters found to compare!") return { "total_chapters": 0, "matching_chapters": 0, "mismatched_chapters": 0, "total_zh_paragraphs": 0, "total_en_paragraphs": 0, "paragraph_difference": 0, } matches = 0 mismatches = 0 total_zh_paragraphs = 0 total_en_paragraphs = 0 for zh_chapter, en_chapter in zip(zh_chapters, en_chapters): zh_para_count = len(zh_chapter["content"]) en_para_count = len(en_chapter["content"]) total_zh_paragraphs += zh_para_count total_en_paragraphs += en_para_count if zh_para_count == en_para_count: matches += 1 else: mismatches += 1 return { "total_chapters": total_chapters, "matching_chapters": matches, "mismatched_chapters": mismatches, "total_zh_paragraphs": total_zh_paragraphs, "total_en_paragraphs": total_en_paragraphs, "paragraph_difference": abs(total_zh_paragraphs - total_en_paragraphs), } def main(): parser = ChapterParser("custom") print(f"Debug: Starting parser with base directory: {os.path.abspath('custom')}") for novel_dir in ["ast", "desolate_era"]: print(f"\n=== Analyzing {novel_dir} ===") config = parser.load_format_config(novel_dir) zh_chapters = parser.get_txt_content(novel_dir, config["zh"])[:5] en_chapters = parser.get_epub_content(novel_dir, config["en"])[:5] for i in range(min(5, max(len(zh_chapters), len(en_chapters)))): print(f"\nChapter {i+1}:") if i >= len(zh_chapters): print("ZH chapter missing") continue if i >= len(en_chapters): print("EN chapter missing") continue zh_ch = zh_chapters[i] en_ch = en_chapters[i] print(f"ZH Title: {zh_ch['title']}") print(f"EN Title: {en_ch['title']}") print(f"ZH paragraphs: {len(zh_ch['content'])}") print(f"EN paragraphs: {len(en_ch['content'])}") # Compare paragraphs max_paras = max(len(zh_ch["content"]), len(en_ch["content"])) for p_idx in range(max_paras): zh_para = ( zh_ch["content"][p_idx] if p_idx < len(zh_ch["content"]) else None ) en_para = ( en_ch["content"][p_idx] if p_idx < len(en_ch["content"]) else None ) # Only print if one is missing or they're significantly different in length if zh_para is None: print(f"\nExtra EN paragraph at position {p_idx+1}:") print(f"EN: {en_para[:100]}...") elif en_para is None: print(f"\nExtra ZH paragraph at position {p_idx+1}:") print(f"ZH: {zh_para[:100]}...") elif ( abs(len(zh_para) - len(en_para)) > 50 ): # Threshold for length mismatch print(f"\nLength mismatch at paragraph {p_idx+1}:") print(f"ZH ({len(zh_para)} chars): {zh_para[:100]}...") print(f"EN ({len(en_para)} chars): {en_para[:100]}...") if __name__ == "__main__": main()