273 lines
9.9 KiB
Python
273 lines
9.9 KiB
Python
import os
|
||
import json
|
||
from bs4 import BeautifulSoup
|
||
from pathlib import Path
|
||
import re
|
||
|
||
|
||
class ChapterParser:
|
||
def __init__(self, base_dir):
|
||
self.base_dir = Path(base_dir)
|
||
|
||
def load_format_config(self, novel_dir):
|
||
"""Load format.json configuration for a novel directory."""
|
||
format_path = self.base_dir / novel_dir / "format.json"
|
||
with open(format_path, "r", encoding="utf-8") as f:
|
||
return json.load(f)
|
||
|
||
def get_txt_content(self, novel_dir, config):
|
||
"""Read and parse text content based on indentation patterns."""
|
||
txt_path = self.base_dir / novel_dir / config["path"]
|
||
print(f"\nDebug: Reading text file from {txt_path}")
|
||
try:
|
||
with open(txt_path, "r", encoding="utf-8") as f:
|
||
lines = f.readlines()
|
||
print(f"Debug: Successfully read {len(lines)} lines")
|
||
except Exception as e:
|
||
print(f"Debug: Error reading file: {e}")
|
||
return []
|
||
|
||
# Skip lines until reaching the starting index
|
||
content_lines = lines[config["idx"] - 1 :]
|
||
|
||
chapters = []
|
||
current_chapter = None
|
||
chapter_content = []
|
||
|
||
for i, line in enumerate(content_lines):
|
||
line = line.rstrip("\n") # Preserve leading whitespace
|
||
if not line: # Skip empty lines
|
||
continue
|
||
|
||
# Check if this is a root-level line (no indentation)
|
||
if not line.startswith(" ") and not line.startswith(" "):
|
||
# Check if next line exists and is indented
|
||
if i + 1 < len(content_lines):
|
||
next_line = content_lines[i + 1].rstrip("\n")
|
||
if next_line and (
|
||
next_line.startswith(" ") or next_line.startswith(" ")
|
||
):
|
||
# This is a chapter header
|
||
if current_chapter:
|
||
chapters.append(
|
||
{
|
||
"number": current_chapter["number"],
|
||
"title": current_chapter["title"],
|
||
"content": chapter_content,
|
||
}
|
||
)
|
||
|
||
# Try to extract chapter number if present
|
||
chapter_num_match = re.search(r"第(\d+)章", line)
|
||
chapter_num = (
|
||
int(chapter_num_match.group(1))
|
||
if chapter_num_match
|
||
else len(chapters) + 1
|
||
)
|
||
|
||
current_chapter = {
|
||
"number": chapter_num,
|
||
"title": line,
|
||
}
|
||
chapter_content = []
|
||
continue
|
||
|
||
# If we have a current chapter and the line is indented, add it to content
|
||
if current_chapter and (line.startswith(" ") or line.startswith(" ")):
|
||
chapter_content.append(line.lstrip(" ")) # Remove indentation
|
||
|
||
# Add the last chapter
|
||
if current_chapter:
|
||
chapters.append(
|
||
{
|
||
"number": current_chapter["number"],
|
||
"title": current_chapter["title"],
|
||
"content": chapter_content,
|
||
}
|
||
)
|
||
|
||
return chapters
|
||
|
||
def get_epub_content(self, novel_dir, config):
|
||
"""Parse EPUB XHTML content and extract chapters."""
|
||
epub_dir = self.base_dir / novel_dir / config["path"]
|
||
print(f"\nDebug: Reading EPUB content from {epub_dir}")
|
||
|
||
def extract_number_from_filename(filename, pattern=None):
|
||
"""Extract chapter number from filename using pattern or default patterns."""
|
||
patterns = [pattern] if pattern else []
|
||
patterns.extend(
|
||
[
|
||
r"Section0*(\d+)", # Section0000.xhtml
|
||
r"^0*(\d+)[-_]", # 0000_Book_1, 00001-1
|
||
r"split_0*(\d+)", # index_split_001
|
||
]
|
||
)
|
||
|
||
for pat in patterns:
|
||
if pat and (match := re.search(pat, filename)):
|
||
return int(match.group(1))
|
||
return None
|
||
|
||
def get_valid_files():
|
||
"""Get list of XHTML files that meet the index criteria."""
|
||
try:
|
||
xhtml_files = sorted(
|
||
f for f in os.listdir(epub_dir) if f.endswith((".xhtml", ".html"))
|
||
)
|
||
|
||
if not xhtml_files:
|
||
return []
|
||
|
||
if config["idx"] == 0:
|
||
return xhtml_files
|
||
|
||
pattern = config.get("pattern")
|
||
return [
|
||
fname
|
||
for fname in xhtml_files
|
||
if (num := extract_number_from_filename(fname, pattern)) is not None
|
||
and num >= config["idx"]
|
||
]
|
||
|
||
except Exception as e:
|
||
print(f"Debug: Error listing directory: {e}")
|
||
return []
|
||
|
||
def parse_chapter(filename, chapter_num):
|
||
"""Parse single chapter file and extract content."""
|
||
try:
|
||
with open(epub_dir / filename, "r", encoding="utf-8") as f:
|
||
soup = BeautifulSoup(f.read(), "html.parser")
|
||
|
||
paragraphs = [
|
||
p.get_text().strip()
|
||
for p in soup.find_all("p")
|
||
if p.get_text().strip()
|
||
]
|
||
|
||
return {
|
||
"number": chapter_num,
|
||
"title": f"Chapter {chapter_num}",
|
||
"content": paragraphs,
|
||
}
|
||
except Exception as e:
|
||
print(f"Debug: Error processing {filename}: {e}")
|
||
return None
|
||
|
||
# Main processing
|
||
files_to_process = get_valid_files()
|
||
if not files_to_process:
|
||
print("Debug: No valid files found to process")
|
||
return []
|
||
|
||
chapters = []
|
||
for i, filename in enumerate(files_to_process, start=1):
|
||
if chapter := parse_chapter(filename, i):
|
||
chapters.append(chapter)
|
||
|
||
return chapters
|
||
|
||
def compare_chapters(self, zh_chapters, en_chapters):
|
||
"""Compare Chinese and English chapters and return aggregate statistics."""
|
||
total_chapters = len(zh_chapters)
|
||
print(
|
||
f"\nDebug: Found {len(zh_chapters)} Chinese chapters and {len(en_chapters)} English chapters"
|
||
)
|
||
|
||
if total_chapters == 0:
|
||
print("Debug: No chapters found to compare!")
|
||
return {
|
||
"total_chapters": 0,
|
||
"matching_chapters": 0,
|
||
"mismatched_chapters": 0,
|
||
"total_zh_paragraphs": 0,
|
||
"total_en_paragraphs": 0,
|
||
"paragraph_difference": 0,
|
||
}
|
||
|
||
matches = 0
|
||
mismatches = 0
|
||
total_zh_paragraphs = 0
|
||
total_en_paragraphs = 0
|
||
|
||
for zh_chapter, en_chapter in zip(zh_chapters, en_chapters):
|
||
zh_para_count = len(zh_chapter["content"])
|
||
en_para_count = len(en_chapter["content"])
|
||
|
||
total_zh_paragraphs += zh_para_count
|
||
total_en_paragraphs += en_para_count
|
||
|
||
if zh_para_count == en_para_count:
|
||
matches += 1
|
||
else:
|
||
mismatches += 1
|
||
|
||
return {
|
||
"total_chapters": total_chapters,
|
||
"matching_chapters": matches,
|
||
"mismatched_chapters": mismatches,
|
||
"total_zh_paragraphs": total_zh_paragraphs,
|
||
"total_en_paragraphs": total_en_paragraphs,
|
||
"paragraph_difference": abs(total_zh_paragraphs - total_en_paragraphs),
|
||
}
|
||
|
||
|
||
def main():
|
||
parser = ChapterParser("custom")
|
||
print(f"Debug: Starting parser with base directory: {os.path.abspath('custom')}")
|
||
|
||
for novel_dir in ["ast", "desolate_era"]:
|
||
print(f"\n=== Analyzing {novel_dir} ===")
|
||
|
||
config = parser.load_format_config(novel_dir)
|
||
zh_chapters = parser.get_txt_content(novel_dir, config["zh"])[:5]
|
||
en_chapters = parser.get_epub_content(novel_dir, config["en"])[:5]
|
||
|
||
for i in range(min(5, max(len(zh_chapters), len(en_chapters)))):
|
||
print(f"\nChapter {i+1}:")
|
||
|
||
if i >= len(zh_chapters):
|
||
print("ZH chapter missing")
|
||
continue
|
||
|
||
if i >= len(en_chapters):
|
||
print("EN chapter missing")
|
||
continue
|
||
|
||
zh_ch = zh_chapters[i]
|
||
en_ch = en_chapters[i]
|
||
|
||
print(f"ZH Title: {zh_ch['title']}")
|
||
print(f"EN Title: {en_ch['title']}")
|
||
print(f"ZH paragraphs: {len(zh_ch['content'])}")
|
||
print(f"EN paragraphs: {len(en_ch['content'])}")
|
||
|
||
# Compare paragraphs
|
||
max_paras = max(len(zh_ch["content"]), len(en_ch["content"]))
|
||
for p_idx in range(max_paras):
|
||
zh_para = (
|
||
zh_ch["content"][p_idx] if p_idx < len(zh_ch["content"]) else None
|
||
)
|
||
en_para = (
|
||
en_ch["content"][p_idx] if p_idx < len(en_ch["content"]) else None
|
||
)
|
||
|
||
# Only print if one is missing or they're significantly different in length
|
||
if zh_para is None:
|
||
print(f"\nExtra EN paragraph at position {p_idx+1}:")
|
||
print(f"EN: {en_para[:100]}...")
|
||
elif en_para is None:
|
||
print(f"\nExtra ZH paragraph at position {p_idx+1}:")
|
||
print(f"ZH: {zh_para[:100]}...")
|
||
elif (
|
||
abs(len(zh_para) - len(en_para)) > 50
|
||
): # Threshold for length mismatch
|
||
print(f"\nLength mismatch at paragraph {p_idx+1}:")
|
||
print(f"ZH ({len(zh_para)} chars): {zh_para[:100]}...")
|
||
print(f"EN ({len(en_para)} chars): {en_para[:100]}...")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|