Files
zh-en-wn-dataset/custom_parser.py
2025-02-10 17:42:04 +06:00

273 lines
9.9 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import json
from bs4 import BeautifulSoup
from pathlib import Path
import re
class ChapterParser:
def __init__(self, base_dir):
self.base_dir = Path(base_dir)
def load_format_config(self, novel_dir):
"""Load format.json configuration for a novel directory."""
format_path = self.base_dir / novel_dir / "format.json"
with open(format_path, "r", encoding="utf-8") as f:
return json.load(f)
def get_txt_content(self, novel_dir, config):
"""Read and parse text content based on indentation patterns."""
txt_path = self.base_dir / novel_dir / config["path"]
print(f"\nDebug: Reading text file from {txt_path}")
try:
with open(txt_path, "r", encoding="utf-8") as f:
lines = f.readlines()
print(f"Debug: Successfully read {len(lines)} lines")
except Exception as e:
print(f"Debug: Error reading file: {e}")
return []
# Skip lines until reaching the starting index
content_lines = lines[config["idx"] - 1 :]
chapters = []
current_chapter = None
chapter_content = []
for i, line in enumerate(content_lines):
line = line.rstrip("\n") # Preserve leading whitespace
if not line: # Skip empty lines
continue
# Check if this is a root-level line (no indentation)
if not line.startswith(" ") and not line.startswith(" "):
# Check if next line exists and is indented
if i + 1 < len(content_lines):
next_line = content_lines[i + 1].rstrip("\n")
if next_line and (
next_line.startswith(" ") or next_line.startswith(" ")
):
# This is a chapter header
if current_chapter:
chapters.append(
{
"number": current_chapter["number"],
"title": current_chapter["title"],
"content": chapter_content,
}
)
# Try to extract chapter number if present
chapter_num_match = re.search(r"第(\d+)章", line)
chapter_num = (
int(chapter_num_match.group(1))
if chapter_num_match
else len(chapters) + 1
)
current_chapter = {
"number": chapter_num,
"title": line,
}
chapter_content = []
continue
# If we have a current chapter and the line is indented, add it to content
if current_chapter and (line.startswith(" ") or line.startswith(" ")):
chapter_content.append(line.lstrip("  ")) # Remove indentation
# Add the last chapter
if current_chapter:
chapters.append(
{
"number": current_chapter["number"],
"title": current_chapter["title"],
"content": chapter_content,
}
)
return chapters
def get_epub_content(self, novel_dir, config):
"""Parse EPUB XHTML content and extract chapters."""
epub_dir = self.base_dir / novel_dir / config["path"]
print(f"\nDebug: Reading EPUB content from {epub_dir}")
def extract_number_from_filename(filename, pattern=None):
"""Extract chapter number from filename using pattern or default patterns."""
patterns = [pattern] if pattern else []
patterns.extend(
[
r"Section0*(\d+)", # Section0000.xhtml
r"^0*(\d+)[-_]", # 0000_Book_1, 00001-1
r"split_0*(\d+)", # index_split_001
]
)
for pat in patterns:
if pat and (match := re.search(pat, filename)):
return int(match.group(1))
return None
def get_valid_files():
"""Get list of XHTML files that meet the index criteria."""
try:
xhtml_files = sorted(
f for f in os.listdir(epub_dir) if f.endswith((".xhtml", ".html"))
)
if not xhtml_files:
return []
if config["idx"] == 0:
return xhtml_files
pattern = config.get("pattern")
return [
fname
for fname in xhtml_files
if (num := extract_number_from_filename(fname, pattern)) is not None
and num >= config["idx"]
]
except Exception as e:
print(f"Debug: Error listing directory: {e}")
return []
def parse_chapter(filename, chapter_num):
"""Parse single chapter file and extract content."""
try:
with open(epub_dir / filename, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
paragraphs = [
p.get_text().strip()
for p in soup.find_all("p")
if p.get_text().strip()
]
return {
"number": chapter_num,
"title": f"Chapter {chapter_num}",
"content": paragraphs,
}
except Exception as e:
print(f"Debug: Error processing {filename}: {e}")
return None
# Main processing
files_to_process = get_valid_files()
if not files_to_process:
print("Debug: No valid files found to process")
return []
chapters = []
for i, filename in enumerate(files_to_process, start=1):
if chapter := parse_chapter(filename, i):
chapters.append(chapter)
return chapters
def compare_chapters(self, zh_chapters, en_chapters):
"""Compare Chinese and English chapters and return aggregate statistics."""
total_chapters = len(zh_chapters)
print(
f"\nDebug: Found {len(zh_chapters)} Chinese chapters and {len(en_chapters)} English chapters"
)
if total_chapters == 0:
print("Debug: No chapters found to compare!")
return {
"total_chapters": 0,
"matching_chapters": 0,
"mismatched_chapters": 0,
"total_zh_paragraphs": 0,
"total_en_paragraphs": 0,
"paragraph_difference": 0,
}
matches = 0
mismatches = 0
total_zh_paragraphs = 0
total_en_paragraphs = 0
for zh_chapter, en_chapter in zip(zh_chapters, en_chapters):
zh_para_count = len(zh_chapter["content"])
en_para_count = len(en_chapter["content"])
total_zh_paragraphs += zh_para_count
total_en_paragraphs += en_para_count
if zh_para_count == en_para_count:
matches += 1
else:
mismatches += 1
return {
"total_chapters": total_chapters,
"matching_chapters": matches,
"mismatched_chapters": mismatches,
"total_zh_paragraphs": total_zh_paragraphs,
"total_en_paragraphs": total_en_paragraphs,
"paragraph_difference": abs(total_zh_paragraphs - total_en_paragraphs),
}
def main():
parser = ChapterParser("custom")
print(f"Debug: Starting parser with base directory: {os.path.abspath('custom')}")
for novel_dir in ["ast", "desolate_era"]:
print(f"\n=== Analyzing {novel_dir} ===")
config = parser.load_format_config(novel_dir)
zh_chapters = parser.get_txt_content(novel_dir, config["zh"])[:5]
en_chapters = parser.get_epub_content(novel_dir, config["en"])[:5]
for i in range(min(5, max(len(zh_chapters), len(en_chapters)))):
print(f"\nChapter {i+1}:")
if i >= len(zh_chapters):
print("ZH chapter missing")
continue
if i >= len(en_chapters):
print("EN chapter missing")
continue
zh_ch = zh_chapters[i]
en_ch = en_chapters[i]
print(f"ZH Title: {zh_ch['title']}")
print(f"EN Title: {en_ch['title']}")
print(f"ZH paragraphs: {len(zh_ch['content'])}")
print(f"EN paragraphs: {len(en_ch['content'])}")
# Compare paragraphs
max_paras = max(len(zh_ch["content"]), len(en_ch["content"]))
for p_idx in range(max_paras):
zh_para = (
zh_ch["content"][p_idx] if p_idx < len(zh_ch["content"]) else None
)
en_para = (
en_ch["content"][p_idx] if p_idx < len(en_ch["content"]) else None
)
# Only print if one is missing or they're significantly different in length
if zh_para is None:
print(f"\nExtra EN paragraph at position {p_idx+1}:")
print(f"EN: {en_para[:100]}...")
elif en_para is None:
print(f"\nExtra ZH paragraph at position {p_idx+1}:")
print(f"ZH: {zh_para[:100]}...")
elif (
abs(len(zh_para) - len(en_para)) > 50
): # Threshold for length mismatch
print(f"\nLength mismatch at paragraph {p_idx+1}:")
print(f"ZH ({len(zh_para)} chars): {zh_para[:100]}...")
print(f"EN ({len(en_para)} chars): {en_para[:100]}...")
if __name__ == "__main__":
main()