chore: asdas
This commit is contained in:
272
custom_parser.py
Normal file
272
custom_parser.py
Normal file
@@ -0,0 +1,272 @@
|
||||
import os
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
|
||||
class ChapterParser:
|
||||
def __init__(self, base_dir):
|
||||
self.base_dir = Path(base_dir)
|
||||
|
||||
def load_format_config(self, novel_dir):
|
||||
"""Load format.json configuration for a novel directory."""
|
||||
format_path = self.base_dir / novel_dir / "format.json"
|
||||
with open(format_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def get_txt_content(self, novel_dir, config):
|
||||
"""Read and parse text content based on indentation patterns."""
|
||||
txt_path = self.base_dir / novel_dir / config["path"]
|
||||
print(f"\nDebug: Reading text file from {txt_path}")
|
||||
try:
|
||||
with open(txt_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
print(f"Debug: Successfully read {len(lines)} lines")
|
||||
except Exception as e:
|
||||
print(f"Debug: Error reading file: {e}")
|
||||
return []
|
||||
|
||||
# Skip lines until reaching the starting index
|
||||
content_lines = lines[config["idx"] - 1 :]
|
||||
|
||||
chapters = []
|
||||
current_chapter = None
|
||||
chapter_content = []
|
||||
|
||||
for i, line in enumerate(content_lines):
|
||||
line = line.rstrip("\n") # Preserve leading whitespace
|
||||
if not line: # Skip empty lines
|
||||
continue
|
||||
|
||||
# Check if this is a root-level line (no indentation)
|
||||
if not line.startswith(" ") and not line.startswith(" "):
|
||||
# Check if next line exists and is indented
|
||||
if i + 1 < len(content_lines):
|
||||
next_line = content_lines[i + 1].rstrip("\n")
|
||||
if next_line and (
|
||||
next_line.startswith(" ") or next_line.startswith(" ")
|
||||
):
|
||||
# This is a chapter header
|
||||
if current_chapter:
|
||||
chapters.append(
|
||||
{
|
||||
"number": current_chapter["number"],
|
||||
"title": current_chapter["title"],
|
||||
"content": chapter_content,
|
||||
}
|
||||
)
|
||||
|
||||
# Try to extract chapter number if present
|
||||
chapter_num_match = re.search(r"第(\d+)章", line)
|
||||
chapter_num = (
|
||||
int(chapter_num_match.group(1))
|
||||
if chapter_num_match
|
||||
else len(chapters) + 1
|
||||
)
|
||||
|
||||
current_chapter = {
|
||||
"number": chapter_num,
|
||||
"title": line,
|
||||
}
|
||||
chapter_content = []
|
||||
continue
|
||||
|
||||
# If we have a current chapter and the line is indented, add it to content
|
||||
if current_chapter and (line.startswith(" ") or line.startswith(" ")):
|
||||
chapter_content.append(line.lstrip(" ")) # Remove indentation
|
||||
|
||||
# Add the last chapter
|
||||
if current_chapter:
|
||||
chapters.append(
|
||||
{
|
||||
"number": current_chapter["number"],
|
||||
"title": current_chapter["title"],
|
||||
"content": chapter_content,
|
||||
}
|
||||
)
|
||||
|
||||
return chapters
|
||||
|
||||
def get_epub_content(self, novel_dir, config):
|
||||
"""Parse EPUB XHTML content and extract chapters."""
|
||||
epub_dir = self.base_dir / novel_dir / config["path"]
|
||||
print(f"\nDebug: Reading EPUB content from {epub_dir}")
|
||||
|
||||
def extract_number_from_filename(filename, pattern=None):
|
||||
"""Extract chapter number from filename using pattern or default patterns."""
|
||||
patterns = [pattern] if pattern else []
|
||||
patterns.extend(
|
||||
[
|
||||
r"Section0*(\d+)", # Section0000.xhtml
|
||||
r"^0*(\d+)[-_]", # 0000_Book_1, 00001-1
|
||||
r"split_0*(\d+)", # index_split_001
|
||||
]
|
||||
)
|
||||
|
||||
for pat in patterns:
|
||||
if pat and (match := re.search(pat, filename)):
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
def get_valid_files():
|
||||
"""Get list of XHTML files that meet the index criteria."""
|
||||
try:
|
||||
xhtml_files = sorted(
|
||||
f for f in os.listdir(epub_dir) if f.endswith((".xhtml", ".html"))
|
||||
)
|
||||
|
||||
if not xhtml_files:
|
||||
return []
|
||||
|
||||
if config["idx"] == 0:
|
||||
return xhtml_files
|
||||
|
||||
pattern = config.get("pattern")
|
||||
return [
|
||||
fname
|
||||
for fname in xhtml_files
|
||||
if (num := extract_number_from_filename(fname, pattern)) is not None
|
||||
and num >= config["idx"]
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Debug: Error listing directory: {e}")
|
||||
return []
|
||||
|
||||
def parse_chapter(filename, chapter_num):
|
||||
"""Parse single chapter file and extract content."""
|
||||
try:
|
||||
with open(epub_dir / filename, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
paragraphs = [
|
||||
p.get_text().strip()
|
||||
for p in soup.find_all("p")
|
||||
if p.get_text().strip()
|
||||
]
|
||||
|
||||
return {
|
||||
"number": chapter_num,
|
||||
"title": f"Chapter {chapter_num}",
|
||||
"content": paragraphs,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Debug: Error processing {filename}: {e}")
|
||||
return None
|
||||
|
||||
# Main processing
|
||||
files_to_process = get_valid_files()
|
||||
if not files_to_process:
|
||||
print("Debug: No valid files found to process")
|
||||
return []
|
||||
|
||||
chapters = []
|
||||
for i, filename in enumerate(files_to_process, start=1):
|
||||
if chapter := parse_chapter(filename, i):
|
||||
chapters.append(chapter)
|
||||
|
||||
return chapters
|
||||
|
||||
def compare_chapters(self, zh_chapters, en_chapters):
|
||||
"""Compare Chinese and English chapters and return aggregate statistics."""
|
||||
total_chapters = len(zh_chapters)
|
||||
print(
|
||||
f"\nDebug: Found {len(zh_chapters)} Chinese chapters and {len(en_chapters)} English chapters"
|
||||
)
|
||||
|
||||
if total_chapters == 0:
|
||||
print("Debug: No chapters found to compare!")
|
||||
return {
|
||||
"total_chapters": 0,
|
||||
"matching_chapters": 0,
|
||||
"mismatched_chapters": 0,
|
||||
"total_zh_paragraphs": 0,
|
||||
"total_en_paragraphs": 0,
|
||||
"paragraph_difference": 0,
|
||||
}
|
||||
|
||||
matches = 0
|
||||
mismatches = 0
|
||||
total_zh_paragraphs = 0
|
||||
total_en_paragraphs = 0
|
||||
|
||||
for zh_chapter, en_chapter in zip(zh_chapters, en_chapters):
|
||||
zh_para_count = len(zh_chapter["content"])
|
||||
en_para_count = len(en_chapter["content"])
|
||||
|
||||
total_zh_paragraphs += zh_para_count
|
||||
total_en_paragraphs += en_para_count
|
||||
|
||||
if zh_para_count == en_para_count:
|
||||
matches += 1
|
||||
else:
|
||||
mismatches += 1
|
||||
|
||||
return {
|
||||
"total_chapters": total_chapters,
|
||||
"matching_chapters": matches,
|
||||
"mismatched_chapters": mismatches,
|
||||
"total_zh_paragraphs": total_zh_paragraphs,
|
||||
"total_en_paragraphs": total_en_paragraphs,
|
||||
"paragraph_difference": abs(total_zh_paragraphs - total_en_paragraphs),
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = ChapterParser("custom")
|
||||
print(f"Debug: Starting parser with base directory: {os.path.abspath('custom')}")
|
||||
|
||||
for novel_dir in ["ast", "desolate_era"]:
|
||||
print(f"\n=== Analyzing {novel_dir} ===")
|
||||
|
||||
config = parser.load_format_config(novel_dir)
|
||||
zh_chapters = parser.get_txt_content(novel_dir, config["zh"])[:5]
|
||||
en_chapters = parser.get_epub_content(novel_dir, config["en"])[:5]
|
||||
|
||||
for i in range(min(5, max(len(zh_chapters), len(en_chapters)))):
|
||||
print(f"\nChapter {i+1}:")
|
||||
|
||||
if i >= len(zh_chapters):
|
||||
print("ZH chapter missing")
|
||||
continue
|
||||
|
||||
if i >= len(en_chapters):
|
||||
print("EN chapter missing")
|
||||
continue
|
||||
|
||||
zh_ch = zh_chapters[i]
|
||||
en_ch = en_chapters[i]
|
||||
|
||||
print(f"ZH Title: {zh_ch['title']}")
|
||||
print(f"EN Title: {en_ch['title']}")
|
||||
print(f"ZH paragraphs: {len(zh_ch['content'])}")
|
||||
print(f"EN paragraphs: {len(en_ch['content'])}")
|
||||
|
||||
# Compare paragraphs
|
||||
max_paras = max(len(zh_ch["content"]), len(en_ch["content"]))
|
||||
for p_idx in range(max_paras):
|
||||
zh_para = (
|
||||
zh_ch["content"][p_idx] if p_idx < len(zh_ch["content"]) else None
|
||||
)
|
||||
en_para = (
|
||||
en_ch["content"][p_idx] if p_idx < len(en_ch["content"]) else None
|
||||
)
|
||||
|
||||
# Only print if one is missing or they're significantly different in length
|
||||
if zh_para is None:
|
||||
print(f"\nExtra EN paragraph at position {p_idx+1}:")
|
||||
print(f"EN: {en_para[:100]}...")
|
||||
elif en_para is None:
|
||||
print(f"\nExtra ZH paragraph at position {p_idx+1}:")
|
||||
print(f"ZH: {zh_para[:100]}...")
|
||||
elif (
|
||||
abs(len(zh_para) - len(en_para)) > 50
|
||||
): # Threshold for length mismatch
|
||||
print(f"\nLength mismatch at paragraph {p_idx+1}:")
|
||||
print(f"ZH ({len(zh_para)} chars): {zh_para[:100]}...")
|
||||
print(f"EN ({len(en_para)} chars): {en_para[:100]}...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user