chore: asdas

This commit is contained in:
2025-02-10 17:42:04 +06:00
parent fd380e250d
commit 9746aad58a
3987 changed files with 775441 additions and 16 deletions

272
custom_parser.py Normal file
View File

@@ -0,0 +1,272 @@
import os
import json
from bs4 import BeautifulSoup
from pathlib import Path
import re
class ChapterParser:
def __init__(self, base_dir):
self.base_dir = Path(base_dir)
def load_format_config(self, novel_dir):
"""Load format.json configuration for a novel directory."""
format_path = self.base_dir / novel_dir / "format.json"
with open(format_path, "r", encoding="utf-8") as f:
return json.load(f)
def get_txt_content(self, novel_dir, config):
"""Read and parse text content based on indentation patterns."""
txt_path = self.base_dir / novel_dir / config["path"]
print(f"\nDebug: Reading text file from {txt_path}")
try:
with open(txt_path, "r", encoding="utf-8") as f:
lines = f.readlines()
print(f"Debug: Successfully read {len(lines)} lines")
except Exception as e:
print(f"Debug: Error reading file: {e}")
return []
# Skip lines until reaching the starting index
content_lines = lines[config["idx"] - 1 :]
chapters = []
current_chapter = None
chapter_content = []
for i, line in enumerate(content_lines):
line = line.rstrip("\n") # Preserve leading whitespace
if not line: # Skip empty lines
continue
# Check if this is a root-level line (no indentation)
if not line.startswith(" ") and not line.startswith(" "):
# Check if next line exists and is indented
if i + 1 < len(content_lines):
next_line = content_lines[i + 1].rstrip("\n")
if next_line and (
next_line.startswith(" ") or next_line.startswith(" ")
):
# This is a chapter header
if current_chapter:
chapters.append(
{
"number": current_chapter["number"],
"title": current_chapter["title"],
"content": chapter_content,
}
)
# Try to extract chapter number if present
chapter_num_match = re.search(r"第(\d+)章", line)
chapter_num = (
int(chapter_num_match.group(1))
if chapter_num_match
else len(chapters) + 1
)
current_chapter = {
"number": chapter_num,
"title": line,
}
chapter_content = []
continue
# If we have a current chapter and the line is indented, add it to content
if current_chapter and (line.startswith(" ") or line.startswith(" ")):
chapter_content.append(line.lstrip("  ")) # Remove indentation
# Add the last chapter
if current_chapter:
chapters.append(
{
"number": current_chapter["number"],
"title": current_chapter["title"],
"content": chapter_content,
}
)
return chapters
def get_epub_content(self, novel_dir, config):
"""Parse EPUB XHTML content and extract chapters."""
epub_dir = self.base_dir / novel_dir / config["path"]
print(f"\nDebug: Reading EPUB content from {epub_dir}")
def extract_number_from_filename(filename, pattern=None):
"""Extract chapter number from filename using pattern or default patterns."""
patterns = [pattern] if pattern else []
patterns.extend(
[
r"Section0*(\d+)", # Section0000.xhtml
r"^0*(\d+)[-_]", # 0000_Book_1, 00001-1
r"split_0*(\d+)", # index_split_001
]
)
for pat in patterns:
if pat and (match := re.search(pat, filename)):
return int(match.group(1))
return None
def get_valid_files():
"""Get list of XHTML files that meet the index criteria."""
try:
xhtml_files = sorted(
f for f in os.listdir(epub_dir) if f.endswith((".xhtml", ".html"))
)
if not xhtml_files:
return []
if config["idx"] == 0:
return xhtml_files
pattern = config.get("pattern")
return [
fname
for fname in xhtml_files
if (num := extract_number_from_filename(fname, pattern)) is not None
and num >= config["idx"]
]
except Exception as e:
print(f"Debug: Error listing directory: {e}")
return []
def parse_chapter(filename, chapter_num):
"""Parse single chapter file and extract content."""
try:
with open(epub_dir / filename, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
paragraphs = [
p.get_text().strip()
for p in soup.find_all("p")
if p.get_text().strip()
]
return {
"number": chapter_num,
"title": f"Chapter {chapter_num}",
"content": paragraphs,
}
except Exception as e:
print(f"Debug: Error processing {filename}: {e}")
return None
# Main processing
files_to_process = get_valid_files()
if not files_to_process:
print("Debug: No valid files found to process")
return []
chapters = []
for i, filename in enumerate(files_to_process, start=1):
if chapter := parse_chapter(filename, i):
chapters.append(chapter)
return chapters
def compare_chapters(self, zh_chapters, en_chapters):
"""Compare Chinese and English chapters and return aggregate statistics."""
total_chapters = len(zh_chapters)
print(
f"\nDebug: Found {len(zh_chapters)} Chinese chapters and {len(en_chapters)} English chapters"
)
if total_chapters == 0:
print("Debug: No chapters found to compare!")
return {
"total_chapters": 0,
"matching_chapters": 0,
"mismatched_chapters": 0,
"total_zh_paragraphs": 0,
"total_en_paragraphs": 0,
"paragraph_difference": 0,
}
matches = 0
mismatches = 0
total_zh_paragraphs = 0
total_en_paragraphs = 0
for zh_chapter, en_chapter in zip(zh_chapters, en_chapters):
zh_para_count = len(zh_chapter["content"])
en_para_count = len(en_chapter["content"])
total_zh_paragraphs += zh_para_count
total_en_paragraphs += en_para_count
if zh_para_count == en_para_count:
matches += 1
else:
mismatches += 1
return {
"total_chapters": total_chapters,
"matching_chapters": matches,
"mismatched_chapters": mismatches,
"total_zh_paragraphs": total_zh_paragraphs,
"total_en_paragraphs": total_en_paragraphs,
"paragraph_difference": abs(total_zh_paragraphs - total_en_paragraphs),
}
def main():
parser = ChapterParser("custom")
print(f"Debug: Starting parser with base directory: {os.path.abspath('custom')}")
for novel_dir in ["ast", "desolate_era"]:
print(f"\n=== Analyzing {novel_dir} ===")
config = parser.load_format_config(novel_dir)
zh_chapters = parser.get_txt_content(novel_dir, config["zh"])[:5]
en_chapters = parser.get_epub_content(novel_dir, config["en"])[:5]
for i in range(min(5, max(len(zh_chapters), len(en_chapters)))):
print(f"\nChapter {i+1}:")
if i >= len(zh_chapters):
print("ZH chapter missing")
continue
if i >= len(en_chapters):
print("EN chapter missing")
continue
zh_ch = zh_chapters[i]
en_ch = en_chapters[i]
print(f"ZH Title: {zh_ch['title']}")
print(f"EN Title: {en_ch['title']}")
print(f"ZH paragraphs: {len(zh_ch['content'])}")
print(f"EN paragraphs: {len(en_ch['content'])}")
# Compare paragraphs
max_paras = max(len(zh_ch["content"]), len(en_ch["content"]))
for p_idx in range(max_paras):
zh_para = (
zh_ch["content"][p_idx] if p_idx < len(zh_ch["content"]) else None
)
en_para = (
en_ch["content"][p_idx] if p_idx < len(en_ch["content"]) else None
)
# Only print if one is missing or they're significantly different in length
if zh_para is None:
print(f"\nExtra EN paragraph at position {p_idx+1}:")
print(f"EN: {en_para[:100]}...")
elif en_para is None:
print(f"\nExtra ZH paragraph at position {p_idx+1}:")
print(f"ZH: {zh_para[:100]}...")
elif (
abs(len(zh_para) - len(en_para)) > 50
): # Threshold for length mismatch
print(f"\nLength mismatch at paragraph {p_idx+1}:")
print(f"ZH ({len(zh_para)} chars): {zh_para[:100]}...")
print(f"EN ({len(en_para)} chars): {en_para[:100]}...")
if __name__ == "__main__":
main()