chore: more!

This commit is contained in:
2025-02-11 13:28:12 +06:00
parent 28342e0ace
commit befdc9c945
2068 changed files with 102392 additions and 908 deletions

View File

@@ -183,26 +183,26 @@ class EpubChapterParser:
# start_idx = config["idx"]
# last_num = max(num for _, num in files_with_nums)
# existing_nums = {num for _, num in files_with_nums}
# missing_nums = []
# for expected_num in range(start_idx, last_num + 1):
# if expected_num not in existing_nums:
# prev_file = next(((f, n) for f, n in files_with_nums if n < expected_num), None)
# next_file = next(((f, n) for f, n in files_with_nums if n > expected_num), None)
# missing_nums.append({
# 'missing_num': expected_num,
# 'prev_file': prev_file[0] if prev_file else None,
# 'next_file': next_file[0] if next_file else None
# })
# if missing_nums:
# logger.warning(f"Found {len(missing_nums)} gaps in file sequence:")
# for gap in missing_nums:
# logger.warning(
# f"Missing number {gap['missing_num']} "
# f"(between files: {gap['prev_file']} and {gap['next_file']})"
# )
# )
return [fname for fname, _ in files_with_nums]
@@ -225,6 +225,9 @@ class EpubChapterParser:
paragraphs = []
for p in soup.find_all("p"):
for br in p.find_all("br"):
br.replace_with("\n")
# sup (footnotes) and a elements
for element in p.find_all(["sup", "a"]):
element.decompose()
@@ -270,11 +273,26 @@ class ChapterParser:
"""Parse EPUB content into chapters."""
epub_dir = self.base_dir / novel_dir / config["path"]
valid_files = self.epub_parser.get_valid_files(epub_dir, config)
chapters = []
for i, filename in enumerate(valid_files, start=1):
if chapter := self.epub_parser.parse_chapter_file(epub_dir / filename, i):
chapters.append(chapter)
if not (
chapter := self.epub_parser.parse_chapter_file(epub_dir / filename, i)
):
continue
if lshift := config.get("lshiftp", 0):
chapter.content = chapter.content[lshift:]
if not chapter.content:
continue
if config.get("include_title"):
if not chapter.content or not (
re.match(config.get("title_pattern", ""), chapter.content[0])
):
chapter.content = [chapter.title] + chapter.content
chapters.append(chapter)
return chapters
@@ -289,15 +307,14 @@ def print_chapter_titles(
logger.info(f"zh: {len(zh_chapters)}")
logger.info(f"en: {len(en_chapters)}")
logger.info("\n=== Chapter Title Comparison ===")
logger.info(f"{'English':<50} | {'Chinese'}")
logger.info("-" * 80)
# logger.info("\n=== Chapter Title Comparison ===")
# logger.info(f"{'English':<50} | {'Chinese'}")
# logger.info("-" * 80)
# for i in range(max_chapters):
# en_title = en_chapters[i].title if i < len(en_chapters) else "N/A"
# zh_title = zh_chapters[i].title if i < len(zh_chapters) else "N/A"
# logger.info(f"{en_title:<50} | {zh_title}")
for i in range(max_chapters):
en_title = en_chapters[i].title if i < len(en_chapters) else "N/A"
zh_title = zh_chapters[i].title if i < len(zh_chapters) else "N/A"
logger.info(f"{en_title:<50} | {zh_title}")
def create_db_entries(
@@ -325,8 +342,8 @@ def create_db_entries(
(
novel_dir,
str(zh_chap.number),
"\n".join(zh_chap.content),
"\n".join(en_chap.content),
"\n\n".join(zh_chap.content),
"\n\n".join(en_chap.content),
),
)
@@ -350,7 +367,7 @@ def main():
epub_dirs = [
d for d in os.listdir("custom") if os.path.isdir(os.path.join("custom", d))
]
# epub_dirs = ["ast"]
# epub_dirs = ["warlock"]
for novel_dir in epub_dirs:
logger.info(f"\n=== Analyzing {novel_dir} ===")