chore: more!
This commit is contained in:
@@ -183,26 +183,26 @@ class EpubChapterParser:
|
||||
# start_idx = config["idx"]
|
||||
# last_num = max(num for _, num in files_with_nums)
|
||||
# existing_nums = {num for _, num in files_with_nums}
|
||||
|
||||
|
||||
# missing_nums = []
|
||||
# for expected_num in range(start_idx, last_num + 1):
|
||||
# if expected_num not in existing_nums:
|
||||
# prev_file = next(((f, n) for f, n in files_with_nums if n < expected_num), None)
|
||||
# next_file = next(((f, n) for f, n in files_with_nums if n > expected_num), None)
|
||||
|
||||
|
||||
# missing_nums.append({
|
||||
# 'missing_num': expected_num,
|
||||
# 'prev_file': prev_file[0] if prev_file else None,
|
||||
# 'next_file': next_file[0] if next_file else None
|
||||
# })
|
||||
|
||||
|
||||
# if missing_nums:
|
||||
# logger.warning(f"Found {len(missing_nums)} gaps in file sequence:")
|
||||
# for gap in missing_nums:
|
||||
# logger.warning(
|
||||
# f"Missing number {gap['missing_num']} "
|
||||
# f"(between files: {gap['prev_file']} and {gap['next_file']})"
|
||||
# )
|
||||
# )
|
||||
|
||||
return [fname for fname, _ in files_with_nums]
|
||||
|
||||
@@ -225,6 +225,9 @@ class EpubChapterParser:
|
||||
|
||||
paragraphs = []
|
||||
for p in soup.find_all("p"):
|
||||
for br in p.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
|
||||
# sup (footnotes) and a elements
|
||||
for element in p.find_all(["sup", "a"]):
|
||||
element.decompose()
|
||||
@@ -270,11 +273,26 @@ class ChapterParser:
|
||||
"""Parse EPUB content into chapters."""
|
||||
epub_dir = self.base_dir / novel_dir / config["path"]
|
||||
valid_files = self.epub_parser.get_valid_files(epub_dir, config)
|
||||
|
||||
chapters = []
|
||||
|
||||
for i, filename in enumerate(valid_files, start=1):
|
||||
if chapter := self.epub_parser.parse_chapter_file(epub_dir / filename, i):
|
||||
chapters.append(chapter)
|
||||
if not (
|
||||
chapter := self.epub_parser.parse_chapter_file(epub_dir / filename, i)
|
||||
):
|
||||
continue
|
||||
|
||||
if lshift := config.get("lshiftp", 0):
|
||||
chapter.content = chapter.content[lshift:]
|
||||
if not chapter.content:
|
||||
continue
|
||||
|
||||
if config.get("include_title"):
|
||||
if not chapter.content or not (
|
||||
re.match(config.get("title_pattern", ""), chapter.content[0])
|
||||
):
|
||||
chapter.content = [chapter.title] + chapter.content
|
||||
|
||||
chapters.append(chapter)
|
||||
|
||||
return chapters
|
||||
|
||||
@@ -289,15 +307,14 @@ def print_chapter_titles(
|
||||
logger.info(f"zh: {len(zh_chapters)}")
|
||||
logger.info(f"en: {len(en_chapters)}")
|
||||
|
||||
logger.info("\n=== Chapter Title Comparison ===")
|
||||
logger.info(f"{'English':<50} | {'Chinese'}")
|
||||
logger.info("-" * 80)
|
||||
|
||||
# logger.info("\n=== Chapter Title Comparison ===")
|
||||
# logger.info(f"{'English':<50} | {'Chinese'}")
|
||||
# logger.info("-" * 80)
|
||||
|
||||
# for i in range(max_chapters):
|
||||
# en_title = en_chapters[i].title if i < len(en_chapters) else "N/A"
|
||||
# zh_title = zh_chapters[i].title if i < len(zh_chapters) else "N/A"
|
||||
# logger.info(f"{en_title:<50} | {zh_title}")
|
||||
for i in range(max_chapters):
|
||||
en_title = en_chapters[i].title if i < len(en_chapters) else "N/A"
|
||||
zh_title = zh_chapters[i].title if i < len(zh_chapters) else "N/A"
|
||||
logger.info(f"{en_title:<50} | {zh_title}")
|
||||
|
||||
|
||||
def create_db_entries(
|
||||
@@ -325,8 +342,8 @@ def create_db_entries(
|
||||
(
|
||||
novel_dir,
|
||||
str(zh_chap.number),
|
||||
"\n".join(zh_chap.content),
|
||||
"\n".join(en_chap.content),
|
||||
"\n\n".join(zh_chap.content),
|
||||
"\n\n".join(en_chap.content),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -350,7 +367,7 @@ def main():
|
||||
epub_dirs = [
|
||||
d for d in os.listdir("custom") if os.path.isdir(os.path.join("custom", d))
|
||||
]
|
||||
# epub_dirs = ["ast"]
|
||||
# epub_dirs = ["warlock"]
|
||||
|
||||
for novel_dir in epub_dirs:
|
||||
logger.info(f"\n=== Analyzing {novel_dir} ===")
|
||||
|
||||
Reference in New Issue
Block a user