chore: more!

2025-02-11 13:28:12 +06:00
parent 28342e0ace
commit befdc9c945
2068 changed files with 102392 additions and 908 deletions
--- a/custom_parser.py
+++ b/custom_parser.py
@@ -183,26 +183,26 @@ class EpubChapterParser:
            #     start_idx = config["idx"]
            #     last_num = max(num for _, num in files_with_nums)
            #     existing_nums = {num for _, num in files_with_nums}
-                
+
            #     missing_nums = []
            #     for expected_num in range(start_idx, last_num + 1):
            #         if expected_num not in existing_nums:
            #             prev_file = next(((f, n) for f, n in files_with_nums if n < expected_num), None)
            #             next_file = next(((f, n) for f, n in files_with_nums if n > expected_num), None)
-                        
+
            #             missing_nums.append({
            #                 'missing_num': expected_num,
            #                 'prev_file': prev_file[0] if prev_file else None,
            #                 'next_file': next_file[0] if next_file else None
            #             })
-                
+
            #     if missing_nums:
            #         logger.warning(f"Found {len(missing_nums)} gaps in file sequence:")
            #         for gap in missing_nums:
            #             logger.warning(
            #                 f"Missing number {gap['missing_num']} "
            #                 f"(between files: {gap['prev_file']} and {gap['next_file']})"
-                        # )
+            # )

            return [fname for fname, _ in files_with_nums]

@@ -225,6 +225,9 @@ class EpubChapterParser:

            paragraphs = []
            for p in soup.find_all("p"):
+                for br in p.find_all("br"):
+                    br.replace_with("\n")
+
                # sup (footnotes) and a elements
                for element in p.find_all(["sup", "a"]):
                    element.decompose()
@@ -270,11 +273,26 @@ class ChapterParser:
        """Parse EPUB content into chapters."""
        epub_dir = self.base_dir / novel_dir / config["path"]
        valid_files = self.epub_parser.get_valid_files(epub_dir, config)
-
        chapters = []
+
        for i, filename in enumerate(valid_files, start=1):
-            if chapter := self.epub_parser.parse_chapter_file(epub_dir / filename, i):
-                chapters.append(chapter)
+            if not (
+                chapter := self.epub_parser.parse_chapter_file(epub_dir / filename, i)
+            ):
+                continue
+
+            if lshift := config.get("lshiftp", 0):
+                chapter.content = chapter.content[lshift:]
+                if not chapter.content:
+                    continue
+
+            if config.get("include_title"):
+                if not chapter.content or not (
+                    re.match(config.get("title_pattern", ""), chapter.content[0])
+                ):
+                    chapter.content = [chapter.title] + chapter.content
+
+            chapters.append(chapter)

        return chapters

@@ -289,15 +307,14 @@ def print_chapter_titles(
    logger.info(f"zh: {len(zh_chapters)}")
    logger.info(f"en: {len(en_chapters)}")

+    logger.info("\n=== Chapter Title Comparison ===")
+    logger.info(f"{'English':<50} | {'Chinese'}")
+    logger.info("-" * 80)

-    # logger.info("\n=== Chapter Title Comparison ===")
-    # logger.info(f"{'English':<50} | {'Chinese'}")
-    # logger.info("-" * 80)
-
-    # for i in range(max_chapters):
-    #     en_title = en_chapters[i].title if i < len(en_chapters) else "N/A"
-    #     zh_title = zh_chapters[i].title if i < len(zh_chapters) else "N/A"
-    #     logger.info(f"{en_title:<50} | {zh_title}")
+    for i in range(max_chapters):
+        en_title = en_chapters[i].title if i < len(en_chapters) else "N/A"
+        zh_title = zh_chapters[i].title if i < len(zh_chapters) else "N/A"
+        logger.info(f"{en_title:<50} | {zh_title}")


 def create_db_entries(
@@ -325,8 +342,8 @@ def create_db_entries(
                (
                    novel_dir,
                    str(zh_chap.number),
-                    "\n".join(zh_chap.content),
-                    "\n".join(en_chap.content),
+                    "\n\n".join(zh_chap.content),
+                    "\n\n".join(en_chap.content),
                ),
            )

@@ -350,7 +367,7 @@ def main():
    epub_dirs = [
        d for d in os.listdir("custom") if os.path.isdir(os.path.join("custom", d))
    ]
-    # epub_dirs = ["ast"]
+    # epub_dirs = ["warlock"]

    for novel_dir in epub_dirs:
        logger.info(f"\n=== Analyzing {novel_dir} ===")