chore: readme a bit

This commit is contained in:
2025-02-09 04:04:55 +06:00
parent d060cdba14
commit 94babaa7aa
4 changed files with 193 additions and 2 deletions

View File

@@ -12,7 +12,10 @@ def clean_text(text: str) -> str:
def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]:
"""extract paragraphs from zh HTML"""
"""
most chinese raws are split with 2 br tags rather than
by <p> elements so.. yeah
"""
if h1_tag := soup.find("h1"):
h1_tag.decompose()