diff --git a/.gitignore b/.gitignore index 38f7f4b..9249410 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ epubs/ *.db +*.db.bak train.en train.zh diff --git a/README.md b/README.md index eb88220..84aaa61 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,37 @@ # chinese -> english finetuning datasets train.en and train.zh are from [here](https://www.dropbox.com/scl/fo/dtrf3pe1vfbo5nse16648/ANLqlv3ascANpkdnYF_w4Jk/V1/TRAIN?dl=0&rlkey=486vbn17qra1ez91btj0n4xu2&subfolder_nav_tracking=1) -TODO: mirror +the [actual dataset and .sqlite file](https://mega.nz/folder/byoFHRST#Mcn6-mU5spHxPg0nMlRS3w) +It's missing the epubs dir I used for paragraph rebuilding... I accidentally deleted the dir, sorry :c +What I did was Google a sentence from the chapter 1 of a novel and just scrape 50-60 chapters from either Webnovel or some aggregator, then unzip it into epub with the directory name set to `book_id`. + +GuoFeng dataset chapter spread: + +```sql +select book_id, count(*) as chapter_count +from chapters +group by book_id +order by chapter_count desc; +``` + +| book_id | chapter_count | +| -------------- | ------------- | +| 45-jsys | 2262 | +| 93-yzsslfmmd | 1733 | +| 2-xzltq | 1718 | +| 19-ysmmjwn | 1546 | +| 52-mfwz | 1254 | +| 86-wzxajddyx | 1188 | +| 34-xwdrcsh | 1172 | +| 25-dgfsngm | 942 | +| 53-gmzz | 798 | +| 6-yh1frhjqjysy | 763 | +| 141-fyyysndy | 745 | +| 37-scrj | 539 | +| 95-cjjyyhy | 516 | +| 99-jjl | 220 | + +There are 21 more with 60chs and the rest are 50 or less. + +However, I didn't import many epubs, there are 153 books in the dataset in total and the most important part about [GuoFeng-Webnovel +](https://github.com/longyuewangdcu/GuoFeng-Webnovel) dataset is the Chinese raws and more or less _decent_ mapping between paragraphs (there are some mistakes which sucks). I used 19 epubs and not many of the paragraphs actually matched. diff --git a/docs/guofeng_chapter_spread.csv b/docs/guofeng_chapter_spread.csv new file mode 100644 index 0000000..da136ac --- /dev/null +++ b/docs/guofeng_chapter_spread.csv @@ -0,0 +1,154 @@ +book_id,chapter_count +45-jsys,2262 +93-yzsslfmmd,1733 +2-xzltq,1718 +19-ysmmjwn,1546 +52-mfwz,1254 +86-wzxajddyx,1188 +34-xwdrcsh,1172 +25-dgfsngm,942 +53-gmzz,798 +6-yh1frhjqjysy,763 +141-fyyysndy,745 +37-scrj,539 +95-cjjyyhy,516 +99-jjl,220 +100-jdxx,100 +149-ajnszwj9csan,100 +151-gfsy,100 +152-dwyx,100 +153-ldyb98k,100 +154-nsxhn,100 +155-dlgl,60 +157-bfrgbfn,60 +158-myqc,60 +159-mrzj,60 +160-dsgyzdn,60 +161-nssjwydgm,60 +162-wajxs,60 +163-sjsdhgwz,60 +164-gxswa,60 +165-ysxw,60 +167-wdbz,60 +169-wgdd,60 +170-fqmy,60 +171-cshtclwrm,60 +172-ymyx,60 +173-cssyxtq,60 +174-nswjbddxd,60 +175-frnmjydl,60 +176-jtxbygy,60 +177-syfjytlnl,60 +178-cszknxs,60 +179-jxdn,60 +1-jlws,50 +10-ssdyzsy,50 +101-wclxzsz,50 +104-msdhl,50 +11-tdtsg,50 +113-dzysxt,50 +114-wyzjzfs,50 +115-kjqdhgst,50 +116-yhzx,50 +121-dfwxqwdlgsmw,50 +124-jdltxs!,50 +125-nndjjgjjlzt,50 +127-dylr,50 +13-sfydxl,50 +130-zshwclqcdzzj,50 +132-gfyxjdcz!,50 +133-sjsdyx,50 +136-wyzsjlys,50 +137-zd,50 +139-cfdw,50 +140-yjqzyds,50 +145-wyzqmlz,50 +147-wnyxs,50 +16-cjsjy,50 +18-jdj,50 +20-zqdxt,50 +21-zzjh,50 +24-xtdhkjwb,50 +29-lwdnx,50 +3-srj,50 +33-yxnq,50 +35-kbds,50 +36-wdyzhgj,50 +39-dwrm,50 +4-mrly,50 +40-jbw,50 +41-sjgl,50 +42-fxsppxhn,50 +46-yghknct,50 +48-jcsw,50 +50-nswdllbw,50 +51-zlhz,50 +54-cywlznrbhd,50 +55-dxcwj,50 +56-nbdyjct,50 +59-ywxcbjn,50 +61-htzb,50 +64-sh,50 +65-ywsczwn,50 +69-txdj,50 +7-sm,50 +70-wdlyzclp,50 +71-hmjqzsxysn,50 +72-bslrnz,50 +75-sghndhm,50 +76-dyxl,50 +77-ndjdxbl,50 +78-ddz,50 +8-csjms,50 +80-ayjy,50 +87-wdyty4xs,50 +88-wyhdqql,50 +89-wddshss,50 +9-msgys,50 +91-wxdknsjx,50 +94-yssjdst,50 +102-bgwzsl,49 +103-kc1s,49 +106-wdzsmjyx,49 +109-ljbyg,49 +118-wdzhwkyxjn,49 +119-fszl,49 +122-yh1fzzsxbns,49 +17-xzfb,49 +27-gmlgdhj,49 +31-yxxgzyscks,49 +43-lnyhs,49 +67-scjh,49 +68-mcxw,49 +96-dqrsztxml,49 +98-wyctUp,49 +111-xldg,48 +12-wskj,48 +134-kcsfcyxks,48 +135-czyxsj,48 +22-ltzz,48 +28-xjnsswb,48 +30-kt,48 +47-gjjf,48 +58-xbdhkjxt,48 +73-ylwsyed,48 +74-ajnsnxb!,48 +81-tywgjqc,48 +129-gwltq,47 +148-hlwzw,47 +97-jmtj,47 +38-wbxttgl,46 +1v1h,44 +85-xsjqlwq,43 +92-nxsmmxhn,43 +146-zlgk,41 +120-yzjd,40 +117-sgjy,39 +123-whrqysj,39 +131-cyyh,39 +142-wyzwss,39 +143-qnyxsjs,39 +84-cqsyjgxgtaj,39 +138-zhyd,37 +14-ftjq,11 +126-mwbbbdd,4 diff --git a/paragraph_split_custom_zh.py b/paragraph_split_custom_zh.py index d22d3fa..c3ca94a 100644 --- a/paragraph_split_custom_zh.py +++ b/paragraph_split_custom_zh.py @@ -12,7 +12,10 @@ def clean_text(text: str) -> str: def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]: - """extract paragraphs from zh HTML""" + """ + most chinese raws are split with 2 br tags rather than + by
elements so.. yeah + """ if h1_tag := soup.find("h1"): h1_tag.decompose()