chore: readme a bit

2025-02-09 04:04:55 +06:00
parent d060cdba14
commit 94babaa7aa
4 changed files with 193 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 epubs/
 *.db
+*.db.bak
 train.en
 train.zh
--- a/README.md
+++ b/README.md
@@ -1,4 +1,37 @@
 # chinese -> english finetuning datasets

 train.en and train.zh are from [here](https://www.dropbox.com/scl/fo/dtrf3pe1vfbo5nse16648/ANLqlv3ascANpkdnYF_w4Jk/V1/TRAIN?dl=0&rlkey=486vbn17qra1ez91btj0n4xu2&subfolder_nav_tracking=1)  
-TODO: mirror
+the [actual dataset and .sqlite file](https://mega.nz/folder/byoFHRST#Mcn6-mU5spHxPg0nMlRS3w)
+It's missing the epubs dir I used for paragraph rebuilding... I accidentally deleted the dir, sorry :c  
+What I did was Google a sentence from the chapter 1 of a novel and just scrape 50-60 chapters from either Webnovel or some aggregator, then unzip it into epub with the directory name set to `book_id`.
+
+GuoFeng dataset chapter spread:
+
+```sql
+select book_id, count(*) as chapter_count
+from chapters
+group by book_id
+order by chapter_count desc;
+```
+
+| book_id        | chapter_count |
+| -------------- | ------------- |
+| 45-jsys        | 2262          |
+| 93-yzsslfmmd   | 1733          |
+| 2-xzltq        | 1718          |
+| 19-ysmmjwn     | 1546          |
+| 52-mfwz        | 1254          |
+| 86-wzxajddyx   | 1188          |
+| 34-xwdrcsh     | 1172          |
+| 25-dgfsngm     | 942           |
+| 53-gmzz        | 798           |
+| 6-yh1frhjqjysy | 763           |
+| 141-fyyysndy   | 745           |
+| 37-scrj        | 539           |
+| 95-cjjyyhy     | 516           |
+| 99-jjl         | 220           |
+
+There are 21 more with 60chs and the rest are 50 or less.
+
+However, I didn't import many epubs, there are 153 books in the dataset in total and the most important part about [GuoFeng-Webnovel
+](https://github.com/longyuewangdcu/GuoFeng-Webnovel) dataset is the Chinese raws and more or less _decent_ mapping between paragraphs (there are some mistakes which sucks). I used 19 epubs and not many of the paragraphs actually matched.
--- a/docs/guofeng_chapter_spread.csv
+++ b/docs/guofeng_chapter_spread.csv
@@ -0,0 +1,154 @@
+book_id,chapter_count
+45-jsys,2262
+93-yzsslfmmd,1733
+2-xzltq,1718
+19-ysmmjwn,1546
+52-mfwz,1254
+86-wzxajddyx,1188
+34-xwdrcsh,1172
+25-dgfsngm,942
+53-gmzz,798
+6-yh1frhjqjysy,763
+141-fyyysndy,745
+37-scrj,539
+95-cjjyyhy,516
+99-jjl,220
+100-jdxx,100
+149-ajnszwj9csan,100
+151-gfsy,100
+152-dwyx,100
+153-ldyb98k,100
+154-nsxhn,100
+155-dlgl,60
+157-bfrgbfn,60
+158-myqc,60
+159-mrzj,60
+160-dsgyzdn,60
+161-nssjwydgm,60
+162-wajxs,60
+163-sjsdhgwz,60
+164-gxswa,60
+165-ysxw,60
+167-wdbz,60
+169-wgdd,60
+170-fqmy,60
+171-cshtclwrm,60
+172-ymyx,60
+173-cssyxtq,60
+174-nswjbddxd,60
+175-frnmjydl,60
+176-jtxbygy,60
+177-syfjytlnl,60
+178-cszknxs,60
+179-jxdn,60
+1-jlws,50
+10-ssdyzsy,50
+101-wclxzsz,50
+104-msdhl,50
+11-tdtsg,50
+113-dzysxt,50
+114-wyzjzfs,50
+115-kjqdhgst,50
+116-yhzx,50
+121-dfwxqwdlgsmw,50
+124-jdltxs！,50
+125-nndjjgjjlzt,50
+127-dylr,50
+13-sfydxl,50
+130-zshwclqcdzzj,50
+132-gfyxjdcz！,50
+133-sjsdyx,50
+136-wyzsjlys,50
+137-zd,50
+139-cfdw,50
+140-yjqzyds,50
+145-wyzqmlz,50
+147-wnyxs,50
+16-cjsjy,50
+18-jdj,50
+20-zqdxt,50
+21-zzjh,50
+24-xtdhkjwb,50
+29-lwdnx,50
+3-srj,50
+33-yxnq,50
+35-kbds,50
+36-wdyzhgj,50
+39-dwrm,50
+4-mrly,50
+40-jbw,50
+41-sjgl,50
+42-fxsppxhn,50
+46-yghknct,50
+48-jcsw,50
+50-nswdllbw,50
+51-zlhz,50
+54-cywlznrbhd,50
+55-dxcwj,50
+56-nbdyjct,50
+59-ywxcbjn,50
+61-htzb,50
+64-sh,50
+65-ywsczwn,50
+69-txdj,50
+7-sm,50
+70-wdlyzclp,50
+71-hmjqzsxysn,50
+72-bslrnz,50
+75-sghndhm,50
+76-dyxl,50
+77-ndjdxbl,50
+78-ddz,50
+8-csjms,50
+80-ayjy,50
+87-wdyty4xs,50
+88-wyhdqql,50
+89-wddshss,50
+9-msgys,50
+91-wxdknsjx,50
+94-yssjdst,50
+102-bgwzsl,49
+103-kc1s,49
+106-wdzsmjyx,49
+109-ljbyg,49
+118-wdzhwkyxjn,49
+119-fszl,49
+122-yh1fzzsxbns,49
+17-xzfb,49
+27-gmlgdhj,49
+31-yxxgzyscks,49
+43-lnyhs,49
+67-scjh,49
+68-mcxw,49
+96-dqrsztxml,49
+98-wyctUp,49
+111-xldg,48
+12-wskj,48
+134-kcsfcyxks,48
+135-czyxsj,48
+22-ltzz,48
+28-xjnsswb,48
+30-kt,48
+47-gjjf,48
+58-xbdhkjxt,48
+73-ylwsyed,48
+74-ajnsnxb！,48
+81-tywgjqc,48
+129-gwltq,47
+148-hlwzw,47
+97-jmtj,47
+38-wbxttgl,46
+1v1h,44
+85-xsjqlwq,43
+92-nxsmmxhn,43
+146-zlgk,41
+120-yzjd,40
+117-sgjy,39
+123-whrqysj,39
+131-cyyh,39
+142-wyzwss,39
+143-qnyxsjs,39
+84-cqsyjgxgtaj,39
+138-zhyd,37
+14-ftjq,11
+126-mwbbbdd,4
--- a/paragraph_split_custom_zh.py
+++ b/paragraph_split_custom_zh.py
@@ -12,7 +12,10 @@ def clean_text(text: str) -> str:


 def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]:
-    """extract paragraphs from zh HTML"""
+    """
+    most chinese raws are split with 2 br tags rather than
+    by <p> elements so.. yeah
+    """
    if h1_tag := soup.find("h1"):
        h1_tag.decompose()