From ba62357f03dd10783c50a99a659a66677c103da2 Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Thu, 13 Feb 2025 18:37:06 +0600 Subject: [PATCH] chore: readme --- README.md | 15 ++++++++-- img/sequence_lengths.png | Bin 0 -> 20898 bytes sequence_len.py | 59 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 img/sequence_lengths.png create mode 100644 sequence_len.py diff --git a/README.md b/README.md index ca9f77e..22e00cd 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,20 @@ # chinese -> english finetuning datasets +## dataset_v3.0_alpaca_noinstr.json +![sequence distribution](./img/sequence_lengths.png "Sequence distribution") + +- 487M +- Dataset size: 37243 samples +- Maximum sequence length: 13760 +- Average sequence length: 3123.26 + + +## + train.en and train.zh are from [here](https://www.dropbox.com/scl/fo/dtrf3pe1vfbo5nse16648/ANLqlv3ascANpkdnYF_w4Jk/V1/TRAIN?dl=0&rlkey=486vbn17qra1ez91btj0n4xu2&subfolder_nav_tracking=1) the [actual dataset and .sqlite file](https://mega.nz/folder/byoFHRST#Mcn6-mU5spHxPg0nMlRS3w) -It's missing the epubs dir I used for paragraph rebuilding... I accidentally deleted the dir, sorry :c -What I did was Google a sentence from the chapter 1 of a novel and just scrape 50-60 chapters from either Webnovel or some aggregator, then unzip it into epub with the directory name set to `book_id`. + + GuoFeng dataset chapter spread: diff --git a/img/sequence_lengths.png b/img/sequence_lengths.png new file mode 100644 index 0000000000000000000000000000000000000000..cdf4d31ae3c75bd264a57fb1eace32d16029ee1f GIT binary patch literal 20898 zcmeIacU)EJmMyx?S<4bdQJ_>n0Ra&ZB^wbWN688zd6PjhYN@57D1rn@O3o@uPE`s@ z&Pk$@vq+Ns#<#1^>ArpX^*#5GzOV1=%df1mV6)b@zA)#QV~n}(Dac7~*~qYwLZNJt zx^PyBLRr;Mp{%^Lel5N;&Q!*V{|MQgyJV+qdCku8ij5&f_KKaAxuu=CiT*DRhBme) zmQ+4&-lN<{kNje6XJ=(A#KU9p_fK$J+8FWp+FY{1RW?{%(6FUYwp}5AR>X?Om{2H+ zTcysPQn?;H(<YwP$I0xpCL(V2b3OeFZP})dWiyv-ur}`2D4_vrP5Anp?fT(UPSv zPNfEOlq!dOkniE0@(nz3RW3@A{kF2HCI{orSDUR@-=3e56mMwcGBz#9^Xu!%nzqku zTx>VXZOpU2p2_CVTeKcSp-`@?u*(b5{zh4k=f&5h8=hOqe?8ay?Hiu|_Z$CT&xce^ z(`$nso6P%$T8^#OcSke2w)QOs37B=;WWK%p`%V33_nFbQox66a9cjtZ{`iPdIYl>@ zZr{Em{-sf;Zmo67V%0u1J=(^J|5ev@Dy&VSP`FpDV5B6U9v>gK$<(eWt*n%P@#4k( z`}b=SQu5*>B)xZ)IBeZ_BK)<%M~;&xGZh-t1I$N9^mBfEeQ@8~TRq=>akAZIEY7Sk zxi6`_qM~7Ppte3ygKu_zp7ZEYg@+6RN?~GdZw)>^(jT51wEf8F*grRvGGn*5@7>kn zpC2*`H`H2CC>!sk$Lyt0=t9Q3K5~kPXvfHhshOI_HzaCAVALihzU-QBjh_VZn|!T% z#3ZVD@Vr+>D~Yb6hsu{O1q2+OPV>?UsLmDuhpmMnRXZZ<;5AZ`RNf3UqKT>k%w*lHN=e1Cv-tpQVnt-=(CavPRNc$WZ!H`*d%G z9y)YLa(SUoeCNJ>i8ehY155K`jXe1aU)6T(+7&(8mfbf$+&r|9+P*Ps?c$H|E(RgH zSkt=L{d#(OI*x7j&OLtWSG}wgY%{e7JM!~)acjp8rR3}8<>lc+Po8mUsePk&e>6F1 zH#OX>eCpJx?-LWXTLfFfn^TR1yPO)k#+p+lH*DA-H|8=k%4OvyzBu`mB~JGF@!6uB zoE()*8^d(N`I)iCE^qPV@bF(^W*j@*1UrnDXY!Zhuz5JtlY`?`6QX~=wa#qzJNCb} zX6m(v%8tAfPY!W%UNk83Qf$kz?-?3OU=T1h{_*WgNlA%KLbmVN_2s3-!RFMM(`U|* zpMLvhH8s>I-`w0RCwu<rR7o(GPvW5JQd?+e1t&NtQ zb$?`37VyQ@$SK7(cd9`vUODEGtDBpB*F!j?pQl!Iiy}_{Mg&48PeG7HOWFVqo9a_GS$_A0>$7unra0}I z`R=(aaX2Ro;_f-2Zqpi51GSRr?am1-1HI2J9U8`4Ec4^e+}%1@7x$FOWtex(nl(DF zQs{NBpW^nvC@;SFwx+gLz~VFix%207!kbUlc<^#Zr>E{Qup|3>YOtQnA5K_&PpQ9I z)2l0=^F8vTr%5OEyU>-C?B_&fYt_0VT4_0s{jp9`?(Hio{Kiv~aSro|6}wjl@Dv zVUA4jxjvnlu;TLa`o)=!6a;`c`7klls_+|W=FKnTlw$@zGJDkhaeGtSAWyHbm{q4I zHEV#)@aolAd@NosKW}=hBW2B|UHu>K?f5=EUX4?2H$Re2c7XHOdxum~ugSVi)vGg{ zw29oxBob><8?7@f*5ic7q1(GRAvX4K?qtnHA?vTdw!2Qqw%L?&&EdSYE%6=3DO6Nc zL=5rMb?Vvawlv!pB;xc2ixc6#ICrw6-fgBW2HQz(ac)W(f2N8Lg&jVAyul({US#}N z5{Y+ms`cm2v~#{nKWW?B-(MX;#)J6JZ&Je|I$3pQZniHhgilIeU;nK|%keF{k4jv* zaz$ERzVZG}PG27%{l#*+6`k=06pDI?M(VXaQrEB_`;s=SUw^bbNWiw;dk-YIwTE(y<*0=JSPPjoP*ki_6OF zo>e{Q-qi$(>QY!wZ{K7F>{5v8n+;aBp*#Mlv>=74i5*a(Y$u*5e zUc8x^nH7$VbaYpR97oNsl?7bEZtzb@GpY<_V`2HQfZ$3k6}BI|x^m^pGOS|jT(F>J zsIu+C4{POgs?NdUL_9!J<%2+Pg!D>Z4%HIZ*&ctYJsy$Yr)+L&D8!I{p3!lrp&UPp zF9@4xZPhmL;;j^&31&xBnjQbh+*Ty%wfhRHZ$^`(Abaa2uaz?m%|6z zveR3>*VU=)`01z4G!ZQ3g=1YGy&V%Q8WICds$broSjt~sOnGuOTK0MGk<#K~-jUDm zN@w^O>FJF&&h@p1in@5QC?iDL{`hvqX!cN;t3FF04sP9UgEGtwl_k($&wci0$0|zr zb*uZr6ki3L)cnr2)>W%ku}X!xelIo*+c=Eq$0X{kZReIwZBu!1dQIQr`)%Thk01ZU zrSYcwO`6B+XGb&+@bK`6&UQbgmfqdU$YJ31j9X_gP2|`yCHuiT>vlntnzKtwOJsj7 z+X?^v`|l)+rY71A)Hr`HK4d@K#G9m(J?-P-i0GW>ni=Y`Zhz@yn?VtWiGkw%14a9|iu+MMlhn+d5kx^Uq)vNeo^k#SW4AGkB~ zMM&tlwRVe2zPnqxMY>s|oX=b5Dbw^h@%cx#wzluAzMO7D%An3@oS1hhZW$#(9Q!@X zLJ*@rdGh4k*9Sb@E@6@4+^dQ|eNyPF2x)5=>*B*lk;Qwdf(?(P;Nh< zrEMXH;Q%BChxTzvW;d0woQX%yQ-e|451x;8o1b>>e{py6+xu+^Dsd0ym)p#eGkblg z>(x_qMbl>xiV>o?zMEZ{8EJ{Bs!~W_-uHOvgsPb3g1LZMLs|MGJk63xL36rg*kFBv z|Jb~6hXePsl$4b74;i8qBhC+W{(eg{+aW1hJ}lX3qBoAL3gWkYf7N~jwSLS;4IWRm z(;3k|L~O7j>G_y*?+_rwo4KcvI@t~;05W}|s^XR{PD{hWVs`zHkMZ#>Pi{BPb{MJY zEq~7Aob7Mgn5^T|y&(BMb766@t$luWwr8lPH7o?r#?HcW7OQ2Skf4=5$~xmXIuk(O zFp<`WUFWasEQQdns-W;#Go6~`GM1~3jH_&A^^MhX8|7Q$PgN_zSI>1UFU1^`e5C!2 zdBL#6x0+th^+kPxI_LiVr%Io3)!t=xk98RNJdj!`_SE0MF^t-$l)N|p(wer_7w5V@ zw?7z&I~r^$+#9^N#OXA4kC6RfWI(`f0n@q*6TRiiuC6(@yFM6ri#{9E>=iK*cNo*x z8FuaN?TxqYDhfJgD1BoJAGLDPyVPM#r6!N&Ovr6v7RxCZH%rN#_GwVBnu`)oKErdk zD%3^Z^XuH6#nh3IMbWJc{1;~09K#3{!hstKGzzW$b;FLgru7_HvBV6`HI(0g9a)vy zw?w^s`I7sRdb|qzJIfATV7f+McI8QINnj^M&0Obq2M!*r+*`q|-d_Dlq-gg@du|KY z3AMyauL~sa5(N0ptkI!ePulXGX?;y!m`5^Tc^2cjj@Jfv`E++9l|+?47vvNaY#cf4 ze8p07uB&dAO*Y6Ft2H;(sHghQR8=)r;S^56eJ`(tS}_Cwob8sG4BNg6*LU>7%sAcE zh&^?AZu6&SGvhSh#bbTnVLxtasE<>Y1VEmzb<5Lsiut0Rpq5mXrkJ3{m4vA2bmU5C zTef3g0#{bEehSTS-}Y>dRebq0)-rlM#)& zF0xpn#*y?6_u`?}cV_+sqi|Is66ou{B^3IKUENfkk6~zhA3R|~z zU)9Yun}Q7w$ps(&R`=sUh_@t@(^-O4kqAR984t;a)}d(Oq268gAkhMIH8l_uW{SdO zwrJwfPvYM{9h8=lsYBYDsK1JXSQg0V|F+Hc+_`hb9UXc+c|Tstg^7h&E)0)NDK$9F zO`4>SabP8#B0o1T4UYZ3XvUt=H(B!ehHL7d? zbmRwK?c?u5IyVJCC18qFbEvEazkPW>hr}|8BqhAeUYqS`nY%PMMB<%~j}P6?Kfgql z=jjlrt|_dP!~qeoY!{gtX;Dv5Pf>e*+`{+i(+Eak2jlh}C!{c^8-gy`j}eL#$pcXH z7ajVlE!J&gs#}@=syMdlF>)a)IH6IIk%tlNEIz+2o?#aEh(^(qH4y!r zpB$(5TrE^BjW5qV2%bGTAM5mB7Z-~Z0lNq*-AH0M8#Dggm&`z0%Xu<1W2^Ns4BTSC>!Sn6kFEjz(gPMiyi{aiZz1Wk-7sb`jMc zS&na}!y}(vD=jfjDeC^76IS6GD3S9pF5B^mCm|sgy3PZvwXuo^$=DGZjIq2|iXQGY zdbIE4*E*dN#~jNJvDzdpL1nBO`3$n6#n-Q2Wz^K3Dcer=E7GO|$g-=e%OGAQPM;79 z6D6D}0mH+?o80rAykBwl*XOyp#89zy**G{#R#3Q zFQ+qRqRH7N{g? z)@NvQ8;N3(Is3;=e98iN0al9$3TETxuBc1V>%e6zaLWLMgG^EVZ4?Dp?V?LolHgx1 z=OS^%;_Q-?M5rbCbmgx{v@;a8iTwy~NYaWqZt?kg79&Msdnbb(n6T1d5vQ6FH&np7 zE+Ybmh*h&#^Pu)MdLmEcM#^fc| zk>8F=+4bc#Mvt!}60~no3RbQr^YUNl8=WHqYZczyet01xv2rA2x6Y>!GCWTsl0eAU zzsw_8!2kP=|NZlk`;!-=uo}02>4+*y8>0mz`}+?bC?gZAA^{bC_;8qutK5Q-ot+(I zlp0DC5F`Ry_8ylpstC4DKmq?SNYIi{NcBlt>Im6|uU*J8dg@C^@UWl^tpQ~g(wJpG z6#N>08$znCC)vwkM3WKx={C{H$9$nkdj0*zhDJt1;CyR_n^Si(Gp87Zxi!Rrz4Lw< z5mBFJp~ZUWkXcpuDN=kqAE&}RmNIYNyjl0wL|GJg&3SGhM!;eC67rI9 zZ&~2RZQB}w9$BS;7^*OMetZjyL0gJD*E^a1jHhSa~?kYDem6=`%2fYMT6Jttqab5loOO=A1sgH zje4g|mt37P|5MebQv?TL8c@^>lnZv$p$NAdN=hFD-y$t5>j$o4<*HR}Vms*QB2n2( zDS@umI#Ic|a&NYREUpe#%Bj7bO(r1R)05Jt8gc$%epWWQw&#HY_R-tnfCCjN&-qPW zA_~Xh4;C>LGm2bkqB0)lpNX~rTq zY9ih9<=OVh5%h83z-b>k1Zpm=w8NsJZE+gFlBfgfUtKv9(>rY6UEiHVi*{Cq2gB)a z#)LXcgY>U&&vgL}rZ)ht&ez}H9~?{~o|w>cXYOpS#%}TR^^FAaKrWB3uB z^die)Bp#U31avNgsPkK7(C(ojPFf}IzBFru0!&-jVYrH*Rb&;DZ=Jt~w%U`2k=Xx* zVe<4hFbMJ?!j27udL>*e;B0lHo<|b?F6{$;GF~Z4nw^d9yoQDbQ8RFzFRtx?_z&oL z)Sz0BCk4yPe*Cx^;bKhxc>mt-@#AnpFwad6#Ovl}wLA4OGQB3MA4Powbh7JN8|cEY zCTmu&?*H`UpqjQf)A0je-iW0;>f7yiB>U3$NSfg)!oZL(HCk#zQeCltr+$?6AYS1Y`~Go(lI(Y|UNVoTY;W9PsaD5I+3H)}XTE>9|b zFqbI0+<$yom%F?)Zwit?$gV%UFVsa2%AA6OLmG?8za)zK@KdX%#ya%2i7&h&+QgH^ zHRQ&0Oe>xDlXQyVfnsTUcIeOiKrQ{Jl0)_=Xh|c+HN_<*BxAH}BGdDE?C8<1jZZP5d^t2i%wl}G%kXOD;XQE_Zvu@o*pPRRCO|;dE zjxC1O|Cpbrzdot$10kwsY;0_rmP?!Ndg`wKoNk#y;y&~rr+4PfN}y+~-SIDzB<;+a z*|LfdVLum+}XV6B`P|?exN!~z~#USa!F-@i>yPcA!RZ2Es{^qdov|v zWSoG^CG9geJ(7UvuJiuxHkw9=4Q_%WUHZa>!|jZ;7)>X%(wR3<$UYtyj{^B;XlTgJ z!69v#Gk%9}6h(631NsELeE0jm{icv<(?diW)ak+kU%q_7MXJCv3AoMK5gCfGwkWJc zUSA}W?s>ZVSkZ2I!5d$n1_s6gC)W5t}l1XpeBu7>(GB^T9M? zZlrY_H3=q6$Flo2yFU+YT#jg)=b z_cr%~dGciehgn!ul=aY$TSpVZ9Vv^~3DLHbuL|q2@V?$&X&-=`hp zH*9!P+VkD+&rSbx{MmJrNFE$wV!Ei!r|dY%MLmB`J}bUXR9IL`u){SqriFH+@T==4 zb#t9fz_W%@Y2)0q(J|3K}$7oh^$gt=~Vl@*}i*=$dLw%s)C$! z)vFpaG*z%_TR(Igb%1grJ+rc~7%wf%Zd|`!0pjjd)tPN3&|Q5ZgM;^hHRU{U;*y@* zlphg!K}*ndmGMG)*2VD)87P_vcgXDe?FN}-%mR@?pzhwDxJ{cIks?Dc-cagSQx(vc zxeMP*oqv3}OZfQmKt4&5t5{4x7!y(Y=N&uhv+T{dbh1=HP^J)_7d8kfkJf@wK=y_g zXYO`)bw$L4xog5iL0x`Q6{ma%x{fglLi#p!BqykwcHN)0V`k|`<@D~n+P=YmD)ZkD zPZAr%3v&;5cXu0)wq^>sFJv4zaKINR9ilDgv12uHaZtYj8sa{DI4h+Dew;y&I{d4x z6j1cPt(#~MnQYEFM>m6VO81+}G~%+(WEM#fNERVN_R@rQhti%1lmT~vhM1Vx#?71U z61HvJNJ2adyMRDFS8(6MzK1%oY#>{}r>g`WGu$}L30nuCJE?-o9<(1Kcaj)quU;WK zHa8(c{6RK0Z_=o3@l8!l7d0DEQ^Wv!5D9V9t|JN%n*m<5erwYW-LQkrxHZH2udVs* zkhGmdn2^2cjZ#c}0!UV!nk-~)74UUZN z9`s?TQ>mTjP9v>CT-!d%*{5@fPH9R7{Mp0H<2wOaBx}qKs*s}o-{5jEG!0S^r=g?)FUi9 zgP3aulgENJ)OTs3tDQbQ-X6$62BN@^dG!ENY;E6bVE3)_ltpWXVx`DL*}0TMKr7Ju zYEfAs_JtE+3ggL|s1RmdwB~RF2}`$QM+DeG)b|4q1>3E}h%WoTlC8H^(%3i8gtPbD zw>o<>iMF=;bEryVE2=`gv}@FHbc#)QY1opw)oH8H{Y{mpK;Ky*P2(Kg6a(UvPbZH+ z@r?YLscHiFib$b8k)V&`K(8>3uFr4{_9GwA4#Y#s%gYnW{_VALhQ95_d-m)xZAnW4 z2b+Mp(>d)F`JBW(#h1*f$1fL97)6~`TGA|Pp>x?oyNA2;S7pt`7^)#!aBz=3EieYdn~V(=!{567kmhN^Xb!tZH|BLXA|k}?UR_>xGV4P zl$4eEmiE+yIJU>FShd!;z;i{`c+uW^kmqjjP=r!BSztYs9mIWbxg~pN!ShC-UstNknH-sSGxz zgq?r1kKF(cz#ba{I7xK^9DX;Omi~QNFusdQI5BVGcU*)A=yR@1x)(24=TL|X{R0i; zk`z{jdt3UQh!6>$qMBGD@g<${v(wWPw7;64`5VE^x;-xkWnDJWa$tcDj zIM2}bN}(qTI#RpiVs5Cvh$ak;IVpQI>*7amW=B(KZ;7xEslZax2=dgvvtR}Bja6KS zkOTg}FC-)>C2vj|QmY!A6U0#%a}BwJ@XrWle!;n+av zFQu6Ww#@(dah*KaUj|fyN*3okmK%ak+SVoO2usx@YNXboUMFKSZOeK`{EdsS#mp^C zRPF?w3Max~W?yJ6sOluRj!Qm%l!O;c<@)susES9V2wEaq6pk|o2SpASDYSW=lfEq*WM@i1YgU5)TA z>%Dl1Usr7DhXG03D0it{5UCX)yuP#Pkv3}n=WF{%H-F5{m2`A;jL^uHSKAuF17njB zFbkL2J;-mJ+I&n%^2ZEI4*&9tgs97O-Q)dd2>1mqHA8wJ)%Q?C5~~y(#s1jKvEcAc zYa*pUsn;U{@1&Ee{4^BCBQUuEYW2VV$OHpm z8*$Rcu)&goi<6^5&G@obHdI6dG{9ExOC&LRCQz_=H+XyH%@!6G_zj9Smk026

Zr zjt}ca?u=03(Tp=49?OfYGqk+(YVUt)bfo-GLoO|TdL}3~{q}3=Txf5bEtx2Za zN3aiY`-+9K&!7f;iyP&Zx3z-5mjuO>O`F{iHDgv?ttS z-04AJ&WKD#_&Rt}ou`M1uRgQhygf&Qn8mTPi82lnI4;(43%RqxW@M{wbOL03xw99$ z-fQK@L$X1U>UwUfh{tuH03wKh0>}eAY-5^bH}DeH0TXb7q@9A8j$lOpxTmtLtd5iy zNM0l;qvPOB5lSy|&rdVkXg7L$y-aie0Bl}hcojMWc+aDPuO)yCmHGvrCD_5_5wA5f ztX>dfAE2!n%HGefuH4d0GiR3~P60at^8D0w6`}Tv{pegpv`qLxiA)DP8;Q~oEOQ~( zCmPO87TI@-XDwl^QGsH9LJU`|SV25QD8gB#kR?ck`bQ=U4DrY%N^@}02M4uKvt?1d z(vk+L*Kpi(3oML4GV<}&{w*M`$lVj5!kzkW!!NF|YvPBh@_Mbv7RwM~};Sj8I^)Ob~!`=bc zpcG6-K9NuY#$c*d1kfM}*$qTVkeFK**y67*a%uOb+r^Zsb^%VJSa1zOV-u%sQRV~p z2)O>Rg3V3aZEjK#&qK}Z*@c_SxVyVo7?Hg|bOG!c^}i}j?BDu5+t#nIB1$x}*E{3t z(*7^82|*P~ zrI|Cn(OYks+t~}BKY?NVFnI#9NF%r^1No95hM?z^B}$RlP#_9|_b|iiNzO7jlk4D` zoa(PW<3oN0^k_d0ZvAiD4~n!hN*`eIU)!m_eHep3x^iwO3&>yqebNS#nuuTt=Y(58aYZ;+5hFMU4c>Lhoewc}vzu=UMY4Kd&ub%sNR|VQP7{y#K`Eja01K)i~ z4|pM@edo@uL1*h( zT=FsE`6n0({uYzbW~tq2M6|V`p`o-RM~}XW)y%mH!efE;GJ9{4T-k%%fW5-JJk$Er z;SV`Y39-F*4)!^G;WbnLiJ6(1gdsu(J_EaB3il79^S8eUoc#L-`_%*XpSgoRAT@-x zI$6QCNHiP`AU9X&w9z8HIvWFkhy77D4i3$hg1$02lfDJ|3C~Mh3_`9M{2$$>;Cjat{h3+ zyyO{|8M3cFQQ#-QitRz!lxp7mM!&6tvl>2nGq|>8FjHj^u0wjt9vJNtb8SKZDzzJH zZ*xUq)(Zylc&R^+V7B9!gDIO_urcYtLdRV>D*Q6|#VsLYmBSqx5(K`nrPb>FmR|G% z9Y-%yR2(Rc0dyDfxud#+$LcE3r9Mh$)1`^9ON>>{9PqkyZMN2 zP7~3;%EH7ue0lWRWx~YT;;vwwP@Z0bKf;q&Cl6~M!CG@lrQ3A_S$SdOE26^7t1X9vplY8j6-JfN}$(eH%~EDI{KHEjwV zNkOr(u`k=BI6sp{GvcB~k5C!>6{D-|dJAns-RDefOS#g69YxSkuzKCP=dc+xYdphK zAxM=I=Qgat;tStyJ-K?^ys5M9h)z}=%)_amknsZ|oD)+upSn{^-e}g)Mi>8N1{~C0 zG`u-#O+zLNgq>rzLBL-5lY-`SeIg=gbzl<``kWL7KU?6T(CaT1X2zxMUi=i$2f!!b zQ-<~uFW$0?)+ak!7tmzEM|uXEv+NT{U!ax*c00G6wdCvKiV8LK1ki9f>8JpMgNYYy zYOWulr4npXDPUF~JJKuR6#J)EHBffuPoXN2G!Mg6Esd}aL&|ey+i9WirU9j!9?)DM zc6nouoW_IE9uWu4bFQUhDH=JD@chn5n<3+1EDl4F8gbbIy#x-x)?PN)?hteBOfew9 zM0PjqY`JpK3T!x&v^xnqj@F?KPq!}H>0H)q?{h+{EdJ>7_K76+Hb@_&frX{FXjkFM zZ3}y58uvk@Lc155xP1cX<mgqCOV8CCg(3X(`<*3uY&V~CxS({q1aecq1@hd+Z42x{rB$TmrWs@?X8)% zQ%K+@*nYD`0rZu#XhWDn5de>yGwIU<5|2Sop%VDMXOqi$Et?jb{eptxF)x`9U0hre zuoM0HJ*))vKddKc9i0g3Adg6sKf(Jji9Gqx+Vt{7-xpVbvf41>O(Hwbeza8s+m<|{ z^FWk;Hw>b<$(sV}-stY{1OeWc`-=@j5Om7&`+4jy6U;mP!i@-?PFak>tSCuHzC)3F zSPF_{AR*6)Dh+e|`!+vwq=(vd%&y_+gT(OFa#OZ#;YDd_C1>X>V#tL`;pF0C z249miRDvEnJvL@<;Td!*5SL53A?8j25_jbCOq*_hW%9o%=kg+FZ)GDDo|LTKiJ_bJ(_oYlAkF{V*MfBU%KHAT>#Pmt<6iGTcHN7sfpBq+PfZD6W_l6kkD`X;#lfUH zMiuGRrs#>IWA`$Ws{z#i{v5gB6JP2kM_L-|5;bZQHQ$w(w`Nq6o@7jmk;s^}E+I9g z3u_c#OS+{XFnXu8u7CezR$+vM0TCao9V_M`?_L-zU(u#6t8}n%0QNhA4kSo2gKnM~R1AWIbf%U*&j+Bh(8cl&V2X$*gOk;k zc0=5eDq*B6(g&?T01{^|SxW>cLCf5!XD!#UnVfn9jbtFlksp8r5Ja%HOPEEDltBxy zEQDxseh2U(hLh0CwP?tME4@{pBuIfqZ3Cc|OWpGhtfggT{IEju1C{cFSct}wT|?_8 z^~=pnb|&+Tk9rks6KX!zoOJy%9}!?X^i7@9+F10{cDQ15pzpv9^7 z*sMxuPlW3*1ycufD@z>;f}<5%m-M0&yEtCzVkP|51#gHkMuww?4d6W+c94zPt~E#!4GdT5p1=(1@!vxNx!*Zl2 zh=mw+i5CX=*U!(-zC9~{am13eV?x#^7I_$S6VOL3i#KxLS%uR3_{F04xO7^APWyl=-rrJghld-7V%6s%l!}A5>2<4sKBv%@HAs|l!05I?(pOGQ{>8<`S<|1*>N9PP z&}DYs=dJsqGjToPIH(}nlV)l1Dw_%;0Aa-~j6y1?H%QZoJzOB7nAM6Ou)9uvI_PbT zIGhFlZXJ?JX*)B8az6y6Yte3lT}LimhV*R=0;3G0HHV&VLehqp2TTMIxH^J`>`O*$ z0X}$_=Nj|T&PIYR2_DogFqX&2a!o8@sRS`5y98SQBB(*$q5w=Nt!x3JO%K8oJ_ifH zVBm!}Sv-4k6^#@lUX=m7?^lDU9XoU8OgTWr#@5CM&Hgtp9Ak#h<%R*dr)_VUwbnH1 zszAhf0ArkDUY|1Iv79b1%a$~}pMnI*qTLe&ki!v{ZVyCXhz7e8V;{ZtbWOX3wzVMK zhRlEl(Xi92_NE z2jwbhwUzQov+PL55z9;xJaFKv&s@_R!v{7tHg@2_vn-4d7%LoD;t*2FJ7*@u;7A)7 z81RXN1(0<7APDkU?WIrU@&5|?fb@gZr5eenPl+xsOw2Yk!km{LIx*)LiJU(e{hVr( zAU?BrJ2-a~1vV_Z9U6iN3SDg06iRR*UKQoo?i53IfapXSAL$GNBM^bNc?_ZWBzj!- zE)n^5fg1V`OV%b+)&oJDW| z&Khp&=!C}a6Ypedd#X-Zp7wo!H!wegsA$#vxplZ*L}4llAw^T#aO(_Yfs*3l%wf1T zE_0Gl>*GE@B8b!JQA=KD1jkb>-X-)o!#VZZo8k#b3n-@TNAFew`ffz9u|gUS3jMb61iMJX>(|?z#A0qBi@SPEJ{OR zvVZjUq7wMX+B3KaaXG-ity_n8WW3r#SfoYk!RtVJq@xRDh&%y#u* zzwVo>_el6b+YsJAbB3rFM90M zeE4v}+qCr4rz)VGRwl)DlmfpJ!;0MXAb;uNVQVnOdvInS33Yv)ge!_PI-&d-7*IpL zP@>7KYEFpec4*Iz*Uq#dE?J~tH;qc4b(BtNni)uK77vilR?wEF5gfJrP7cBFcEB_rXkM@cGa=oAh%j?80$&B|1Dm)_tlsYGOHu?1B GoBt0PpvyV{ literal 0 HcmV?d00001 diff --git a/sequence_len.py b/sequence_len.py new file mode 100644 index 0000000..ad4733c --- /dev/null +++ b/sequence_len.py @@ -0,0 +1,59 @@ +from torchtune.data import Message +from torchtune.models.qwen2 import qwen2_tokenizer +from prompts.translation import TranslateTemplate +from tqdm import tqdm +from tqdm.contrib.concurrent import process_map +import json + + +def analyze_sequence_lengths(vocab_path, merges_path, json_path): + # Load Qwen2 tokenizer + tokenizer = qwen2_tokenizer(vocab_path, merges_path) + translate_template = TranslateTemplate() + + with open(json_path, "r", encoding="utf-8") as f: + dataset = json.load(f) + + max_len = 0 + lengths = [] + + for sample in tqdm(dataset): + # Convert sample to messages + msgs = [ + Message(role="user", content=sample["input"]), + Message(role="assistant", content=sample["output"]), + ] + + templated_msgs = translate_template(msgs) + + # Tokenize messages + tokens, mask = tokenizer.tokenize_messages(templated_msgs) + seq_len = len(tokens) + lengths.append(seq_len) + max_len = max(max_len, seq_len) + + avg_len = sum(lengths) / len(lengths) + print(f"\nDataset size: {len(dataset)} samples") + print(f"Maximum sequence length: {max_len}") + print(f"Average sequence length: {avg_len:.2f}") + + # Optional: Plot distribution + import matplotlib.pyplot as plt + + plt.figure(figsize=(10, 6)) + plt.hist(lengths, bins=50) + plt.title("Distribution of Sequence Lengths") + plt.xlabel("Sequence Length") + plt.ylabel("Count") + plt.savefig("sequence_lengths.png") # or .jpg + plt.close() + + return max_len, lengths + + +# Example usage +vocab_path = "/home/mira/models/Qwen2.5-7B-Base/vocab.json" +merges_path = "/home/mira/models/Qwen2.5-7B-Base/merges.txt" +dataset = "/home/mira/models/datasets/GuoFeng/datasets/dataset_v3.0_alpaca_noinstr.json" + +max_len, lengths = analyze_sequence_lengths(vocab_path, merges_path, dataset)