diff --git a/README.md b/README.md index ca9f77e..22e00cd 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,20 @@ # chinese -> english finetuning datasets +## dataset_v3.0_alpaca_noinstr.json +![sequence distribution](./img/sequence_lengths.png "Sequence distribution") + +- 487M +- Dataset size: 37243 samples +- Maximum sequence length: 13760 +- Average sequence length: 3123.26 + + +## + train.en and train.zh are from [here](https://www.dropbox.com/scl/fo/dtrf3pe1vfbo5nse16648/ANLqlv3ascANpkdnYF_w4Jk/V1/TRAIN?dl=0&rlkey=486vbn17qra1ez91btj0n4xu2&subfolder_nav_tracking=1) the [actual dataset and .sqlite file](https://mega.nz/folder/byoFHRST#Mcn6-mU5spHxPg0nMlRS3w) -It's missing the epubs dir I used for paragraph rebuilding... I accidentally deleted the dir, sorry :c -What I did was Google a sentence from the chapter 1 of a novel and just scrape 50-60 chapters from either Webnovel or some aggregator, then unzip it into epub with the directory name set to `book_id`. + + GuoFeng dataset chapter spread: diff --git a/img/sequence_lengths.png b/img/sequence_lengths.png new file mode 100644 index 0000000..cdf4d31 Binary files /dev/null and b/img/sequence_lengths.png differ diff --git a/sequence_len.py b/sequence_len.py new file mode 100644 index 0000000..ad4733c --- /dev/null +++ b/sequence_len.py @@ -0,0 +1,59 @@ +from torchtune.data import Message +from torchtune.models.qwen2 import qwen2_tokenizer +from prompts.translation import TranslateTemplate +from tqdm import tqdm +from tqdm.contrib.concurrent import process_map +import json + + +def analyze_sequence_lengths(vocab_path, merges_path, json_path): + # Load Qwen2 tokenizer + tokenizer = qwen2_tokenizer(vocab_path, merges_path) + translate_template = TranslateTemplate() + + with open(json_path, "r", encoding="utf-8") as f: + dataset = json.load(f) + + max_len = 0 + lengths = [] + + for sample in tqdm(dataset): + # Convert sample to messages + msgs = [ + Message(role="user", content=sample["input"]), + Message(role="assistant", content=sample["output"]), + ] + + templated_msgs = translate_template(msgs) + + # Tokenize messages + tokens, mask = tokenizer.tokenize_messages(templated_msgs) + seq_len = len(tokens) + lengths.append(seq_len) + max_len = max(max_len, seq_len) + + avg_len = sum(lengths) / len(lengths) + print(f"\nDataset size: {len(dataset)} samples") + print(f"Maximum sequence length: {max_len}") + print(f"Average sequence length: {avg_len:.2f}") + + # Optional: Plot distribution + import matplotlib.pyplot as plt + + plt.figure(figsize=(10, 6)) + plt.hist(lengths, bins=50) + plt.title("Distribution of Sequence Lengths") + plt.xlabel("Sequence Length") + plt.ylabel("Count") + plt.savefig("sequence_lengths.png") # or .jpg + plt.close() + + return max_len, lengths + + +# Example usage +vocab_path = "/home/mira/models/Qwen2.5-7B-Base/vocab.json" +merges_path = "/home/mira/models/Qwen2.5-7B-Base/merges.txt" +dataset = "/home/mira/models/datasets/GuoFeng/datasets/dataset_v3.0_alpaca_noinstr.json" + +max_len, lengths = analyze_sequence_lengths(vocab_path, merges_path, dataset)