feat: json export

Delete stats.py
Add files via upload
2025-02-13 17:26:51 +06:00 · 2022-07-11 10:25:02 +08:00 · 2022-07-11 09:32:20 +08:00 · 2022-07-11 09:31:51 +08:00 · 2022-07-11 09:26:45 +08:00 · 2022-07-11 09:24:09 +08:00
8 changed files with 45678 additions and 246 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
 {
    "python.languageServer": "None"
 }
--- a/README.md
+++ b/README.md
@@ -23,10 +23,11 @@ Please check  [Dev-metadata](./dev/meta_data.tsv) and [Test-metadata](./test/met
 ##### Table 2. Statistics of MAC
-| Data     | # Src_Sents | # Tgt_Sents | # Src_Tokens | # Tgt_Tokens |
+| Data     | # Src_Sents | # Tgt_Sents | # Src_Tokens | # Tgt_Tokens | # 1-1 (%)    |
-| -------- |:----------- | ----------- | ------------ | ------------ |
+| -------- |:----------- | ----------- | ------------ | ------------ | ------------ |
-| MAC-Dev  | 1,391       | 2,505       | 37,024       | 47,959       |
+| MAC-Dev  | 1,444       | 1,947       | 21,911       | 31,374       | 817 (61.5)   |
-| MAC-Test | 4,875       | 6,610       | 91,971       | 121,306      |
+| MAC-Test | 4,799       | 6,573       | 73,635       | 105,407      | 2,628 (59.8) |
 | **Total**    | 6,243       | 8,520       | 95,546       | 136,781      | 3,445 (60.2) |
 ## Manual Alignment and Inter-Coder Agreement
@@ -48,7 +49,7 @@ The observed differences are then resolved through discussions between the annot
 All the cases of annotator differences and the corresponding resolutions have been recorded in an Excel file [anno_disagreement.xlsx](./test/anno_disagreement.xlsx). The final alignments verified by both annotators are saved in the directory [dev/Intertext](./dev/Intertext) and [test/Intertext](./test/Intertext).
-We use the set-based metric Jaccard Index as suggested by Artstein & Poesio (2008) to measure the Inter-Coder Agreement (ICA):
+We use the script [compute_ica.py](./compute_ica.py), which implements the set-based metric Jaccard Index as suggested by Artstein & Poesio (2008), to measure the Inter-Coder Agreement (ICA):
 ```bash
 python compute_ica.py -a1 test/intertext_01 -a2 test/intertext_02
--- a/intertext2json.py
+++ b/intertext2json.py
@@ -0,0 +1,115 @@
 from ast import Tuple
 import os
 import json
 import argparse
 from typing import Any
 from xml.etree.ElementTree import parse
 def mkdir(file_path: str):
    directory = os.path.dirname(file_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
 def main():
    parser = argparse.ArgumentParser(description="Convert Intertext to Alpaca JSON")
    _ = parser.add_argument(
        "-i",
        "--input",
        type=str,
        required=True,
        help="Input directory for Intertext alignments.",
    )
    _ = parser.add_argument(
        "-o",
        "--output_file",
        type=str,
        required=True,
        help="Output file for JSON files.",
    )
    args = parser.parse_args()
    mkdir(args.output_file)
    all_pairs = []
    align_files: list[str] = [
        f for f in os.listdir(args.input) if "_zh." in f and "_en.xml" in f
    ]
    for align_file in sorted(align_files):
        print(f"Processing {align_file}...")
        align_path = os.path.join(args.input, align_file)
        doc = parse(align_path)
        root = doc.getroot()
        zh_file = root.get("fromDoc")
        en_file = root.get("toDoc")
        if not zh_file or not en_file:
            print(f"Warning: Missing fromDoc or toDoc in {align_file}")
            continue
        zh_sents = get_sents(os.path.join(args.input, zh_file))
        en_sents = get_sents(os.path.join(args.input, en_file))
        alignments = get_alignments(align_path)
        pairs = create_pairs(zh_sents, en_sents, alignments)
        all_pairs.extend(pairs)
    write_json(all_pairs, args.output_file)
    print(f"Created {args.output_file} with {len(all_pairs)} pairs")
 def create_pairs(zh_sents, en_sents, alignments):
    pairs = []
    for zh_idx, en_idx in alignments:
        zh_sent = find_sent_by_id(zh_idx, zh_sents)
        en_sent = find_sent_by_id(en_idx, en_sents)
        if zh_sent and en_sent:  # both sentences should exist
            pair = {"input": zh_sent, "output": en_sent}
            pairs.append(pair)
    return pairs
 def write_json(pairs, out_file):
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(pairs, f, ensure_ascii=False, indent=2)
 def find_sent_by_id(idx, sents):
    sent = ""
    if len(idx) > 0:
        sent = " ".join(sents[idx[0] : idx[-1] + 1])
    return sent.strip()
 def get_alignments(file: str):
    doc = parse(file)
    links = []
    for link in doc.iterfind("link"):
        if xtargets := link.get("xtargets"):
            en_link, zh_link = xtargets.split(";")
            zh_bead = parse_link(zh_link)
            en_bead = parse_link(en_link)
            links.append((zh_bead, en_bead))
    return links
 def parse_link(link) -> list[int]:
    bead = []
    if len(link) > 0:
        bead = [int(item.split(":")[1]) - 1 for item in link.split(" ")]
    return bead
 def get_sents(file: str) -> list[str]:
    doc = parse(file)
    sents = []
    for sent in doc.iterfind("p/s"):
        sents.append(sent.text)
    return sents
 if __name__ == "__main__":
    main()
--- a/json/dev_aligned_pairs.json
+++ b/json/dev_aligned_pairs.json
--- a/json/merged_aligned_pairs.json
+++ b/json/merged_aligned_pairs.json
--- a/json/test_aligned_pairs.json
+++ b/json/test_aligned_pairs.json
--- a/merge_json.sh
+++ b/merge_json.sh
@@ -0,0 +1,23 @@
 #!/bin/sh
 if [ "$#" -ne 3 ]; then
    echo "Usage: $0 input1.json input2.json output.json" >&2
    exit 1
 fi
 i1="$1"
 i2="$2"
 out="$3"
 echo "input 1 len: $(jq 'length' "$i1")"
 echo "input 2 len: $(jq 'length' "$i2")"
 jq -s '.[0] + .[1]' "$i1" "$i2" >"$out"
 if [ $? -eq 0 ]; then
    echo "merged files into $out"
    echo "output len: $(jq 'length' "$out")"
 else
    echo "error merging files" >&2
    exit 1
 fi
Author	SHA1	Message	Date
kuwoyuki	0cbbf66733	feat: json export	2025-02-13 17:26:51 +06:00
bfsujason	64858bb9b6	Delete stats.py	2022-07-11 10:25:02 +08:00
bfsujason	5480fc31dd	Add files via upload	2022-07-11 09:32:20 +08:00
bfsujason	7bf23b5537	Delete dev-anno.002_zh.002_en.tsv	2022-07-11 09:31:51 +08:00
bfsujason	aea80bb055	Update stats.py	2022-07-11 09:26:45 +08:00
bfsujason	8e797120f4	Update stats.py	2022-07-11 09:24:09 +08:00
bfsujason	8cfbfe54cb	Add files via upload	2022-07-11 09:23:21 +08:00
bfsujason	553791ba36	Update README.md	2022-07-11 09:17:52 +08:00
bfsujason	5e3eaee5ba	Update README.md	2022-07-11 09:12:16 +08:00
nlpfun	54e23edc6f	Update README.md	2022-06-20 10:22:58 +08:00