Compare commits
10 Commits
ad3f2f5f96
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
0cbbf66733
|
|||
|
|
64858bb9b6 | ||
|
|
5480fc31dd | ||
|
|
7bf23b5537 | ||
|
|
aea80bb055 | ||
|
|
8e797120f4 | ||
|
|
8cfbfe54cb | ||
|
|
553791ba36 | ||
|
|
5e3eaee5ba | ||
|
|
54e23edc6f |
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"python.languageServer": "None"
|
||||||
|
}
|
||||||
11
README.md
11
README.md
@@ -23,10 +23,11 @@ Please check [Dev-metadata](./dev/meta_data.tsv) and [Test-metadata](./test/met
|
|||||||
|
|
||||||
##### Table 2. Statistics of MAC
|
##### Table 2. Statistics of MAC
|
||||||
|
|
||||||
| Data | # Src_Sents | # Tgt_Sents | # Src_Tokens | # Tgt_Tokens |
|
| Data | # Src_Sents | # Tgt_Sents | # Src_Tokens | # Tgt_Tokens | # 1-1 (%) |
|
||||||
| -------- |:----------- | ----------- | ------------ | ------------ |
|
| -------- |:----------- | ----------- | ------------ | ------------ | ------------ |
|
||||||
| MAC-Dev | 1,391 | 2,505 | 37,024 | 47,959 |
|
| MAC-Dev | 1,444 | 1,947 | 21,911 | 31,374 | 817 (61.5) |
|
||||||
| MAC-Test | 4,875 | 6,610 | 91,971 | 121,306 |
|
| MAC-Test | 4,799 | 6,573 | 73,635 | 105,407 | 2,628 (59.8) |
|
||||||
|
| **Total** | 6,243 | 8,520 | 95,546 | 136,781 | 3,445 (60.2) |
|
||||||
|
|
||||||
## Manual Alignment and Inter-Coder Agreement
|
## Manual Alignment and Inter-Coder Agreement
|
||||||
|
|
||||||
@@ -48,7 +49,7 @@ The observed differences are then resolved through discussions between the annot
|
|||||||
|
|
||||||
All the cases of annotator differences and the corresponding resolutions have been recorded in an Excel file [anno_disagreement.xlsx](./test/anno_disagreement.xlsx). The final alignments verified by both annotators are saved in the directory [dev/Intertext](./dev/Intertext) and [test/Intertext](./test/Intertext).
|
All the cases of annotator differences and the corresponding resolutions have been recorded in an Excel file [anno_disagreement.xlsx](./test/anno_disagreement.xlsx). The final alignments verified by both annotators are saved in the directory [dev/Intertext](./dev/Intertext) and [test/Intertext](./test/Intertext).
|
||||||
|
|
||||||
We use the set-based metric Jaccard Index as suggested by Artstein & Poesio (2008) to measure the Inter-Coder Agreement (ICA):
|
We use the script [compute_ica.py](./compute_ica.py), which implements the set-based metric Jaccard Index as suggested by Artstein & Poesio (2008), to measure the Inter-Coder Agreement (ICA):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python compute_ica.py -a1 test/intertext_01 -a2 test/intertext_02
|
python compute_ica.py -a1 test/intertext_01 -a2 test/intertext_02
|
||||||
|
|||||||
115
intertext2json.py
Normal file
115
intertext2json.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
from ast import Tuple
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from typing import Any
|
||||||
|
from xml.etree.ElementTree import parse
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir(file_path: str):
|
||||||
|
directory = os.path.dirname(file_path)
|
||||||
|
if directory and not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Convert Intertext to Alpaca JSON")
|
||||||
|
_ = parser.add_argument(
|
||||||
|
"-i",
|
||||||
|
"--input",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Input directory for Intertext alignments.",
|
||||||
|
)
|
||||||
|
_ = parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output_file",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Output file for JSON files.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
mkdir(args.output_file)
|
||||||
|
|
||||||
|
all_pairs = []
|
||||||
|
align_files: list[str] = [
|
||||||
|
f for f in os.listdir(args.input) if "_zh." in f and "_en.xml" in f
|
||||||
|
]
|
||||||
|
|
||||||
|
for align_file in sorted(align_files):
|
||||||
|
print(f"Processing {align_file}...")
|
||||||
|
align_path = os.path.join(args.input, align_file)
|
||||||
|
|
||||||
|
doc = parse(align_path)
|
||||||
|
root = doc.getroot()
|
||||||
|
zh_file = root.get("fromDoc")
|
||||||
|
en_file = root.get("toDoc")
|
||||||
|
|
||||||
|
if not zh_file or not en_file:
|
||||||
|
print(f"Warning: Missing fromDoc or toDoc in {align_file}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
zh_sents = get_sents(os.path.join(args.input, zh_file))
|
||||||
|
en_sents = get_sents(os.path.join(args.input, en_file))
|
||||||
|
|
||||||
|
alignments = get_alignments(align_path)
|
||||||
|
pairs = create_pairs(zh_sents, en_sents, alignments)
|
||||||
|
all_pairs.extend(pairs)
|
||||||
|
|
||||||
|
write_json(all_pairs, args.output_file)
|
||||||
|
print(f"Created {args.output_file} with {len(all_pairs)} pairs")
|
||||||
|
|
||||||
|
|
||||||
|
def create_pairs(zh_sents, en_sents, alignments):
|
||||||
|
pairs = []
|
||||||
|
for zh_idx, en_idx in alignments:
|
||||||
|
zh_sent = find_sent_by_id(zh_idx, zh_sents)
|
||||||
|
en_sent = find_sent_by_id(en_idx, en_sents)
|
||||||
|
if zh_sent and en_sent: # both sentences should exist
|
||||||
|
pair = {"input": zh_sent, "output": en_sent}
|
||||||
|
pairs.append(pair)
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
|
||||||
|
def write_json(pairs, out_file):
|
||||||
|
with open(out_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(pairs, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def find_sent_by_id(idx, sents):
|
||||||
|
sent = ""
|
||||||
|
if len(idx) > 0:
|
||||||
|
sent = " ".join(sents[idx[0] : idx[-1] + 1])
|
||||||
|
return sent.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_alignments(file: str):
|
||||||
|
doc = parse(file)
|
||||||
|
links = []
|
||||||
|
for link in doc.iterfind("link"):
|
||||||
|
if xtargets := link.get("xtargets"):
|
||||||
|
en_link, zh_link = xtargets.split(";")
|
||||||
|
zh_bead = parse_link(zh_link)
|
||||||
|
en_bead = parse_link(en_link)
|
||||||
|
links.append((zh_bead, en_bead))
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def parse_link(link) -> list[int]:
|
||||||
|
bead = []
|
||||||
|
if len(link) > 0:
|
||||||
|
bead = [int(item.split(":")[1]) - 1 for item in link.split(" ")]
|
||||||
|
return bead
|
||||||
|
|
||||||
|
|
||||||
|
def get_sents(file: str) -> list[str]:
|
||||||
|
doc = parse(file)
|
||||||
|
sents = []
|
||||||
|
for sent in doc.iterfind("p/s"):
|
||||||
|
sents.append(sent.text)
|
||||||
|
return sents
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
5266
json/dev_aligned_pairs.json
Normal file
5266
json/dev_aligned_pairs.json
Normal file
File diff suppressed because it is too large
Load Diff
22642
json/merged_aligned_pairs.json
Normal file
22642
json/merged_aligned_pairs.json
Normal file
File diff suppressed because it is too large
Load Diff
17382
json/test_aligned_pairs.json
Normal file
17382
json/test_aligned_pairs.json
Normal file
File diff suppressed because it is too large
Load Diff
23
merge_json.sh
Executable file
23
merge_json.sh
Executable file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
if [ "$#" -ne 3 ]; then
|
||||||
|
echo "Usage: $0 input1.json input2.json output.json" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
i1="$1"
|
||||||
|
i2="$2"
|
||||||
|
out="$3"
|
||||||
|
|
||||||
|
echo "input 1 len: $(jq 'length' "$i1")"
|
||||||
|
echo "input 2 len: $(jq 'length' "$i2")"
|
||||||
|
|
||||||
|
jq -s '.[0] + .[1]' "$i1" "$i2" >"$out"
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "merged files into $out"
|
||||||
|
echo "output len: $(jq 'length' "$out")"
|
||||||
|
else
|
||||||
|
echo "error merging files" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user