Compare commits

...

10 Commits

Author SHA1 Message Date
0cbbf66733 feat: json export 2025-02-13 17:26:51 +06:00
bfsujason
64858bb9b6 Delete stats.py 2022-07-11 10:25:02 +08:00
bfsujason
5480fc31dd Add files via upload 2022-07-11 09:32:20 +08:00
bfsujason
7bf23b5537 Delete dev-anno.002_zh.002_en.tsv 2022-07-11 09:31:51 +08:00
bfsujason
aea80bb055 Update stats.py 2022-07-11 09:26:45 +08:00
bfsujason
8e797120f4 Update stats.py 2022-07-11 09:24:09 +08:00
bfsujason
8cfbfe54cb Add files via upload 2022-07-11 09:23:21 +08:00
bfsujason
553791ba36 Update README.md 2022-07-11 09:17:52 +08:00
bfsujason
5e3eaee5ba Update README.md 2022-07-11 09:12:16 +08:00
nlpfun
54e23edc6f Update README.md 2022-06-20 10:22:58 +08:00
8 changed files with 45678 additions and 246 deletions

3
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,3 @@
{
"python.languageServer": "None"
}

View File

@@ -1,6 +1,6 @@
# MAC
MAC is a manually aligned Chinese-English parallel corpus of literary texts, consisting of chapters sampled from six Chinese novels and their English translations.
MAC is a manually aligned Chinese-English parallel corpus of literary texts, consisting of chapters sampled from six Chinese novels and their English translations.
Although MAC is initially created for evaluating the performance of automatic sentence aligners such as [Gale-Church](https://aclanthology.org/J93-1004/), [Hunalign](http://mokk.bme.hu/en/resources/hunalign/), [Belualign](https://github.com/rsennrich/Bleualign), [Vecalign](https://github.com/thompsonb/vecalign) and [Bertalign](https://github.com/bfsujason/bertalign), the corpus can also be used in the study of contrastive linguistics, the difference between translated v.s. non-translated language and translation strategies, etc.
@@ -23,10 +23,11 @@ Please check [Dev-metadata](./dev/meta_data.tsv) and [Test-metadata](./test/met
##### Table 2. Statistics of MAC
| Data | # Src_Sents | # Tgt_Sents | # Src_Tokens | # Tgt_Tokens |
| -------- |:----------- | ----------- | ------------ | ------------ |
| MAC-Dev | 1,391 | 2,505 | 37,024 | 47,959 |
| MAC-Test | 4,875 | 6,610 | 91,971 | 121,306 |
| Data | # Src_Sents | # Tgt_Sents | # Src_Tokens | # Tgt_Tokens | # 1-1 (%) |
| -------- |:----------- | ----------- | ------------ | ------------ | ------------ |
| MAC-Dev | 1,444 | 1,947 | 21,911 | 31,374 | 817 (61.5) |
| MAC-Test | 4,799 | 6,573 | 73,635 | 105,407 | 2,628 (59.8) |
| **Total** | 6,243 | 8,520 | 95,546 | 136,781 | 3,445 (60.2) |
## Manual Alignment and Inter-Coder Agreement
@@ -48,7 +49,7 @@ The observed differences are then resolved through discussions between the annot
All the cases of annotator differences and the corresponding resolutions have been recorded in an Excel file [anno_disagreement.xlsx](./test/anno_disagreement.xlsx). The final alignments verified by both annotators are saved in the directory [dev/Intertext](./dev/Intertext) and [test/Intertext](./test/Intertext).
We use the set-based metric Jaccard Index as suggested by Artstein & Poesio (2008) to measure the Inter-Coder Agreement (ICA):
We use the script [compute_ica.py](./compute_ica.py), which implements the set-based metric Jaccard Index as suggested by Artstein & Poesio (2008), to measure the Inter-Coder Agreement (ICA):
```bash
python compute_ica.py -a1 test/intertext_01 -a2 test/intertext_02

115
intertext2json.py Normal file
View File

@@ -0,0 +1,115 @@
from ast import Tuple
import os
import json
import argparse
from typing import Any
from xml.etree.ElementTree import parse
def mkdir(file_path: str):
directory = os.path.dirname(file_path)
if directory and not os.path.exists(directory):
os.makedirs(directory)
def main():
parser = argparse.ArgumentParser(description="Convert Intertext to Alpaca JSON")
_ = parser.add_argument(
"-i",
"--input",
type=str,
required=True,
help="Input directory for Intertext alignments.",
)
_ = parser.add_argument(
"-o",
"--output_file",
type=str,
required=True,
help="Output file for JSON files.",
)
args = parser.parse_args()
mkdir(args.output_file)
all_pairs = []
align_files: list[str] = [
f for f in os.listdir(args.input) if "_zh." in f and "_en.xml" in f
]
for align_file in sorted(align_files):
print(f"Processing {align_file}...")
align_path = os.path.join(args.input, align_file)
doc = parse(align_path)
root = doc.getroot()
zh_file = root.get("fromDoc")
en_file = root.get("toDoc")
if not zh_file or not en_file:
print(f"Warning: Missing fromDoc or toDoc in {align_file}")
continue
zh_sents = get_sents(os.path.join(args.input, zh_file))
en_sents = get_sents(os.path.join(args.input, en_file))
alignments = get_alignments(align_path)
pairs = create_pairs(zh_sents, en_sents, alignments)
all_pairs.extend(pairs)
write_json(all_pairs, args.output_file)
print(f"Created {args.output_file} with {len(all_pairs)} pairs")
def create_pairs(zh_sents, en_sents, alignments):
pairs = []
for zh_idx, en_idx in alignments:
zh_sent = find_sent_by_id(zh_idx, zh_sents)
en_sent = find_sent_by_id(en_idx, en_sents)
if zh_sent and en_sent: # both sentences should exist
pair = {"input": zh_sent, "output": en_sent}
pairs.append(pair)
return pairs
def write_json(pairs, out_file):
with open(out_file, "w", encoding="utf-8") as f:
json.dump(pairs, f, ensure_ascii=False, indent=2)
def find_sent_by_id(idx, sents):
sent = ""
if len(idx) > 0:
sent = " ".join(sents[idx[0] : idx[-1] + 1])
return sent.strip()
def get_alignments(file: str):
doc = parse(file)
links = []
for link in doc.iterfind("link"):
if xtargets := link.get("xtargets"):
en_link, zh_link = xtargets.split(";")
zh_bead = parse_link(zh_link)
en_bead = parse_link(en_link)
links.append((zh_bead, en_bead))
return links
def parse_link(link) -> list[int]:
bead = []
if len(link) > 0:
bead = [int(item.split(":")[1]) - 1 for item in link.split(" ")]
return bead
def get_sents(file: str) -> list[str]:
doc = parse(file)
sents = []
for sent in doc.iterfind("p/s"):
sents.append(sent.text)
return sents
if __name__ == "__main__":
main()

5266
json/dev_aligned_pairs.json Normal file

File diff suppressed because it is too large Load Diff

22642
json/merged_aligned_pairs.json Normal file

File diff suppressed because it is too large Load Diff

17382
json/test_aligned_pairs.json Normal file

File diff suppressed because it is too large Load Diff

23
merge_json.sh Executable file
View File

@@ -0,0 +1,23 @@
#!/bin/sh
if [ "$#" -ne 3 ]; then
echo "Usage: $0 input1.json input2.json output.json" >&2
exit 1
fi
i1="$1"
i2="$2"
out="$3"
echo "input 1 len: $(jq 'length' "$i1")"
echo "input 2 len: $(jq 'length' "$i2")"
jq -s '.[0] + .[1]' "$i1" "$i2" >"$out"
if [ $? -eq 0 ]; then
echo "merged files into $out"
echo "output len: $(jq 'length' "$out")"
else
echo "error merging files" >&2
exit 1
fi