Update utils

This commit is contained in:
nlpfun
2021-11-30 23:59:44 +08:00
parent 9556ab29b8
commit d440977dcb
9 changed files with 2667 additions and 0 deletions

63
utils/sent_splitter.py Normal file
View File

@@ -0,0 +1,63 @@
# 2021/11/30
# bfsujason@163.com
"""
Usage:
python utils/sent_splitter.py \
-i utils/zh_raw
-o utils/zh
-l zh
"""
import os
import re
import shutil
import argparse
import pysbd
def main():
parser = argparse.ArgumentParser(description='Split multilingual sentences using pySBD')
parser.add_argument('-i', '--input', type=str, required=True, help='Directory for raw files.')
parser.add_argument('-o', '--output', type=str, required=True, help='Directory for split files.')
parser.add_argument('-l', '--language', type=str, required=True, help='ISO 639-1 language code.')
args = parser.parse_args()
make_dir(args.output)
splitter = pysbd.Segmenter(language=args.language, clean=False)
for file in os.listdir(args.input):
print("Splitting file {} ...".format(file))
sents = split_sents(os.path.join(args.input, file), splitter)
write_sents(os.path.join(args.output, file), sents)
def write_sents(fp, sents):
with open(fp, 'wt', encoding='utf-8') as f:
for sent in sents:
f.write(sent + '\n')
def split_sents(fp, splitter):
paras = get_paras(fp)
sents_in_para = []
for para in paras:
cur_sents = splitter.segment(para)
sents_in_para.append(cur_sents)
sents = [j for sub in sents_in_para for j in sub]
return sents
def get_paras(fp):
paras = []
with open(fp, 'rt', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
line = re.sub('\s+', ' ', line)
paras.append(line)
return paras
def make_dir(path):
if os.path.isdir(path):
shutil.rmtree(path)
os.makedirs(path, exist_ok=True)
if __name__ == '__main__':
main()