Update utils
This commit is contained in:
63
utils/sent_splitter.py
Normal file
63
utils/sent_splitter.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# 2021/11/30
|
||||
# bfsujason@163.com
|
||||
|
||||
"""
|
||||
Usage:
|
||||
|
||||
python utils/sent_splitter.py \
|
||||
-i utils/zh_raw
|
||||
-o utils/zh
|
||||
-l zh
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import argparse
|
||||
import pysbd
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Split multilingual sentences using pySBD')
|
||||
parser.add_argument('-i', '--input', type=str, required=True, help='Directory for raw files.')
|
||||
parser.add_argument('-o', '--output', type=str, required=True, help='Directory for split files.')
|
||||
parser.add_argument('-l', '--language', type=str, required=True, help='ISO 639-1 language code.')
|
||||
args = parser.parse_args()
|
||||
|
||||
make_dir(args.output)
|
||||
splitter = pysbd.Segmenter(language=args.language, clean=False)
|
||||
for file in os.listdir(args.input):
|
||||
print("Splitting file {} ...".format(file))
|
||||
sents = split_sents(os.path.join(args.input, file), splitter)
|
||||
write_sents(os.path.join(args.output, file), sents)
|
||||
|
||||
def write_sents(fp, sents):
|
||||
with open(fp, 'wt', encoding='utf-8') as f:
|
||||
for sent in sents:
|
||||
f.write(sent + '\n')
|
||||
|
||||
def split_sents(fp, splitter):
|
||||
paras = get_paras(fp)
|
||||
sents_in_para = []
|
||||
for para in paras:
|
||||
cur_sents = splitter.segment(para)
|
||||
sents_in_para.append(cur_sents)
|
||||
sents = [j for sub in sents_in_para for j in sub]
|
||||
return sents
|
||||
|
||||
def get_paras(fp):
|
||||
paras = []
|
||||
with open(fp, 'rt', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
line = re.sub('\s+', ' ', line)
|
||||
paras.append(line)
|
||||
return paras
|
||||
|
||||
def make_dir(path):
|
||||
if os.path.isdir(path):
|
||||
shutil.rmtree(path)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user