Files
bertalign/utils/embed.py
2021-05-18 00:03:45 +08:00

33 lines
1005 B
Python

import os
import argparse
import time
import numpy as np
from sentence_transformers import SentenceTransformer
def _main():
parser = argparse.ArgumentParser('Convert sentences into vectors.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', type=str, help='input file.')
parser.add_argument('-o', '--output', type=str, help='output file.')
args = parser.parse_args()
model = SentenceTransformer('LaBSE')
embed_texts(model, args.input, args.output)
def get_sents(TEXT):
sents = []
with open(TEXT, 'r', encoding="utf-8") as f:
for line in f:
sents.append(line.strip())
return sents
def embed_texts(model, fin, fout):
t_0 = time.time()
print("Embedding text {} ...".format(fin))
txt = get_sents(fin)
embed = model.encode(txt)
embed.tofile(fout)
print("It takes {} seconds".format(time.time() - t_0))
if __name__ == '__main__':
_main()