33 lines
1005 B
Python
33 lines
1005 B
Python
import os
|
|
import argparse
|
|
import time
|
|
import numpy as np
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
def _main():
|
|
parser = argparse.ArgumentParser('Convert sentences into vectors.',
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
parser.add_argument('-i', '--input', type=str, help='input file.')
|
|
parser.add_argument('-o', '--output', type=str, help='output file.')
|
|
args = parser.parse_args()
|
|
model = SentenceTransformer('LaBSE')
|
|
embed_texts(model, args.input, args.output)
|
|
|
|
def get_sents(TEXT):
|
|
sents = []
|
|
with open(TEXT, 'r', encoding="utf-8") as f:
|
|
for line in f:
|
|
sents.append(line.strip())
|
|
return sents
|
|
|
|
def embed_texts(model, fin, fout):
|
|
t_0 = time.time()
|
|
print("Embedding text {} ...".format(fin))
|
|
txt = get_sents(fin)
|
|
embed = model.encode(txt)
|
|
embed.tofile(fout)
|
|
print("It takes {} seconds".format(time.time() - t_0))
|
|
|
|
if __name__ == '__main__':
|
|
_main()
|