first commit
This commit is contained in:
32
utils/embed.py
Normal file
32
utils/embed.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import os
|
||||
import argparse
|
||||
import time
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
def _main():
|
||||
parser = argparse.ArgumentParser('Convert sentences into vectors.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('-i', '--input', type=str, help='input file.')
|
||||
parser.add_argument('-o', '--output', type=str, help='output file.')
|
||||
args = parser.parse_args()
|
||||
model = SentenceTransformer('LaBSE')
|
||||
embed_texts(model, args.input, args.output)
|
||||
|
||||
def get_sents(TEXT):
|
||||
sents = []
|
||||
with open(TEXT, 'r', encoding="utf-8") as f:
|
||||
for line in f:
|
||||
sents.append(line.strip())
|
||||
return sents
|
||||
|
||||
def embed_texts(model, fin, fout):
|
||||
t_0 = time.time()
|
||||
print("Embedding text {} ...".format(fin))
|
||||
txt = get_sents(fin)
|
||||
embed = model.encode(txt)
|
||||
embed.tofile(fout)
|
||||
print("It takes {} seconds".format(time.time() - t_0))
|
||||
|
||||
if __name__ == '__main__':
|
||||
_main()
|
||||
Reference in New Issue
Block a user