word2vec-uk / pipeline.py
Dmitry Chaplinsky
Nope
b30bec4
raw
history blame
720 Bytes
from gensim.models import KeyedVectors
from typing import List, Dict
class PreTrainedPipeline:
def __init__(self, path=""):
from huggingface_hub import hf_hub_download
self.model = KeyedVectors.load_word2vec_format(
hf_hub_download(repo_id="lang-uk/word2vec-uk", filename="ubercorpus.cased.tokenized.300d"), binary=False
)
def __call__(self, inputs: str) -> List[Dict]:
"""
Args:
inputs (:obj:`str`):
a string containing some text
Return:
A :obj:`str`
"""
inputs = inputs.strip()
return [{"generated_text": ", \n\n".join(f"{k}" for k, v in self.model.most_similar(inputs, topn=30))}]