|
|
|
import os |
|
from tqdm import tqdm |
|
from langchain.vectorstores import FAISS |
|
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, CohereEmbeddings |
|
from retry import retry |
|
from key import CoherenceKey, OpenaiKey |
|
|
|
|
|
OUTPUT_DIR = './output/' |
|
|
|
|
|
@retry(tries=10, delay=60) |
|
def store_add_texts_with_retry(store, i): |
|
store.add_texts([i.page_content], metadatas=[i.metadata]) |
|
|
|
|
|
def doc2vec(docs, model, folder_name=None): |
|
if folder_name: |
|
dir = os.path.join(OUTPUT_DIR, folder_name) |
|
else: |
|
dir = OUTPUT_DIR |
|
|
|
print(f'Building faiss Index from {len(docs)} docs') |
|
docs_test = [docs[0]] |
|
docs.pop(0) |
|
index = 0 |
|
|
|
print(f'Dumping FAISS to {dir}') |
|
if model =='openai': |
|
key = os.getenv('OPENAI_API_KEY') |
|
db = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=key)) |
|
elif model =='mpnet': |
|
db = FAISS.from_documents(docs_test, HuggingFaceEmbeddings()) |
|
elif model =='cohere': |
|
db = FAISS.from_documents(docs_test, CohereEmbeddings(cohere_api_key=CoherenceKey)) |
|
else: |
|
raise ValueError(f'Embedding Model {model} not supported') |
|
|
|
for doc in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), |
|
bar_format='{l_bar}{bar}| Time Left: {remaining}'): |
|
try: |
|
store_add_texts_with_retry(db, doc) |
|
except Exception as e: |
|
print(e) |
|
print("Error on ", doc) |
|
print("Saving progress") |
|
print(f"stopped at {index} out of {len(docs)}") |
|
db.save_local(dir) |
|
break |
|
index += 1 |
|
db.save_local(dir) |