import logging import lancedb import os from pathlib import Path from sentence_transformers import SentenceTransformer import openai from sentence_transformers import CrossEncoder cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', max_length=512) def rerank_documents(query, documents): scores = cross_encoder.predict([(query,d) for d in documents]) return [pair[1] for pair in sorted(zip(scores, documents), reverse=True)] EMB_MODEL_NAME = "" DB_TABLE_NAME = "" # Setting up the logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Enable multiple retrievers retrievers = {} import tiktoken def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int: """Returns the number of tokens in a text string.""" encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) return num_tokens def trim(text, length = 8190): text = ' '.join(text.split()).replace('<|endoftext|>','') while num_tokens_from_string(text) > length: text = ' '.join(text.split()[:-10]) return text def openai_embedding(text, key = None): client = openai.OpenAI( api_key=key, ) trimmed = trim(text) rs = client.embeddings.create(input=[trimmed], model="text-embedding-ada-002") return rs.data[0].embedding minilm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') gtelarge = SentenceTransformer('thenlper/gte-large') retrievers['MiniLM'] = lambda t, key: minilm.encode(t) retrievers['GteLarge'] = lambda t, key: gtelarge.encode(t) retrievers['OpenAI'] = openai_embedding # db db_uri = os.path.join(Path(__file__).parents[1], ".lancedb") db = lancedb.connect(db_uri) tables = {} for table_name in db.table_names(): tables[table_name] = db.open_table(table_name)