import wikipedia import numpy as np import faiss from transformers import AutoTokenizer from sentence_transformers import SentenceTransformer # Load Sentence Transformer model embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2") # Function to fetch Wikipedia content def get_wikipedia_content(topic): try: page = wikipedia.page(topic) return page.content except wikipedia.exceptions.PageError: return None except wikipedia.exceptions.DisambiguationError as e: print(f"Ambiguous topic. Options: {e.options}") return None # Function to split text into chunks def split_text(text, chunk_size=256, chunk_overlap=20): tokens = tokenizer.tokenize(text) chunks = [] start = 0 while start < len(tokens): end = min(start + chunk_size, len(tokens)) chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end])) if end == len(tokens): break start = end - chunk_overlap return chunks # Function to create FAISS index def create_faiss_index(chunks): embeddings = embedding_model.encode(chunks) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings)) return index, embeddings