Spaces:
Sleeping
Sleeping
import wikipedia | |
import numpy as np | |
import faiss | |
from transformers import AutoTokenizer | |
from sentence_transformers import SentenceTransformer | |
# Load Sentence Transformer model | |
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") | |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2") | |
# Function to fetch Wikipedia content | |
def get_wikipedia_content(topic): | |
try: | |
page = wikipedia.page(topic) | |
return page.content | |
except wikipedia.exceptions.PageError: | |
return None | |
except wikipedia.exceptions.DisambiguationError as e: | |
print(f"Ambiguous topic. Options: {e.options}") | |
return None | |
# Function to split text into chunks | |
def split_text(text, chunk_size=256, chunk_overlap=20): | |
tokens = tokenizer.tokenize(text) | |
chunks = [] | |
start = 0 | |
while start < len(tokens): | |
end = min(start + chunk_size, len(tokens)) | |
chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end])) | |
if end == len(tokens): | |
break | |
start = end - chunk_overlap | |
return chunks | |
# Function to create FAISS index | |
def create_faiss_index(chunks): | |
embeddings = embedding_model.encode(chunks) | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(np.array(embeddings)) | |
return index, embeddings | |