smrup's picture
Upload 4 files
87198ce verified
import wikipedia
import numpy as np
import faiss
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
# Load Sentence Transformer model
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
# Function to fetch Wikipedia content
def get_wikipedia_content(topic):
try:
page = wikipedia.page(topic)
return page.content
except wikipedia.exceptions.PageError:
return None
except wikipedia.exceptions.DisambiguationError as e:
print(f"Ambiguous topic. Options: {e.options}")
return None
# Function to split text into chunks
def split_text(text, chunk_size=256, chunk_overlap=20):
tokens = tokenizer.tokenize(text)
chunks = []
start = 0
while start < len(tokens):
end = min(start + chunk_size, len(tokens))
chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end]))
if end == len(tokens):
break
start = end - chunk_overlap
return chunks
# Function to create FAISS index
def create_faiss_index(chunks):
embeddings = embedding_model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
return index, embeddings