Ask-About-Me / build_index.py
prakashknaikade's picture
intial commit
359520d
# build_index.py
import os
import requests
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredURLLoader, UnstructuredHTMLLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.docstore.document import Document as LCDocument
DOCS_PATH = "docs"
INDEX_PATH = "faiss_index"
def fetch_html_with_timeout(url: str, timeout=5) -> list[Document]:
"""
Downloads the page content with a timeout, then parses it using UnstructuredHTMLLoader.
Returns a list of Documents (can be 1 or multiple if you want to split further).
"""
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status() # raise HTTPError if not 200
except Exception as e:
print(f"[Timeout/Fetch Error] Skipping {url}: {e}")
return []
# Write the HTML to a temporary file so we can load it with UnstructuredHTMLLoader
# (unstructured requires a file-like, we can do in-memory, but let's keep it simple)
temp_filename = "temp_html_file.html"
with open(temp_filename, "w", encoding="utf-8") as f:
f.write(response.text)
loader = UnstructuredHTMLLoader(temp_filename)
docs = loader.load() # returns a list of Document objects
for doc in docs:
doc.metadata["source"] = url
return docs
def load_web_docs(urls: list[str], timeout=5) -> list[Document]:
all_docs = []
for url in urls:
print(f"Fetching: {url}")
docs_from_url = fetch_html_with_timeout(url, timeout=timeout)
all_docs.extend(docs_from_url)
return all_docs
def load_documents(docs_path=DOCS_PATH):
all_docs = []
for file_name in os.listdir(docs_path):
file_path = os.path.join(docs_path, file_name)
print(f"Processing file: {file_name}") # Debug log
# 1) Text files
if file_name.lower().endswith(".txt"):
print(" -> Loading as .txt")
loader = TextLoader(file_path, encoding="utf-8")
loaded_docs = loader.load()
all_docs.extend(loaded_docs)
print(f" -> Loaded {len(loaded_docs)} docs from {file_name}")
# 2) PDF
elif file_name.lower().endswith(".pdf"):
print(" -> Loading as .pdf")
loader = PyPDFLoader(file_path)
pdf_docs = loader.load_and_split()
all_docs.extend(pdf_docs)
print(f" -> Loaded {len(pdf_docs)} docs from {file_name}")
# 3) URLs
elif file_name.lower().endswith(".urls"):
print(" -> Loading as .urls")
with open(file_path, "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
print(f" -> Found {len(urls)} URLs in {file_name}")
if urls:
web_docs = load_web_docs(urls, timeout=5)
print(f" -> Loaded {len(web_docs)} web docs from URLs")
all_docs.extend(web_docs)
else:
print(" -> Skipped: unrecognized file type.")
return all_docs
def build_faiss_index():
documents = load_documents()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splitted_docs = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"})
vectorstore = FAISS.from_documents(splitted_docs, embeddings)
os.makedirs(INDEX_PATH, exist_ok=True)
vectorstore.save_local(INDEX_PATH)
print(f"Vector index saved to {INDEX_PATH}")
if __name__ == "__main__":
build_faiss_index()