Spaces:

prakashknaikade
/

Ask-About-Me

Running

App Files Files Community

Ask-About-Me / build_index.py

prakashknaikade

intial commit

359520d 7 months ago

raw

history blame contribute delete

3.8 kB

	# build_index.py
	import os
	import requests
	from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredURLLoader, UnstructuredHTMLLoader
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema import Document
	from langchain_community.docstore.document import Document as LCDocument

	DOCS_PATH = "docs"
	INDEX_PATH = "faiss_index"

	def fetch_html_with_timeout(url: str, timeout=5) -> list[Document]:
	"""
	Downloads the page content with a timeout, then parses it using UnstructuredHTMLLoader.
	Returns a list of Documents (can be 1 or multiple if you want to split further).
	"""
	try:
	response = requests.get(url, timeout=timeout)
	response.raise_for_status() # raise HTTPError if not 200
	except Exception as e:
	print(f"[Timeout/Fetch Error] Skipping {url}: {e}")
	return []

	# Write the HTML to a temporary file so we can load it with UnstructuredHTMLLoader
	# (unstructured requires a file-like, we can do in-memory, but let's keep it simple)
	temp_filename = "temp_html_file.html"
	with open(temp_filename, "w", encoding="utf-8") as f:
	f.write(response.text)

	loader = UnstructuredHTMLLoader(temp_filename)
	docs = loader.load() # returns a list of Document objects
	for doc in docs:
	doc.metadata["source"] = url
	return docs

	def load_web_docs(urls: list[str], timeout=5) -> list[Document]:
	all_docs = []
	for url in urls:
	print(f"Fetching: {url}")
	docs_from_url = fetch_html_with_timeout(url, timeout=timeout)
	all_docs.extend(docs_from_url)
	return all_docs

	def load_documents(docs_path=DOCS_PATH):
	all_docs = []

	for file_name in os.listdir(docs_path):
	file_path = os.path.join(docs_path, file_name)
	print(f"Processing file: {file_name}") # Debug log

	# 1) Text files
	if file_name.lower().endswith(".txt"):
	print(" -> Loading as .txt")
	loader = TextLoader(file_path, encoding="utf-8")
	loaded_docs = loader.load()
	all_docs.extend(loaded_docs)
	print(f" -> Loaded {len(loaded_docs)} docs from {file_name}")

	# 2) PDF
	elif file_name.lower().endswith(".pdf"):
	print(" -> Loading as .pdf")
	loader = PyPDFLoader(file_path)
	pdf_docs = loader.load_and_split()
	all_docs.extend(pdf_docs)
	print(f" -> Loaded {len(pdf_docs)} docs from {file_name}")

	# 3) URLs
	elif file_name.lower().endswith(".urls"):
	print(" -> Loading as .urls")
	with open(file_path, "r", encoding="utf-8") as f:
	urls = [line.strip() for line in f if line.strip()]
	print(f" -> Found {len(urls)} URLs in {file_name}")
	if urls:
	web_docs = load_web_docs(urls, timeout=5)
	print(f" -> Loaded {len(web_docs)} web docs from URLs")
	all_docs.extend(web_docs)

	else:
	print(" -> Skipped: unrecognized file type.")

	return all_docs

	def build_faiss_index():
	documents = load_documents()
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	splitted_docs = text_splitter.split_documents(documents)

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"})
	vectorstore = FAISS.from_documents(splitted_docs, embeddings)

	os.makedirs(INDEX_PATH, exist_ok=True)
	vectorstore.save_local(INDEX_PATH)
	print(f"Vector index saved to {INDEX_PATH}")

	if __name__ == "__main__":
	build_faiss_index()