Spaces:
Running
Running
# build_index.py | |
import os | |
import requests | |
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredURLLoader, UnstructuredHTMLLoader | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.schema import Document | |
from langchain_community.docstore.document import Document as LCDocument | |
DOCS_PATH = "docs" | |
INDEX_PATH = "faiss_index" | |
def fetch_html_with_timeout(url: str, timeout=5) -> list[Document]: | |
""" | |
Downloads the page content with a timeout, then parses it using UnstructuredHTMLLoader. | |
Returns a list of Documents (can be 1 or multiple if you want to split further). | |
""" | |
try: | |
response = requests.get(url, timeout=timeout) | |
response.raise_for_status() # raise HTTPError if not 200 | |
except Exception as e: | |
print(f"[Timeout/Fetch Error] Skipping {url}: {e}") | |
return [] | |
# Write the HTML to a temporary file so we can load it with UnstructuredHTMLLoader | |
# (unstructured requires a file-like, we can do in-memory, but let's keep it simple) | |
temp_filename = "temp_html_file.html" | |
with open(temp_filename, "w", encoding="utf-8") as f: | |
f.write(response.text) | |
loader = UnstructuredHTMLLoader(temp_filename) | |
docs = loader.load() # returns a list of Document objects | |
for doc in docs: | |
doc.metadata["source"] = url | |
return docs | |
def load_web_docs(urls: list[str], timeout=5) -> list[Document]: | |
all_docs = [] | |
for url in urls: | |
print(f"Fetching: {url}") | |
docs_from_url = fetch_html_with_timeout(url, timeout=timeout) | |
all_docs.extend(docs_from_url) | |
return all_docs | |
def load_documents(docs_path=DOCS_PATH): | |
all_docs = [] | |
for file_name in os.listdir(docs_path): | |
file_path = os.path.join(docs_path, file_name) | |
print(f"Processing file: {file_name}") # Debug log | |
# 1) Text files | |
if file_name.lower().endswith(".txt"): | |
print(" -> Loading as .txt") | |
loader = TextLoader(file_path, encoding="utf-8") | |
loaded_docs = loader.load() | |
all_docs.extend(loaded_docs) | |
print(f" -> Loaded {len(loaded_docs)} docs from {file_name}") | |
# 2) PDF | |
elif file_name.lower().endswith(".pdf"): | |
print(" -> Loading as .pdf") | |
loader = PyPDFLoader(file_path) | |
pdf_docs = loader.load_and_split() | |
all_docs.extend(pdf_docs) | |
print(f" -> Loaded {len(pdf_docs)} docs from {file_name}") | |
# 3) URLs | |
elif file_name.lower().endswith(".urls"): | |
print(" -> Loading as .urls") | |
with open(file_path, "r", encoding="utf-8") as f: | |
urls = [line.strip() for line in f if line.strip()] | |
print(f" -> Found {len(urls)} URLs in {file_name}") | |
if urls: | |
web_docs = load_web_docs(urls, timeout=5) | |
print(f" -> Loaded {len(web_docs)} web docs from URLs") | |
all_docs.extend(web_docs) | |
else: | |
print(" -> Skipped: unrecognized file type.") | |
return all_docs | |
def build_faiss_index(): | |
documents = load_documents() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
splitted_docs = text_splitter.split_documents(documents) | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"}) | |
vectorstore = FAISS.from_documents(splitted_docs, embeddings) | |
os.makedirs(INDEX_PATH, exist_ok=True) | |
vectorstore.save_local(INDEX_PATH) | |
print(f"Vector index saved to {INDEX_PATH}") | |
if __name__ == "__main__": | |
build_faiss_index() | |