import os import zipfile import torch import faiss import numpy as np import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from sentence_transformers import SentenceTransformer from langchain.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS as LangChainFAISS from langchain.docstore import InMemoryDocstore from langchain.schema import Document from langchain.llms import HuggingFacePipeline from huggingface_hub import login from huggingface_hub import upload_file # Extract the Knowledge Base ZIP if os.path.exists("md_knowledge_base.zip"): with zipfile.ZipFile("md_knowledge_base.zip", "r") as zip_ref: zip_ref.extractall("md_knowledge_base") print("✅ Knowledge base extracted.") # Load Markdown Files KB_PATH = "md_knowledge_base" files = [os.path.join(dp, f) for dp, _, fn in os.walk(KB_PATH) for f in fn if f.endswith(".md")] docs = [doc for f in files for doc in TextLoader(f, encoding="utf-8").load()] print(f"✅ Loaded {len(docs)} documents.") # Chunking def get_dynamic_chunk_size(text): if len(text) < 1000: return 300 elif len(text) < 5000: return 500 else: return 1000 chunks = [] for doc in docs: chunk_size = get_dynamic_chunk_size(doc.page_content) chunk_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100) chunks.extend(chunk_splitter.split_documents([doc])) texts = [chunk.page_content for chunk in chunks] # Vectorstore (FAISS) embed_model_id = "sentence-transformers/all-MiniLM-L6-v2" embedder = SentenceTransformer(embed_model_id) embeddings = embedder.encode(texts, show_progress_bar=False) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(np.array(embeddings, dtype="float32")) docs = [Document(page_content=t) for t in texts] docstore = InMemoryDocstore({str(i): docs[i] for i in range(len(docs))}) id_map = {i: str(i) for i in range(len(docs))} embed_fn = HuggingFaceEmbeddings(model_name=embed_model_id) vectorstore = LangChainFAISS( index=index, docstore=docstore, index_to_docstore_id=id_map, embedding_function=embed_fn ) print("✅ FAISS vectorstore ready.") # Load Falcon-e-1B-Instruct model_id = "tiiuae/falcon-e-1b-instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16 ).to("cuda" if torch.cuda.is_available() else "cpu") text_gen_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device=0 if torch.cuda.is_available() else -1, return_full_text=False, do_sample=False, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id ) llm = HuggingFacePipeline(pipeline=text_gen_pipeline) def truncate_context(context, max_length=1024): tokens = tokenizer.encode(context) if len(tokens) > max_length: tokens = tokens[:max_length] return tokenizer.decode(tokens, skip_special_tokens=True) def format_prompt(context, question): return ( "You are the Hull University Assistant—a friendly, knowledgeable chatbot dedicated to " "helping students with questions about courses, admissions, tuition fees, and student life. " "Use ONLY the information provided in the context below to answer the question. " "If the answer cannot be found in the context, reply: \"I’m sorry, but I don’t have that " "information available right now.\"\n\n" f"Context:\n{truncate_context(context)}\n\n" f"Student Question: {question}\n" "Assistant Answer:" ) def answer_fn(question): docs = vectorstore.similarity_search(question, k=5) if not docs: return "I'm sorry, I couldn't find any relevant information for your query." context = "\n\n".join(d.page_content for d in docs) prompt = format_prompt(context, question) try: response = llm.invoke(prompt).strip() return response except Exception as e: return f"An error occurred: {e}" # Gradio Interface def chat_fn(user_message, history): bot_response = answer_fn(user_message) history = history + [(user_message, bot_response)] return history, history with gr.Blocks() as demo: gr.Markdown("## 📘 University of Hull Assistant") chatbot = gr.Chatbot() state = gr.State([]) user_input = gr.Textbox(placeholder="Ask a question about University of Hull...", show_label=False) user_input.submit(fn=chat_fn, inputs=[user_input, state], outputs=[chatbot, state]) demo.launch()