Spaces:
Sleeping
Sleeping
| import textwrap | |
| import warnings | |
| import faiss | |
| import numpy as np | |
| import torch | |
| warnings.filterwarnings("ignore") | |
| import gradio as gr | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from pdfminer.high_level import extract_text | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| # ================== PDF Handling Functions ================== | |
| def pdf_to_text(path): | |
| try: | |
| txt = extract_text(path) or "" | |
| except Exception: | |
| txt = "" | |
| if len(txt.strip()) < 200: | |
| try: | |
| pages = convert_from_path(path, dpi=200) | |
| ocr_all = [pytesseract.image_to_string(img) for img in pages] | |
| txt = "\n".join(ocr_all) | |
| except Exception: | |
| txt = "" | |
| return txt | |
| def chunk_text(text, max_chars=800): | |
| paras = [p.strip() for p in text.split("\n") if p.strip()] | |
| chunks, buf = [], "" | |
| for p in paras: | |
| if len(p) > max_chars: | |
| for piece in textwrap.wrap(p, width=max_chars, break_long_words=False): | |
| chunks.append(piece.strip()) | |
| else: | |
| if len(buf) + len(p) + 1 <= max_chars: | |
| buf = (buf + "\n" + p).strip() | |
| else: | |
| if buf: chunks.append(buf) | |
| buf = p | |
| if buf: chunks.append(buf) | |
| return [c for c in chunks if len(c) > 80] | |
| # ================== Load Embeddings + Model ================== | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| model_id = "google/flan-t5-base" | |
| tok = AutoTokenizer.from_pretrained(model_id) | |
| gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| gen_model.to(device) | |
| # ================== Chat Function ================== | |
| def chat_fn(message, history=None): | |
| prompt = f"Answer clearly and exam-ready:\n\nQuestion:\n{message}" | |
| inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device) | |
| out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False) | |
| return tok.decode(out[0], skip_special_tokens=True).strip() | |
| # ================== Gradio Interface ================== | |
| iface = gr.ChatInterface( | |
| fn=chat_fn, | |
| title="💬 Practical Chatbot", | |
| description="Ask about Physics & Chemistry Practicals (Class 9–10)." | |
| ) | |
| iface.launch() | |