Spaces:

aimanathar
/

virtual_trainr

Sleeping

App Files Files Community

virtual_trainr / app.py

aimanathar

Upload app.py

f357715 verified 5 months ago

raw

history blame contribute delete

2.36 kB

	import textwrap
	import warnings

	import faiss
	import numpy as np
	import torch

	warnings.filterwarnings("ignore")
	import gradio as gr
	import pytesseract
	from pdf2image import convert_from_path
	from pdfminer.high_level import extract_text
	from sentence_transformers import SentenceTransformer
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


	# ================== PDF Handling Functions ==================
	def pdf_to_text(path):
	try:
	txt = extract_text(path) or ""
	except Exception:
	txt = ""
	if len(txt.strip()) < 200:
	try:
	pages = convert_from_path(path, dpi=200)
	ocr_all = [pytesseract.image_to_string(img) for img in pages]
	txt = "\n".join(ocr_all)
	except Exception:
	txt = ""
	return txt

	def chunk_text(text, max_chars=800):
	paras = [p.strip() for p in text.split("\n") if p.strip()]
	chunks, buf = [], ""
	for p in paras:
	if len(p) > max_chars:
	for piece in textwrap.wrap(p, width=max_chars, break_long_words=False):
	chunks.append(piece.strip())
	else:
	if len(buf) + len(p) + 1 <= max_chars:
	buf = (buf + "\n" + p).strip()
	else:
	if buf: chunks.append(buf)
	buf = p
	if buf: chunks.append(buf)
	return [c for c in chunks if len(c) > 80]

	# ================== Load Embeddings + Model ==================
	embed_model = SentenceTransformer("all-MiniLM-L6-v2")

	model_id = "google/flan-t5-base"
	tok = AutoTokenizer.from_pretrained(model_id)
	gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	gen_model.to(device)

	# ================== Chat Function ==================
	def chat_fn(message, history=None):
	prompt = f"Answer clearly and exam-ready:\n\nQuestion:\n{message}"
	inputs = tok(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
	out = gen_model.generate(**inputs, max_new_tokens=120, num_beams=4, do_sample=False)
	return tok.decode(out[0], skip_special_tokens=True).strip()

	# ================== Gradio Interface ==================
	iface = gr.ChatInterface(
	fn=chat_fn,
	title="💬 Practical Chatbot",
	description="Ask about Physics & Chemistry Practicals (Class 9–10)."
	)

	iface.launch()