Spaces:

Sazid2
/

Assamese

Sleeping

App Files Files Community

Sazid2 commited on 22 days ago

Commit

86a2d4b

verified ·

1 Parent(s): cf6f71f

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -209

app.py CHANGED Viewed

@@ -1,225 +1,298 @@
-import streamlit as st
 import os
 import sqlite3
 from datetime import datetime
-import PyPDF2
-from sentence_transformers import SentenceTransformer
-import faiss
 import numpy as np
-from transformers import pipeline
 import pytesseract
-from PIL import Image
 import sympy as sp
-import io
-# App configuration
-st.set_page_config(
-    page_title="Jajabor – SEBA Class 10 Tutor",
-    page_icon="🧭",
-    layout="wide"
-)
-# Initialize session state
-if 'chat_history' not in st.session_state:
-    st.session_state.chat_history = []
-if 'username' not in st.session_state:
-    st.session_state.username = ""
-if 'tutor' not in st.session_state:
-    st.session_state.tutor = None
-class SimpleTutor:
-    def __init__(self):
-        self.llm = None
-        self.embedding_model = None
-        self.index = None
-        self.corpus_chunks = []
-        self._load_models()
-        self.load_pdfs()
-    def _load_models(self):
-        try:
-            self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-            self.llm = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)
-        except Exception as e:
-            st.error(f"Model loading error: {e}")
-    def load_pdfs(self):
-        pdf_dir = "pdfs/class10"
-        if not os.path.exists(pdf_dir):
-            return
-        all_texts = []
-        for fname in os.listdir(pdf_dir):
-            if fname.lower().endswith('.pdf'):
-                path = os.path.join(pdf_dir, fname)
-                try:
-                    reader = PyPDF2.PdfReader(path)
-                    text = ""
-                    for page in reader.pages:
-                        text += page.extract_text() or ""
-                    if text.strip():
-                        all_texts.append(text)
-                except Exception as e:
-                    st.error(f"Error reading {fname}: {e}")
-        self.corpus_chunks = []
-        for text in all_texts:
-            chunks = self._split_text(text)
-            self.corpus_chunks.extend(chunks)
-        if self.corpus_chunks and self.embedding_model:
-            try:
-                embs = self.embedding_model.encode(self.corpus_chunks).astype("float32")
-                dim = embs.shape[1]
-                self.index = faiss.IndexFlatL2(dim)
-                self.index.add(embs)
-            except Exception as e:
-                st.error(f"FAISS error: {e}")
-    def _split_text(self, text, chunk_size=400):
-        if not text:
-            return []
-        return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size) if text[i:i+chunk_size].strip()]
-    def answer_question(self, question):
-        if not question.strip():
-            return "অনুগ্ৰহ কৰি এটা প্ৰশ্ন সোধক।"
-        if self._is_math_question(question):
-            return self._solve_math(question)
-        context = ""
-        if self.index and self.corpus_chunks:
-            relevant_chunks = self._find_relevant_chunks(question)
-            if relevant_chunks:
-                context = "\n".join(relevant_chunks[:2])
-        if self.llm:
-            try:
-                prompt = f"প্ৰশ্ন: {question}\n\nসংদৰ্ভ: {context}\n\nসহায়ক উত্তৰ:" if context else f"প্ৰশ্ন: {question}\n\nউত্তৰ:"
-                response = self.llm(prompt, max_new_tokens=150, temperature=0.3)
-                return response[0]['generated_text']
-            except Exception as e:
-                return f"উত্তৰ তৈয়াৰ কৰোঁতে সমস্যা: {e}"
-        else:
-            return "মই আপোনাৰ প্ৰশ্নটো বুজিলোঁ। অধ্যয়নৰ বাবে শুভেচ্ছা!"
-    def _is_math_question(self, text):
-        math_indicators = ['+', '-', '*', '/', '=', 'x', 'y', 'গণিত', 'সমীকৰণ']
-        return any(indicator in text.lower() for indicator in math_indicators)
-    def _solve_math(self, expr):
-        try:
-            expr = expr.strip().replace('^', '**')
-            if '=' in expr:
-                parts = expr.split('=')
-                if len(parts) == 2:
-                    left = sp.sympify(parts[0].strip())
-                    right = sp.sympify(parts[1].strip())
-                    equation = sp.Eq(left, right)
-                    solutions = sp.solve(equation)
-                    if solutions:
-                        return f"সমীকৰণ: {equation}\n\nসমাধান: {solutions}"
-            else:
-                expr_sym = sp.sympify(expr)
-                simplified = sp.simplify(expr_sym)
-                return f"প্ৰকাশ: {expr}\n\nসৰলীকৃত: {simplified}"
-        except Exception as e:
-            return f"গণিত সমাধানত সমস্যা: {e}"
-    def _find_relevant_chunks(self, question, k=3):
-        if not self.corpus_chunks:
-            return []
-        if self.index and self.embedding_model:
             try:
-                q_vec = self.embedding_model.encode([question]).astype("float32")
-                D, I = self.index.search(q_vec, k)
-                return [self.corpus_chunks[i] for i in I[0] if 0 <= i < len(self.corpus_chunks)]
             except Exception:
-                pass
         return []
-def extract_text_from_image(uploaded_file):
     try:
-        image = Image.open(uploaded_file)
-        text = pytesseract.image_to_string(image)
-        return text.strip()
     except Exception as e:
-        return ""
-# Main app
-def main():
-    st.title("🧭 জাজাবৰ – SEBA Class 10 AI Tutor")
-    # Sidebar
-    with st.sidebar:
-        st.header("👤 লগিন")
-        username = st.text_input("আপোনাৰ নাম", value=st.session_state.username)
-        if username and username != st.session_state.username:
-            st.session_state.username = username
-            st.success(f"লগিন successful: {username}")
-        st.header("📷 ছবিৰ পৰা পাঠ")
-        uploaded_image = st.file_uploader("ছবি আপলোড কৰক", type=['png', 'jpg', 'jpeg'])
-        st.header("💡 টিপছ")
-        st.info("""
-        - নাম লিখি প্ৰশ্ন সোধক
-        - ছবি আপলোড কৰিলে OCR ৰ সহায়ত পাঠ পঢ়িব
-        - বিষয়সমূহ:
-          - অসমীয়া
-          - ইংৰাজী
-          - গণিত
-          - বিজ্ঞান
-          - সামাজিক বিজ্ঞান
-        """)
-    # Initialize tutor
-    if st.session_state.tutor is None:
-        with st.spinner('জাজাবৰক সাজু কৰি থকা হৈছে...'):
-            st.session_state.tutor = SimpleTutor()
-    # Main chat area
-    st.header("💬 জাজাবৰৰ সৈতে কথোপকথন")
-    # Display chat history
-    for i, (question, answer) in enumerate(st.session_state.chat_history):
-        with st.chat_message("user"):
-            st.write(question)
-        with st.chat_message("assistant"):
-            st.write(answer)
-    # Chat input
-    if prompt := st.chat_input("আপোনাৰ প্ৰশ্ন ইয়াত লিখক..."):
-        if not st.session_state.username:
-            st.error("⚠️ প্ৰথমে আপোনাৰ নাম লিখক")
-            st.stop()
-        # Add user message to chat
-        st.session_state.chat_history.append((prompt, ""))
-        # Process OCR if image uploaded
-        full_question = prompt
-        if uploaded_image:
-            ocr_text = extract_text_from_image(uploaded_image)
             if ocr_text:
-                full_question += f"\n[ছবিৰ পাঠ: {ocr_text}]"
-        # Get AI response
-        with st.spinner('জাজাবৰে চিন্তা কৰি আছে...'):
-            response = st.session_state.tutor.answer_question(full_question)
-        # Update chat history
-        st.session_state.chat_history[-1] = (prompt, response)
-        # Rerun to update display
-        st.rerun()
-    # Clear chat button
-    if st.button("🧹 কথোপকথন পৰিষ্কাৰ কৰক"):
-        st.session_state.chat_history = []
-        st.rerun()
 if __name__ == "__main__":
-    main()

+# app.py
+"""
+Jajabor – Minimal safe version (no FAISS, no torch, no transformers)
+- Retrieval: TF-IDF (scikit-learn)
+- PDF reading: PyPDF2
+- OCR: pytesseract
+- Math: sympy
+- UI: Gradio
+"""
 import os
+import io
 import sqlite3
 from datetime import datetime
+import traceback
+from PyPDF2 import PdfReader
+from PIL import Image
+import gradio as gr
 import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import linear_kernel
 import pytesseract
 import sympy as sp
+# ---------- CONFIG ----------
+APP_NAME = "Jajabor – Minimal (TF-IDF retrieval)"
+BASE_DIR = os.path.abspath(os.path.dirname(__file__))
+PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
+DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
+CHUNK_SIZE = 600
+CHUNK_OVERLAP = 120
+TOP_K = 3
+# ---------- DB ----------
+def init_db(path=DB_PATH):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    conn = sqlite3.connect(path)
+    cur = conn.cursor()
+    cur.execute("""CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT UNIQUE, created_at TEXT)""")
+    cur.execute("""CREATE TABLE IF NOT EXISTS interactions (id INTEGER PRIMARY KEY AUTOINCREMENT, user_id INTEGER, timestamp TEXT, query TEXT, answer TEXT, is_math INTEGER)""")
+    conn.commit()
+    conn.close()
+def get_or_create_user(username):
+    username = username.strip()
+    if not username:
+        return None
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+    cur.execute("SELECT id FROM users WHERE username=?", (username,))
+    row = cur.fetchone()
+    if row:
+        uid = row[0]
+    else:
+        cur.execute("INSERT INTO users (username, created_at) VALUES (?, ?)", (username, datetime.utcnow().isoformat()))
+        conn.commit()
+        uid = cur.lastrowid
+    conn.close()
+    return uid
+def log_interaction(user_id, query, answer, is_math):
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+    cur.execute("INSERT INTO interactions (user_id, timestamp, query, answer, is_math) VALUES (?, ?, ?, ?, ?)",
+                (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0))
+    conn.commit()
+    conn.close()
+init_db()
+# ---------- PDF reading ----------
+def extract_text_from_pdf(pdf_path):
+    pages = []
+    try:
+        reader = PdfReader(pdf_path)
+        for page in reader.pages:
             try:
+                txt = page.extract_text() or ""
+                pages.append(txt)
             except Exception:
+                continue
+    except Exception as e:
+        print("PDF read error:", e)
+    return "\n".join(pages)
+def load_all_pdfs(pdf_dir):
+    texts = []
+    metas = []
+    if not os.path.isdir(pdf_dir):
+        print("PDF_DIR not found:", pdf_dir)
+        return texts, metas
+    for fname in sorted(os.listdir(pdf_dir)):
+        if fname.lower().endswith(".pdf"):
+            path = os.path.join(pdf_dir, fname)
+            print("Reading:", path)
+            text = extract_text_from_pdf(path)
+            texts.append(text)
+            metas.append({"source": fname})
+    return texts, metas
+def split_text_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
+    if not text:
         return []
+    chunks = []
+    step = chunk_size - overlap
+    i = 0
+    while i < len(text):
+        chunk = text[i:i+chunk_size]
+        if chunk.strip():
+            chunks.append(chunk)
+        i += max(step, 1)
+    return chunks
+# ---------- Build TF-IDF index ----------
+print("Loading PDFs and building TF-IDF index...")
+all_texts, all_metas = load_all_pdfs(PDF_DIR)
+corpus_chunks = []
+corpus_metas = []
+for text, meta in zip(all_texts, all_metas):
+    chs = split_text_into_chunks(text)
+    corpus_chunks.extend(chs)
+    corpus_metas.extend([meta] * len(chs))
+if len(corpus_chunks) == 0:
+    print("No PDF chunks found. Upload PDFs into pdfs/class10/")
+vectorizer = None
+tfidf_matrix = None
+if corpus_chunks:
     try:
+        vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
+        tfidf_matrix = vectorizer.fit_transform(corpus_chunks)
+        print("TF-IDF ready. Chunks:", len(corpus_chunks))
     except Exception as e:
+        print("Failed to build TF-IDF:", e)
+        vectorizer = None
+        tfidf_matrix = None
+def retrieve_tfidf(query, top_k=TOP_K):
+    if tfidf_matrix is None or vectorizer is None:
+        return []
+    qv = vectorizer.transform([query])
+    sims = linear_kernel(qv, tfidf_matrix).flatten()
+    idxs = sims.argsort()[::-1][:top_k]
+    results = []
+    for idx in idxs:
+        if sims[idx] <= 0:
+            continue
+        results.append({"score": float(sims[idx]), "text": corpus_chunks[idx], "meta": corpus_metas[idx]})
+    return results
+# ---------- OCR and math ----------
+def ocr_from_image(img: Image.Image):
+    try:
+        img = img.convert("RGB")
+    except Exception:
+        pass
+    try:
+        text = pytesseract.image_to_string(img, lang="asm+eng")
+    except Exception:
+        try:
+            text = pytesseract.image_to_string(img)
+        except Exception:
+            text = ""
+    return text.strip()
+def is_likely_math(text: str) -> bool:
+    if not text:
+        return False
+    math_chars = set("0123456789+-*/=^()%")
+    if any(ch in text for ch in math_chars):
+        return True
+    kws = ["গণিত", "সমীকৰণ", "বীজগণিত", "math", "solve", "equation"]
+    return any(k in text for k in kws)
+def solve_math_expression(expr: str):
+    try:
+        expr = expr.replace("^", "**")
+        if "=" in expr:
+            left, right = expr.split("=", 1)
+            eq = sp.Eq(sp.sympify(left), sp.sympify(right))
+            sol = sp.solve(eq)
+            return "ধাপ-ধাপে সমাধান (সংক্ষেপ):\n" + str(sol)
+        else:
+            simp = sp.simplify(sp.sympify(expr))
+            return f"সরলীকৰণ: {simp}"
+    except Exception:
+        return "গণিতীয় অভিব্যক্তি বুজা যায় নাই — দয়া কৰি সঠিকভাৱে লিখক।"
+# ---------- Answering (extractive) ----------
+def answer_with_retrieval(query, chat_history):
+    results = retrieve_tfidf(query, top_k=TOP_K)
+    if not results:
+        return "পাঠ্যপুথি সম্বন্ধীয় তথ্য নহল; দয়া কৰি অধিক স্পষ্টকৈ সোধক।"
+    # Combine top chunks as extractive answer (shorten if too long)
+    answer_parts = []
+    for r in results:
+        txt = r["text"].strip()
+        if len(txt) > 800:
+            txt = txt[:800].rsplit("\n", 1)[0] + "…"
+        answer_parts.append(f"[Source: {r['meta'].get('source','textbook')}] \n{txt}")
+    return "\n\n".join(answer_parts)
+# ---------- Chat logic ----------
+def login_user(username, user_state):
+    username = (username or "").strip()
+    if not username:
+        return user_state, "⚠️ অনুগ্ৰহ কৰি লগিনৰ বাবে এটা নাম লিখক।"
+    user_id = get_or_create_user(username)
+    user_state = {"username": username, "user_id": user_id}
+    total, math_count = 0, 0
+    try:
+        total, math_count = (lambda uid: (lambda c,m: (c,m))( * (lambda cur: (cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (uid,)), cur.fetchone())[1] ) )(uid) )(user_id)
+    except Exception:
+        total, math_count = get_or_create_user(username) and (0,0)
+    stats = f"👤 {username}\n📊 মোট প্ৰশ্ন: {total}\n🧮 গণিত: {math_count}"
+    return user_state, stats
+def chat_logic(username, text_input, image_input, audio_input, chat_history, user_state):
+    if chat_history is None:
+        chat_history = []
+    if not user_state or not user_state.get("user_id"):
+        sys_msg = "⚠️ প্ৰথমে লগিন কৰক।"
+        chat_history = chat_history + [[text_input or "", sys_msg]]
+        return chat_history, user_state, ""
+    user_id = user_state["user_id"]
+    final_query_parts = []
+    ocr_text = ""
+    if image_input:
+        try:
+            if isinstance(image_input, str):
+                img = Image.open(image_input)
+            else:
+                raw = image_input.read()
+                img = Image.open(io.BytesIO(raw))
+            ocr_text = ocr_from_image(img)
             if ocr_text:
+                final_query_parts.append(ocr_text)
+        except Exception:
+            pass
+    if text_input:
+        final_query_parts.append(text_input)
+    if not final_query_parts:
+        sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক বা ছবি আপলোড কৰক।"
+        chat_history = chat_history + [["", sys_msg]]
+        return chat_history, user_state, ""
+    full_query = "\n".join(final_query_parts)
+    is_math_flag = is_likely_math(full_query)
+    if is_math_flag:
+        math_answer = solve_math_expression(full_query)
+        # Use extractive retrieval to provide supporting text and then math result
+        retrieval = answer_with_retrieval(full_query, chat_history)
+        final_answer = f"{retrieval}\n\nগণিত সমাধান:\n{math_answer}"
+    else:
+        final_answer = answer_with_retrieval(full_query, chat_history)
+    log_interaction(user_id, full_query, final_answer, is_math_flag)
+    display_q = text_input or ocr_text or "(image)"
+    chat_history = chat_history + [[display_q, final_answer]]
+    return chat_history, user_state, ""
+# ---------- Gradio UI ----------
+with gr.Blocks(title=APP_NAME) as demo:
+    gr.Markdown("# 🧭 Jajabor – Minimal TF-IDF Tutor (Free)")
+    user_state = gr.State({})
+    with gr.Row():
+        with gr.Column(scale=1):
+            username_inp = gr.Textbox(label="নাম / ইউজাৰ আইডি", placeholder="e.g. abu10")
+            login_btn = gr.Button("Login")
+            stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।")
+        with gr.Column(scale=3):
+            chat = gr.Chatbot(label="জাজাবৰ", height=480)
+            text_inp = gr.Textbox(label="আপোনাৰ প্ৰশ্ন লিখক", lines=2)
+            with gr.Row():
+                image_inp = gr.Image(label="📷 ছবি (Optional)", type="filepath")
+                audio_inp = gr.Audio(label="🎙️ (Optional)", type="filepath")
+            ask_btn = gr.Button("সোধক")
+    login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])
+    ask_btn.click(chat_logic, inputs=[username_inp, text_inp, image_inp, audio_inp, chat, user_state], outputs=[chat, user_state, None])
+    text_inp.submit(chat_logic, inputs=[username_inp, text_inp, image_inp, audio_inp, chat, user_state], outputs=[chat, user_state, None])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)