Spaces:

ADKU
/

ResearchGPT_space

Sleeping

App Files Files Community

ADKU commited on Mar 6

Commit

7ef58b7

verified ·

1 Parent(s): 23c923c

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -103

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModel
 import google.generativeai as genai
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -64,8 +66,8 @@ except Exception as e:
     logger.error(f"Model loading failed: {e}")
     raise
-# Generate SciBERT embeddings
-def generate_embeddings_sci_bert(texts, batch_size=32):
     try:
         all_embeddings = []
         for i in range(0, len(texts), batch_size):
@@ -94,7 +96,7 @@ except Exception as e:
     logger.error(f"FAISS index creation failed: {e}")
     raise
-# Hybrid search function (return indices instead of truncated strings)
 def get_relevant_papers(query):
     if not query.strip():
         return [], "Please enter a search query."
@@ -106,73 +108,127 @@ def get_relevant_papers(query):
         bm25_top_indices = np.argsort(bm25_scores)[::-1][:5]
         combined_indices = list(set(indices[0]) | set(bm25_top_indices))
         ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
-        # Return formatted strings for dropdown and indices for full data
         papers = [f"{i+1}. {df.iloc[idx]['title']} - Abstract: {df.iloc[idx]['abstract'][:200]}..." for i, idx in enumerate(ranked_results[:5])]
         return papers, ranked_results[:5], "Search completed."
     except Exception as e:
         logger.error(f"Search failed: {e}")
         return [], [], "Search failed. Please try again."
-# Gemini API QA function with full context
-def answer_question(selected_index, question, history):
-    if selected_index is None:
-        return [(question, "Please select a paper first!")], history
     if not question.strip():
         return [(question, "Please ask a question!")], history
     if question.lower() in ["exit", "done"]:
-        return [("Conversation ended.", "Select a new paper or search again!")], []
     try:
-        # Get full paper data from DataFrame using index
-        paper_data = df.iloc[selected_index]
-        title = paper_data["title"]
-        abstract = paper_data["abstract"]  # Full abstract, not truncated
-        authors = ", ".join(paper_data["authors"])
-        doi = paper_data["doi"]
-        # Build prompt with all fields
-        prompt = (
-            "You are Dr. Sage, the world's most brilliant and reliable research assistant, specializing in machine learning, deep learning, and agriculture. "
-            "Your goal is to provide concise, accurate, and well-structured answers based on the given paper's details. "
-            "When asked about tech stacks or methods, follow these guidelines:\n"
-            "1. If the abstract explicitly mentions technologies (e.g., Python, TensorFlow), list them precisely with brief explanations.\n"
-            "2. If the abstract is vague (e.g., 'machine learning techniques'), infer the most likely tech stacks based on the context of crop prediction and modern research practices, and explain your reasoning.\n"
-            "3. Always respond in a clear, concise format—use bullet points for lists (e.g., tech stacks) and short paragraphs for explanations.\n"
-            "4. If the question requires prior conversation context, refer to it naturally to maintain coherence.\n"
-            "5. If the abstract lacks enough detail, supplement with plausible, domain-specific suggestions and note they are inferred.\n"
-            "6. Avoid speculation or fluff—stick to facts or educated guesses grounded in the field.\n\n"
-            "Here’s the paper:\n"
-            f"Title: {title}\n"
-            f"Authors: {authors}\n"
-            f"Abstract: {abstract}\n"
-            f"DOI: {doi}\n\n"
-        )
-        # Add history if present
-        if history:
-            prompt += "Previous conversation (use for context):\n"
-            for user_q, bot_a in history[-2:]:
-                prompt += f"User: {user_q}\nAssistant: {bot_a}\n"
-        prompt += f"Now, answer this question: {question}"
-        logger.info(f"Prompt sent to Gemini API: {prompt[:200]}...")
-        # Call Gemini API (Gemini 1.5 Flash)
-        model = genai.GenerativeModel("gemini-1.5-flash")
-        response = model.generate_content(prompt)
-        answer = response.text.strip()
-        # Fallback for poor responses
-        if not answer or len(answer) < 15:
-            answer = (
-                "The abstract doesn’t provide specific technologies, but based on crop prediction with machine learning and deep learning, likely tech stacks include:\n"
-                "- Python: Core language for ML/DL.\n"
-                "- TensorFlow or PyTorch: Frameworks for deep learning models.\n"
-                "- Scikit-learn: For traditional ML algorithms.\n"
-                "- Pandas/NumPy: For data handling and preprocessing."
             )
         history.append((question, answer))
         return history, history
     except Exception as e:
@@ -183,70 +239,106 @@ def answer_question(selected_index, question, history):
 # Gradio UI
 with gr.Blocks(
     css="""
-    .chatbot {height: 600px; overflow-y: auto;}
-    .sidebar {width: 300px;}
-    #main {display: flex; flex-direction: row;}
     """,
-    theme=gr.themes.Default(primary_hue="blue")
 ) as demo:
-    gr.Markdown("# ResearchGPT - Paper Search & Chat")
     with gr.Row(elem_id="main"):
-        # Sidebar for search
-        with gr.Column(scale=1, min_width=300, elem_classes="sidebar"):
-            gr.Markdown("### Search Papers")
-            query_input = gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning in healthcare")
-            search_btn = gr.Button("Search")
-            paper_dropdown = gr.Dropdown(label="Select a Paper", choices=[], interactive=True)
-            search_status = gr.Textbox(label="Search Status", interactive=False)
-            # States to store paper choices and indices
-            paper_choices_state = gr.State([])
-            paper_indices_state = gr.State([])
-            search_btn.click(
-                fn=get_relevant_papers,
-                inputs=query_input,
-                outputs=[paper_choices_state, paper_indices_state, search_status]
-            ).then(
-                fn=lambda choices: gr.update(choices=choices, value=None),
-                inputs=paper_choices_state,
-                outputs=paper_dropdown
-            )
         # Main chat area
-        with gr.Column(scale=3):
-            gr.Markdown("### Chat with Selected Paper")
-            selected_paper = gr.Textbox(label="Selected Paper", interactive=False)
             chatbot = gr.Chatbot(label="Conversation", elem_classes="chatbot")
             question_input = gr.Textbox(label="Ask a question", placeholder="e.g., What methods are used?")
             chat_btn = gr.Button("Send")
-            # State to store conversation history and selected index
             history_state = gr.State([])
             selected_index_state = gr.State(None)
-            # Update selected paper and index
-            def update_selected_paper(choice, indices):
-                if choice is None:
-                    return "", None
-                index = int(choice.split(".")[0]) - 1  # Extract rank (e.g., "1." -> 0)
-                selected_idx = indices[index]
-                return choice, selected_idx
-            paper_dropdown.change(
-                fn=update_selected_paper,
-                inputs=[paper_dropdown, paper_indices_state],
-                outputs=[selected_paper, selected_index_state]
             ).then(
                 fn=lambda: [],
                 inputs=None,
-                outputs=chatbot
             )
-            # Handle chat
             chat_btn.click(
                 fn=answer_question,
-                inputs=[selected_index_state, question_input, history_state],
                 outputs=[chatbot, history_state]
             ).then(
                 fn=lambda: "",

 from transformers import AutoTokenizer, AutoModel
 import google.generativeai as genai
 import logging
+from PyPDF2 import PdfReader
+import io
 # Set up logging
 logging.basicConfig(level=logging.INFO)
     logger.error(f"Model loading failed: {e}")
     raise
+# Generate SciBERT embeddings (optimized with larger batch size)
+def generate_embeddings_sci_bert(texts, batch_size=64):  # Increased batch size for efficiency
     try:
         all_embeddings = []
         for i in range(0, len(texts), batch_size):
     logger.error(f"FAISS index creation failed: {e}")
     raise
+# Hybrid search function (unchanged from original)
 def get_relevant_papers(query):
     if not query.strip():
         return [], "Please enter a search query."
         bm25_top_indices = np.argsort(bm25_scores)[::-1][:5]
         combined_indices = list(set(indices[0]) | set(bm25_top_indices))
         ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
         papers = [f"{i+1}. {df.iloc[idx]['title']} - Abstract: {df.iloc[idx]['abstract'][:200]}..." for i, idx in enumerate(ranked_results[:5])]
         return papers, ranked_results[:5], "Search completed."
     except Exception as e:
         logger.error(f"Search failed: {e}")
         return [], [], "Search failed. Please try again."
+# Process uploaded PDF for RAG
+def process_uploaded_pdf(file):
+    try:
+        pdf_reader = PdfReader(file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text() or ""
+        cleaned_text = clean_text(text)
+        chunks = [cleaned_text[i:i+1000] for i in range(0, len(cleaned_text), 1000)]  # Chunk for efficiency
+        embeddings = generate_embeddings_sci_bert(chunks)
+        faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
+        faiss_index.add(embeddings.astype(np.float32))
+        tokenized_chunks = [chunk.split() for chunk in chunks]
+        bm25_rag = BM25Okapi(tokenized_chunks)
+        return {"chunks": chunks, "embeddings": embeddings, "faiss_index": faiss_index, "bm25": bm25_rag}, "Document processed successfully"
+    except Exception as e:
+        logger.error(f"PDF processing failed: {e}")
+        return None, "Failed to process document"
+# Hybrid search for RAG
+def get_relevant_chunks(query, uploaded_doc):
+    if not query.strip():
+        return [], "Please enter a question."
+    try:
+        query_embedding = generate_embeddings_sci_bert([query])
+        distances, indices = uploaded_doc["faiss_index"].search(query_embedding.astype(np.float32), 3)
+        bm25_scores = uploaded_doc["bm25"].get_scores(query.split())
+        combined_indices = list(set(indices[0]) | set(np.argsort(bm25_scores)[::-1][:3]))
+        ranked_results = sorted(combined_indices, key=lambda idx: -bm25_scores[idx])
+        return [uploaded_doc["chunks"][idx] for idx in ranked_results[:3]], "Retrieval completed."
+    except Exception as e:
+        logger.error(f"RAG retrieval failed: {e}")
+        return [], "Retrieval failed."
+# Unified QA function
+def answer_question(mode, selected_index, question, history, uploaded_doc=None):
     if not question.strip():
         return [(question, "Please ask a question!")], history
     if question.lower() in ["exit", "done"]:
+        return [("Conversation ended.", "Start a new conversation!")], []
     try:
+        if mode == "research":
+            if selected_index is None:
+                return [(question, "Please select a paper first!")], history
+            paper_data = df.iloc[selected_index]
+            title = paper_data["title"]
+            abstract = paper_data["abstract"]
+            authors = ", ".join(paper_data["authors"])
+            doi = paper_data["doi"]
+            prompt = (
+                "You are Dr. Sage, the world's most brilliant and reliable research assistant, specializing in machine learning, deep learning, and agriculture. "
+                "Your goal is to provide concise, accurate, and well-structured answers based on the given paper's details. "
+                "When asked about tech stacks or methods, follow these guidelines:\n"
+                "1. If the abstract explicitly mentions technologies (e.g., Python, TensorFlow), list them precisely with brief explanations.\n"
+                "2. If the abstract is vague (e.g., 'machine learning techniques'), infer the most likely tech stacks based on the context of crop prediction and modern research practices, and explain your reasoning.\n"
+                "3. Always respond in a clear, concise format—use bullet points for lists (e.g., tech stacks) and short paragraphs for explanations.\n"
+                "4. If the question requires prior conversation context, refer to it naturally to maintain coherence.\n"
+                "5. If the abstract lacks enough detail, supplement with plausible, domain-specific suggestions and note they are inferred.\n"
+                "6. Avoid speculation or fluff—stick to facts or educated guesses grounded in the field.\n\n"
+                "Here’s the paper:\n"
+                f"Title: {title}\n"
+                f"Authors: {authors}\n"
+                f"Abstract: {abstract}\n"
+                f"DOI: {doi}\n\n"
+            )
+            if history:
+                prompt += "Previous conversation (use for context):\n"
+                for user_q, bot_a in history[-2:]:
+                    prompt += f"User: {user_q}\nAssistant: {bot_a}\n"
+            prompt += f"Now, answer this question: {question}"
+            model = genai.GenerativeModel("gemini-1.5-flash")
+            response = model.generate_content(prompt)
+            answer = response.text.strip()
+            if not answer or len(answer) < 15:
+                answer = (
+                    "The abstract doesn’t provide specific technologies, but based on crop prediction with machine learning and deep learning, likely tech stacks include:\n"
+                    "- Python: Core language for ML/DL.\n"
+                    "- TensorFlow or PyTorch: Frameworks for deep learning models.\n"
+                    "- Scikit-learn: For traditional ML algorithms.\n"
+                    "- Pandas/NumPy: For data handling and preprocessing."
+                )
+        elif mode == "rag":
+            if uploaded_doc is None:
+                return [(question, "Please upload a document first!")], history
+            relevant_chunks, _ = get_relevant_chunks(question, uploaded_doc)
+            context = "\n".join(relevant_chunks)
+            prompt = (
+                "You are an expert AI assistant specializing in answering questions based on uploaded documents. "
+                "Provide concise, accurate answers based on the following document content:\n"
+                f"Content: {context}\n\n"
             )
+            if history:
+                prompt += "Previous conversation (use for context):\n"
+                for user_q, bot_a in history[-2:]:
+                    prompt += f"User: {user_q}\nAssistant: {bot_a}\n"
+            prompt += f"Now, answer this question: {question}"
+            model = genai.GenerativeModel("gemini-1.5-flash")
+            response = model.generate_content(prompt)
+            answer = response.text.strip()
+        else:  # general mode
+            prompt = (
+                "You are a highly knowledgeable AI assistant. Answer the following question concisely and accurately:\n"
+            )
+            if history:
+                prompt += "Previous conversation (use for context):\n"
+                for user_q, bot_a in history[-2:]:
+                    prompt += f"User: {user_q}\nAssistant: {bot_a}\n"
+            prompt += f"Question: {question}"
+            model = genai.GenerativeModel("gemini-1.5-flash")
+            response = model.generate_content(prompt)
+            answer = response.text.strip()
         history.append((question, answer))
         return history, history
     except Exception as e:
 # Gradio UI
 with gr.Blocks(
     css="""
+    .chatbot {height: 500px; overflow-y: auto; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
+    .sidebar {width: 350px; padding: 15px; background: #f8f9fa; border-radius: 10px;}
+    #main {display: flex; flex-direction: row; gap: 20px; padding: 20px;}
+    .tab-content {padding: 20px; background: #ffffff; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);}
+    .gr-button {background: #007bff; color: white; border-radius: 5px; transition: background 0.3s;}
+    .gr-button:hover {background: #0056b3;}
+    h1 {color: #007bff; text-align: center; margin-bottom: 20px;}
     """,
+    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
 ) as demo:
+    gr.Markdown("# Triad: ResearchGPT, RAG, & General Chat")
     with gr.Row(elem_id="main"):
+        # Sidebar
+        with gr.Column(scale=1, min_width=350, elem_classes="sidebar"):
+            mode_tabs = gr.Tabs()
+            with mode_tabs:
+                # Research Mode (unchanged backend)
+                with gr.TabItem("Research Mode"):
+                    gr.Markdown("### Search Papers")
+                    query_input = gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning in healthcare")
+                    search_btn = gr.Button("Search")
+                    paper_dropdown = gr.Dropdown(label="Select a Paper", choices=[], interactive=True)
+                    search_status = gr.Textbox(label="Search Status", interactive=False)
+                    paper_choices_state = gr.State([])
+                    paper_indices_state = gr.State([])
+                    search_btn.click(
+                        fn=get_relevant_papers,
+                        inputs=query_input,
+                        outputs=[paper_choices_state, paper_indices_state, search_status]
+                    ).then(
+                        fn=lambda choices: gr.update(choices=choices, value=None),
+                        inputs=paper_choices_state,
+                        outputs=paper_dropdown
+                    )
+                # RAG Mode
+                with gr.TabItem("RAG Mode"):
+                    gr.Markdown("### Upload Document")
+                    file_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
+                    upload_status = gr.Textbox(label="Upload Status", interactive=False)
+                    uploaded_doc_state = gr.State(None)
+                    file_upload.change(
+                        fn=process_uploaded_pdf,
+                        inputs=file_upload,
+                        outputs=[uploaded_doc_state, upload_status]
+                    )
+                # General Mode
+                with gr.TabItem("General Chat"):
+                    gr.Markdown("Ask anything, powered by Gemini!")
         # Main chat area
+        with gr.Column(scale=3, elem_classes="tab-content"):
+            gr.Markdown("### Chat Area")
+            selected_display = gr.Markdown(label="Selected Context", value="Select a mode to begin!")
             chatbot = gr.Chatbot(label="Conversation", elem_classes="chatbot")
             question_input = gr.Textbox(label="Ask a question", placeholder="e.g., What methods are used?")
             chat_btn = gr.Button("Send")
             history_state = gr.State([])
             selected_index_state = gr.State(None)
+            def update_display(mode, choice, indices, uploaded_doc):
+                if mode == "research" and choice:
+                    index = int(choice.split(".")[0]) - 1
+                    selected_idx = indices[index]
+                    paper = df.iloc[selected_idx]
+                    return f"**{paper['title']}**<br>DOI: [{paper['doi']}](https://doi.org/{paper['doi']})", selected_idx
+                elif mode == "rag" and uploaded_doc:
+                    return "Uploaded Document Ready", None
+                elif mode == "general":
+                    return "General Chat Mode", None
+                return "Select a mode to begin!", None
+            mode_tabs.select(
+                fn=lambda tab: ("research" if tab == "Research Mode" else "rag" if tab == "RAG Mode" else "general"),
+                inputs=None,
+                outputs=None,
+                _js="tab => tab"
+            ).then(
+                fn=update_display,
+                inputs=[mode_tabs, paper_dropdown, paper_indices_state, uploaded_doc_state],
+                outputs=[selected_display, selected_index_state]
             ).then(
                 fn=lambda: [],
                 inputs=None,
+                outputs=[chatbot, history_state]
             )
+            paper_dropdown.change(
+                fn=update_display,
+                inputs=[mode_tabs, paper_dropdown, paper_indices_state, uploaded_doc_state],
+                outputs=[selected_display, selected_index_state]
+            )
             chat_btn.click(
                 fn=answer_question,
+                inputs=[mode_tabs, selected_index_state, question_input, history_state, uploaded_doc_state],
                 outputs=[chatbot, history_state]
             ).then(
                 fn=lambda: "",