Spaces:

sugiv
/

Research-PDFs-VQA

Sleeping

App Files Files Community

sugiv commited on Oct 24

Commit

452c0e2

•

1 Parent(s): 170c3c0

Adding initial set of files

Browse files

Files changed (4) hide show

app.py +175 -0
backend.py +175 -0
enriched_pdf.pkl +3 -0
pdf_classes.py +25 -0

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import gradio as gr
+from backend import get_answer
+import pickle
+import re
+import os
+from pdf_classes import PDFSegment, PDFPage, RichPDFDocument
+def load_enriched_pdf(file_path):
+    with open(file_path, 'rb') as f:
+        return pickle.load(f)
+# Load the enriched PDF
+enriched_pdf = load_enriched_pdf('enriched_pdf.pkl')
+# Access API tokens from environment variables
+jina_api_token = os.getenv('JINA_API_TOKEN')
+gpt4_api_key = os.getenv('GPT4_API_KEY')
+pinecone_api_key = os.getenv('PINECONE_API_KEY')
+# Initialize Pinecone with environment variables
+os.environ["PINECONE_API_KEY"] = pinecone_api_key
+os.environ["PINECONE_ENVIRONMENT"] = "us-east-1"
+# Sample data for papers (5 papers for the grid)
+papers = [
+    {"id": "1", "title": "Attention Is All You Need", "authors": "Vaswani et al.", "year": 2017},
+    {"id": "2", "title": "BERT", "authors": "Devlin et al.", "year": 2018},
+    {"id": "3", "title": "GPT-3", "authors": "Brown et al.", "year": 2020},
+    {"id": "4", "title": "Transformer-XL", "authors": "Dai et al.", "year": 2019},
+    {"id": "5", "title": "T5", "authors": "Raffel et al.", "year": 2020},
+]
+predefined_questions = {
+    '1': [
+        'Explain equation one in laymen terms and explain each and every component?',
+        'Create list of authors who contributed to the paper in the same order, starting from left to right and go down?',
+        'Explain figure two, left to right and also explain the flow of the diagram?',
+        'Explain the position-wise Feed forward networks and equation two?',
+        'Please summarize the findings from table 1?',
+        'Explain the optimizer used and explain equation 3',
+        'What is BLUE score for Tranformer model from Table 2?',
+        'What does Figure 1 illustrate about the overall architecture of the Transformer model?',
+        'How does Figure 2 depict the difference between Scaled Dot-Product Attention and Multi-Head Attention?',
+        'Based on Figure 1, how many encoder and decoder layers are used in the Transformer model?',
+        'What mathematical formula is shown in Figure 2 for Scaled Dot-Product Attention?',
+        'According to Table 1, how does the complexity of Self-Attention compare to Recurrent and Convolutional layers?',
+        'What does Table 2 reveal about the BLEU scores and training costs of the Transformer compared to other models?',
+        "How does Table 3 visualize the impact of different model variations on the Transformer's performance?",
+        'What does Equation 3 in the paper represent, and how is it visually presented?',
+        'Can you describe the sinusoidal function used for positional encoding as shown in the equations in Section 3.5?',
+        "How does Figure 1 illustrate the flow of information in the Transformer's encoder-decoder structure?"
+    ]
+}
+css = """
+body { font-family: Arial, sans-serif; }
+.container { max-width: 800px; margin: 0 auto; padding: 20px; }
+.hero { text-align: center; margin-bottom: 30px; }
+.paper-grid { display: grid; grid-template-columns: repeat(5, 1fr); gap: 10px; margin-bottom: 30px; }
+.paper-tile { background-color: white; border: 2px solid #ddd; border-radius: 8px; padding: 10px; cursor: pointer; transition: all 0.3s; }
+.paper-tile:hover { transform: translateY(-5px); box-shadow: 0 5px 15px rgba(0,0,0,0.1); }
+.paper-tile.selected { border-color: #007bff; background-color: #e6f3ff; }
+.paper-tile h3 { margin-top: 0; font-size: 14px; }
+.paper-tile p { margin: 5px 0; font-size: 12px; color: #666; }
+#chat-area { background-color: white; border-radius: 8px; padding: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
+"""
+def update_predefined_questions(paper_id):
+    if paper_id in predefined_questions:
+        return gr.Dropdown(choices=predefined_questions[paper_id], visible=True)
+    return gr.Dropdown(choices=[], visible=False)
+def format_answer(answer):
+    # Convert LaTeX-style math to Markdown-style math
+    answer = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', answer)
+    answer = re.sub(r'\\\((.*?)\\\)', r'$\1$', answer)
+    # Format headers
+    lines = answer.split('\n')
+    formatted_lines = []
+    for line in lines:
+        if line.startswith('###'):
+            formatted_lines.append(f"\n{line}\n")
+        elif line.startswith('**') and line.endswith('**'):
+            formatted_lines.append(f"\n{line}\n")
+        else:
+            formatted_lines.append(line)
+    # Join lines back together
+    formatted_answer = '\n'.join(formatted_lines)
+    # Add spacing around math blocks
+    formatted_answer = re.sub(r'(\\\\.*?\\\\)', r'\n\1\n', formatted_answer)
+    return formatted_answer
+def update_chat_area(paper_id, predefined_question):
+    if not paper_id:
+        return "Please select a paper first."
+    selected_paper = next((p for p in papers if p['id'] == paper_id), None)
+    if not selected_paper:
+        return "Invalid paper selection."
+    if selected_paper['id'] != '1':
+        return "This paper will be supported soon."
+    if not predefined_question:
+        return "Please select a predefined question."
+    # Call the backend function to get the answer
+    answer = get_answer(predefined_question, enriched_pdf, jina_api_token, gpt4_api_key)
+    return format_answer(answer) if answer else "Failed to generate an answer. Please try again."
+with gr.Blocks(css=css) as demo:
+    gr.HTML('''
+        <div class="hero">
+            <h1>AI Paper Q&A</h1>
+            <p>Select a paper and ask questions about it</p>
+        </div>
+    ''')
+    paper_id_input = gr.Textbox(visible=False)
+    with gr.Row():
+        paper_tiles = gr.Radio(
+            choices=[f"{p['title']} ({p['authors']}, {p['year']})" for p in papers],
+            label="Select a paper",
+            info="Choose one of the papers to ask questions about."
+        )
+    predefined_question_dropdown = gr.Dropdown(label="Select a predefined question", choices=[], visible=False)
+    custom_question_input = gr.Textbox(
+        label="Or ask your own question here...",
+        value="Will be supported later after adding prompt guard",
+        interactive=False
+    )
+    submit_btn = gr.Button("Submit")
+    chat_output = gr.Markdown(label="Answer")
+    def update_chat_area_with_loading(paper_id, predefined_question):
+        # Display loading message while processing
+        loading_message = "**Generating answer...**"
+        # Return early with loading message to show progress
+        yield loading_message
+        # Call the actual function and yield its result
+        yield update_chat_area(paper_id, predefined_question)
+    paper_tiles.change(
+        fn=lambda x: next((p['id'] for p in papers if f"{p['title']} ({p['authors']}, {p['year']})" == x), None),
+        inputs=[paper_tiles],
+        outputs=[paper_id_input]
+    )
+    paper_id_input.change(
+        fn=update_predefined_questions,
+        inputs=[paper_id_input],
+        outputs=[predefined_question_dropdown]
+    )
+    submit_btn.click(
+        fn=update_chat_area_with_loading,
+        inputs=[paper_id_input, predefined_question_dropdown],
+        outputs=[chat_output]
+    )
+if __name__ == '__main__':
+    demo.launch()

backend.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import requests
+import numpy as np
+from openai import OpenAI
+import json
+from pinecone import Pinecone
+import os
+import requests
+import numpy as np
+from PIL import Image as PILImage
+from docarray import BaseDoc
+from docarray import DocList
+from docarray.typing import ImageTensor, NdArray
+from typing import List, Dict, Optional
+import requests
+import base64
+from pdf_classes import RichPDFDocument
+import io
+# Access environment variables individually and pass them as separate arguments
+pc = Pinecone(
+    api_key=os.environ["PINECONE_API_KEY"],
+    environment=os.environ["PINECONE_ENVIRONMENT"]
+)
+print("Connected to Pinecone")
+index = pc.Index("rich-pdf-late-chunks")
+def create_question_embedding(question: str, api_token: str) -> np.ndarray:
+    url = 'https://api.jina.ai/v1/embeddings'
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {api_token}'
+    }
+    data = {
+        "model": "jina-clip-v1",
+        "input": [{"text": question}]
+    }
+    response = requests.post(url, headers=headers, json=data)
+    if response.status_code == 200:
+        result = response.json()
+        return np.array(result['data'][0]['embedding'])
+    else:
+        raise Exception(f"Error creating embedding: {response.text}")
+from openai import OpenAI
+def create_few_shot_prompt(question: str, rich_pdf: RichPDFDocument, pinecone_index, api_token: str, top_k: int = 3):
+    prompt = f"Question: {question}\n\n"
+    prompt += "Here are relevant excerpts from the document:\n\n"
+    image_data = []
+    included_pages = set()
+    question_embedding = create_question_embedding(question, api_token)
+    results = pinecone_index.query(vector=question_embedding.tolist(), top_k=top_k, include_metadata=True)
+    for i, match in enumerate(results['matches'], 1):
+        metadata = match['metadata']
+        #print(f"Processing chunk {i}: {metadata}")
+        segment_types = metadata['segment_types'].split(',')
+        page_numbers = [int(pn) for pn in metadata['page_numbers'].split(',')]
+        # Handle potential JSON decoding errors
+        try:
+            contents = json.loads(metadata['contents'])
+        except json.JSONDecodeError:
+            contents = [metadata['contents']]  # Treat as a single content item if JSON decoding fails
+        prompt += f"Excerpt {i}:\n"
+        prompt += f"Pages: {', '.join(map(str, page_numbers))}\n"
+        prompt += f"Types: {', '.join(segment_types)}\n"
+        for j, content in enumerate(contents, 1):
+            if isinstance(content, str) and '[Image' in content:
+                prompt += f"Image content {j}: {content}\n"
+            else:
+                prompt += f"Text content {j}: {content[:200]}...\n"  # Limit text content to 200 characters
+        prompt += "\n"
+        # Add only one full-page screenshot as a reference
+        if not included_pages and page_numbers:
+            page_num = page_numbers[0]
+            prompt += f"\nFull-page context for Page {page_num + 1}: [Full-page screenshot]\n"
+            buffered = io.BytesIO()
+            PILImage.fromarray(rich_pdf.pages[page_num].screenshot).save(buffered, format="PNG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            image_data.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/png;base64,{img_base64}",
+                    "detail": "low"
+                }
+            })
+            included_pages.add(page_num)
+            #print(f"Added full-page screenshot of page {page_num + 1} to image_data")
+    prompt += "\nInstructions for answering the question:\n"
+    prompt += "1. Carefully review all provided excerpts.\n"
+    prompt += "2. Use the full-page screenshot to understand the overall context.\n"
+    prompt += "3. Refer to specific excerpts in your answer when applicable.\n"
+    prompt += "4. If the question asks for specific information, provide a clear and concise answer.\n"
+    prompt += "5. If the answer isn't directly stated, use the context to infer the most likely answer.\n\n"
+    prompt += f"Now, please answer the following question based on the provided information:\n{question}\n"
+    #print(f"\nTotal images included: {len(image_data)}")
+    return prompt, image_data
+def query_gpt4o(question: str, rich_pdf, pinecone_index, api_token: str, gpt4_api_key: str):
+    client = OpenAI(api_key=gpt4_api_key)
+    prompt, image_data = create_few_shot_prompt(question, rich_pdf, pinecone_index, api_token)
+    #print("Prompt generated is:", prompt)
+    #print(f"\nNumber of images included: {len(image_data)}")
+    content_list = [{"type": "text", "text": prompt}] + image_data
+    #print(f"Total number of content items (text + images): {len(content_list)}")
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o",  # Ensure this is the correct model name for GPT-4 with vision capabilities
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an advanced AI assistant capable of analyzing various types of documents, including but not limited to research papers, financial reports, and general texts. Your task is to provide accurate and relevant answers to questions by carefully examining both textual and visual information provided from the document. When appropriate, cite specific excerpts, images, or page numbers in your responses. Explain your reasoning clearly, especially when making inferences or connections between different parts of the document."
+                },
+                {
+                    "role": "user",
+                    "content": content_list
+                }
+            ],
+            max_tokens=500  # Increased token limit for more detailed responses
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        print(f"Failed to execute GPT-4V query: {e}")
+        return None
+import re
+def format_answer(answer):
+    # Convert LaTeX-style math to Markdown-style math
+    answer = re.sub(r'\$\$(.*?)\$\$', r'\\[\1\\]', answer)
+    answer = re.sub(r'\$(.*?)\$', r'\\(\1\\)', answer)
+    # Format headers
+    lines = answer.split('\n')
+    formatted_lines = []
+    for line in lines:
+        if line.startswith('###'):
+            formatted_lines.append(f"\n{line}\n")
+        elif line.startswith('**') and line.endswith('**'):
+            formatted_lines.append(f"\n{line}\n")
+        else:
+            formatted_lines.append(line)
+    # Join lines back together
+    formatted_answer = '\n'.join(formatted_lines)
+    # Add spacing around math blocks
+    formatted_answer = re.sub(r'(\\\\.*?\\\\)', r'\n\1\n', formatted_answer)
+    return formatted_answer
+# Example usage function
+def get_answer(question: str, enriched_pdf:RichPDFDocument, jina_api_token: str, gpt4_api_key: str):
+    answer_generated = query_gpt4o(question, enriched_pdf, index, jina_api_token, gpt4_api_key)
+    #print(answer_generated)
+    if answer_generated:
+        return format_answer(answer_generated)
+    else:
+        return None

enriched_pdf.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e35595825f67b68880e9c7b3dec8ca15a3d92422a27e911974df592d89b39a8b
+size 30402603

pdf_classes.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# pdf_classes.py
+from docarray import BaseDoc
+from docarray import DocList
+from docarray.typing import ImageTensor, NdArray
+from typing import Dict, Optional
+class PDFSegment(BaseDoc):
+    page_number: int
+    segment_type: str  # 'text', 'image', 'table', or 'hybrid'
+    content: Optional[str]
+    image: Optional[ImageTensor]
+    position: Dict[str, int]  # {x, y, width, height}
+    relationships: Dict[str, Optional[str]]  # {'prev': id, 'next': id, 'parent': id}
+    embedding: Optional[NdArray[768]]
+class PDFPage(BaseDoc):
+    page_number: int
+    screenshot: ImageTensor
+    embedding: Optional[NdArray[768]] = None
+class RichPDFDocument(BaseDoc):
+    file_path: str
+    num_pages: int
+    segments: DocList[PDFSegment]
+    pages: DocList[PDFPage]