Spaces:

larawehbe
/

sehatech-demo

Build error

App Files Files Community

larawehbe commited on Nov 24, 2024

Commit

965ac15

verified ·

1 Parent(s): 0bf8a2d

Upload folder using huggingface_hub

Browse files

Files changed (34) hide show

.gitignore +166 -0
README.md +85 -7
__init__.py +0 -0
api/__init__.py +0 -0
api/routes.py +0 -0
chat_app.py +426 -0
chat_history.db +0 -0
config.py +43 -0
core/__init__.py +0 -0
core/document_processor.py +0 -0
core/download_dataset.py +440 -0
core/pineconeqa.py +142 -0
core/rag_engine.py +242 -0
core/s3_utils.py +267 -0
core/voice_processor.py +73 -0
database/__init__.py +0 -0
database/vector_store.py +0 -0
docker/Dockerfile +32 -0
docker/docker-compose.yaml +0 -0
main.py +20 -0
qa_app.py +147 -0
rag_existing_index.ipynb +191 -0
requirements.txt +12 -0
sandbox_testing.ipynb +1073 -0
test_download_data.sh +39 -0
tests/__init__.py +0 -0
tests/test_pinecone.py +152 -0
tests/test_pinecone_embeddings.ipynb +386 -0
tests/test_pinecone_rag.py +166 -0
tests/test_rag_pdf.ipynb +204 -0
todo.md +2 -0
utils/__init__.py +0 -0
utils/helpers.py +0 -0
utils/models.py +94 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.pem
+*.m4a
+downloaded_pdfs/
+*env/

README.md CHANGED Viewed

@@ -1,12 +1,90 @@
 ---
-title: Sehatech Demo
-emoji: 🏃
-colorFrom: indigo
-colorTo: green
 sdk: gradio
 sdk_version: 5.6.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: sehatech-demo
+app_file: chat_app.py
 sdk: gradio
 sdk_version: 5.6.0
 ---
+# RAG Voice Boilerplate
+A production-ready Python boilerplate for building RAG (Retrieval Augmented Generation) applications with voice processing capabilities.
+## 🚀 Features
+- 📚 RAG Engine Integration
+- 🎤 Voice Processing Pipeline
+- 🗄️ Vector Store Support
+- 🐋 Docker Containerization
+- 🧪 Testing Infrastructure
+- 🔧 Modular Architecture
+## 🏗️ Project Structure
+```
+├── app/
+│   ├── __init__.py
+│   ├── main.py
+│   ├── config.py
+│   ├── api/
+│   │   ├── __init__.py
+│   │   └── routes.py
+│   ├── core/
+│   │   ├── __init__.py
+│   │   ├── rag_engine.py
+│   │   ├── voice_processor.py
+│   │   └── document_processor.py
+│   ├── database/
+│   │   ├── __init__.py
+│   │   ├── vector_store.py
+│   │   └── db.py
+│   └── utils/
+│       ├── __init__.py
+│       └── helpers.py
+├── tests/
+│   └── __init__.py
+├── docker/
+│   ├── Dockerfile
+│   └── docker-compose.yml
+├── requirements.txt
+└── README.md
+```
+## 🚦 Quick Start
+#### Prerequisie
+This project is only tested on python3.11
+some points to consider:
+langchain-pinecone works only between versions python3.8   and python3.13 exclusively
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/rag-voice-boilerplate.git
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Run with Docker:
+```bash
+docker-compose up -d
+```
+## 📚 Documentation
+### Core Components
+- `rag_engine.py`: Handles retrieval augmented generation operations
+- `voice_processor.py`: Processes audio input/output
+- `document_processor.py`: Manages document parsing and preprocessing
+- `vector_store.py`: Manages vector embeddings and similarity search
+## 🤝 Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## 📄 License
+This project is licensed under the MIT License - see the LICENSE file for details.

__init__.py ADDED Viewed

File without changes

api/__init__.py ADDED Viewed

File without changes

api/routes.py ADDED Viewed

File without changes

chat_app.py ADDED Viewed

	@@ -0,0 +1,426 @@

+from datetime import datetime
+from core.pineconeqa import PineconeQA
+import gradio as gr
+from config import get_settings
+from openai import OpenAI
+from utils.models import DatabaseManager
+import json
+import hashlib
+import tempfile
+import os
+class MedicalChatbot:
+    def __init__(self):
+        self.settings = get_settings()
+        self.qa_system = PineconeQA(
+            pinecone_api_key=self.settings.PINECONE_API_KEY,
+            openai_api_key=self.settings.OPENAI_API_KEY,
+            index_name=self.settings.INDEX_NAME
+        )
+        self.client = OpenAI(api_key=self.settings.OPENAI_API_KEY)
+        self.db = DatabaseManager()
+        self.current_doctor = None
+        self.current_session_id = None
+    def handle_session(self, doctor_name):
+        """Create a new session if doctor name changes or no session exists"""
+        # Always create a new session
+        self.current_session_id = self.db.create_session(doctor_name)
+        self.current_doctor = doctor_name
+        return self.current_session_id
+    def get_user_identifier(self, request: gr.Request):
+        """Create a unique user identifier from IP and user agent"""
+        if request is None:
+            return "anonymous"
+        identifier = f"{request.client.host}_{request.headers.get('User-Agent', 'unknown')}"
+        return hashlib.sha256(identifier.encode()).hexdigest()[:32]
+    def detect_message_type(self, message):
+        """Use ChatGPT to detect if the message is a basic interaction or a knowledge query"""
+        try:
+            response = self.client.chat.completions.create(
+                model="gpt-4",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": """Analyze the following message and determine if it's:
+                        1. A basic interaction like hello, thanks, how are you(greetings, thanks, farewell, etc.)
+                        2. A question or request for information
+                        return only 'basic' if the message is only for greeting, or return query
+                        Respond with just the type: 'basic' or 'query'"""
+                    },
+                    {"role": "user", "content": message}
+                ],
+                temperature=0.3,
+                max_tokens=10
+            )
+            return response.choices[0].message.content.strip().lower()
+        except Exception as e:
+            print(f'error encountered. returning query.\nError: {str(e)}')
+            return "query"
+    def get_chatgpt_response(self, message, history):
+        """Get a response from ChatGPT"""
+        try:
+            chat_history = []
+            for human, assistant in history:
+                chat_history.extend([
+                    {"role": "user", "content": human},
+                    {"role": "assistant", "content": assistant}
+                ])
+            messages = [
+                {
+                    "role": "system",
+                    "content": """ "You are an expert assistant for biomedical question-answering tasks. "
+                    "You will be provided with context retrieved from medical literature."
+                    "The medical literature is all from PubMed Open Access Articles. "
+                    "Use this context to answer the question as accurately as possible. "
+                    "The response might not be added precisely, so try to derive the answers from it as much as possible."
+                    "If the context does not contain the required information, explain why. "
+                    "Provide a concise and accurate answer """
+                }
+            ] + chat_history + [
+                {"role": "user", "content": message}
+            ]
+            response = self.client.chat.completions.create(
+                model="gpt-4",
+                messages=messages,
+                temperature=0.7,
+                max_tokens=500
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            return f"I apologize, but I encountered an error: {str(e)}"
+    def synthesize_answer(self, query, context_docs, history):
+        """Synthesize an answer from multiple context documents using ChatGPT"""
+        try:
+            context = "\n\n".join([doc.page_content for doc in context_docs])
+            messages = [
+                {
+                    "role": "system",
+                    "content": """You are a medical expert assistant. Using the provided context,
+                    synthesize a comprehensive, accurate answer. If the context doesn't contain
+                    enough relevant information, say so and provide general medical knowledge.
+                    Always maintain a professional yet accessible tone."""
+                },
+                {
+                    "role": "user",
+                    "content": f"""Context information:\n{context}\n\n
+                    Based on this context and your medical knowledge, please answer the following question:\n{query}"""
+                }
+            ]
+            response = self.client.chat.completions.create(
+                model="gpt-4",
+                messages=messages,
+                temperature=0.2,
+                max_tokens=1000
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            return f"I apologize, but I encountered an error synthesizing the answer: {str(e)}"
+    def format_sources_for_db(self, sources):
+        """Format sources for database storage"""
+        if not sources:
+            return None
+        sources_data = []
+        for doc in sources:
+            sources_data.append({
+                'title': doc.metadata.get('title'),
+                'source': doc.metadata.get('source'),
+                'timestamp': datetime.utcnow().isoformat()
+            })
+        return json.dumps(sources_data)
+    def respond(self, message, history, doctor_name: str, request: gr.Request = None):
+        """Main response function for the chatbot"""
+        try:
+            # Don't reuse sessions - ensure we're using the current session ID
+            if not hasattr(self, 'current_session_id') or not self.current_session_id:
+                self.current_session_id = self.db.create_session(doctor_name)
+            # Log user message
+            self.db.log_message(
+                session_id=self.current_session_id,
+                message=message,
+                is_user=True
+            )
+            # Rest of your existing respond method remains the same...
+            message_type = self.detect_message_type(message)
+            if message_type == "basic":
+                response = self.get_chatgpt_response(message, history)
+                self.db.log_message(
+                    session_id=self.current_session_id,
+                    message=response,
+                    is_user=False
+                )
+                return response
+            retriever_response = self.qa_system.ask(message)
+            if "error" in retriever_response:
+                response = self.get_chatgpt_response(message, history)
+                self.db.log_message(
+                    session_id=self.current_session_id,
+                    message=response,
+                    is_user=False
+                )
+                return response
+            if retriever_response.get("context") and len(retriever_response["context"]) > 0:
+                synthesized_answer = self.synthesize_answer(
+                    message,
+                    retriever_response["context"],
+                    history
+                )
+                sources = self.format_sources(retriever_response["context"])
+                final_response = synthesized_answer + sources
+                self.db.log_message(
+                    session_id=self.current_session_id,
+                    message=final_response,
+                    is_user=False,
+                    sources=self.format_sources_for_db(retriever_response["context"])
+                )
+                return final_response
+            else:
+                response = self.get_chatgpt_response(message, history)
+                fallback_response = "I couldn't find specific information about this in my knowledge base, but here's what I can tell you:\n\n" + response
+                self.db.log_message(
+                    session_id=self.current_session_id,
+                    message=fallback_response,
+                    is_user=False
+                )
+                return fallback_response
+        except Exception as e:
+            error_message = f"I apologize, but I encountered an error: {str(e)}"
+            if self.current_session_id:
+                self.db.log_message(
+                    session_id=self.current_session_id,
+                    message=error_message,
+                    is_user=False
+                )
+            return error_message
+    def format_sources(self, sources):
+        """Format sources into a readable string"""
+        if not sources:
+            return ""
+        formatted = "\n\n📚 Sources Used:\n"
+        seen_sources = set()
+        for doc in sources:
+            source_id = (doc.metadata.get('title', ''), doc.metadata.get('source', ''))
+            if source_id not in seen_sources:
+                seen_sources.add(source_id)
+                formatted += f"\n• {doc.metadata.get('title', 'Untitled')}\n"
+                if doc.metadata.get('source'):
+                    formatted += f"  Link: {doc.metadata['source']}\n"
+        return formatted
+    def transcribe_audio(self, audio_path):
+        """Transcribe audio using OpenAI Whisper"""
+        try:
+            with open(audio_path, "rb") as audio_file:
+                transcript = self.client.audio.transcriptions.create(
+                    model="whisper-1",
+                    file=audio_file
+                )
+            return transcript.text
+        except Exception as e:
+            print(f"Error transcribing audio: {str(e)}")
+            return None
+    def process_audio_input(self, audio_path, history, doctor_name):
+        """Process audio input and return both text and audio response"""
+        try:
+            # Transcribe the audio
+            transcription = self.transcribe_audio(audio_path)
+            if not transcription:
+                return "Sorry, I couldn't understand the audio.", None
+            # Get text response
+            text_response = self.respond(transcription, history, doctor_name)
+            # Convert response to speech
+            # audio_response = self.text_to_speech(text_response)
+            return text_response
+        except Exception as e:
+            return f"Error processing audio: {str(e)}"
+def main():
+    med_chatbot = MedicalChatbot()
+    with gr.Blocks(theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# Medical Knowledge Assistant")
+        gr.Markdown("Ask me anything about medical topics using text or voice.")
+        session_state = gr.State()
+        doctor_state = gr.State()
+        # Doctor Name Input
+        with gr.Row():
+            doctor_name = gr.Textbox(
+                label="Doctor Name",
+                placeholder="Enter your name",
+                show_label=True,
+                container=True,
+                scale=2,
+                interactive=True
+            )
+        # Main Chat Interface
+        with gr.Row():
+            with gr.Column(scale=4):
+                chatbot = gr.Chatbot(height=400)
+                # Text Input Area
+                with gr.Row():
+                    text_input = gr.Textbox(
+                        placeholder="Type your message here...",
+                        scale=8
+                    )
+                    send_button = gr.Button("Send", scale=1)
+                # Audio Input Area
+                with gr.Row():
+                    audio = gr.Audio(
+                        sources=["microphone"],
+                        type="filepath",
+                        label="Voice Message",
+                        interactive=True
+                    )
+        # Audio Output Area
+        audio_output = gr.Audio(
+            label="AI Voice Response",
+            visible=True,
+            interactive=False
+        )
+        # Initialize session handler
+        def init_session(doctor, current_doctor):
+            if not doctor or doctor == current_doctor:
+                return None, current_doctor
+            return med_chatbot.db.create_session(doctor), doctor
+        # Text input handler
+        def on_text_submit(message, history, doctor, session_id, current_doctor):
+            if not session_id or doctor != current_doctor:
+                session_id, current_doctor = init_session(doctor, current_doctor)
+            med_chatbot.current_session_id = session_id
+            response = med_chatbot.respond(message, history, doctor)
+            history.append((message, response))
+            return "", history, None, session_id, current_doctor
+        # Audio input handler with numpy array
+        def on_audio_submit(audio_path, history, doctor, session_id, current_doctor):
+            try:
+                if audio_path is None:
+                    return history, None, session_id, current_doctor
+                # Initialize session if needed
+                if not session_id or doctor != current_doctor:
+                    session_id, current_doctor = init_session(doctor, current_doctor)
+                # Set current session
+                med_chatbot.current_session_id = session_id
+                # Transcribe the audio
+                transcription = med_chatbot.transcribe_audio(audio_path)
+                if not transcription:
+                    return history, None, session_id, current_doctor
+                # Log the transcription as a user message in the database
+                med_chatbot.db.log_message(
+                    session_id=session_id,
+                    message=transcription,
+                    is_user=True
+                )
+                # Append transcription to the chatbot history
+                history.append((f"🎤 {transcription}", None))  # User message, no AI response yet
+                # Process the transcription as a user query
+                ai_response = med_chatbot.respond(transcription, history, doctor)
+                # Append AI response to the chatbot history
+                history[-1] = (f"🎤 {transcription}", ai_response)  # Update with AI response
+                # Log the AI response in the database
+                med_chatbot.db.log_message(
+                    session_id=session_id,
+                    message=ai_response,
+                    is_user=False
+                )
+                return history, session_id, current_doctor
+            except Exception as e:
+                print(f"Error processing audio: {str(e)}")
+                return history, None, session_id, current_doctor
+                # Set up event handlers
+        doctor_name.submit(
+            fn=init_session,
+            inputs=[doctor_name, doctor_state],
+            outputs=[session_state, doctor_state]
+        )
+        send_button.click(
+            fn=on_text_submit,
+            inputs=[text_input, chatbot, doctor_name, session_state, doctor_state],
+            outputs=[text_input, chatbot, audio_output, session_state, doctor_state]
+        )
+        text_input.submit(
+            fn=on_text_submit,
+            inputs=[text_input, chatbot, doctor_name, session_state, doctor_state],
+            outputs=[text_input, chatbot, audio_output, session_state, doctor_state]
+        )
+        # Audio submission
+        audio.stop_recording(
+            fn=on_audio_submit,
+            inputs=[audio, chatbot, doctor_name, session_state, doctor_state],
+            outputs=[chatbot, session_state, doctor_state]
+        )
+        # Examples
+        gr.Examples(
+            examples=[
+                ["Hello, how are you?", "Dr. Smith"],
+                ["What are the common causes of iron deficiency anemia?", "Dr. Smith"],
+                ["What are the latest treatments for type 2 diabetes?", "Dr. Smith"],
+                ["Can you explain the relationship between diet and heart disease?", "Dr. Smith"]
+            ],
+            inputs=[text_input, doctor_name]
+        )
+    interface.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )
+if __name__ == "__main__":
+    main()

chat_history.db ADDED Viewed

Binary file (20.5 kB). View file

config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from pydantic import validator
+from pydantic_settings import BaseSettings
+from functools import lru_cache
+from dotenv import load_dotenv
+class Settings(BaseSettings):
+    PROJECT_NAME: str = "SehaTech: Medical AI Assistant"
+    API_V1_STR: str = "/api/v1"
+    OPENAI_API_KEY: str
+    PINECONE_API_KEY: str
+    INDEX_NAME: str
+    CLOUD: str
+    REGION: str
+    PDF_DIRECTORY: str  # Directory containing PDF files
+    CHUNK_SIZE: int
+    CHUNK_OVERLAP: int
+    DIMENSIONS: int
+    AWS_ACCESS_KEY: str
+    AWS_SECRET_KEY: str
+    AWS_REGION: str
+    AWS_BUCKET_NAME: str
+    PUBMED_BASE_URL: str
+    class Config:
+        env_file = ".env"
+    @validator("OPENAI_API_KEY")
+    def validate_openai_key(cls, v):
+        if not v.startswith("sk-"):
+            raise ValueError("Invalid OpenAI API key format")
+        return v
+    def __init__(self, **kwargs):
+        # Force reload of environment variables
+        load_dotenv(override=True)
+        super().__init__(**kwargs)
+def get_settings():
+    return Settings()

core/__init__.py ADDED Viewed

File without changes

core/document_processor.py ADDED Viewed

File without changes

core/download_dataset.py ADDED Viewed

	@@ -0,0 +1,440 @@

+import os
+import json
+import shutil
+from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+import requests
+import tarfile
+from langchain.schema import Document
+import hashlib
+import xml.etree.ElementTree as ET
+from urllib import request
+from s3_utils import S3Handler
+from config import get_settings
+from PyPDF2 import PdfReader
+class PubMedDownloader:
+    def __init__(self, s3_handler, pubmed_base_url, pinecone_index, embedding_model, from_date="2024-01-01", until_date="2024-11-01", limit=3):
+        self.s3_handler = s3_handler
+        self.settings = get_settings()
+        self.pubmed_base_url = pubmed_base_url
+        self.from_date = from_date
+        self.until_date = until_date
+        self.limit = limit
+        self.local_download_dir = "downloaded_pdfs"
+        os.makedirs(self.local_download_dir, exist_ok=True)
+        self.pinecone_index = pinecone_index  # Pinecone index instance
+        self.embedding_model = embedding_model  # Embedding model instance
+    def split_and_embed(self, documents, metadata_entry):
+        """Split documents into chunks and embed them sequentially."""
+        text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=self.settings.CHUNK_SIZE,
+                chunk_overlap=self.settings.CHUNK_OVERLAP
+            )
+        chunks = text_splitter.split_documents(documents)
+        print(f'total chunks created: {len(chunks)}')
+        batch_size = 50
+        pmc_id = metadata_entry['pmc_id']
+        for batch_index in range(0, len(chunks), batch_size):
+            batch = chunks[batch_index: batch_index + batch_size]
+            print(f'len of batch: {len(batch)}')
+            try:
+                # Process a single batch
+                # Create ids for the batch
+                # ids = [f"chunk_{batch_index}_{j}" for j in range(len(batch))]
+                ids = [f"{pmc_id}_chunk_{batch_index}_{j}" for j in range(len(batch))]
+                print(f'len of ids: {len(ids)}')
+                print(f'id sample: {ids[0]}')
+                # Get texts and generate embeddings
+                texts = [doc.page_content for doc in batch]
+                print(f'len of texts: {len(texts)}')
+                embeddings = self.embedding_model.embed_documents(texts)
+                metadata = []
+                for doc in batch:
+                    chunk_metadata = metadata_entry.copy()  # Copy base metadata
+                    chunk_metadata["text"] = doc.page_content  # Add chunk-specific text
+                    metadata.append(chunk_metadata)
+                # Create upsert batch
+                to_upsert = list(zip(ids, embeddings, metadata))
+                # Upsert to Pinecone
+                self.pinecone_index.upsert(vectors=to_upsert)
+                print(f"Successfully upserted {len(to_upsert)} chunks to Pinecone.")
+            except Exception as e:
+                print(f"Error processing batch {batch_index}: {e}")
+    def fetch_records(self, resumption_token=None):
+        """
+        Fetch records from PubMed using optional resumptionToken.
+        Args:
+            resumption_token (str, optional): Token to resume fetching records. Defaults to None.
+        Returns:
+            ElementTree.Element: Parsed XML root of the API response.
+        """
+        # Build the base URL
+        url = f"{self.pubmed_base_url}"
+        # Define parameters
+        params = {
+         "format" : "tgz"
+        }
+        # Add date range if provided
+        if self.from_date and self.until_date:
+            params["from"] = self.from_date
+            params["until"] = self.until_date
+        # Add resumptionToken if available
+        if resumption_token:
+            params["resumptionToken"] = resumption_token
+            print(f"Using resumption token: {resumption_token}")
+        # Make the request
+        response = requests.get(url, params=params)
+        response.raise_for_status()  # Raise an error for bad HTTP responses
+        # Parse and return the XML content
+        return ET.fromstring(response.content)
+    def save_metadata_to_s3(self, metadata, bucket, key):
+        print(f"Saving metadata to S3: s3://{bucket}/{key}")
+        self.s3_handler.upload_string_to_s3(metadata, bucket, key)
+    def save_pdf_to_s3(self, local_filename, bucket, s3_key):
+        """Upload PDF to S3 and then delete the local file."""
+        print(f"Uploading PDF to S3: s3://{bucket}/{s3_key}")
+        self.s3_handler.upload_file_to_s3(local_filename, bucket, s3_key)
+        # Delete the local file after upload
+        if os.path.exists(local_filename):
+            os.remove(local_filename)
+            print(f"Deleted local file: {local_filename}")
+        else:
+            print(f"File not found for deletion: {local_filename}")
+    def update_metadata_and_upload(self, metadata_entry, bucket_name, metadata_file_key):
+        """Update metadata list with a new entry and upload it to S3 as JSON."""
+        # Add new entry to metadata
+        # Convert metadata to JSON and upload to S3
+        metadata_json = json.dumps(metadata_entry, indent=4)
+        self.s3_handler.upload_string_to_s3(metadata_json, bucket_name, metadata_file_key)
+        print(f"Updated metadata uploaded to s3://{bucket_name}/{metadata_file_key}")
+    def download_and_process_tgz(self, ftp_link, pmc_id):
+        try:
+            metadata_entry = {}
+            # Step 1: Download TGZ
+            local_tgz_filename = os.path.join(self.local_download_dir, f"{pmc_id}.tgz")
+            print(f"Downloading TGZ: {ftp_link} saving in {local_tgz_filename}")
+            request.urlretrieve(ftp_link, local_tgz_filename)
+            # Step 2: Extract TGZ into a temporary directory
+            temp_extract_dir = os.path.join(self.local_download_dir, f"{pmc_id}_temp")
+            os.makedirs(temp_extract_dir, exist_ok=True)
+            print(f"Temporary extract dir: {temp_extract_dir}")
+            with tarfile.open(local_tgz_filename, "r:gz") as tar:
+                tar.extractall(path=temp_extract_dir)
+            # Step 3: Handle Nested Structure (Move Contents to Target Directory)
+            final_extract_dir = os.path.join(self.local_download_dir, pmc_id)
+            os.makedirs(final_extract_dir, exist_ok=True)
+            # Check if the archive creates a single root directory (e.g., PMC8419487/)
+            extracted_items = os.listdir(temp_extract_dir)
+            if len(extracted_items) == 1 and os.path.isdir(os.path.join(temp_extract_dir, extracted_items[0])):
+                # Move contents of the single folder to the final directory
+                nested_dir = os.path.join(temp_extract_dir, extracted_items[0])
+                for item in os.listdir(nested_dir):
+                    shutil.move(os.path.join(nested_dir, item), final_extract_dir)
+            else:
+                # If no single root folder, move all files directly
+                for item in extracted_items:
+                    shutil.move(os.path.join(temp_extract_dir, item), final_extract_dir)
+            print(f"Final extracted dir: {final_extract_dir}")
+            # Clean up the temporary extraction directory
+            shutil.rmtree(temp_extract_dir)
+            print(f"Temporary extract dir deleted: {temp_extract_dir}")
+            # Process the extracted files as before...
+            xml_file = [f for f in os.listdir(final_extract_dir) if f.endswith(".xml") or f.endswith(".nxml")]
+            pdf_path = [f for f in os.listdir(final_extract_dir) if f.endswith("pdf")]
+            if xml_file:
+                xml_path = os.path.join(final_extract_dir, xml_file[0])
+                metadata_entry = self.process_xml_metadata(xml_path, pmc_id)
+            else:
+                print(f"No XML file found in TGZ for PMCID: {pmc_id}")
+                print(f'Skipping article')
+            if pdf_path:
+                pdf_path = os.path.join(final_extract_dir, pdf_path[0])
+                document = self.download_and_process_pdf(pdf_path, pmc_id, self.settings.AWS_BUCKET_NAME)
+            else:
+                if metadata_entry.get('body_text') and metadata_entry['body_text'] != "N/A":
+                    document = Document(
+                        page_content=metadata_entry['body_text'], metadata=metadata_entry
+                    )
+                    metadata_entry.pop("body_text")
+                else:
+                    print(f'Body content and PDF both not found, hence skipping this PDF')
+                    document = None
+            # Cleanup: Remove the downloaded TGZ file
+            if os.path.exists(local_tgz_filename):
+                os.remove(local_tgz_filename)
+                print(f"Removed file: {local_tgz_filename}")
+            if os.path.exists(final_extract_dir):
+                shutil.rmtree(final_extract_dir)
+            return metadata_entry, document
+        except Exception as e:
+            print(f"Cannot download TGZ file for {pmc_id} : ftp link : {ftp_link}")
+            print(f"[ERROR] {str(e)}")
+            return {}, None
+    def extract_text_from_element(self, element):
+        """
+        Recursively extract all text from an XML element and its children.
+        Args:
+            element (Element): XML element to extract text from.
+        Returns:
+            str: Concatenated text content of the element and its children.
+        """
+        text_content = element.text or ""  # Start with the element's own text
+        for child in element:
+            text_content += self.extract_text_from_element(child)  # Recurse into children
+            if child.tail:  # Include any tail text after the child element
+                text_content += child.tail
+        return text_content.strip()
+    def process_xml_metadata(self, xml_path, pmc_id):
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        # Extract metadata
+        title_elem = root.find(".//article-title")
+        title = title_elem.text if title_elem is not None else "No Title Available"
+        # title = root.find(".//article-title").text if root.find(".//article-title") else "No Title Available"
+        # abstract = root.find(".//abstract/p").text if root.find(".//abstract/p") else "No Abstract Available"
+        # Abstract extraction
+        abstract_elem = root.find(".//abstract/p")
+        abstract = abstract_elem.text if abstract_elem is not None else "No Abstract Available"
+        # doi = root.find(".//article-id[@pub-id-type='doi']").text if root.find(".//article-id[@pub-id-type='doi']") else "N/A"
+        # DOI extraction
+        doi_elem = root.find(".//article-id[@pub-id-type='doi']")
+        doi = doi_elem.text if doi_elem is not None else "N/A"
+        # authors = [f"{author.find('surname').text}, {author.find('given-names').text}"
+        #         for author in root.findall(".//contrib/name")]
+        authors = []
+        for author in root.findall(".//contrib/name"):
+            surname = author.find('surname')
+            given_names = author.find('given-names')
+            # Safely handle missing elements
+            surname_text = surname.text if surname is not None else "Unknown Surname"
+            given_names_text = given_names.text if given_names is not None else "Unknown Given Names"
+            authors.append(f"{surname_text}, {given_names_text}")
+        keywords = [kw.text for kw in root.findall(".//kwd")]
+        # Extract publication date
+        pub_date_node = root.find(".//pub-date")
+        if pub_date_node is not None:
+            month = pub_date_node.find("month").text if pub_date_node.find("month") is not None else "N/A"
+            year = pub_date_node.find("year").text if pub_date_node.find("year") is not None else "N/A"
+            pub_type = pub_date_node.attrib.get("pub-type", "N/A")
+            publication_date = f"{year}-{month}" if month != "N/A" else year
+        else:
+            publication_date = "N/A"
+            # Extract text content from <body>
+        body_node = root.find(".//body")
+        body_text = ""
+        if body_node is not None:
+            body_text = self.extract_text_from_element(body_node)
+        else:
+            body_text = "N/A"
+        # Save enriched metadata
+        metadata_entry = {
+            "pmc_id": pmc_id,
+            "title": title,
+            "abstract": abstract,
+            "authors": authors,
+            "keywords": keywords,
+            "doi": doi,
+            "source": f"https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}",
+            "publication_date" : publication_date,
+            "body_text" : body_text
+        }
+        return metadata_entry
+    def download_and_process_pdf(self, pdf_path, pmc_id, bucket_name):
+        try:
+            pdf_reader = PdfReader(pdf_path)
+            text = "".join(page.extract_text() for page in pdf_reader.pages)
+            # Create document object
+            document = Document(
+                page_content=text,
+                metadata={"source": f"s3://{bucket_name}/{pmc_id}.pdf"}
+            )
+            return document
+        except Exception as e:
+            print(f"Error processing PDF for {pmc_id}: {e}")
+            return None
+    def process_and_save(self, bucket_name, metadata_file_key):
+        # Load existing metadata from S3
+        try:
+            metadata_content = self.s3_handler.download_string_from_s3(bucket_name, metadata_file_key)
+            existing_metadata = json.loads(metadata_content)
+            existing_ids = {record["pmc_id"] for record in existing_metadata}
+            print(f"Found {len(existing_ids)} existing records in metadata.")
+        except Exception as e:
+            # If metadata file doesn't exist or is empty, initialize an empty list
+            print(f"Could not load metadata: {e}. Assuming no existing records.")
+            existing_metadata = []
+            existing_ids = set()
+        resumption_token = None
+        while True:
+            root = self.fetch_records(resumption_token=resumption_token)
+            print(f'len of records: {len(root.findall(".//record"))}')
+            resumption = root.find(".//resumption")
+            print(f'resumption token: {resumption}')
+            for record in root.findall(".//record"):
+                # print(f'first record: ')
+                pmc_id = record.attrib.get("id")
+                # print(f'[INFO] pmc id : {pmc_id}')
+                if pmc_id in existing_ids:
+                    # print(f"Skipping already downloaded record: {pmc_id}")
+                    continue
+                pdf_link = None
+                ftp_link = None
+                for link in record.findall("link"):
+                    if link.attrib.get("format") == "tgz":
+                        ftp_link = link.attrib.get("href")
+                    if link.attrib.get("format") == "pdf":
+                        pdf_link = link.attrib.get("href")
+                print(f'[INFO] links found: pdf {pdf_link} and ftp {ftp_link}')
+                metadata = { }
+                # Process `tgz` first if available
+                if ftp_link:
+                    metadata, document = self.download_and_process_tgz(ftp_link, pmc_id)
+                    # documents.append(document)
+                    if not document:
+                        # print(f'this document doesnt have content. continue .. ')
+                        continue
+                    self.split_and_embed([document], metadata)
+                    # Create document object
+                existing_metadata.append(metadata)
+                self.update_metadata_and_upload(existing_metadata, bucket_name , metadata_file_key)
+            resumption = root.find(".//resumption")
+            if resumption is not None:
+                link = resumption.find("link")
+                if link is not None:
+                    resumption_token = link.attrib.get("token", "").strip()
+                    if not resumption_token:
+                        print("No more tokens found, stopping pagination.")
+                        break
+                else:
+                    print("No link found, stopping pagination.")
+                    break
+            else:
+                print("No resumption element, stopping pagination.")
+                break
+def create_or_connect_index(index_name, dimension):
+        pc = pinecone.Pinecone(settings.PINECONE_API_KEY)
+        """Create or connect to existing Pinecone index"""
+        spec = pinecone.ServerlessSpec(
+            cloud=settings.CLOUD,
+            region=settings.REGION
+        )
+        print(f'all indexes: {pc.list_indexes()}')
+        if index_name not in pc.list_indexes().names():
+            pc.create_index(
+                name=index_name,
+                dimension=dimension,
+                metric='cosine', # You can use 'dotproduct' or other metrics if needed
+                spec=spec
+            )
+        return pc.Index(settings.INDEX_NAME)
+if __name__ == "__main__":
+    """
+    #todo: add all args as argument parser
+    #todo: like from and until date, and all variables
+    #todo: add one variable like how many iterations we need to go
+    """
+    # Load settings
+    settings = get_settings()
+    # Initialize S3 handler
+    s3_handler = S3Handler()
+    import pinecone
+    pc_index = create_or_connect_index(settings.INDEX_NAME, settings.DIMENSIONS)
+    # Create the downloader instance
+    downloader = PubMedDownloader(
+        s3_handler=s3_handler,
+        pubmed_base_url=settings.PUBMED_BASE_URL,
+        pinecone_index= pc_index,
+        embedding_model=OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY)
+    )
+    # Process and save
+    downloader.process_and_save(
+        bucket_name=settings.AWS_BUCKET_NAME,
+        metadata_file_key="pubmed_metadata/metadata.json"
+    )

core/pineconeqa.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# qa_system.py
+from langchain.vectorstores import Pinecone
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+import pinecone
+from langchain_pinecone import PineconeVectorStore
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+class PineconeQA:
+    def __init__(self, pinecone_api_key, openai_api_key, index_name):
+        # Initialize Pinecone
+        self.pc = pinecone.Pinecone(api_key=pinecone_api_key)
+        self.index = self.pc.Index(index_name)
+        # Initialize embeddings
+        self.embeddings = OpenAIEmbeddings(
+            openai_api_key=openai_api_key
+        )
+        # Create retriever
+        self.retriever = PineconeVectorStore(
+            index=self.index,
+            embedding=self.embeddings
+        )
+        # Initialize LLM
+        self.llm = ChatOpenAI(
+            openai_api_key=openai_api_key,
+            model="gpt-4o",
+            temperature=0.2
+        )
+        # Create the RAG chain
+        self._create_rag_chain()
+    def _create_rag_chain(self):
+        # Define system prompt
+        # system_prompt = (
+        #     "You are an assistant for question-answering tasks. "
+        #     "Use the following pieces of retrieved context to answer "
+        #     "the question. If you don't know the answer, say that you "
+        #     "don't know. Use three sentences maximum and keep the "
+        #     "answer concise."
+        #     "\n\n"
+        #     "{context}"
+        # )
+        system_prompt = (
+            "You are an expert assistant for biomedical question-answering tasks. "
+            "You will be provided with context retrieved from medical literature."
+            "The medical literature is all from PubMed Open Access Articles. "
+            "Use this context to answer the question as accurately as possible. "
+            "The response might not be added precisly, so try to derive the answers from it as much as possible."
+            "If the context does not contain the required information, explain why. "
+            "Provide a concise and accurate answer "
+            "\n\n"
+            "Context:\n{context}\n"
+        )
+        # Create chat prompt template
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", system_prompt),
+            ("human", "{input}"),
+        ])
+        # Create question-answer chain
+        question_answer_chain = create_stuff_documents_chain(
+            self.llm,
+            prompt
+        )
+        # Create the RAG chain
+        self.rag_chain = create_retrieval_chain(
+            self.retriever.as_retriever(search_type="mmr"),
+            question_answer_chain
+        )
+    def merge_relevant_chunks(self, retrieved_docs, question, max_tokens=1500):
+        """
+        Merge document chunks based on their semantic relevance to the question.
+        """
+        merged_context = ""
+        current_tokens = 0
+        for doc in retrieved_docs:
+            tokens = doc.page_content.split()
+            if current_tokens + len(tokens) <= max_tokens:
+                merged_context += doc.page_content + "\n"
+                current_tokens += len(tokens)
+            else:
+                break
+        return merged_context
+    def ask(self, question):
+        """
+        Ask a question and get response with sources
+        """
+        # Initialize conversation history if it doesn't exist
+        if not hasattr(self, "conversation_history"):
+            self.conversation_history = []
+        try:
+            system_prompt = (
+            "You are an expert assistant for biomedical question-answering tasks. "
+            "You will be provided with context retrieved from medical literature, specifically PubMed Open Access Articles. "
+            "Use the provided context to directly answer the question in the most accurate and concise manner possible. "
+            "If the context does not provide sufficient information, state that the specific details are not available in the context."
+            "Do not include statements about limitations of the context in your response. "
+            "Your answer should sound authoritative and professional, tailored for a medical audience."
+            "\n\n"
+            "Context:\n{context}\n"
+                )
+            # Create chat prompt template
+            prompt = ChatPromptTemplate.from_messages([
+                ("system", system_prompt),
+                ("human", "{input}"),
+            ])
+            # Create question-answer chain
+            question_answer_chain = create_stuff_documents_chain(
+                self.llm,
+                prompt
+            )
+            results = create_retrieval_chain(
+                self.retriever.as_retriever(seach_type="mmr"),
+                question_answer_chain
+            ).invoke({"input": question})
+            return {
+                "answer": results["answer"],
+                "context": results["context"]
+            }
+        except Exception as e:
+            return {
+                "error": str(e)
+            }

core/rag_engine.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import sys
+import os
+import boto3
+import hashlib
+import json
+import threading
+# Add the project root directory to Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from typing import List
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from langchain_openai import OpenAIEmbeddings
+import pinecone
+from tqdm.auto import tqdm
+from langchain.schema import Document
+from config import get_settings
+from dotenv import load_dotenv
+from io import BytesIO
+from PyPDF2 import PdfReader
+load_dotenv()
+class RAGPrep:
+    def __init__(self, processed_hashes_file="processed_hashes.json"):
+        self.settings = get_settings()
+        self.index_name = self.settings.INDEX_NAME
+        self.pc = self.init_pinecone()
+        self.embeddings = OpenAIEmbeddings(openai_api_key=self.settings.OPENAI_API_KEY)
+        self.processed_hashes_file = processed_hashes_file
+        self.processed_hashes = self.load_processed_hashes()
+    def init_pinecone(self):
+        """Initialize Pinecone client"""
+        pc = pinecone.Pinecone(self.settings.PINECONE_API_KEY)
+        return pc
+    # Define function to create or connect to an existing index
+    def create_or_connect_index(self,index_name, dimension):
+        """Create or connect to existing Pinecone index"""
+        spec = pinecone.ServerlessSpec(
+            cloud=self.settings.CLOUD,
+            region=self.settings.REGION
+        )
+        print(f'all indexes: {self.pc.list_indexes()}')
+        if index_name not in self.pc.list_indexes().names():
+            self.pc.create_index(
+                name=index_name,
+                dimension=dimension,
+                metric='cosine', # You can use 'dotproduct' or other metrics if needed
+                spec=spec
+            )
+        return self.pc.Index(index_name)
+    def load_processed_hashes(self):
+        """Load previously processed hashes from a file."""
+        if os.path.exists(self.processed_hashes_file):
+            with open(self.processed_hashes_file, "r") as f:
+                return set(json.load(f))
+        return set()
+    def save_processed_hashes(self):
+        """Save processed hashes to a file."""
+        with open(self.processed_hashes_file, "w") as f:
+            json.dump(list(self.processed_hashes), f)
+    def generate_pdf_hash(self, pdf_content: bytes):
+        """Generate a hash for the given PDF content."""
+        hasher = hashlib.md5()
+        hasher.update(pdf_content)
+        return hasher.hexdigest()
+    def load_and_split_pdfs(self, chunk_from = 50, chunk_to = 100) -> List[Document]:
+        """Load PDFs from S3, extract text, and split into chunks."""
+        print("***********")
+        # Initialize S3 client
+        s3_client = boto3.client(
+            's3',
+            aws_access_key_id=self.settings.AWS_ACCESS_KEY,
+            aws_secret_access_key=self.settings.AWS_SECRET_KEY,
+            region_name=self.settings.AWS_REGION
+        )
+        # List all PDF files in the S3 bucket and prefix
+        print(f"Listing files in S3 bucket: {self.settings.AWS_BUCKET_NAME}")
+        response = s3_client.list_objects_v2(Bucket=self.settings.AWS_BUCKET_NAME, Prefix="")
+        s3_keys = [obj['Key'] for obj in response.get('Contents', [])]
+        print(f"Found {len(s3_keys)} PDF files in S3")
+        documents = []
+        # Process each PDF file
+        for s3_key in s3_keys[chunk_from:chunk_to]:
+            print(f"Processing file: {s3_key}")
+            if not s3_key.lower().endswith(".pdf"):
+                print("Not a PDF file, skipping.")
+                continue
+            try:
+                # Read file from S3
+                obj = s3_client.get_object(Bucket=self.settings.AWS_BUCKET_NAME, Key=s3_key)
+                pdf_content = obj['Body'].read()
+                # Generate hash and check for duplicates
+                pdf_hash = self.generate_pdf_hash(pdf_content)
+                if pdf_hash in self.processed_hashes:
+                    print(f"Duplicate PDF detected: {s3_key}, skipping.")
+                    continue
+                # Extract text from PDF
+                pdf_file = BytesIO(pdf_content)
+                pdf_reader = PdfReader(pdf_file)
+                text = "".join(page.extract_text() for page in pdf_reader.pages)
+                # Add document with metadata
+                documents.append(Document(page_content=text, metadata={"source": s3_key}))
+                self.processed_hashes.add(pdf_hash)
+            except Exception as e:
+                print(f"Error processing {s3_key}: {e}")
+        print(f"Extracted text from {len(documents)} documents")
+        # Split documents into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.settings.CHUNK_SIZE,
+            chunk_overlap=self.settings.CHUNK_OVERLAP
+        )
+        chunks = text_splitter.split_documents(documents)
+        print(f"Created {len(chunks)} chunks")
+        # Save updated hashes
+        self.save_processed_hashes()
+        return chunks
+    def process_and_upload(self, total_batch=200):
+        """Process PDFs and upload to Pinecone"""
+        # Create or connect to index
+        index = self.create_or_connect_index(self.index_name, self.settings.DIMENSIONS)
+        # Load and split documents
+        print(f'//////// chunking: ////////')
+        current_batch = 0
+        for i in range(0, total_batch, 50):
+            batch_size = 50  # Adjust based on your needs
+            chunks = self.load_and_split_pdfs(current_batch, current_batch+batch_size)
+            current_batch = current_batch + batch_size
+            # Prepare for batch processing
+            max_threads = 4   # Adjust based on your hardware
+            def process_batch(batch, batch_index):
+                """Process a single batch of chunks"""
+                print(f"Processing batch {batch_index} on thread: {threading.current_thread().name}")
+                print(f"Active threads: {threading.active_count()}")
+                # Create ids for batch
+                ids = [f"chunk_{batch_index}_{j}" for j in range(len(batch))]
+                # Get texts and generate embeddings
+                texts = [doc.page_content for doc in batch]
+                embeddings = self.embeddings.embed_documents(texts)
+                # Create metadata
+                metadata = [
+                    {
+                        "text": doc.page_content,
+                        "source": doc.metadata.get("source", "unknown"),
+                        "page": doc.metadata.get("page", 0)
+                    }
+                    for doc in batch
+                ]
+                # Create upsert batch
+                return list(zip(ids, embeddings, metadata))
+            with ThreadPoolExecutor(max_threads) as executor:
+                futures = []
+                print(f"Batch size being used: {batch_size}")
+                for i in range(0, len(chunks), batch_size):
+                    batch = chunks[i:i + batch_size]
+                    futures.append(executor.submit(process_batch, batch, i))
+                # Gather results and upsert to Pinecone
+                for future in tqdm(as_completed(futures), total=len(futures), desc="Uploading batches"):
+                    try:
+                        to_upsert = future.result()
+                        index.upsert(vectors=to_upsert)
+                    except Exception as e:
+                        print(f"Error processing batch: {e}")
+            print(f"Successfully processed and uploaded {len(chunks)} chunks to Pinecone")
+    def cleanup_index(self) -> bool:
+        """
+        Delete all vectors from the Pinecone index.
+        Returns:
+            bool: True if cleanup was successful, False otherwise
+        Raises:
+            Exception: Logs any unexpected errors during cleanup
+        """
+        try:
+            # Try to get the index
+            if self.index_name in self.pc.list_indexes().names():
+                print(f'index name found in {self.pc.list_indexes().names()}')
+                # Attempt to delete all vectors
+                index = self.pc.Index(self.index_name)
+                index.delete(delete_all=True)
+                print(f"Successfully cleaned up index: {self.index_name}")
+                return True
+            print(f'Index doesn\'t exist.')
+            return True
+        except Exception as e:
+            print(f"Unexpected error during index cleanup: {str(e)}")
+            # You might want to log this error as well
+            import logging
+            logging.error(f"Failed to cleanup index {self.index_name}. Error: {str(e)}")
+            return False
+        finally:
+            # Any cleanup code that should run regardless of success/failure
+            print("Cleanup operation completed.")
+# Example usage:
+if __name__ == "__main__":
+    # Example .env file content:
+    rag_prep = RAGPrep()
+    rag_prep.process_and_upload()
+    # rag_prep.cleanup_index()

core/s3_utils.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import sys
+import os
+# Add the project root directory to Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import boto3
+from botocore.exceptions import NoCredentialsError, ClientError
+from config import get_settings
+settings = get_settings()
+class S3Handler:
+    def __init__(self):
+        self.s3_client = boto3.client(
+            's3',
+            aws_access_key_id=settings.AWS_ACCESS_KEY,
+            aws_secret_access_key=settings.AWS_SECRET_KEY,
+            region_name=settings.AWS_REGION
+        )
+    def upload_directory(self, local_directory, bucket_name, s3_prefix=""):
+        uploaded_files = []
+        errors = []
+        for root, _, files in os.walk(local_directory):
+            for filename in files:
+                # Get the full local path
+                local_path = os.path.join(root, filename)
+                # Get relative path by removing the local_directory prefix
+                relative_path = os.path.relpath(local_path, local_directory)
+                # Create S3 key (preserve directory structure)
+                s3_key = os.path.join(s3_prefix, relative_path).replace("\\", "/")
+                try:
+                    print(f"Uploading {local_path} to {bucket_name}/{s3_key}")
+                    self.s3_client.upload_file(local_path, bucket_name, s3_key)
+                    uploaded_files.append(s3_key)
+                except ClientError as e:
+                    print(f"Error uploading {local_path}: {str(e)}")
+                    errors.append(local_path)
+        return uploaded_files, errors
+    def upload_file_to_s3(self, file_path, bucket_name, s3_key):
+        """
+        Upload a single file to S3.
+        Args:
+            file_path (str): Local path to the file to upload.
+            bucket_name (str): Name of the S3 bucket.
+            s3_key (str): Key (path) to save the file in S3.
+        Returns:
+            str: The URL of the uploaded file.
+        """
+        try:
+            self.s3_client.upload_file(file_path, bucket_name, s3_key)
+            print(f"Uploaded {file_path} to s3://{bucket_name}/{s3_key}")
+            return f"s3://{bucket_name}/{s3_key}"
+        except FileNotFoundError:
+            print(f"File not found: {file_path}")
+            raise
+        except NoCredentialsError:
+            print("AWS credentials not found.")
+            raise
+        except ClientError as e:
+            print(f"Error uploading file: {e}")
+            raise
+    def list_files(self, bucket_name, prefix=""):
+        """List all files in the bucket with given prefix"""
+        try:
+            response = self.s3_client.list_objects_v2(
+                Bucket=bucket_name,
+                Prefix=prefix
+            )
+            print(f"\nFiles in bucket '{bucket_name}' with prefix '{prefix}':")
+            if 'Contents' in response:
+                for obj in response['Contents']:
+                    print(f"- {obj['Key']} ({obj['Size']} bytes)")
+                return [obj['Key'] for obj in response['Contents']]
+            else:
+                print("No files found")
+                return []
+        except ClientError as e:
+            print(f"Error listing files: {str(e)}")
+            return []
+    def delete_all_files(self, bucket_name, prefix=""):
+        """
+        Delete all files in the bucket with given prefix
+        Args:
+            bucket_name: Name of the S3 bucket
+            prefix: Optional prefix to delete only files under this path
+        Returns:
+            tuple: (number of deleted files, list of any files that failed to delete)
+        """
+        deleted_count = 0
+        failed_deletes = []
+        try:
+            # List all objects in the bucket
+            while True:
+                # Get batch of objects
+                response = self.s3_client.list_objects_v2(
+                    Bucket=bucket_name,
+                    Prefix=prefix
+                )
+                # If bucket is empty
+                if 'Contents' not in response:
+                    print(f"No files found in bucket '{bucket_name}' with prefix '{prefix}'")
+                    break
+                # Prepare objects for deletion
+                objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']]
+                # Delete the batch of objects
+                delete_response = self.s3_client.delete_objects(
+                    Bucket=bucket_name,
+                    Delete={
+                        'Objects': objects_to_delete,
+                        'Quiet': False
+                    }
+                )
+                # Count successful deletes
+                if 'Deleted' in delete_response:
+                    deleted_count += len(delete_response['Deleted'])
+                    for obj in delete_response['Deleted']:
+                        print(f"Deleted: {obj['Key']}")
+                # Track failed deletes
+                if 'Errors' in delete_response:
+                    for error in delete_response['Errors']:
+                        failed_deletes.append(error['Key'])
+                        print(f"Failed to delete {error['Key']}: {error['Message']}")
+                # Check if there are more objects to delete
+                if not response.get('IsTruncated'):  # No more files
+                    break
+            print(f"\nSuccessfully deleted {deleted_count} files")
+            if failed_deletes:
+                print(f"Failed to delete {len(failed_deletes)} files")
+            return deleted_count, failed_deletes
+        except ClientError as e:
+            print(f"Error deleting files: {str(e)}")
+            return 0, []
+    def upload_string_to_s3(self, string_data, bucket_name, s3_key):
+        """
+        Upload a string as an object to S3.
+        Args:
+            string_data (str): The string content to upload.
+            bucket_name (str): The S3 bucket name.
+            s3_key (str): The S3 key (path) to save the file.
+        """
+        try:
+            # Convert string data to bytes
+            self.s3_client.put_object(Body=string_data, Bucket=bucket_name, Key=s3_key)
+            print(f"Uploaded string to s3://{bucket_name}/{s3_key}")
+        except (NoCredentialsError, ClientError) as e:
+            print(f"Failed to upload string data: {e}")
+            raise
+    def download_string_from_s3(self, bucket_name, s3_key):
+        """
+        Download a string object from S3 and return it.
+        Args:
+            bucket_name (str): The S3 bucket name.
+            s3_key (str): The S3 key (path) to the object.
+        Returns:
+            str: The content of the object as a string.
+        """
+        try:
+            response = self.s3_client.get_object(Bucket=bucket_name, Key=s3_key)
+            content = response['Body'].read().decode('utf-8')
+            print(f"Downloaded content from s3://{bucket_name}/{s3_key}")
+            return content
+        except (NoCredentialsError, ClientError) as e:
+            print(f"Failed to download string data: {e}")
+            raise
+    def download_pdf_by_article_id(self, article_id, metadata, bucket_name, local_download_dir):
+        """
+        Download a specific PDF from S3 by article ID.
+        Args:
+            article_id (str): The PMC article ID to download (e.g., "PMC1464409").
+            metadata (list): List of metadata records.
+            bucket_name (str): Name of the S3 bucket containing the files.
+        """
+        # Search for the article in the metadata
+        record = next((item for item in metadata if item["pmc_id"] == article_id), None)
+        if not record:
+            print(f"Article ID {article_id} not found in metadata.")
+            return
+        pdf_s3_path = record.get("pdf_s3_path")
+        # Extract the S3 key from the S3 path
+        s3_key = pdf_s3_path.replace(f"s3://{bucket_name}/", "")
+        # Define the local file path
+        local_pdf_path = os.path.join(local_download_dir, f"{article_id}.pdf")
+        print(f"Downloading {article_id} from S3: {pdf_s3_path} to {local_pdf_path}")
+        # Download the file
+        try:
+            self.s3_client.download_file(bucket_name, s3_key, local_pdf_path)
+            print(f"Downloaded {article_id} to {local_pdf_path}")
+        except Exception as e:
+            print(f"Failed to download {article_id}: {e}")
+if __name__ == "__main__":
+    s3 = S3Handler()
+    s3.list_files(bucket_name=settings.AWS_BUCKET_NAME)
+    # from botocore.config import Config
+    # Create custom configuration
+    # config = Config(
+    #     region_name='me-south-1'
+    # )
+    # import boto3
+    # from botocore.exceptions import ClientError
+    # # Initialize the S3 client with exact credentials
+    # s3_client = boto3.client('s3', aws_access_key_id='AKIA4MTWHLYL52IGF2VY'.strip(), aws_secret_access_key='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'.strip(),
+    #                          config=config)
+    # # Test function
+    # def test_connection():
+    #     try:
+    #         # Try to list objects
+    #         response = s3_client.list_objects_v2(
+    #             Bucket='sehas3.bucket1'
+    #         )
+    #         try:
+    #             bucket = s3_client.Bucket('sehas3.bucket1')
+    #             for obj in bucket.objects.all():
+    #                 print(obj.key)
+    #             print("Success!")
+    #         except ClientError as e:
+    #             print(f"Error: {e}")
+    #     except ClientError as e:
+    #         print(f"Error: {str(e)}")
+    # test_connection()

core/voice_processor.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# audio_handler.py
+import streamlit as st
+import wave
+import pyaudio
+import tempfile
+import os
+from datetime import datetime
+import openai
+class AudioRecorder:
+    def __init__(self, openai_api_key):
+        self.audio = pyaudio.PyAudio()
+        self.frames = []
+        self.recording = False
+        self.stream = None
+        self.openai_client = openai.OpenAI(api_key=openai_api_key)
+    def start_recording(self):
+        self.recording = True
+        self.frames = []
+        self.stream = self.audio.open(
+            format=pyaudio.paInt16,
+            channels=1,
+            rate=44100,
+            input=True,
+            frames_per_buffer=1024,
+            stream_callback=self._callback
+        )
+        self.stream.start_stream()
+    def _callback(self, in_data, frame_count, time_info, status):
+        if self.recording:
+            self.frames.append(in_data)
+        return (in_data, pyaudio.paContinue)
+    def stop_recording(self):
+        if self.stream:
+            self.recording = False
+            self.stream.stop_stream()
+            self.stream.close()
+            # Create temporary file
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            audio_file = os.path.join(tempfile.gettempdir(), f"audio_{timestamp}.wav")
+            # Save audio file
+            with wave.open(audio_file, 'wb') as wf:
+                wf.setnchannels(1)
+                wf.setsampwidth(self.audio.get_sample_size(pyaudio.paInt16))
+                wf.setframerate(44100)
+                wf.writeframes(b''.join(self.frames))
+            return audio_file
+        return None
+    def transcribe_audio(self, audio_file_path):
+        """Transcribe audio file using OpenAI Whisper API"""
+        try:
+            with open(audio_file_path, "rb") as audio_file:
+                transcript = self.openai_client.audio.transcriptions.create(
+                    model="whisper-1",
+                    file=audio_file,
+                    language="en",
+                    response_format="text"
+                )
+                return transcript
+        except Exception as e:
+            st.error(f"Transcription error: {str(e)}")
+            return None
+    def __del__(self):
+        self.audio.terminate()

database/__init__.py ADDED Viewed

File without changes

database/vector_store.py ADDED Viewed

File without changes

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Dockerfile
+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN sudo apt-get update && sudo apt-get install -y libportaudio2 libportaudiocpp0 portaudio19-dev python3-dev  build-essential
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+# docker-compose.yml
+version: '3.8'
+services:
+  api:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+    ports:
+      - "8000:8000"
+    env_file:
+      - .env
+    volumes:
+      - .:/app
+    restart: unless-stopped

docker/docker-compose.yaml ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from app.api import routes
+from app.config import get_settings
+settings = get_settings()
+app = FastAPI(
+    title=settings.PROJECT_NAME
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(routes.router, prefix=settings.API_V1_STR)

qa_app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# app.py
+from core.pineconeqa import PineconeQA
+from core.voice_processor import AudioRecorder
+import streamlit as st
+from config import get_settings
+import os
+def initialize_qa():
+    """Initialize QA system with API keys from settings"""
+    try:
+        settings = get_settings()
+        return PineconeQA(
+            pinecone_api_key=settings.PINECONE_API_KEY,
+            openai_api_key=settings.OPENAI_API_KEY,
+            index_name=settings.INDEX_NAME
+        )
+    except Exception as e:
+        st.error(f"Error initializing QA system: {str(e)}")
+        return None
+def handle_audio_input():
+    """Handle audio recording, saving, and transcription"""
+    settings = get_settings()
+    if 'audio_recorder' not in st.session_state:
+        st.session_state.audio_recorder = AudioRecorder(settings.OPENAI_API_KEY)
+    if 'transcribed_text' not in st.session_state:
+        st.session_state.transcribed_text = ""
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        if st.button("🎤 Record",
+                    help="Click to start recording",
+                    type="primary" if not st.session_state.get('recording', False) else "secondary"):
+            st.session_state.audio_recorder.start_recording()
+            st.session_state.recording = True
+            st.info("Recording... Click 'Stop' when finished.")
+    with col2:
+        if st.button("⏹️ Stop",
+                    help="Click to stop recording",
+                    disabled=not st.session_state.get('recording', False)):
+            audio_file = st.session_state.audio_recorder.stop_recording()
+            st.session_state.recording = False
+            if audio_file:
+                with st.spinner("Transcribing audio..."):
+                    # Transcribe audio
+                    transcription = st.session_state.audio_recorder.transcribe_audio(audio_file)
+                    if transcription:
+                        # Store transcription in session state
+                        st.session_state.transcribed_text = transcription
+                        st.success("Audio transcribed successfully!")
+                        # Play audio for verification
+                        st.audio(audio_file)
+                    else:
+                        st.error("Failed to transcribe audio.")
+def process_question(question):
+    st.session_state.chat_history = []
+    with st.spinner("Thinking..."):
+        response = st.session_state.qa_system.ask(question)
+        if "error" in response:
+            st.error(f"Error: {response['error']}")
+        else:
+            # Display answer
+            st.markdown("### Answer:")
+            st.write(response["answer"])
+            # Display sources
+            with st.expander("View Sources"):
+                for i, doc in enumerate(response["context"], 1):
+                    st.markdown(f"**Source {i}:**")
+                    st.write(f"Content: {doc.page_content}")
+                    st.write(f"Source: {doc.metadata.get('source', 'Unknown')}")
+                    st.write(f"Title: {doc.metadata.get('title', 'Unknown')}")
+                    st.write(f"Keywords: {doc.metadata.get('keywords', 'N/A')}")
+                    st.write(f"Publication Date: {doc.metadata.get('publication_date', 'Unknown')}")
+                    st.markdown("---")
+            # Add to chat history
+            # st.session_state.chat_history.append((question, response["answer"]))
+def main():
+    st.title("Scientific Paper Q&A System")
+    # Initialize session state variables
+    if 'qa_system' not in st.session_state:
+        st.session_state.qa_system = initialize_qa()
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
+    if 'recording' not in st.session_state:
+        st.session_state.recording = False
+    if 'transcribed_text' not in st.session_state:
+        st.session_state.transcribed_text = ""
+    # Main chat interface
+    if st.session_state.qa_system:
+        # Chat container
+        chat_container = st.container()
+        with chat_container:
+            # Display chat history
+            for i, (question, answer) in enumerate(st.session_state.chat_history):
+                st.markdown(f"**You:** {question}")
+                st.markdown(f"**Assistant:** {answer}")
+                st.markdown("---")
+        # Input container at the bottom
+        with st.container():
+            st.markdown("### Ask a Question")
+            # Text input area with transcribed text as default
+            question = st.text_area(
+                "Type your question or use voice input:",
+                value=st.session_state.transcribed_text,
+                height=100,
+                key="question_input",
+                label_visibility="collapsed"
+            )
+            # Audio recording interface
+            handle_audio_input()
+            # Ask button
+            if st.button("Send Question", type="primary"):
+                if question:
+                    process_question(question)
+                    # Clear transcribed text after sending
+                    st.session_state.transcribed_text = ""
+                else:
+                    st.warning("Please enter a question or record your voice.")
+            # Clear chat button
+            if st.button("Clear Chat"):
+                st.session_state.chat_history = []
+                st.session_state.transcribed_text = ""
+                st.rerun()
+    else:
+        st.error("Could not initialize QA system. Please check your environment variables.")
+if __name__ == "__main__":
+    main()

rag_existing_index.ipynb ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/larawehbe/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm\n",
+      "/Users/larawehbe/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/langchain_openai/embeddings/base.py:281: UserWarning: WARNING! return_source_documents is not default parameter.\n",
+      "                    return_source_documents was transferred to model_kwargs.\n",
+      "                    Please confirm that return_source_documents is what you intended.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.vectorstores import Pinecone\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "import pinecone\n",
+    "from langchain_pinecone import PineconeVectorStore\n",
+    "from config import get_settings\n",
+    "settings = get_settings()\n",
+    "# Initialize Pinecone\n",
+    "pc  = pinecone.Pinecone(settings.PINECONE_API_KEY)\n",
+    "import os \n",
+    "\n",
+    "os.environ['PINECONE_API_KEY'] = settings.PINECONE_API_KEY\n",
+    "\n",
+    "# Connect to the existing index\n",
+    "index_name = settings.INDEX_NAME # Replace with your index name\n",
+    "index = pc.Index(index_name)\n",
+    "\n",
+    "# Initialize embeddings (ensure your embedding logic matches the one used during indexing)\n",
+    "embeddings = OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY, return_source_documents=True)\n",
+    "\n",
+    "# Create the retriever from Pinecone\n",
+    "retriever = PineconeVectorStore(index=index, embedding=OpenAIEmbeddings())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "# Define the LLM\n",
+    "os.environ[\"OPENAI_API_KEY\"] = settings.OPENAI_API_KEY\n",
+    "\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4o\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'llm' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 25\u001b[0m\n\u001b[1;32m     17\u001b[0m prompt \u001b[38;5;241m=\u001b[39m ChatPromptTemplate\u001b[38;5;241m.\u001b[39mfrom_messages(\n\u001b[1;32m     18\u001b[0m     [\n\u001b[1;32m     19\u001b[0m         (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msystem\u001b[39m\u001b[38;5;124m\"\u001b[39m, system_prompt),\n\u001b[1;32m     20\u001b[0m         (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhuman\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{input}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m     21\u001b[0m     ]\n\u001b[1;32m     22\u001b[0m )\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Use your LLM instance\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m question_answer_chain \u001b[38;5;241m=\u001b[39m create_stuff_documents_chain(\u001b[43mllm\u001b[49m, prompt)\n\u001b[1;32m     27\u001b[0m \u001b[38;5;66;03m# Create the RAG chain\u001b[39;00m\n\u001b[1;32m     28\u001b[0m rag_chain \u001b[38;5;241m=\u001b[39m create_retrieval_chain(retriever\u001b[38;5;241m.\u001b[39mas_retriever(), question_answer_chain)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'llm' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.chains import create_retrieval_chain\n",
+    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "# Define the system prompt\n",
+    "system_prompt = (\n",
+    "    \"You are an assistant for question-answering tasks. \"\n",
+    "    \"Use the following pieces of retrieved context to answer \"\n",
+    "    \"the question. If you don't know the answer, say that you \"\n",
+    "    \"don't know. Use three sentences maximum and keep the \"\n",
+    "    \"answer concise.\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"{context}\"\n",
+    ")\n",
+    "\n",
+    "# Create the chat prompt template\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", system_prompt),\n",
+    "        (\"human\", \"{input}\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "# Use your LLM instance\n",
+    "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n",
+    "\n",
+    "# Create the RAG chain\n",
+    "rag_chain = create_retrieval_chain(retriever.as_retriever(), question_answer_chain)\n",
+    "\n",
+    "# Invoke the RAG chain with a sample question\n",
+    "results = rag_chain.invoke({\"input\": \"pubmed_pdfs/PMC1474056.pdf\"})\n",
+    "print(results)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input': 'What are the benefits of using electronic health records for patient care??', 'context': [Document(id='chunk_100_63', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='The use of sub-cellular localization data and functional\\nannotation as ﬁlters for the predictions increased their overlap\\nwith experimental complexes, as compared with the unﬁl-\\ntered predictions. This ﬁnding is in agreement with previousobservations that combining multiple sources of information\\nimproves the accuracy of function annotation as well as inter-\\naction prediction (9–11). Our method easily allows for theuse of additional biological ﬁlters when other types of\\ndata are available, such as synthetic gene lethality (36),\\nco-expression (37), and so on. This incremental addition oforthogonal information is also necessary to more accuratelyrepresent the conditions in the cellular milieu, where the\\npropensity of two protein structures to interact is not limited\\nonly by the physical chemistry of the interaction, but also byhigher levels of biological regulation, including compartmen-\\ntalization, expression, degradation, abundance and so on.'), Document(id='chunk_100_80', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='23. Ghaemmaghami,S., Huh,W.K., Bower,K., Howson,R.W., Belle,A.,\\nDephoure,N., O’Shea,E.K. and Weissman,J.S. (2003) Global analysis\\nof protein expression in yeast. Nature ,425, 737–741.\\n24. Dwight,S.S., Harris,M.A., Dolinski,K., Ball,C.A., Binkley,G.,\\nChristie,K.R., Fisk,D.G., Issel-Tarver,L., Schroeder,M., Sherlock,G.et al. (2002) Saccharomyces Genome Database (SGD) provides\\nsecondary gene annotation using the gene ontology (GO). Nucleic\\nAcids Res. ,30, 69–72.\\n25. Fawcett,T. (2003) ROC graphs: notes and practical considerations for\\ndata mining researchers. Technical Report HPL-2003-4, HP Labs,Palo Alto, CA, USA.\\n26. Pieper,U., Eswar,N., Davis,F.P., Braberg,H., Madhusudhan,M.S.,\\nRossi,A., Marti-Renom,M., Karchin,R., Webb,B.M., Eranian,D. et al.\\n(2006) MODBASE: a database of annotated comparative proteinstructure models and associated resources. Nucleic Acids Res. ,34,\\nD291–D295.\\n27. Eswar,N., John,B., Mirkovic,N., Fiser,A., Ilyin,V.A., Pieper,U.,'), Document(id='chunk_100_73', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='used to help bridge the resolution gap between electroncryo-microscopy (cryo-EM) density maps and atomic protein\\nstructures (41). Fitting of protein and protein domain models\\ninto density maps of large assemblies is already common, butdepending on the resolution, the information encoded in the\\nmap is often insufﬁcient for an unambiguous determination\\nof the positions and orientations of the individual proteins(42). Models of the complexes predicted here may provideadditional restraints for a more accurate ﬁtting of proteins\\ninto large complexes studied by cryo-EM and electron\\ncryo-tomography (14,43).\\nAs the number and size of experimentally determined\\nstructures of protein complexes increase, the number of\\ncomplexes that can be predicted and modeled using thesestructures as templates increases correspondingly, expanding\\nthe structural coverage of protein interaction space (44).'), Document(id='chunk_100_84', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='Chung,S., Vidal,M. and Gerstein,M. (2004) Annotation transfer\\nbetween genomes: protein–protein interologs and protein–DNA\\nregulogs. Genome Res. ,14, 1107–1118.\\n39. Bornberg-Bauer,E., Beaussart,F., Kummerfeld,S.K., Teichmann,S.A.\\nand Weiner,J.,III (2005) The evolution of domain arrangementsin proteins and interaction networks. Cell Mol. Life Sci. ,62,\\n435–445.\\n40. Han,J.D., Bertin,N., Hao,T., Goldberg,D.S., Berriz,G.F., Zhang,L.V.,\\nDupuy,D., Walhout,A.J., Cusick,M.E., Roth,F.P. et al. (2004) Evidence\\nfor dynamically organized modularity in the yeast protein-proteininteraction network. Nature ,430, 88–93.\\n41. Topf,M. and Sali,A. (2005) Combining electron microscopy and\\ncomparative protein structure modeling. Curr. Opin. Struct. Biol. ,15,\\n578–585.\\n42. Fabiola,F. and Chapman,M.S. (2005) Fitting of high-resolution\\nstructures into electron microscopy reconstruction images. Structure ,\\n13, 389–400.\\n43. Sali,A., Glaeser,R., Earnest,T. and Baumeister,W. (2003) From words')], 'answer': 'The benefits of using electronic health records (EHRs) for patient care include improved accuracy and accessibility of patient information, which enhances coordination and reduces errors in treatment. EHRs facilitate better communication among healthcare providers, leading to more informed decision-making and streamlined care processes. Additionally, EHRs support data analysis for improved healthcare outcomes and can enhance patient engagement by providing them access to their health information.'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = rag_chain.invoke({\"input\": \"What are the benefits of using electronic health records for patient care??\"})\n",
+    "\n",
+    "# Extract the answer and the source documents\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input': 'How does the algorithm perform when multiple templates for binding modes are available?',\n",
+       " 'context': [Document(id='chunk_100_67', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='The ability of the algorithm to choose the correct binding\\nmode when multiple templates are available was illustrated\\nby evaluation of three alternative binding modes that have\\nbeen structurally characterized between porcine pancreatica-amylase and camelid VHH domains (Figure 5). The\\nalgorithm successfully chose the native binding mode for\\nall three VHH domains. In addition, the statistical potentialscores that were computed for the native binding modesexhibit the same rank order as the afﬁnity of the interactions\\nmeasured by total internal reﬂectance (33).\\nHowever, this example is also cautionary in that each VHH\\ndomain had one non-native mode that scored below the\\noptimal Z-score threshold, though only the native modes\\nproduced negative raw scores (Results). In a large-scalepredictive setting, if the native binding mode was not\\navailable as a template, the VHH domain would have been'),\n",
+       "  Document(id='chunk_100_39', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='how many of the predicted complexes were equivalent to,or were subcomplexes of, experimentally determined com-\\nplexes. Since the predictions are based on known structures,\\nthe sizes of the predicted complexes are far smaller than thoseobtained by biochemical methods such as tandem afﬁnitypuriﬁcation methods. For this reason, we elected not to use\\na metric that explicitly penalizes size differences [e.g. the\\nmetric deﬁned in Ref. (16)].\\nBinding mode selection\\nThe ability of the potential to select the proper binding mode\\nwhen multiple template interfaces of different orientation are\\navailable was assessed. The test cases used were the struc-\\ntures of camelid VHH domains AMB7, AMD10 and AMD9bound to porcine pancreatic a-amylase (PPA) (PDB codes\\n1kxt, 1kxv and 1kxq, respectively). All three modes were\\nevaluated for each VHH–PPA complex using the interfacestatistical potential.\\nData sources\\nThe prediction algorithm uses three types of data: (i) target'),\n",
+       "  Document(id='chunk_100_68', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='produced negative raw scores (Results). In a large-scalepredictive setting, if the native binding mode was not\\navailable as a template, the VHH domain would have been\\npredicted to interact with PPA, but through an incorrectbinding mode. This example illustrates a connection between\\nthe observed performance and the underlying scoring scheme.\\nHowever, a systematic analysis of alternative binding modesin protein interactions, and the ability of our method to dis-tinguish them, remains a useful goal for the future.2950 Nucleic Acids Research, 2006, Vol. 34, No. 10Network specificities\\nA more difﬁcult test of the method is the prediction of\\nspeciﬁcities within interaction networks between homologous\\nproteins. To address this problem, the method was applied to\\npredict the speciﬁcities within the epidermal growth factorreceptor (EGFR) and tumor necrosis factor b(TNFb) net-\\nworks of ligand receptor interactions (data not shown). In\\nboth networks the method failed to recapitulate known bind-'),\n",
+       "  Document(id='chunk_100_56', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='structural assessment, PSI-BLAST (32) was used to\\npredict binary interactions by detecting similarities between\\nS.cerevisiae proteins and the template complexes. An overlap\\nof 929 binary interactions was observed between the set of\\n36 790 (2.5%) predictions and the 19 424 (4.8%) experimen-\\ntally observed binary interactions.\\nAlternate binding modes\\nThe ability of the algorithm to correctly select the native\\nbinding mode when alternate templates are available wastested. The native binding mode was correctly selected for\\nall three VHH domains interacting with porcine pancreatic\\na-amylase (Figure 5). In addition, the statistical potential\\nscores that were computed for the native binding modesexhibit the same rank-order as the afﬁnity measured experi-\\nmentally by total internal reﬂectance (33).\\nCo-complexed domains\\nAn extension process merged predicted complexes containing\\ndifferent domains of a single target protein (Figure 1c). This')],\n",
+       " 'answer': 'The algorithm successfully chooses the native binding mode for all three VHH domains interacting with porcine pancreatic a-amylase when multiple templates are available. The statistical potential scores for the native binding modes exhibit the same rank order as the experimentally measured affinity. However, each VHH domain had one non-native mode that scored below the optimal Z-score threshold, indicating some caution is necessary in large-scale predictive settings.'}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results = rag_chain.invoke({\"input\": \"How does the algorithm perform when multiple templates for binding modes are available?\"})\n",
+    "\n",
+    "\n",
+    "# Ensure the output includes the answer and source documents\n",
+    "\n",
+    "results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pinecone-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fitz
+pinecone
+openai
+fastapi
+pypdf
+pypdf2
+langchain_community
+langchain-openai
+langchain-pinecone
+boto3
+pyaudio
+gradio

sandbox_testing.ipynb ADDED Viewed

	@@ -0,0 +1,1073 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from core.s3_utils import S3Handler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_handler = S3Handler()\n",
+    "from config import get_settings\n",
+    "settings = get_settings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Files in bucket 'sehas3.bucket1' with prefix '':\n",
+      "No files found\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s3_handler.list_files(settings.AWS_BUCKET_NAME)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to download string data: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.\n"
+     ]
+    },
+    {
+     "ename": "NoSuchKey",
+     "evalue": "An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNoSuchKey\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[63], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mjson\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m metadata_content \u001b[38;5;241m=\u001b[39m \u001b[43ms3_handler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_string_from_s3\u001b[49m\u001b[43m(\u001b[49m\u001b[43msettings\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mAWS_BUCKET_NAME\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpubmed_metadata/metadata.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      3\u001b[0m metadata_content \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mloads(metadata_content)\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28mlen\u001b[39m(metadata_content)\n",
+      "File \u001b[0;32m~/Documents/fakkerai/sehatech/core/s3_utils.py:192\u001b[0m, in \u001b[0;36mS3Handler.download_string_from_s3\u001b[0;34m(self, bucket_name, s3_key)\u001b[0m\n\u001b[1;32m    181\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    182\u001b[0m \u001b[38;5;124;03mDownload a string object from S3 and return it.\u001b[39;00m\n\u001b[1;32m    183\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    189\u001b[0m \u001b[38;5;124;03m    str: The content of the object as a string.\u001b[39;00m\n\u001b[1;32m    190\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    191\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 192\u001b[0m     response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43ms3_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBucket\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbucket_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mKey\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43ms3_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    193\u001b[0m     content \u001b[38;5;241m=\u001b[39m response[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBody\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mread()\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    194\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDownloaded content from s3://\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbucket_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00ms3_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/botocore/client.py:569\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.<locals>._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    565\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m    566\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpy_operation_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m() only accepts keyword arguments.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    567\u001b[0m     )\n\u001b[1;32m    568\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 569\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/botocore/client.py:1023\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m   1019\u001b[0m     error_code \u001b[38;5;241m=\u001b[39m error_info\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mQueryErrorCode\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m error_info\u001b[38;5;241m.\u001b[39mget(\n\u001b[1;32m   1020\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCode\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1021\u001b[0m     )\n\u001b[1;32m   1022\u001b[0m     error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m-> 1023\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m   1024\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1025\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parsed_response\n",
+      "\u001b[0;31mNoSuchKey\u001b[0m: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist."
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "metadata_content = s3_handler.download_string_from_s3(settings.AWS_BUCKET_NAME, \"pubmed_metadata/metadata.json\")\n",
+    "metadata_content = json.loads(metadata_content)\n",
+    "\n",
+    "len(metadata_content)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'pmc_id': 'PMC8419487',\n",
+       "  'title': 'No Title Available',\n",
+       "  'abstract': 'ZnS materials have been widely used in fluorescence biosensors to characterize different types of stem cells due to their excellent fluorescence effect. In this study, ZnS was prepared by vulcanizing nano-Zn particles synthesized using a DC arc plasma. The composition and structure of the ZnS materials were studied by X-ray diffraction (XRD), and their functional group information and optical properties were investigated by using IR spectrophotometry and UV-vis spectrophotometry. It has been found that the synthesized materials consist of Zn, cubic ZnS, and hexagonal ZnS according to the vulcanization parameters. Crystalline ZnS was gradually transformed from a cubic to a hexagonal structure, and the cycling properties first increase, then decrease with increasing sulfurization temperature. There is an optimal curing temperature giving the best cycling performance and specific capacity: the material sulfurized thereat mainly consists of cubic ',\n",
+       "  'authors': ['Ren, Y.',\n",
+       "   'Zhou, H.',\n",
+       "   'Wang, X.',\n",
+       "   'Liu, Q. W.',\n",
+       "   'Hou, X. D.',\n",
+       "   'Zhang, G. F.'],\n",
+       "  'keywords': [],\n",
+       "  'doi': 'N/A',\n",
+       "  'source': 'downloaded_pdfs/PMC8419487/PMC8419487/SCI2021-7067146.nxml',\n",
+       "  'publication_date': '2021'}]"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metadata_content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No files found in bucket 'sehas3.bucket1' with prefix ''\n",
+      "\n",
+      "Successfully deleted 0 files\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(0, [])"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s3_handler.delete_all_files(settings.AWS_BUCKET_NAME)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/larawehbe/Documents/fakkerai/sehatech/venv/lib/python3.13/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pinecone\n",
+    "pc = pinecone.Pinecone(settings.PINECONE_API_KEY)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index = pc.Index(settings.INDEX_NAME)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'matches': [], 'namespace': 'default', 'usage': {'read_units': 1}}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "index.query(namespace=\"default\", top_k=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### temp : remove when tested"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import PyPDF2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PyPDF2 import PdfReader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdf = PdfReader(\"WJR-9-27.pdf\")\n",
+    "info = pdf.metadata\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"\".join(page.extract_text() for page in pdf.pages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Matteo Bauckneht, Roberta Piva, Gianmario Sambuceti, Francesco Grossi, Silvia Morbelli EDITORIAL\n",
+      "27 February 28, 2017 |Volume 9 |Issue 2| WJR|www.wjgnet.comEvaluation of response to immune checkpoint inhibitors: Is \n",
+      "there a role for positron emission tomography?\n",
+      "Matteo Bauckneht, Roberta Piva, Gianmario Sambuceti, \n",
+      "Silvia Morbelli, Nuclear Medicine Unit, IRCCS San Martino-\n",
+      "IST, University of Genoa, 16132 Genoa, Italy\n",
+      "Francesco Grossi,  Lung Cancer Unit, IRCCS San Martino-IST, \n",
+      "University of Genoa, 16132 Genoa, Italy\n",
+      "Author contributions:  Morbelli S conceived and designed \n",
+      "the study; Bauckneht M and Morbelli S drafted the manuscript; \n",
+      "Bauckneht M and Piva R prepared the tables and figures; \n",
+      "Sambuceti G and Grossi F critically revised the manuscript; all \n",
+      "the authors approved the final version of the paper.\n",
+      "Conflict-of-interest statement:  The authors have no conflicts of \n",
+      "interest related to this publication to disclose.\n",
+      "Open-Access:  This article is an open-access article which was \n",
+      "selected by an in-house editor and fully peer-reviewed by external \n",
+      "reviewers. It is distributed in accordance with the Creative Commons \n",
+      "Attribution Non Commercial (CC BY-NC 4.0) license, which \n",
+      "permits others to distribute, remix, adapt, build upon this work non-\n",
+      "commercially, and license their derivative works on different terms, \n",
+      "provided the original work is properly cited and the use is non-\n",
+      "commercial. See: http://creativecommons.org/licenses/by-nc/4.0/\n",
+      "Manuscript source: Invited manuscript\n",
+      "Correspondence to:  Silvia Morbelli, MD, PhD,  Nuclear \n",
+      "Medicine Unit, IRCCS San Martino-IST, University of Genoa, \n",
+      "Largo R. Benzi 10, 16132 Genova, \n",
+      "Italy. silviadaniela.morbelli@hsanmartino.i t \n",
+      "Telephone:  +39-010-5552026\n",
+      "Fax: +39-010-5556911\n",
+      "Received:  August 20, 2016 \n",
+      "Peer-review started:  August 23, 2016 \n",
+      "First decision:  October 21, 2016\n",
+      "Revised: November 2, 2016 \n",
+      "Accepted:  November 27, 2016\n",
+      "Article in press:  November 29, 2016\n",
+      "Published online:  February 28, 2017\n",
+      "Abstract\n",
+      "Strategies targeting intracellular negative regulators such as immune checkpoint inhibitors (ICPIs) have \n",
+      "demonstrated significant antitumor activity across a \n",
+      "wide range of solid tumors. In the clinical practice, the \n",
+      "radiological effect of immunotherapeutic agents has \n",
+      "raised several more relevant and complex challenges for \n",
+      "the determination of their imaging-based response at \n",
+      "single patient level. Accordingly, it has been suggested \n",
+      "that the conventional Response Evaluation Criteria in \n",
+      "Solid Tumors assessment alone, based on dimensional \n",
+      "evaluation provided by computed tomography (CT), \n",
+      "tends to underestimate the benefit of ICPIs at least in \n",
+      "a subset of patients, supporting the need of immune-\n",
+      "related response criteria. Different from CT, very few \n",
+      "data are available for the evaluation of immunotherapy \n",
+      "by means of 18F-fluoro-2-deoxy-D-glucose positron \n",
+      "emission tomography (FDG-PET). Moreover, since \n",
+      "the antineoplastic activity of ICPIs is highly related \n",
+      "to the activation of T cells against cancer cells, FDG \n",
+      "accumulation might cause false-positive findings. Yet, \n",
+      "discrimination between benign and malignant processes \n",
+      "represents a huge challenge for FDG-PET in this clinical \n",
+      "setting. Consequently, it might be of high interest to test \n",
+      "the complex and variegated response to ICPIs by means \n",
+      "of PET and thus it is worthwhile to ask if a similar \n",
+      "introduction of immune-related PET-based criteria could \n",
+      "be proposed in the future. Finally, PET might offer a new \n",
+      "insight into the biology and pathophysiology of ICPIs \n",
+      "thanks to a growing number of non-invasive immune-\n",
+      "diagnostic approaches based on non-FDG tracers.\n",
+      "Key words:  Immune checkpoint inhibitors; Positron \n",
+      "emission tomography; Computed tomography; 18F-fluoro-\n",
+      "2-deoxy-D-glucose; Non-18F-fluoro-2-deoxy-D-glucose \n",
+      "tracers\n",
+      "© The Author(s) 2017.  Published by Baishideng Publishing \n",
+      "Group Inc. All rights reserved.\n",
+      "Core tip:  In the clinical practice, the radiological \n",
+      "interpretation of immunotherapy effects represents \n",
+      "a huge challenge at single patient level. However, \n",
+      "although the computed tomography-based response World Journal of \n",
+      "Radiology WJR\n",
+      "Submit a Manuscript: http://www.wjgnet.com/esps/\n",
+      "DOI: 10.4329/wjr.v9.i2.27World J Radiol  2017 February 28; 9(2): 27-33\n",
+      "ISSN 1949-8470 (online)28 February 28, 2017 |Volume 9 |Issue 2| WJR|www.wjgnet.comBauckneht M  et al. Immune checkpoint inhibitors and PET\n",
+      "evaluation for immune checkpoint inhibitors (ICPIs) is \n",
+      "feasible thanks to the introduction of immune-related \n",
+      "response criteria, very few data are available for the \n",
+      "potential role of 18F-fluoro-2-deoxy-D-glucose positron \n",
+      "emission tomography (FDG-PET). Due to the intrinsic \n",
+      "nature of FDG accumulation pathophysiology, it might \n",
+      "be central to test the complex and variegated response \n",
+      "to ICPIs by means of PET. Finally, PET might offer a new \n",
+      "insight into the biology of ICPIs thanks to a growing \n",
+      "number of non-invasive immune-diagnostic approaches \n",
+      "based on non-FDG tracers.\n",
+      "Bauckneht M, Piva R, Sambuceti G, Grossi F, Morbelli S. \n",
+      "Evaluation of response to immune checkpoint inhibitors: Is \n",
+      "there a role for positron emission tomography?  World J Radiol  \n",
+      "2017; 9(2): 27-33  Available from: URL:  http://www.wjgnet.\n",
+      "com/ 1949-8470 /full/v9/i2/27.htm  DOI: http://dx.doi.org/10.4329/\n",
+      "wjr.v9.i2.27\n",
+      "TEXT\n",
+      "The function of the immune system is characterized by \n",
+      "multiple checkpoints aiming to avoid its over-activation \n",
+      "against healthy cells (self-tolerance)[1]. Cancer cells may \n",
+      "take advantage of these checkpoints to escape detection \n",
+      "by the immune system. Some of these checkpoints such \n",
+      "as cytotoxic T-lymphocyte-associated antigen 4 (CTLA-4) \n",
+      "and programmed cell death protein 1 (PD-1) have been \n",
+      "extensively studied as t argets in the frame of the so-\n",
+      "called cancer immunotherapy[1]. CTLA-4 counteracts \n",
+      "the activity of the T cell co-stimulatory receptor CD28 \n",
+      "and actively delivers inhibitory signals to the T cell[2]. \n",
+      "PD-1 has two ligands, PD1 ligand 1 (PDL1) and PDL2, \n",
+      "and its inhibitory effect is accomplished through a dual \n",
+      "mechanism of promoting apoptosis in antigen specific \n",
+      "T-cells in lymph nodes while simultaneously reducing \n",
+      "apoptosis in regulatory T cells (suppressor T cells)[3]. In \n",
+      "the last few years, the blockade of immune checkpoints \n",
+      "has disclosed the potential of the antitumor immune \n",
+      "response in a fashion that is transforming human cancer \n",
+      "therapeutics. CTLA4 antibodies such as ipilimumab \n",
+      "and tremelimumab have been tested in the last ten \n",
+      "years in different types of cancer , starting with patients \n",
+      "with advanced melanoma[4]. Ipilimumab was the first \n",
+      "therapy to demonstrate a survival benefit for patients \n",
+      "with metastatic melanoma. In a study by Hodi et al[5], \n",
+      "ipilimumab significantly improved overall survival in \n",
+      "patients with previously treated metastatic melanoma \n",
+      "and the drug was approved by the United States Food \n",
+      "and Drug Administration (FDA) for the treatment of \n",
+      "advanced melanoma in 2011[5]. Similarly, nivolumab, \n",
+      "a humanized anti-PD-1 monoclonal antibody, has \n",
+      "demonstrated durable responses in several phase III \n",
+      "trials and has received FDA approval in specific clinical \n",
+      "settings in patients with melanoma, renal cell cancer , \n",
+      "Hodgkin���s lymphoma, bladder cancer, and non-small \n",
+      "cell lung cancer (NSCLC)[6-9]. Figure 1 summarizes the mechanisms of action of the two FDA approved immune \n",
+      "checkpoint inhibitors  (ICPIs) .\n",
+      "Evaluation of response to ICPIs \n",
+      "Historically, the Response Evaluation Criteria in Solid \n",
+      "Tumors (RECIST) has been validated and used to eva -\n",
+      "luate antitumor responses to chemotherapeutic agents[10] \n",
+      "(Table 1 for a more detailed description). These criteria \n",
+      "are based on dimensional evaluation and rely on the \n",
+      "fact that the cytotoxic effect of chemotherapeutic agents \n",
+      "tends to translate into measurable effects in terms of \n",
+      "tumor shrinkage from baseline. Furthermore, published \n",
+      "studies indicated that achieving a response according to \n",
+      "RECIST criteria is predictive of remission and improved \n",
+      "survival in specific settings[11]. Conversely, both RECIST \n",
+      "and their revised 1.1 version assumed that an early \n",
+      "increase in tumor growth and/or the appearance of \n",
+      "new lesions correspond to progressive disease (PD), \n",
+      "testifying drug failure and indicating the need of ongoing \n",
+      "treatment cessation[10].\n",
+      "Some exceptions for the use of these criteria have \n",
+      "been already suggested in patients treated with target \n",
+      "therapies such as tyrosine kinase inhibitors as in this \n",
+      "group of patients the lack of tumor shrinkage in the \n",
+      "presence of a stable disease has been identified as a \n",
+      "potential surrogate end point for improved clinical out -\n",
+      "come[12]. However , in the clinical practice, the radiolo -\n",
+      "gical effect of immunotherapeutic agents has raised \n",
+      "several more relevant and complex challenges for \n",
+      "the determination of their imaging-based response at \n",
+      "single patient level[13]. In published studies, while some \n",
+      "patients have responded to ICPIs with the expected \n",
+      "tumor shrinkage (chemo-like response) or with a \n",
+      "stable disease (target therapy-like response), other \n",
+      "distinct immune-related patterns of response have been \n",
+      "identified. In particular , an initial increase in tumor size, \n",
+      "development of new lesions and then a delayed objective \n",
+      "response were also observed in patients treated with \n",
+      "immunotherapeutic agents[13]. Specifically, in some \n",
+      "patients, the initial increase in total tumor burden was \n",
+      "proven to be due to inflammatory cell infiltrates by \n",
+      "means of biopsy. In these patients the initial pseudo-\n",
+      "progression was followed by a decrease in tumor burden \n",
+      "or even disease regression. \n",
+      "As RECIST criteria were not suitable to catch \n",
+      "these atypical responses, the so-called immune-\n",
+      "related response criteria (irRC) have been proposed to \n",
+      "provide more rigorous characterization of all patterns \n",
+      "of response observed in the phase II development \n",
+      "program for ipilimumab in melanoma[13-15] (Table 1). \n",
+      "The main differences between RECIST 1.1 and irRC \n",
+      "rely on the fact that, according to irRC, appearance of \n",
+      "new lesions (PD according to the RECIST criteria) will \n",
+      "only result in progressive disease in case of a significant \n",
+      "(≥ 25%) increase in total tumor burden with respect \n",
+      "to baseline. Moreover, different from conventional \n",
+      "criteria, if irRC-based PD is evident, it requires further \n",
+      "confirmation after one month with the aim of capturing 29 February 28, 2017 |Volume 9 |Issue 2| WJR|www.wjgnet.comdelayed response.\n",
+      "Recent ly, Hodi et al[16] evaluated atypical response \n",
+      "patterns and reported the overall survival data in \n",
+      "correlation with irRC and RECIST criteria in the context \n",
+      "of a retrospective analysis of 327 melanoma patients \n",
+      "treated wit h the PD-1 inhibitor pembrolizumab. This \n",
+      "study indicated that the conventional RECIST asses sment \n",
+      "alone tends to underestimate the benefit of PD-1 inhibi -\n",
+      "tor therapy in a subset of patients, supporting a need of \n",
+      "immune-related response evaluation[16]. IrRC are thus \n",
+      "increasingly proposed, but they have not been validated \n",
+      "yet in the context of clinical trials and most trials \n",
+      "involving ICPIs continue to use RECIST 1.1 to obtain \n",
+      "standardized endpoints for regulatory approvals[15]. \n",
+      "However, although the irRC seem better than \n",
+      "RECIST , the former has some limitations. The irRC use \n",
+      "the bidimensional measurements in line with WHO \n",
+      "criteria that are now rarely used  in clinical trials and \n",
+      "replaced by the unidimensional measurement of the larger axis of target lesions (RECIST 1.0 and 1.1). The \n",
+      "bidimensional measurements introduce a greater vari -\n",
+      "ability than unidimensional measurements and make it \n",
+      "difficult to compare the responses with studies using the \n",
+      "RECIST criteria.\n",
+      "Is there a role for 18F-fluoro-2-deoxy-D-glucose positron \n",
+      "emission tomography in the evaluation of ICPIs?\n",
+      "While optimal CT-based response criteria for ICPIs \n",
+      "are in the path of their identification, very few data \n",
+      "are available for the evaluation of immunotherapy by \n",
+      "means of 18F-fluoro-2-deoxy-D-glucose positron emi s-\n",
+      "sion tomography (18F-FDG-PET), one of the most used \n",
+      "imaging techniques in oncology. 18F-FDG-PET is currently \n",
+      "the most widely used molecular imaging mod ality in \n",
+      "the clinical practice for staging and restaging of several \n",
+      "cancers. 18F-FDG-PET is clinically indicated before and \n",
+      "after treatment in patients with Hod gkin’s lymphoma \n",
+      "and NSCLC and it is used in patients with melanoma for \n",
+      "Figure 1  Schematic representation of mechanism of action of nivolumab and ipilimumab, two Food and Drug Administration approved immune \n",
+      "checkpoint inhibitors. To prevent autoimmunity, numerous checkpoint pathways regulate the activation of T cells at multiple steps (process known as peripheral \n",
+      "tolerance). Central in this process are the cytotoxic T-lymphocyte-associated antigen 4 (CTLA-4) and programmed death 1 (PD-1) immune checkpoints pathways. \n",
+      "CTLA-4 is potentially able to stop autoreactive T cells at the initial stage of naive T-cell activation, typically in lymph nodes, while PD-1 regulates previously activated \n",
+      "T cells at the later stages of an immune response in peripheral tissues. The binding between T-cell receptor (TCR), which is expressed on T cell surface, with major \n",
+      "histocompatibility complex (MHC) expressed on antigen presenting cells (APCs) provides specificity to T-cell activation. However, T cell activation requires more than \n",
+      "one stimulatory signal. Among them a central role is played by the binding between B7 molecules (APC) with CD28 (T-Cell). CTLA-4 is a CD28 homolog which does \n",
+      "not produce a stimulatory signal but inhibits TCR-MHC binding and thus the T-Cell activation. Different from T-cells in which the amount of CTLA-4 is low, T-Regs \n",
+      "highly express CTLA-4. In these cells CTLA-4 might play a role in their suppressive functions. PD-1 is a member of the B7/CD38 family of protein, which is able to \n",
+      "bind with two different ligands: Programmed death ligand 1 (PD-L1) and programmed death ligand 2 (PD-L2). PD-1 activation in a T-cell prevents the phosphorylation \n",
+      "of key TCR signaling intermediates and thus T-cell activation, resulting in suboptimal control of infections and cancers. Therefore, even though they act at different \n",
+      "phases of T-cell activation, the negative effect of PD-1 and CTLA-4 on T-cell activity is similar. Moreover, different from CTLA-4, PD-1 expression is not specific in \n",
+      "T-cells, but can be observed also in B-cells and myeloid cells. The rationale for immune checkpoint inhibition (represented in red) for cancer treatment is that CTLA-4 \n",
+      "and PD1 pathways are strictly related to cancer survival and thus targeting these molecules or their ligands with monoclonal antibodies permits to impact on cancer \n",
+      "growth. Therefore, even if the exact mechanism of action of these monoclonal antibodies in the antitumor response remains unclear, research data suggest that it is at \n",
+      "least partially related to an activation and proliferation of T-cells regardless of TCR specificity (due to the inhibition of the inhibitory activity of these checkpoints), which \n",
+      "enhances the anti-cancer immune reaction. T Reg\n",
+      "Antigen\n",
+      "presenting cellCancer cell\n",
+      "T cellB7\n",
+      "B7\n",
+      "B7\n",
+      "B7\n",
+      "B7\n",
+      "B7\n",
+      "B7\n",
+      "B7MHCCTLA-4\n",
+      "TCR\n",
+      "IPILMUMABNIVOLUMAB\n",
+      "CD28\n",
+      "CD28\n",
+      "CD28\n",
+      "CTLA-4 CTLA-4PD-1\n",
+      "PD-1\n",
+      "PD-1PD-L1\n",
+      "PD-L1\n",
+      "PD-L1MHCMHCBauckneht M  et al. Immune checkpoint inhibitors and PET30 February 28, 2017 |Volume 9 |Issue 2| WJR|www.wjgnet.comspecific clinical indications[17-19]. The use of 18F-FDG-PET \n",
+      "in post-treatment settings is based on the assumption \n",
+      "that tumor size changes are only the final step in a \n",
+      "sequence of complex metabolic and functional processes \n",
+      "during and after treatment[20]. Two different types \n",
+      "of criteria have been proposed for the identification \n",
+      "of 18F-FDG-PET-based response in solid tumors: The \n",
+      "European Organization for Research and Treatment of \n",
+      "Cancer (EORTC) and the PET Response Criteria in Solid \n",
+      "Tumors (PERCIST) criteria[21,22] (Table 1). Both criteria \n",
+      "target the most metabolically active part of patient’s \n",
+      "tumor burden, which is regarded as the most viable and \n",
+      "aggressive disease site. In both cases, the so-called \n",
+      "standardized uptake value (SUV) is measured at baseline \n",
+      "and after treatment. However , they differ for some rele -\n",
+      "vant aspects. The EORTC criteria were published in 1999 \n",
+      "and are based on the evaluation of a lesion-specific \n",
+      "region of interest (ROI) chosen as the most 18F-FDG-avid \n",
+      "at baseline and followed in the after-treatment scans[22]. \n",
+      "The PERCIST criteria were proposed in 2009 by Wahl \n",
+      "et al[21] and rely on the use of a 1 cm3 ROI on the most \n",
+      "18F-FDG-avid part of the single most metabolically active \n",
+      "lesion at each PET/CT scan (which is not necessarily \n",
+      "located in the same lesion in all scans). \n",
+      "Relatively few papers have compared the two methods in solid tumors and good agreement, similar \n",
+      "responses and survival outcomes have been highlighted \n",
+      "in the available studies[23]. However, for the EORTC \n",
+      "criteria, no recommendations on the number of target \n",
+      "lesions or on whether computing SUV max or average \n",
+      "SUV for response calculation are given while the \n",
+      "PERCIST criteria recommend the use of lean body mass \n",
+      "for SUV normalization (SUL). In this framework, some \n",
+      "studies have demonstrated a higher accuracy with \n",
+      "respect to RECIST for both metabolic response based \n",
+      "criteria in patients treated with target therapies such as \n",
+      "erlotinib. This finding is due to the relative lower tumor \n",
+      "shrinkage characterizing this type of treatment[24]. \n",
+      "Similarly, an 18F-FDG-PET-based five-point scale (5-PS), \n",
+      "the so-called Deauville criteria, has been demonstrated \n",
+      "to be superior to CT-based response by scoring images \n",
+      "in the assessment of response at the middle and end of \n",
+      "treatment in HD patients[18]. Again these findings testify \n",
+      "that functional changes always precede morphological \n",
+      "changes in the course of pathological processes. In \n",
+      "this regard it might be of interest to test the complex \n",
+      "and variegated response to ICPIs by means of PET-\n",
+      "based criteria. In fact, on one hand, functional imaging \n",
+      "may capture different features of treatment with ICPIs \n",
+      "in terms of entity and time course of response. On   Category PERCIST EORTC 1999 RECIST 1.1 irRC\n",
+      "  Target lesions The hottest single tumor lesion (SUL \n",
+      "peak) at baseline\n",
+      "18F-FDG PET The most 18F-FDG-avid \n",
+      "lesions (SUV BSA). Number \n",
+      "of lesions not specifiedMaximum, 5 Maximum, 15 lesions\n",
+      "  New lesion Results in progressive disease at first \n",
+      "appearanceResults in progressive disease \n",
+      "at first appearanceResults in progressive \n",
+      "disease at first \n",
+      "appearanceUp to 10 new visceral and 5 cutaneous \n",
+      "lesions may be added to the sum of \n",
+      "the products of the two largest \n",
+      "perpendicular diameters of \n",
+      "all index lesions at \n",
+      "any time point\n",
+      "  Complete \n",
+      "  responseCMR: Complete resolution of 18F-FDG \n",
+      "uptake within the \n",
+      "target lesion (< mean liver activity and \n",
+      "indistinguishable from \n",
+      "background/blood pool and no new \n",
+      "18F-FDG-avid lesions)CMR: Complete absence of \n",
+      "18F-FDG uptake Disappearance of all target and nontarget lesions\n",
+      " Nodes must regress to < 10 mm short axis \n",
+      " No new lesions\n",
+      " Confirmation required\n",
+      "  Partial response PMR: A reduction of a minimum of \n",
+      "30% in the target tumor 18F-\n",
+      "FDG SUL peakPMR: A decrease in SUV > \n",
+      "25%≥ 30% decrease in \n",
+      "tumor burden compared \n",
+      "to baseline \n",
+      "Confirmation required≥ 50% decrease in tumor burden \n",
+      "compared with baseline1\n",
+      "Confirmation required\n",
+      "  Progressive \n",
+      "  diseasePMD: A 30% increase in 18F-FDG SUL \n",
+      "peak or advent of new 18F-FDG-avid \n",
+      "lesionsPMD: An increase in SUV > \n",
+      "25% or appearance of new \n",
+      "lesions≥ 20% + 5 mm absolute \n",
+      "increase in tumor \n",
+      "burden compared\n",
+      "with nadir\n",
+      "Appearance of new \n",
+      "lesions or progression of \n",
+      "nontarget lesions≥ 25% increase in tumor burden \n",
+      "compared with baseline,\n",
+      "nadir or reset baseline1\n",
+      "New lesions added to tumor burden\n",
+      "Confirmation required\n",
+      "  Stable disease SMD: Disease other than CMR, PMR \n",
+      "or PMDSMD: Increase in SUV by < \n",
+      "25% or decrease in SUV by < \n",
+      "15% Neither partial response nor progressive diseaseTable 1  Key features of positron emission tomography Response Criteria in Solid Tumors, European Organization for Research and \n",
+      "Treatment of Cancer 1999, Response Evaluation Criteria in Solid Tumors 1.1 and immune related Response Criteria \n",
+      "1If an increase in tumor burden is observed at the first scheduled assessment, the baseline is reset to the value observed at the first assessment. PERCIST: PET \n",
+      "Response Criteria in Solid Tumors; EORTC: European Organization for Research and Treatment of Cancer; RECIST: Response Evaluation Criteria in Solid \n",
+      "Tumors; irRC: Immune related Response Criteria; CMR: Complete metabolic response; PMR: Partial metabolic response; PMD: Progressive metabolic disease; \n",
+      "SMD: Stable metabolic disease; SUL: SUV normalized to lean body mass; SUV BSA: SUV normalized for body surface area; SUV: Standardized uptake value.Bauckneht M  et al. Immune checkpoint inhibitors and PET31 February 28, 2017 |Volume 9 |Issue 2| WJR|www.wjgnet.comthe other hand, it has been reported that the initial \n",
+      "increase in tumor size, later followed by tumor volume \n",
+      "reduction in part of the patients treated with ICPIs, is \n",
+      "due to inflammatory cell infiltrates. Accordingly, given \n",
+      "the well-known high metabolic activity characterizing \n",
+      "inflammatory cells, this feature may also hamper the \n",
+      "evaluation of 18F-FDG-PET-based response to ICPIs. \n",
+      "Sachpekidis  et al[20] evaluated the role of 18F-FDG-\n",
+      "PET/CT after two cycles of ipilimumab in predicting the \n",
+      "final response to therapy in 22 patients with metastatic \n",
+      "melanoma. They evaluated response to treatment by \n",
+      "means of the EORTC criteria and found that 18F-FDG-\n",
+      "PET/CT after two cycles of ipilimumab is predictive of \n",
+      "the final treatment outcome in patients with progressive \n",
+      "metabolic disease (PMD) and stable metabolic disease \n",
+      "(SMD)[20]. However, two patients were initially falsely \n",
+      "classified as early SMD, but they later demonstrated \n",
+      "new metastatic lesions,  “upgrading” them to late PMD. \n",
+      "Similarly, early evaluation by means of 18F-FDG-PET \n",
+      "did not identify responders to treatment as the two \n",
+      "patients eventually characterized with PMR were initially \n",
+      "classified with early PMD due to new lesions[20]. In fact, \n",
+      "both RECIST 1.1 and PET-based criteria consider the \n",
+      "identification of new (metabolically active) lesions as \n",
+      "progressive disease. Therefore, presently proposed \n",
+      "PET-based metabolic criteria suffer from at least one \n",
+      "of the same limitations that ha ve resulted in the under -\n",
+      "estimation of response to treatment with ICPIs by \n",
+      "means of RECIST 1.1. Similarly, in the phase 2 study \n",
+      "by Younes et al[9], nivolumab resulted in frequent res -\n",
+      "ponses in patients with classical Hodgkin’s lymphoma \n",
+      "after failure of ASCT and brentuximab vedotin. Most of \n",
+      "these responses were maintained through the reported \n",
+      "follow-up period with an acceptable safety profile. In \n",
+      "this study 18F-FDG-PET was performed at baseline and \n",
+      "at weeks 17 and 25. A negative 18F-FDG-PET scan, \n",
+      "visually assessed by an independent radiological review \n",
+      "committee (IRRC), was required for confirmation \n",
+      "of complete remission. The study demonstrated a \n",
+      "general reduction of tumor burden. Yet, discordance \n",
+      "in complete remission between IRRC and investigator \n",
+      "assessments was largely based on the interpretation \n",
+      "of 18F-FDG-PET scans and standardized uptake values \n",
+      "were not collected as part of this study. The vast \n",
+      "majority of other available data on the potential utility \n",
+      "of 18F-FDG-PET afte r ICPIs are case reports more \n",
+      "often describing underlying challenges of monitoring \n",
+      "radiologic response in these patients and showing \n",
+      "18F-FDG-PET features of inflammatory reactions. PET-\n",
+      "highlighted autoimmune pancolitis, splenic sarcoidosis-\n",
+      "like lesion and exacerbation of sarcoidosis as a potential \n",
+      "confounder in the assessment of tumor response in \n",
+      "a melanoma patient treated with ipilimumab have all \n",
+      "been described[25-27]. Similarly, K oo et al[26] illustrated a \n",
+      "series of inflammatory reactions with avid FDG uptake \n",
+      "in patients treated with ipilimumab, including those with \n",
+      "thyroiditis, hypophysitis, granulomatous inflammation in \n",
+      "the lymph nodes and skin, and enterocolitis. \n",
+      "Accordingly, the potential and challenges of 18F-FDG-PET imaging in the evaluation of patients treated with \n",
+      "ICPIs still need to be clarified and deeply addressed. \n",
+      "Given the relatively greater experience of CT-based \n",
+      "evaluation in this setting and the fact that irRC CT-\n",
+      "based criteria seem to better in capturing response to \n",
+      "ICPIs, it is worthwhile to ask if a similar modification of \n",
+      "PET-based criteria could be proposed in the future.\n",
+      "Potential new PET-based approaches to evaluate the \n",
+      "effect of ICPIs\n",
+      "As mentioned above, due to its intrinsic nature, 18F-FDG-\n",
+      "PET displays not only cancer cell’s metabolic activity but \n",
+      "also inflammation. Since the antineoplastic activity of \n",
+      "ICPIs is highly related to the activation of T cells against \n",
+      "cancer cells, 18F-FDG accumulation might cause false-\n",
+      "positive findings. Yet, discrimination between benign \n",
+      "and malignant processes represents a huge challenge \n",
+      "for 18F-FDG-PET in this clinical setting. Together with the \n",
+      "need of the clinicians to discriminate between responders \n",
+      "and non-responders, allowing individual therapy \n",
+      "optimization and avoiding adverse effects brought \n",
+      "about by ineffective therapy, several studies have been \n",
+      "recently conducted to explore the possible role of non-\n",
+      "FDG radiotracers in the field of ICPIs. These studies, \n",
+      "mainly performed with labeled monoclonal antibodies, \n",
+      "open the new era of the so-called “Immuno-PET”. \n",
+      "Accordingly, in 2014, Higashikawa et al[28] developed \n",
+      "a molecular imaging probe that is able to evaluate \n",
+      "CTLA-4 expression prior to CTLA-4 targeting in cancer . \n",
+      "This 64Cu labeled radiotracer is basically composed \n",
+      "of DOTA protein together with a CTLA-4 specific \n",
+      "antibody and is able to display CTLA-4 expression \n",
+      "in vivo . Similarly, specific experimental radiotracers \n",
+      "were proposed for the visualization of PD-1 and PD-L1 \n",
+      "cellular expression[29-32]. Maute et al[29] measured PD-L1 \n",
+      "expression by radiolabeling a PD-L1 high affinity protein \n",
+      "(HAC) with 64Cu and tested its feasibility in a living \n",
+      "mouse, while Hettich  et al[30] developed two 64Cu labeled \n",
+      "immunoPET tracers for imaging of both PD-1 and PD-L1. \n",
+      "Also one SPECT study with radiolabeled anti-murine \n",
+      "PD-L1 in mice has been conducted[32]. More recently, a \n",
+      "89Zr labeled CD3 PET imaging agent was proposed by \n",
+      "Larimer et al[33]. CD3 is a part of the TCR complex that \n",
+      "serves as a global T lymphocyte marker . By serving as \n",
+      "a marker of total T-cell infiltration, CD3 may represent \n",
+      "a more direct approach than pre-treatment biopsy \n",
+      "or genetic screening to monitoring tumor immune \n",
+      "response, by directly examining active recruitment of \n",
+      "T cells responsible for cancer cell death. In this study \n",
+      "the authors showed that CD3 PET imaging revealed \n",
+      "two distinct groups of mice, stratified by PET signal \n",
+      "intensity. While high-CD3 PET uptake was correlated \n",
+      "with subsequent reduced tumor volume, low uptake was \n",
+      "predictive of suboptimal response. Altogether these non-\n",
+      "invasive approaches allow simultaneous imaging of the \n",
+      "entire cancer mass and associated metastases, which \n",
+      "may differ from the primary tumor in CTLA-4, PD-1 or \n",
+      "PD-L1 expression status. Immune imaging can be used \n",
+      "for repeated assessment of the same tumor at different Bauckneht M  et al. Immune checkpoint inhibitors and PET32 February 28, 2017 |Volume 9 |Issue 2| WJR|www.wjgnet.comtime points ( e.g., before and after treatment), thereby \n",
+      "yielding a richer set of diagnostic information that would \n",
+      "be difficult or impossible to achieve with traditional \n",
+      "approaches. Furthermore, although further investigations \n",
+      "are needed before their potential introduction in the \n",
+      "clinical setting, these non-invasive immune-diagnostic \n",
+      "approaches might yield novel insights into the biology \n",
+      "and pathophysiological importance of ICPIs as cancer \n",
+      "therapeutics. \n",
+      "REFERENCES\n",
+      "1 Pardoll DM . The blockade of immune checkpoints in cancer \n",
+      "immunotherapy. Nat Rev Cancer  2012; 12: 252-264 [PMID: \n",
+      "22437870 DOI: 10.1038/nrc3239]\n",
+      "2 Walunas TL , Lenschow DJ, Bakker CY , Linsley PS, Freeman GJ, \n",
+      "Green JM, Thompson CB, Bluestone JA. CTLA-4 can function as \n",
+      "a negative regulator of T cell activation. Immunity  1994; 1: 405-413 \n",
+      "[PMID: 7882171]\n",
+      "3 Francisco LM , Sage PT, Sharpe AH. The PD-1 pathway in \n",
+      "tolerance and autoimmunity. Immunol Rev  2010; 236: 219-242 \n",
+      "[PMID: 20636820 DOI: 10.1111/j.1600]\n",
+      "4 O’Day SJ , Hamid O, Urba WJ. Targeting cytotoxic T-lymphocyte \n",
+      "antigen-4 (CTLA-4): a novel strategy for the treatment of melanoma \n",
+      "and other malignancies. Cancer  2007; 110: 2614-2627 [PMID: \n",
+      "18000991 DOI: 10.1002/cncr.23086]\n",
+      "5 Hodi FS , O’Day SJ, McDermott DF, Weber RW, Sosman JA, \n",
+      "Haanen JB, Gonzalez R, Robert C, Schadendorf D, Hassel JC, \n",
+      "Akerley W, van den Eertwegh AJ, Lutzky J, Lorigan P, Vaubel JM, \n",
+      "Linette GP, Hogg D, Ottensmeier CH, Lebbé C, Peschel C, Quirt \n",
+      "I, Clark JI, Wolchok JD, Weber JS, Tian J, Yellin MJ, Nichol GM, \n",
+      "Hoos A, Urba WJ. Improved survival with ipilimumab in patients \n",
+      "with metastatic melanoma. N Engl J Med  2010; 363: 711-723 \n",
+      "[PMID: 20525992 DOI: 10.1056/NEJMoa1003466]\n",
+      "6 Giri A , Walia SS, Gajra A. Clinical Trials Investigating Immune \n",
+      "Checkpoint Inhibitors in Non-Small-Cell Lung Cancer. Rev Recent \n",
+      "Clin Trials  2016; 11: 297-305 [PMID: 27457350]\n",
+      "7 Carlo MI , V oss MH, Motzer RJ. Checkpoint inhibitors and other \n",
+      "novel immunotherapies for advanced renal cell carcinoma. Nat \n",
+      "Rev Urol  2016; 13: 420-431 [PMID: 27324121 DOI: 10.1038/\n",
+      "nrurol.2016.103]\n",
+      "8 Ball MW , Allaf ME, Drake CG. Recent advances in immuno -\n",
+      "therapy for kidney cancer. Discov Med  2016; 21: 305-313 [PMID: \n",
+      "27232516]\n",
+      "9 Younes A , Santoro A, Shipp M, Zinzani PL, Timmerman JM, Ansell \n",
+      "S, Armand P, Fanale M, Ratanatharathorn V , Kuruvilla J, Cohen JB, \n",
+      "Collins G, Savage KJ, Trneny M, Kato K, Farsaci B, Parker SM, \n",
+      "Rodig S, Roemer MG, Ligon AH, Engert A. Nivolumab for classical \n",
+      "Hodgkin’s lymphoma after failure of both autologous stem-cell \n",
+      "transplantation and brentuximab vedotin: a multicentre, multicohort, \n",
+      "single-arm phase 2 trial. Lancet Oncol  2016; 17: 1283-1294 [PMID: \n",
+      "27451390 DOI: 10.1016/S1470-2045(16)30167]\n",
+      "10 Eisenhauer EA , Therasse P, Bogaerts J, Schwartz LH, Sargent D, \n",
+      "Ford R, Dancey J, Arbuck S, Gwyther S, Mooney M, Rubinstein \n",
+      "L, Shankar L, Dodd L, Kaplan R, Lacombe D, Verweij J. New \n",
+      "response evaluation criteria in solid tumours: revised RECIST \n",
+      "guideline (version 1.1). Eur J Cancer  2009; 45: 228-247 [PMID: \n",
+      "19097774 DOI: 10.1016/j.ejca.2008.10.026]\n",
+      "11 von Minckwitz G , Sinn HP, Raab G, Loibl S, Blohmer JU, \n",
+      "Eidtmann H, Hilfrich J, Merkle E, Jackisch C, Costa SD, Caputo \n",
+      "A, Kaufmann M. Clinical response after two cycles compared \n",
+      "to HER2, Ki-67, p53, and bcl-2 in independently predicting a \n",
+      "pathological complete response after preoperative chemotherapy in \n",
+      "patients with operable carcinoma of the breast. Breast Cancer Res  \n",
+      "2008; 10: R30 [PMID: 18380893 DOI: 10.1186/bcr1989]\n",
+      "12 Tsujino K , Shiraishi J, Tsuji T, Kurata T, Kawaguchi T, Kubo \n",
+      "A, Takada M. Is response rate increment obtained by molecular \n",
+      "targeted agents related to survival benefit in the phase III trials of advanced cancer? Ann Oncol  2010; 21: 1668-1674 [PMID: \n",
+      "20064832 DOI: 10.1093/annonc/mdp588]\n",
+      "13 Wolchok JD , Hoos A, O’Day S, Weber JS, Hamid O, Lebbé C, \n",
+      "Maio M, Binder M, Bohnsack O, Nichol G, Humphrey R, Hodi FS. \n",
+      "Guidelines for the evaluation of immune therapy activity in solid \n",
+      "tumors: immune-related response criteria. Clin Cancer Res  2009; \n",
+      "15: 7412-7420 [PMID: 19934295 DOI: 10.1158/1078-0432]\n",
+      "14 Hodi FS , Sznol M, Kluger HM, McDermott DF, Carvajal RD, \n",
+      "Lawrence DP, Topalian SL, Atkins MB, Powderly JD, Sharfman \n",
+      "WH, Puzanov I, Smith DC, Leming PD, Lipson EJ, Taube JM, \n",
+      "Anders R, Horak CE, Kollia G, Gupta AK, Sosman JA. Long \n",
+      "term survival of ipilimumab-naive patients (pts) with advanced \n",
+      "melanoma (MEL) treated with nivolumab (anti-PD-1, BMS-936558, \n",
+      "ONO-4538) in a phase I trial. ASCO Annual Meeting 2014 May \n",
+      "30- Jun 3; Chicago, Illinois, USA. J Clin Oncol  2014; 32: 5s (suppl; \n",
+      "abstr 9002)\n",
+      "15 Chiou VL , Burotto M. Pseudoprogression and Immune-Related \n",
+      "Response in Solid Tumors. J Clin Oncol  2015; 33: 3541-3543 \n",
+      "[PMID: 26261262 DOI: 10.1200/JCO.2015.61.6870]\n",
+      "16 Hodi FS , Hwu WJ, Kefford R, Weber JS, Daud A, Hamid O, \n",
+      "Patnaik A, Ribas A, Robert C, Gangadhar TC, Joshua AM, Hersey P, \n",
+      "Dronca R, Joseph R, Hille D, Xue D, Li XN, Kang SP , Ebbinghaus S, \n",
+      "Perrone A, Wolchok JD. Evaluation of Immune-Related Response \n",
+      "Criteria and RECIST v1.1 in Patients With Advanced Melanoma \n",
+      "Treated With Pembrolizumab. J Clin Oncol  2016; 34: 1510-1517 \n",
+      "[PMID: 26951310 DOI: 10.1200/JCO.2015.64.0391]\n",
+      "17 Gould MK , Donington J, Lynch WR, Mazzone PJ, Midthun DE, \n",
+      "Naidich DP, Wiener RS. Evaluation of individuals with pulmonary \n",
+      "nodules: when is it lung cancer? Diagnosis and management of lung \n",
+      "cancer, 3rd ed: American College of Chest Physicians evidence-\n",
+      "based clinical practice guidelines. Chest  2013; 143: e93S-120S \n",
+      "[PMID: 23649456 DOI: 10.1378/chest.12-2351]\n",
+      "18 Cheson BD , Fisher RI, Barrington SF, Cavalli F, Schwartz LH, \n",
+      "Zucca E, Lister TA. Recommendations for initial evaluation, \n",
+      "staging, and res ponse assessment of Hodgkin and non-Hodgkin \n",
+      "lymphoma: the Lugano classification. J Clin Oncol  2014; 32: \n",
+      "3059-3068 [PMID: 25113753 DOI: 10.1200/JCO.2013.54.8800]\n",
+      "19 Morbelli S , Capitanio S, De Carli F, Bongioanni F, De Astis E, \n",
+      "Miglino M, Verardi MT, Buschiazzo A, Fiz F, Marini C, Pomposelli \n",
+      "E, Sambuceti G. Baseline and ongoing PET-derived factors predict \n",
+      "detrimental effect or potential utility of 18F-FDG PET/CT (FDG-\n",
+      "PET/CT) performed f or surveillance in asymptomatic lymphoma \n",
+      "patients in first remission. Eur J Nucl Med Mol Imaging  2016; 43: \n",
+      "232-239 [PMID: 26283504 DOI: 10.1007/s00259-015-3164-9]\n",
+      "20 Sachpekidis C , Larribere L, Pan L, Haberkorn U, Dimitrakopoulou-\n",
+      "Strauss A, Hassel JC. Predictive value of early 18F-FDG PET/\n",
+      "CT studies for treatment response evaluation to ipilimumab in \n",
+      "metastatic melanoma: preliminary results of an ongoing study. Eur J \n",
+      "Nucl Med Mol Imaging  2015; 42: 386-396 [PMID: 25359635 DOI: \n",
+      "10.1007/s00259-014-2944-y]\n",
+      "21 Wahl RL , Jacene H, Kasam on Y, Lodge MA. From RECIST to \n",
+      "PERCIST: Evolving Considerations for PET response criteria in \n",
+      "solid tumors. J Nucl M ed 2009; 50 Suppl 1: 122S-150S [PMID: \n",
+      "19403881 DOI: 10.2967/jnumed.108.057307]\n",
+      "22 Young H , Baum R, Cremerius U, Herholz K, Hoekstra O, \n",
+      "Lammertsma AA, Pruim J, Price P. Measurement of clinical and \n",
+      "subclinical tumour response using [18F]-fluorodeoxyglucose \n",
+      "and positron emission tomography: review and 1999 EORTC \n",
+      "recommendations. European Organization for Research and \n",
+      "Treatment of Cancer (EORTC) PET Study Group. Eur J Cancer  \n",
+      "1999; 35: 1773-1782 [PMID: 10673991]\n",
+      "23 Skougaard K , Nielsen D, Jensen BV , Hendel HW. Comparison of \n",
+      "EORTC criteria and PERCIST for PET/CT response evaluation of \n",
+      "patients with metastatic colorectal cancer treated with irinotecan \n",
+      "and cetuximab. J Nucl Med  2013; 54: 1026-1031 [PMID: 23572497 \n",
+      "DOI: 10.2967/jnumed.112.111757]\n",
+      "24 Stefano A , Russo G, Ippolito M, Cosentino S, Murè G, Baldari S, \n",
+      "Sabini MG, Sardina D, Valastro LM, Bordonaro R, Messa C, Gilardi \n",
+      "MC, Soto Parra H. Evaluation of erlotinib treatment response in \n",
+      "non-small cell lung cancer using metabolic and anatomic criteria. Q Bauckneht M  et al. Immune checkpoint inhibitors and PET33 February 28, 2017 |Volume 9 |Issue 2| WJR|www.wjgnet.comJ Nucl Med Mol Imaging  2014 May 9; Epub ahead of print [PMID: \n",
+      "24809275]\n",
+      "25 Goethals L , Wilgenhof S, De Geeter F, Everaert H, Neyns B. \n",
+      "18F-FDG PET/CT imaging of an anti-CTLA-4 antibody-associated \n",
+      "autoimmune pancolitis. Eur J Nucl Med Mol Imaging  2011; 38: \n",
+      "1390-1391 [PMID: 21365253 DOI: 10.1007/s00259-011-1749-5]\n",
+      "26 Koo PJ , Klingensmith WC, Lewis KD, Bagrosky BM, Gonzalez \n",
+      "R. Anti-CTLA4 antibody therapy related complications on FDG \n",
+      "PET/CT. Clin Nucl Med  2014; 39: e93-e96 [PMID: 23657138 DOI: \n",
+      "10.1097/RLU.0b013e318292a775]\n",
+      "27 Perng P , Marcus C, Subramaniam RM. (18)F-FDG PET/CT and \n",
+      "Melanoma: Staging, Immune Modulation and Mutation-Targeted \n",
+      "Therapy Assessment, and Prognosis. AJR Am J Roentgenol  2015; \n",
+      "205: 259-270 [PMID: 26204273 DOI: 10.2214/AJR.14.13575]\n",
+      "28 Higashikawa K , Yagi K, Watanabe K, Kamino S, Ueda M, \n",
+      "Hiromura M, Enomoto S. 64Cu-DOTA-anti-CTLA-4 mAb enabled \n",
+      "PET visualization of CTLA-4 on the T-cell infiltrating tumor tissues. \n",
+      "PLoS One  2014; 9: e109866 [PMID: 25365349]\n",
+      "29 Maute RL , Gordon SR, Mayer AT, McCracken MN, Natarajan A, \n",
+      "Ring NG, Kimura R, Tsai JM, Manglik A, Kruse AC, Gambhir SS, \n",
+      "Weissman IL, Ring AM. Engineering high-affinity PD-1 variants for optimized immunotherapy and immuno-PET imaging. Proc Natl \n",
+      "Acad Sci USA  2015; 112: E6506-E6514 [PMID: 26604307 DOI: \n",
+      "10.1073/pnas.1519623112]\n",
+      "30 Hettich M , Braun F, Bartholomä MD, Schirmbeck R, Niedermann \n",
+      "G. High-Resolution PET Imaging with Therapeutic Antibody-based \n",
+      "PD-1/PD-L1 Checkpoint Tracers. Theranostics  2016; 6: 1629-1640 \n",
+      "[PMID: 27446497 DOI: 10.7150/thno.15253]\n",
+      "31 Heskamp S , Hobo W, Molkenboer-Kuenen JD, Olive D, Oyen WJ, \n",
+      "Dolstra H, Boerman OC. Noninvasive Imaging of Tumor PD-L1 \n",
+      "Expression Using Radiolabeled Anti-PD-L1 Antibodies. Cancer Res  \n",
+      "2015; 75: 2928-2936 [PMID: 25977331 DOI: 10.1158/0008-5472.\n",
+      "CAN-14-3477]\n",
+      "32 Josefsson A , Nedrow JR, Park S, Banerjee SR, Rittenbach A, Jammes \n",
+      "F, Tsui B, Sgouros G. Imaging, Biodistribution, and Dosimetry of \n",
+      "Radionuclide-Labeled PD-L1 Antibody in an Immunocompetent \n",
+      "Mouse Model of Breast Cancer. Cancer Res  2016; 76: 472-479 [PMID: \n",
+      "26554829 DOI: 10.1158/0008-5472.CAN-15-2141]\n",
+      "33 Larimer BM , Wehrenberg-Klee E, Caraballo A, Mahmood U. \n",
+      "Quantitative CD3 PET Imaging Predicts Tumor Growth Response \n",
+      "to Anti-CTLA-4 Therapy. J Nucl Med  2016; 57: 1607-1611 [PMID: \n",
+      "27230929 DOI: 10.2967/jnumed.116.173930]\n",
+      "P- Reviewer : Morris DLL, Palumbo B    S- Editor : Ji FF    \n",
+      "L- Editor : Wang TQ    E- Editor : Wu HLBauckneht M  et al. Immune checkpoint inhibitors and PET\n",
+      "                                      © 2017 Baishideng Publishing Group Inc . All rights reserved.Published by Baishideng Publishing Group Inc\n",
+      "8226 Regency Drive, Pleasanton, CA 94588, USA\n",
+      "Telephone: +1-925-223-8242\n",
+      "Fax: +1-925-223-8243\n",
+      "E-mail: bpgoffice@wjgnet.com\n",
+      "Help Desk: http://www.wjgnet.com/esps/helpdesk.aspx\n",
+      "http://www.wjgnet.com\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xml.etree.ElementTree as ET\n",
+    "tree = ET.parse(\"ijms-24-05988.nxml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root = tree.getroot()\n",
+    "body_elements = root.findall(\".//body//p\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'acute myeloid leukemia'"
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "root.findall(\".//kwd\")[0].text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Variation in Lipid Species Profiles among Leukemic Cells Significantly Impacts Their Sensitivity to the Drug Targeting of Lipid Metabolism and the Prognosis of AML Patients'"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "root.find(\".//article-title\").text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Several studies have linked bad prognoses of acute myeloid leukemia (AML) to the ability of leukemic cells to reprogram their metabolism and, in particular, their lipid metabolism. In this context, we performed “in-depth” characterization of fatty acids (FAs) and lipid species in leukemic cell lines and in plasma from AML patients. We firstly showed that leukemic cell lines harbored significant differences in their lipid profiles at steady state, and that under nutrient stress, they developed common mechanisms of protection that led to variation in the same lipid species; this highlights that the remodeling of lipid species is a major and shared mechanism of adaptation to stress in leukemic cells. We also showed that sensitivity to etomoxir, which blocks fatty acid oxidation (FAO), was dependent on the initial lipid profile of cell lines, suggesting that only a particular “lipidic phenotype” is sensitive to the drug targeting of FAO. We then showed that the lipid profiles of plasma samples from AML patients were significantly correlated with the prognosis of patients. In particular, we highlighted the impact of phosphocholine and phosphatidyl-choline metabolism on patients’ survival. In conclusion, our data show that balance between lipid species is a phenotypic marker of the diversity of leukemic cells that significantly influences their proliferation and resistance to stress, and thereby, the prognosis of AML patients.'"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "root.find(\".//abstract/p\").text "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "content = \"\\n\".join([p.text for p in body_elements if p.text]) if body_elements else \"No Content Available\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "School nutrition is an important key modifier in terms of child and adolescent nutrient intake. A poor diet early in life can lead to a multitude of immediate and long-term health problems. Changes to the dietary information given in the school setting as well as changes to the food programs offered have the potential to promote adherence to a healthy diet, which can lead to lifelong health benefits.\n",
+      "The present Special Issue includes two multicomponent school-based nutrition interventions to increase fruit (F) and vegetable (V) intake in children [\n",
+      "Both multicomponent school-based interventions in the present Special Issue used the well-known theory of planned behavior as one of their theoretical models. More specifically, attitude, subjective norms, and perceived behavioral control can predict behavioral intentions. This approach has effectively predicted and changed diet-related behaviors and intentions in youth [\n",
+      "The Dutch “Kokkerelli learning street” program combined the classroom with experiential learning strategies. Using this program, Hahnraths et al. also examined FV preferences, knowledge, attitudes, and intention to consume FV short term (directly after the intervention) and after 3 months [\n",
+      "Both of the school-based nutrition interventions included in the Special Issue aimed to change F and V intake in children aged approximately 7–10 years old. Preference is the main factor associated with F and V in children, whereas Vs are not an innate preferred food. However, participation in the 3-year intervention had a stronger effect on changing FV intake than change in FV preference among primary school children [\n",
+      "Overall, these intervention studies [\n",
+      "It is noteworthy that the 3-year intervention “Nutri-skolica” was not fully implemented due to the COVID-19 pandemic. On the same note, the next study in the Special Issue examines the changes due to COVID-19 regarding the implementation of emergency school meals and pandemic electronic benefits in an urban setting. Cadenhead et al. presented qualitative data on facilitators and barriers to using the available emergency school meals and the P-EBT [\n",
+      "Through the Hunger-Free Kids Act of 2010, the US Department of Agriculture (USDA) established policies to improve the nutritional quality of food and beverages served to US children through federal food assistance programs and it made changes in the Child and Adult Care Food Program (CAFP). In this Special Issue, Dave et al. assess what changes in children’s dietary behaviors occurred as a result of the new CACFP meal pattern requirements [\n",
+      "The Smart Snacks rule was part of the implementation of the 2010 Act, which allows the USDA to regulate foods and beverages sold in schools outside of the school meal programs. For example, energy drinks are not permitted in schools, and a study using data from the School Nutrition and Meal Cost Study reported 84% compliance in middle schools in the United States [\n",
+      "The National School Lunch Program (NSLP) and the School Breakfast Program have to follow specific nutrition requirements consistent with the Dietary Guidelines for Americans. Eating school breakfast and school lunch every day was associated with modestly healthier dietary intakes in US schoolchildren [\n",
+      "Taken together, the studies presented in this Special Issue highlight the relevance of the role of schools in children’s nutrition. The school environment represents a unique opportunity to positively impact the nutrition of children considering they can consume a significant percentage of their daily intake there. Similarly, a recent systematic review concluded that FV interventions provide a promising avenue by which children’s consumption can be improved. Future interventions should place more focus on vegetable intake [\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pinecone-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

test_download_data.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/bin/bash
+# Specify the number of articles to download
+limit=10
+# Fetch the list of articles with metadata in XML format
+response=$(curl -s "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf&limit=$limit")
+# Parse each record in the response
+echo "$response" | while read -r line; do
+    # Extract the PMC ID
+    if [[ $line =~ id=\"(PMC[0-9]+)\" ]]; then
+        pmc_id="${BASH_REMATCH[1]}"
+        echo "Processing article ID: $pmc_id"
+        # Extract the title for metadata
+        title=$(echo "$response" | sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" | sed -n 's/.*citation="\(.*\)".*/\1/p')
+        # Extract the PDF link for download
+        pdf_link=$(echo "$response" | sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" | sed -n 's/.*<link format="pdf"[^>]* href="\([^"]*\)".*/\1/p')
+        # Check if we found a PDF link
+        if [[ -n $pdf_link ]]; then
+            # Print metadata
+            echo "Title: $title"
+            echo "Downloading PDF from: $pdf_link"
+            # Download the PDF
+            curl -O "$pdf_link"
+            # Optional: Save metadata to a file
+            echo "Title: $title" >> metadata.txt
+            echo "PDF Link: $pdf_link" >> metadata.txt
+            echo "---------------------" >> metadata.txt
+        else
+            echo "No PDF link found for article ID: $pmc_id"
+        fi
+    fi
+done

tests/__init__.py ADDED Viewed

File without changes

tests/test_pinecone.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import pytest
+import os
+import time
+from openai import OpenAI
+from pinecone import Pinecone, ServerlessSpec
+from datasets import load_dataset
+from dotenv import load_dotenv
+class TestPineconeIntegration:
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Setup test environment and resources"""
+        # Load environment variables
+        load_dotenv("../")
+        # Initialize clients
+        self.pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+        # Constants
+        self.MODEL = "text-embedding-3-small"
+        self.index_name = "test-semantic-search-openai"
+        yield  # This is where the test runs
+        # Cleanup after tests
+        try:
+            if self.index_name in self.pc.list_indexes().names():
+                self.pc.delete_index(self.index_name)
+        except Exception as e:
+            print(f"Cleanup failed: {str(e)}")
+    def test_01_create_embeddings(self):
+        """Test OpenAI embedding creation"""
+        sample_texts = [
+            "Sample document text goes here",
+            "there will be several phrases in each batch"
+        ]
+        res = self.client.embeddings.create(
+            input=sample_texts,
+            model=self.MODEL
+        )
+        embeds = [record.embedding for record in res.data]
+        assert len(embeds) == 2
+        assert len(embeds[0]) > 0  # Check if embeddings are non-empty
+        return embeds[0]  # Return for use in other tests
+    def test_02_create_index(self):
+        """Test Pinecone index creation"""
+        # Get sample embedding dimension from previous test
+        sample_embed = self.test_01_create_embeddings()
+        embedding_dimension = len(sample_embed)
+        spec = ServerlessSpec(cloud="aws", region="us-east-1")
+        # Create index if it doesn't exist
+        if self.index_name not in self.pc.list_indexes().names():
+            self.pc.create_index(
+                self.index_name,
+                dimension=embedding_dimension,
+                metric='dotproduct',
+                spec=spec
+            )
+        # Wait for index to be ready
+        max_retries = 60  # Maximum number of seconds to wait
+        retries = 0
+        while not self.pc.describe_index(self.index_name).status['ready']:
+            if retries >= max_retries:
+                raise TimeoutError("Index creation timed out")
+            time.sleep(1)
+            retries += 1
+        # Verify index exists and is ready
+        assert self.index_name in self.pc.list_indexes().names()
+        assert self.pc.describe_index(self.index_name).status['ready']
+    def test_03_upload_data(self):
+        """Test data upload to Pinecone"""
+        # Ensure index exists first
+        self.test_02_create_index()
+        # Connect to index
+        index = self.pc.Index(self.index_name)
+        # Load test dataset - using 'trec' instead of 'train'
+        trec = load_dataset('trec', split='train[:10]')  # Using smaller dataset for testing
+        batch_size = 5
+        total_processed = 0
+        for i in range(0, len(trec['text']), batch_size):
+            i_end = min(i + batch_size, len(trec['text']))
+            lines_batch = trec['text'][i:i_end]
+            ids_batch = [str(n) for n in range(i, i_end)]
+            # Create embeddings
+            res = self.client.embeddings.create(input=lines_batch, model=self.MODEL)
+            embeds = [record.embedding for record in res.data]
+            # Prepare metadata and upsert batch
+            meta = [{'text': line} for line in lines_batch]
+            to_upsert = zip(ids_batch, embeds, meta)
+            # Upsert to Pinecone
+            index.upsert(vectors=list(to_upsert))
+            total_processed += len(lines_batch)
+        # Wait for a moment to ensure data is indexed
+        time.sleep(5)
+        # Verify data was uploaded
+        stats = index.describe_index_stats()
+        print(f'stats: {stats}')
+        # assert stats.total_vector_count == total_processed
+    def test_04_query_index(self):
+        """Test querying the Pinecone index"""
+        # Ensure data is uploaded first
+        self.test_03_upload_data()
+        index = self.pc.Index(self.index_name)
+        # Create query embedding
+        query = "What caused the Great Depression?"
+        xq = self.client.embeddings.create(input=query, model=self.MODEL).data[0].embedding
+        # Query index
+        res = index.query(vector=xq, top_k=5, include_metadata=True)
+        # Verify response format
+        assert 'matches' in res
+        assert len(res['matches']) <= 5  # Should return up to 5 results
+        # Verify match format
+        for match in res['matches']:
+            assert 'score' in match
+            assert 'metadata' in match
+            assert 'text' in match['metadata']
+    def test_05_delete_index(self):
+        """Test index deletion"""
+        # Ensure index exists first
+        self.test_02_create_index()
+        # Delete index
+        self.pc.delete_index(self.index_name)
+        # Verify deletion
+        assert self.index_name not in self.pc.list_indexes().names()

tests/test_pinecone_embeddings.ipynb ADDED Viewed

	@@ -0,0 +1,386 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/larawehbe/Documents/fakkerai/sehatech/venv/lib/python3.13/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pinecone\n",
+    "from langchain.document_loaders import PyPDFLoader\n",
+    "from langchain.embeddings import OpenAIEmbeddings  # Adjust to your embedding model\n",
+    "from langchain.vectorstores import Pinecone\n",
+    "from langchain.chains import RetrievalQA\n",
+    "from langchain.llms import OpenAI  # Replace with the LLM of your choice\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize Pinecone\n",
+    "\n",
+    "pc = pinecone.Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\"))\n",
+    "index_name = \"clec16a-study\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = pinecone.ServerlessSpec(cloud='aws',region=\"us-east-1\")\n",
+    "\n",
+    "# Create the index if it doesn't exist\n",
+    "if index_name not in pc.list_indexes():\n",
+    "    pc.create_index(index_name, dimension=1536, spec=spec)  # Adjust dimension as needed\n",
+    "# Connect to the index\n",
+    "index = pc.Index(index_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai \n",
+    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/qt/8nj7tb591mx9xtqkgz7mjyjh0000gn/T/ipykernel_17808/4087293823.py:7: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-openai package and should be used instead. To use it run `pip install -U :class:`~langchain-openai` and import as `from :class:`~langchain_openai import OpenAIEmbeddings``.\n",
+      "  embedding_model = OpenAIEmbeddings(model=MODEL)\n"
+     ]
+    }
+   ],
+   "source": [
+    "MODEL = 'text-embedding-ada-002'  \n",
+    "pdf_path = \"../data/main.pdf\"  # Replace with your actual PDF path\n",
+    "loader = PyPDFLoader(pdf_path)\n",
+    "documents = loader.load()\n",
+    "\n",
+    "# Initialize embedding model\n",
+    "embedding_model = OpenAIEmbeddings(model=MODEL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define function to create or connect to an existing index\n",
+    "def create_or_connect_index(index_name, dimension):\n",
+    "    spec = pinecone.ServerlessSpec(cloud='aws',region=\"us-east-1\")\n",
+    "    if index_name not in pc.list_indexes().names():\n",
+    "        pc.create_index(\n",
+    "            name=index_name,\n",
+    "            dimension=dimension,\n",
+    "            metric='cosine', # You can use 'dotproduct' or other metrics if needed\n",
+    "            spec=spec\n",
+    "        )\n",
+    "    return pc.Index(index_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sampleembedding: 1536\n"
+     ]
+    }
+   ],
+   "source": [
+    "sample_embedding = embedding_model.embed_query(\"Test\")\n",
+    "index = create_or_connect_index(index_name, dimension=len(sample_embedding))\n",
+    "print(f'sample embedding: {len(sample_embedding)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i, doc in enumerate(documents):\n",
+    "    embedding = embedding_model.embed_query(doc.page_content)\n",
+    "    pinecone_id = f\"page-{i}\"\n",
+    "    metadata = {\"text\": doc.page_content}  # Include a 'text' snippet in metadata\n",
+    "    index.upsert([(pinecone_id, embedding, metadata)])  # Upsert embedding with metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "\n",
+    "vector_store = Pinecone.from_existing_index(index_name=index_name, embedding=embedding_model)\n",
+    "\n",
+    "# Set up RetrievalQA chain for querying using a chat-based model for better responses\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", openai_api_key=openai.api_key)  # Replace with the chat model of choice\n",
+    "qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question: What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\n",
+      "Answer: CLEC16A is an E3 ubiquitin ligase that plays a significant role in mitochondrial quality control through a process called mitophagy. Mitophagy is a type of autophagy where damaged mitochondria are selectively eliminated from the cell. CLEC16A regulates mitophagy by forming a tripartite complex with another E3 ubiquitin ligase, RNF41, and a ubiquitin-specific peptidase, USP8. This complex controls the activity of the mitophagy regulator PRKN/Parkin. \n",
+      "\n",
+      "Maintaining mitochondrial quality control is crucial for cellular health as damaged mitochondria can lead to a decrease in energy production, increase in harmful reactive oxygen species, and potential induction of cell death. Therefore, the role of CLEC16A in mitochondrial quality control is important for maintaining cellular health and function. It is also noteworthy that the gene for CLEC16A is associated with over 20 human diseases, including diabetes, cardiovascular disease, stroke, multiple sclerosis, arthritis, and Crohn's disease, further underscoring its importance in cellular health.\n",
+      "\n",
+      "Question: How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\n",
+      "Answer: The intrinsically disordered protein region (IDPR) within CLEC16A plays a significant role in its stability and its interaction with RNF41. The IDPR facilitates CLEC16A's turnover and degradation, with mutations in this region leading to increased stability of the protein. This region also contributes to the interaction between CLEC16A and RNF41, a process that is essential for the assembly of the CLEC16A-RNF41-USP8 mitophagy complex. \n",
+      "\n",
+      "Furthermore, the IDPR within CLEC16A is required for RNF41-mediated turnover of CLEC16A, as the removal or shuffling of the IDPR prevents RNF41 from reducing CLEC16A protein levels. This suggests the internal IDPR destabilizes CLEC16A and that this action depends upon the IDPR's amino acid sequence order.\n",
+      "\n",
+      "Moreover, the lysine residues within the IDPR are crucial for both CLEC16A turnover and for RNF41 to act upon CLEC16A. However, simply retaining the lysine residues in their original positions within a shuffled IDPR does not restore CLEC16A turnover or RNF41 action, indicating that the entire IDPR sequence needs to be intact for RNF41 to destabilize CLEC16A. \n",
+      "\n",
+      "Overall, the internal IDPR within CLEC16A plays a key role in the protein's stability, its interaction with RNF41, and its regulation within the cellular environment.\n",
+      "\n",
+      "Question: What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\n",
+      "Answer: The CLEC16A-RNF41 complex plays a crucial role in the regulation of mitophagy, a process for eliminating damaged mitochondria. The CLEC16A gene encodes an E3 ubiquitin ligase that helps maintain mitochondrial health through selective mitochondrial autophagy (mitophagy). CLEC16A forms a complex with another E3 ligase, RNF41, and a ubiquitin-specific peptidase, USP8, to control the activity of the mitophagy regulator PRKN/Parkin. CLEC16A directly binds and ubiquitinates RNF41 to promote assembly and stability of the tripartite mitophagy complex. The study found that an intrinsically disordered protein region (IDPR) within CLEC16A is crucial for its function and turnover. The IDPR is essential to control the reciprocal regulatory balance between CLEC16A and RNF41, a balance which could possibly be targeted to improve mitochondrial health in disease.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define the list of questions\n",
+    "questions = [\n",
+    "    \"What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\",\n",
+    "    \"How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\",\n",
+    "    \"What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\",\n",
+    "    # Add more questions as needed\n",
+    "]\n",
+    "\n",
+    "# Query each question and print the answers\n",
+    "for question in questions:\n",
+    "    answer = qa_chain.run(question)\n",
+    "    print(f\"Question: {question}\")\n",
+    "    print(f\"Answer: {answer}\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Now, turn it into a chat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_llm = ChatOpenAI(model=\"gpt-4o\", openai_api_key=openai.api_key)\n",
+    "chat_qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever(search_kwargs={\"k\" : 3}))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Welcome to the CLEC16A Chat System! Ask any question, or type 'exit' to quit.\n",
+      "AI: The document is a scientific article discussing research on the regulation of CLEC16A stability through an intrinsically disordered protein region (IDPR) and its implications in various diseases. It includes detailed methodologies such as statistical analysis using Prism software, data availability, acknowledgments, and contributions from various authors supported by institutions like the University of Michigan and the NIH. The research explores genetic associations with diseases like type 1 diabetes, multiple sclerosis, and myocardial infarction. The document also discusses experimental procedures including protein purification, nuclear magnetic resonance (NMR), circular dichroism, and cell culture techniques. Additionally, it highlights the significance of intrinsically disordered proteins in cellular functions and diseases. The article contains references to previous studies and provides a comprehensive overview of the research conducted on CLEC16A and its regulatory mechanisms.\n",
+      "\n",
+      "AI: 1. **Research and Data Availability**: The study involves biophysical studies of recombinant proteins, and all data are contained within the manuscript. Supporting information is available in the referenced article.\n",
+      "\n",
+      "2. **Funding and Support**: The research received significant support from institutions such as the University of Michigan Center for Structural Biology and various grants from organizations including the NIH, JDRF, and the Department of Veterans Affairs. Specific grants and awards are mentioned, highlighting the financial backing that facilitated the study.\n",
+      "\n",
+      "3. **Contributions and Acknowledgments**: The research involved multiple contributors, with specific roles such as conceptualization, investigation, formal analysis, and writing. Acknowledgments are given to individuals and facilities that provided assistance, such as the University of Michigan BioNMR Core for help with NMR studies. The authors declare no conflicts of interest.\n",
+      "\n",
+      "AI: The document you provided seems to be a scientific research article about the protein CLEC16A and its intrinsically disordered protein region (IDPR), as well as its regulation and structural characteristics. As an internal medicine doctor, the direct application of this specific research to your practice may not be immediately clear unless the findings relate to a particular medical condition or treatment relevant to your patients.\n",
+      "\n",
+      "However, staying informed about the latest scientific research can be beneficial in several ways:\n",
+      "\n",
+      "1. **Understanding Disease Mechanisms**: Research into proteins like CLEC16A can provide insights into the mechanisms of diseases, especially if these proteins are implicated in conditions that you treat.\n",
+      "\n",
+      "2. **Potential for New Treatments**: Understanding the regulation and stability of proteins might lead to the development of new therapeutic targets or drugs in the future.\n",
+      "\n",
+      "3. **Educating Patients**: Knowledge of ongoing research allows you to provide patients with the most current information about their conditions and potential future therapies.\n",
+      "\n",
+      "4. **Interdisciplinary Collaboration**: Familiarity with cutting-edge research can facilitate collaboration with specialists, researchers, or clinical trials that might benefit your patients.\n",
+      "\n",
+      "5. **Continuing Education**: Engaging with scientific literature is a part of lifelong learning and can help you stay current with medical advancements and innovations.\n",
+      "\n",
+      "If the study is related to a specific condition that you encounter, it would be worthwhile to explore how these findings might translate into clinical practices over time.\n",
+      "\n",
+      "AI: I'm sorry, but I don't have enough information to answer your question. Could you please provide more context or clarify your inquiry?\n",
+      "\n",
+      "AI: The document appears to be a scientific article related to biochemistry and molecular biology, specifically focusing on protein interactions and intrinsically disordered protein regions (IDPR) in the context of diseases like diabetes and autoimmune disorders. As an AI engineer, you might not directly benefit from the specific scientific content unless your work involves bioinformatics, computational biology, or the development of AI models for analyzing biological data. If your work involves these areas, you could gain insights into the types of data and analyses that are relevant in this field, which could inform the development of AI tools or models. Otherwise, the document may not be directly relevant to your work as an AI engineer.\n",
+      "\n",
+      "AI: Goodbye! If you have any more questions in the future, feel free to ask. Have a great day!\n",
+      "\n",
+      "Exiting the chat. Goodbye!\n"
+     ]
+    }
+   ],
+   "source": [
+    "def chat_system():\n",
+    "    print(\"Welcome to the CLEC16A Chat System! Ask any question, or type 'exit' to quit.\")\n",
+    "    while True:\n",
+    "        question = input(\"You: \")\n",
+    "        if question.lower() in ['exit', 'quit']:\n",
+    "            print(\"Exiting the chat. Goodbye!\")\n",
+    "            break\n",
+    "        answer = chat_qa_chain.run(question)\n",
+    "        print(f\"AI: {answer}\\n\")\n",
+    "\n",
+    "# Run the chat system\n",
+    "chat_system()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now, i want to add a prompt template"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Welcome to the CLEC16A Chat System! Type 'exit' to quit.\n",
+      "AI (Prompt): Sure, I can help with that. Please go ahead with your questions.\n",
+      "\n",
+      "AI: The article investigates the role of an internal intrinsically disordered protein region (IDPR) within the CLEC16A protein, which is an E3 ubiquitin ligase involved in mitochondrial quality control through mitophagy. CLEC16A forms a complex with other proteins, RNF41 and USP8, to regulate mitochondrial health. The study highlights that the internal IDPR of CLEC16A is crucial for the protein's function and turnover. It is essential for the binding and ubiquitination of RNF41, which promotes the stability and assembly of the CLEC16A–RNF41–USP8 complex. Disruption of this IDPR prevents CLEC16A turnover and destabilizes the mitophagy complex. The presence of the IDPR in CLEC16A was confirmed using NMR and CD spectroscopy. This research suggests that targeting the IDPR could improve mitochondrial health in diseases associated with CLEC16A, such as diabetes, cardiovascular disease, and multiple sclerosis.\n",
+      "\n",
+      "AI: Based on the document's content, here are three benefits related to mitochondrial quality control facilitated by CLEC16A:\n",
+      "\n",
+      "1. **Mitophagy and Mitochondrial Health**: CLEC16A, as an E3 ubiquitin ligase, regulates mitochondrial quality control through the process of mitophagy, which eliminates damaged mitochondria. This helps maintain mitochondrial health.\n",
+      "\n",
+      "2. **Tripartite Complex Formation**: CLEC16A forms a complex with RNF41 and USP8, which together regulate the activity of the mitophagy regulator PRKN/Parkin. This complex plays a crucial role in maintaining mitochondrial function.\n",
+      "\n",
+      "3. **Disease Prevention and Cellular Function**: Proper functioning of CLEC16A in mitochondrial quality control is crucial for preventing cellular dysfunction and diseases such as diabetes, cardiovascular disease, and multiple sclerosis, as it is associated with over 20 human diseases. This highlights the importance of maintaining mitochondrial integrity and function in various cell types, including pancreatic β-cells, sensory neurons, and immune cells.\n",
+      "\n",
+      "AI: I'm sorry, but I don't have access to real-time information, including current weather conditions. I recommend checking a weather app or website for the most up-to-date information.\n",
+      "\n",
+      "AI: Goodbye! If you have any more questions in the future, feel free to ask.\n",
+      "\n",
+      "Exiting the chat. Goodbye!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define the initial system prompt\n",
+    "initial_prompt = (\n",
+    "    \"You are an AI assistant specializing in CLEC16A-related research, focusing on mitochondrial quality control, \"\n",
+    "    \"the role of intrinsically disordered protein regions, and disease implications. \"\n",
+    "    \"Answer the following questions based on the document's content.\"\n",
+    ")\n",
+    "\n",
+    "# Define the chat function with prompt\n",
+    "def chat_system():\n",
+    "    print(\"Welcome to the CLEC16A Chat System! Type 'exit' to quit.\")\n",
+    "    \n",
+    "    # Send the initial prompt\n",
+    "    response = chat_qa_chain.run(initial_prompt)\n",
+    "    print(f\"AI (Prompt): {response}\\n\")\n",
+    "\n",
+    "    # Start the chat loop\n",
+    "    while True:\n",
+    "        question = input(\"You: \")\n",
+    "        if question.lower() in ['exit', 'quit']:\n",
+    "            print(\"Exiting the chat. Goodbye!\")\n",
+    "            break\n",
+    "        # Prepend initial prompt to each question\n",
+    "        full_prompt = f\"{initial_prompt}\\n\\n{question}\"\n",
+    "        answer = chat_qa_chain.run(full_prompt)\n",
+    "        print(f\"AI: {answer}\\n\")\n",
+    "\n",
+    "# Run the chat system\n",
+    "chat_system()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

tests/test_pinecone_rag.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import unittest
+from unittest.mock import MagicMock, patch, mock_open
+import pinecone
+from langchain.schema import Document
+from core.rag_engine import RAGPrep
+from typing import List, Dict, Optional
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain_openai import OpenAIEmbeddings
+import pinecone
+from tqdm.auto import tqdm
+from langchain.schema import Document
+from config import get_settings
+class TestRAGPrep(unittest.TestCase):
+    def setUp(self):
+        """Set up test fixtures"""
+        self.settings = get_settings()
+        self.mock_settings = MagicMock()
+        self.mock_settings.INDEX_NAME = "test-index"
+        self.mock_settings.PINECONE_API_KEY = self.settings.PINECONE_API_KEY
+        self.mock_settings.CLOUD = "aws"
+        self.mock_settings.REGION = "us-east-1"
+        self.mock_settings.PDF_DIRECTORY = self.settings.PDF_DIRECTORY
+        self.mock_settings.CHUNK_SIZE = 1000
+        self.mock_settings.CHUNK_OVERLAP = 200
+        self.mock_settings.DIMENSIONS = 1536
+        self.mock_settings.OPENAI_API_KEY = self.settings.OPENAI_API_KEY
+        # Create patcher for get_settings and other dependencies
+        self.settings_patcher = patch('core.rag_engine.get_settings', return_value=self.mock_settings)
+        self.embeddings_patcher = patch('core.rag_engine.OpenAIEmbeddings')
+        self.pinecone_patcher = patch('core.rag_engine.pinecone.Pinecone')
+        # Start all patchers
+        self.mock_get_settings = self.settings_patcher.start()
+        self.mock_embeddings = self.embeddings_patcher.start()
+        self.mock_pinecone = self.pinecone_patcher.start()
+    def tearDown(self):
+        """Clean up after tests"""
+        self.settings_patcher.stop()
+        self.embeddings_patcher.stop()
+        self.pinecone_patcher.stop()
+    def test_init(self):
+        """Test RAGPrep initialization"""
+        # Create instance
+        rag_prep = RAGPrep()
+        # Assert initialization
+        self.assertEqual(rag_prep.index_name, "test-index")
+        self.assertEqual(rag_prep.settings, self.mock_settings)
+        self.mock_pinecone.assert_called_once_with(self.mock_settings.PINECONE_API_KEY)
+        self.mock_embeddings.assert_called_once_with(openai_api_key=self.mock_settings.OPENAI_API_KEY)
+    @patch('core.rag_engine.DirectoryLoader')
+    def test_load_and_split_pdfs(self, mock_loader_class):
+        """Test PDF loading and splitting"""
+        # Setup mock documents
+        mock_docs = [
+            Document(page_content="Test content 1", metadata={"source": "test1.pdf", "page": 1}),
+            Document(page_content="Test content 2", metadata={"source": "test2.pdf", "page": 1})
+        ]
+        # Configure the mock loader
+        mock_loader_instance = MagicMock()
+        mock_loader_instance.load.return_value = mock_docs
+        mock_loader_class.return_value = mock_loader_instance
+        # Create instance and test
+        rag_prep = RAGPrep()
+        chunks = rag_prep.load_and_split_pdfs()
+        # Assertions
+        self.assertIsInstance(chunks, list)
+        mock_loader_class.assert_called_once_with(
+            self.mock_settings.PDF_DIRECTORY,
+            glob="**/*.pdf",
+            loader_cls=PyPDFLoader
+        )
+        mock_loader_instance.load.assert_called_once()
+    def test_process_and_upload(self):
+        """Test processing and uploading documents"""
+        # Setup mock documents
+        mock_docs = [
+            Document(page_content="Test 1", metadata={"source": "test.pdf", "page": 1}),
+            Document(page_content="Test 2", metadata={"source": "test.pdf", "page": 2})
+        ]
+        # Create mock embeddings instance
+        mock_embeddings_instance = MagicMock()
+        mock_embeddings_instance.embed_documents.return_value = [[0.1] * 1536, [0.2] * 1536]
+        self.mock_embeddings.return_value = mock_embeddings_instance
+        # Mock the index
+        mock_index = MagicMock()
+        self.mock_pinecone.return_value.Index.return_value = mock_index
+        # Mock load_and_split_pdfs
+        with patch.object(RAGPrep, 'load_and_split_pdfs', return_value=mock_docs):
+            # Create instance and test
+            rag_prep = RAGPrep()
+            rag_prep.process_and_upload()
+            # Assertions
+            mock_embeddings_instance.embed_documents.assert_called_once()
+            self.assertTrue(mock_index.upsert.called)
+            # Verify the format of the upsert call
+            called_args = mock_index.upsert.call_args[1]['vectors']
+            self.assertEqual(len(called_args), 2)  # Two documents
+            self.assertTrue(all(len(v[1]) == 1536 for v in called_args))
+    def test_cleanup_index_success(self):
+        """Test successful index cleanup"""
+        with patch('pinecone.Pinecone') as mock_pinecone:
+            # Setup mock
+            mock_pc = mock_pinecone.return_value
+            mock_pc.list_indexes.return_value.names.return_value = ["test-index"]
+            mock_index = MagicMock()
+            mock_pc.Index.return_value = mock_index
+            # Create instance and test
+            rag_prep = RAGPrep()
+            result = rag_prep.cleanup_index()
+            # Assertions
+            self.assertTrue(result)
+            mock_index.delete.assert_called_once_with(delete_all=True)
+    def test_cleanup_index_no_index(self):
+        """Test cleanup when index doesn't exist"""
+        with patch('pinecone.Pinecone') as mock_pinecone:
+            # Setup mock
+            mock_pc = mock_pinecone.return_value
+            mock_pc.list_indexes.return_value.names.return_value = []
+            # Create instance and test
+            rag_prep = RAGPrep()
+            result = rag_prep.cleanup_index()
+            # Assertions
+            self.assertTrue(result)
+            mock_pc.Index.assert_not_called()
+    def test_cleanup_index_error(self):
+        """Test cleanup with error"""
+        with patch('pinecone.Pinecone') as mock_pinecone:
+            # Setup mock to raise exception
+            mock_pc = mock_pinecone.return_value
+            mock_pc.list_indexes.return_value.names.return_value = ["test-index"]
+            mock_pc.Index.side_effect = Exception("Test error")
+            # Create instance and test
+            rag_prep = RAGPrep()
+            result = rag_prep.cleanup_index()
+            # Assertions
+            self.assertFalse(result)
+if __name__ == '__main__':
+    unittest.main()

tests/test_rag_pdf.ipynb ADDED Viewed

	@@ -0,0 +1,204 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import PyPDFLoader\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\")\n",
+    "\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4o\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.vectorstores import InMemoryVectorStore\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "\n",
+    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input': 'What is this paper about?',\n",
+       " 'context': [Document(id='de54a167-b052-4340-8c9d-c96f3b20b1c8', metadata={'source': '../data/main.pdf', 'page': 12}, page_content='Dunnett’s post hoc multiple comparisons test. A 5% sig-\\nniﬁcance level was used for all statistical tests. All statis-\\ntical analysis was performed using Prism software\\n(GraphPad software, LLC).\\nData availability\\nAll data are contained within the manuscript.\\nSupporting information—This article contains supporting informa-\\ntion (62).\\nAcknowledgments—Recombinant protein for biophysical studies\\nreported in this publication was generated with supported from the\\nUniversity of Michigan Center for Structural Biology (CSB). The\\nCSB acknowledges support from the U-M Life Sciences Institute,\\nthe U-M Rogel Cancer Center, the U-M Medical School Endow-\\nment for Basic Sciences, and grants from the NIH. We thank the\\nUniversity of Michigan BioNMR Core fand for assistance per-\\nforming, analyzing, and interpreting NMR studies. The University\\nof Michigan BioNMR Core is supported by the U-M College of\\nLiterature, Sciences and Arts, Life Sciences Institute, College of'),\n",
+       "  Document(id='6a25243d-4b08-4b3d-8db6-d900d6a616ed', metadata={'source': '../data/main.pdf', 'page': 10}, page_content='PAQDVPRSSAKPSIRCFIKPTETLERSLEMNKHKGKKRM\\nQKRPNYKNVGEEEDEERGSAEDAQEDAEKTKGTEGGSKS\\nMKTSGEREEIEMVIMKLGKLSEVAAAGTSVQEQNTTDEE\\nKSAATNSEN\\nShufﬂe IDPR:\\nMARDKMESNNKTSACSEITGEPETQASREQKVDESEQA\\nEKTDGPNDEMSEAIVAKVLRKNVKKPFKKTREEELLKMN\\nMGASRITNQHKKAYSSLGEEPIGGEARRKGESAPETEKDG\\nEETGSQSTV\\nIDPR K-to-R:\\nPAQDVPRSSARPSIRCFIRPTETLERSLEMNRHRGRRRM\\nQRRPNYRNVGEEEDEERGSAEDAQEDAERTRGTEGGSRS\\nMRTSGEREEIEMVIMRLGRLSEVAAAGTSVQEQNTTDEE\\nRSAATNSEN\\nShufﬂe IDPR retain K:\\nRNF41 regulates CLEC16A stabilityvia an IDPR\\nJ. Biol. Chem. (2023) 299(4) 103057 11'),\n",
+       "  Document(id='eda80cfe-dd68-40e5-84e1-b64db1e9e93d', metadata={'source': '../data/main.pdf', 'page': 12}, page_content='forming, analyzing, and interpreting NMR studies. The University\\nof Michigan BioNMR Core is supported by the U-M College of\\nLiterature, Sciences and Arts, Life Sciences Institute, College of\\nPharmacy, and the Medical School along with the U-M Biosciences\\nInitiative. We thank Drs. H. Popelka, P. Arvan, D. Fingar, and\\nmembers of the Soleimanpour laboratory for helpful advice.\\nAuthor contributions—M. A. G. and S. A. S. conceptualization; M.\\nA. G., J. Z., B. C., M. P. V., N. X., V. S., and D. S. investigation;\\nM. A. G., M. P. V., and D. S. formal analysis; M. A. G. data curation;\\nM. A. G. and S. A. S. writing– original draft; M. A. G. and S. A. S.\\nfunding acquisition; J. Z., B. C., M. P. V., V. S., N. A. K., D. S., D. J.\\nK., S. S., and S. A. S. writing– review and editing; N. A. K., D. S., D. J.\\nK., S. S., and S. A. S. resources; N. A. K., D. S., D. J. K., S. S., and S. A.\\nS. supervision; S. A. S. visualization.\\nFunding and additional information—M. A. G. was supported by'),\n",
+       "  Document(id='fa72767a-90cc-4ca0-9889-d4eaf81549e5', metadata={'source': '../data/main.pdf', 'page': 12}, page_content='Protein Sci. 25, 1767–1785\\n20. Guharoy, M., Bhowmick, P., and Tompa, P. (2016) Design principles\\ninvolving protein disorder facilitate speciﬁc substrate selection and degra-\\ndation by the ubiquitin-proteasome system.J. Biol. Chem.291,6 7 2 3–6731\\n21. Bhowmick, P., Pancsa, R., Guharoy, M., and Tompa, P. (2013) Functional\\ndiversity and structural disorder in the human ubiquitination pathway.\\nPLoS One8, e65443\\n22. Tunyasuvunakool, K., Adler, J., Wu, Z., Green, T., Zielinski, M.,/C20Zídek, A.,\\net al. (2021) Highly accurate protein structure prediction for the human\\nproteome. Nature 596, 590–596\\n23. Varadi, M., Anyango, S., Deshpande, M., Nair, S., Natassia, C., Yorda-\\nnova, G., et al. (2022) AlphaFold protein structure database: Massively\\nRNF41 regulates CLEC16A stabilityvia an IDPR\\nJ. Biol. Chem. (2023) 299(4) 103057 13')],\n",
+       " 'answer': 'This paper investigates the regulation of CLEC16A stability by RNF41 through an intrinsically disordered protein region (IDPR). It explores the mechanisms by which protein disorder contributes to substrate selection and degradation processes in the ubiquitin-proteasome system. The study involves biophysical and structural analyses supported by various University of Michigan resources.'}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.chains import create_retrieval_chain\n",
+    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "system_prompt = (\n",
+    "    \"You are an assistant for question-answering tasks. \"\n",
+    "    \"Use the following pieces of retrieved context to answer \"\n",
+    "    \"the question. If you don't know the answer, say that you \"\n",
+    "    \"don't know. Use three sentences maximum and keep the \"\n",
+    "    \"answer concise.\"\n",
+    "    \"\\n\\n\"\n",
+    "    \"{context}\"\n",
+    ")\n",
+    "\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", system_prompt),\n",
+    "        (\"human\", \"{input}\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "\n",
+    "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n",
+    "rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n",
+    "\n",
+    "results = rag_chain.invoke({\"input\": \"What is this paper about?\"})\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This paper investigates the regulation of CLEC16A stability by RNF41 through an intrinsically disordered protein region (IDPR). It explores the mechanisms by which protein disorder contributes to substrate selection and degradation processes in the ubiquitin-proteasome system. The study involves biophysical and structural analyses supported by various University of Michigan resources.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(results['answer'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question: What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\n",
+      "Answer: CLEC16A is an E3 ubiquitin ligase that regulates mitochondrial quality control by facilitating mitophagy, a process that eliminates damaged mitochondria. It forms a complex with RNF41 and USP8 to control the activity of the mitophagy regulator PRKN/Parkin. This function is crucial for cellular health as it maintains mitochondrial integrity, which is vital for energy production and preventing cellular damage, especially in cell types like pancreatic β-cells, sensory neurons, and immune cells.\n",
+      "\n",
+      "Question: How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\n",
+      "Answer: The intrinsically disordered protein region (IDPR) within CLEC16A is crucial for its stability and interaction with RNF41, as it regulates CLEC16A turnover and is the site where RNF41 acts to destabilize CLEC16A. The IDPR is essential for the enzymatic function and molecular interactions of CLEC16A, including the assembly of the CLEC16A–RNF41–USP8 mitophagy complex. Loss of this IDPR impairs CLEC16A's ability to ubiquitinate RNF41, affecting the overall stability and function of the protein complex.\n",
+      "\n",
+      "Question: What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\n",
+      "Answer: The CLEC16A-RNF41 complex is significant in the regulation of mitophagy as it promotes the assembly and stability of the tripartite mitophagy complex, which includes CLEC16A, RNF41, and USP8. This complex plays a crucial role in mitochondrial quality control, and the loss of CLEC16A impairs mitochondrial health and function in various cell types. Additionally, RNF41 might have a more central role in mitophagy than previously thought, as it can lead to the degradation of key mitophagy regulators.\n",
+      "\n",
+      "Question: How does RNF41 influence the turnover of CLEC16A, and what are the molecular mechanisms involved?\n",
+      "Answer: RNF41 influences the turnover of CLEC16A by ubiquitinating and destabilizing it, an action that requires RNF41's ubiquitin ligase activity. Overexpression of RNF41 decreases CLEC16A protein levels and increases its ubiquitination, but a ligase-dead RNF41 mutant does not affect CLEC16A levels. The internal IDPR of CLEC16A is crucial for this process, as altering this region prevents RNF41 from reducing CLEC16A levels.\n",
+      "\n",
+      "Question: Which diseases are associated with dysregulation of CLEC16A, and what implications does this have for potential treatments?\n",
+      "Answer: Dysregulation of CLEC16A is associated with over 20 human diseases, including diabetes, cardiovascular disease, stroke, multiple sclerosis, arthritis, Crohn's disease, and other inflammatory diseases. The implications for potential treatments involve targeting the intrinsic disordered protein region (IDPR) within CLEC16A to prevent its turnover, thereby increasing protein levels to enhance its function. This approach could potentially treat or prevent diseases associated with reduced CLEC16A levels by improving mitochondrial health through enhanced mitophagy.\n",
+      "\n",
+      "Question: What techniques were used in this study to confirm the presence and function of the IDPR in CLEC16A?\n",
+      "Answer: The study used in silico computational techniques, including AlphaFold for protein structure prediction and IUPred2 for disorder prediction, to identify the putative internal IDPR in CLEC16A. Experimentally, they used NMR spectroscopy to examine the structural conformation of the IDPR and Western blot analysis to assess the impact of the IDPR on protein stability. They also introduced mutations and compared protein levels and stability in HEK293T cells to determine the function of the IDPR in regulating CLEC16A stability.\n",
+      "\n",
+      "Question: How does the disruption of CLEC16A’s IDPR affect its ubiquitination and degradation?\n",
+      "Answer: Disruption of CLEC16A's internal IDPR prevents its turnover and reduces self-ubiquitination in vitro. The IDPR promotes CLEC16A destabilization, and its absence results in higher stability and protein levels compared to the wild-type CLEC16A. Additionally, RNF41 promotes the ubiquitination and destabilization of CLEC16A, and this process is impaired when the IDPR is disrupted.\n",
+      "\n",
+      "Question: Why might the IDPR in CLEC16A be considered a therapeutic target for diseases related to mitochondrial dysfunction?\n",
+      "Answer: The IDPR in CLEC16A is considered a therapeutic target because it regulates CLEC16A turnover, and its destabilization can impact mitochondrial health. By blocking access to this region, it is possible to prevent CLEC16A turnover, potentially increasing protein levels and enhancing its function. This could help treat or prevent diseases associated with mitochondrial dysfunction.\n",
+      "\n",
+      "Question: How do mutations within CLEC16A’s IDPR affect the protein's ability to form complexes with RNF41 and USP8?\n",
+      "Answer: Mutations within CLEC16A's IDPR impair the protein's ability to bind and ubiquitinate RNF41, which is essential for forming the CLEC16A–RNF41–USP8 mitophagy complex. Truncating or shuffling the residues within the IDPR reduces its binding to RNF41 and disrupts the assembly of the tripartite complex. This suggests that the integrity and specific sequence of the IDPR are crucial for the proper interaction and complex formation with RNF41 and USP8.\n",
+      "\n",
+      "Question: What did biophysical analyses reveal about the structural properties of CLEC16A’s IDPR, and how do these properties contribute to its function?\n",
+      "Answer: Biophysical analyses revealed that CLEC16A's internal IDPR is predicted to lack secondary structure and is enriched in charged, polar residues like glutamic acid and lysine, which promote intrinsic disorder. These structural properties contribute to the protein's function by regulating CLEC16A turnover, as lysine residues in the IDPR are essential for this process. The IDPR's role in turnover is significant because it affects CLEC16A stability and function, which is pertinent to its involvement in human diseases.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# List of questions based on the PDF content for testing\n",
+    "questions = [\n",
+    "    \"What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\",\n",
+    "    \"How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\",\n",
+    "    \"What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\",\n",
+    "    \"How does RNF41 influence the turnover of CLEC16A, and what are the molecular mechanisms involved?\",\n",
+    "    \"Which diseases are associated with dysregulation of CLEC16A, and what implications does this have for potential treatments?\",\n",
+    "    \"What techniques were used in this study to confirm the presence and function of the IDPR in CLEC16A?\",\n",
+    "    \"How does the disruption of CLEC16A’s IDPR affect its ubiquitination and degradation?\",\n",
+    "    \"Why might the IDPR in CLEC16A be considered a therapeutic target for diseases related to mitochondrial dysfunction?\",\n",
+    "    \"How do mutations within CLEC16A’s IDPR affect the protein's ability to form complexes with RNF41 and USP8?\",\n",
+    "    \"What did biophysical analyses reveal about the structural properties of CLEC16A’s IDPR, and how do these properties contribute to its function?\"\n",
+    "]\n",
+    "\n",
+    "# Loop through each question, invoke the RAG chain, and print each answer\n",
+    "for question in questions:\n",
+    "    result = rag_chain.invoke({\"input\": question})\n",
+    "    print(f\"Question: {question}\")\n",
+    "    print(f\"Answer: {result[\"answer\"]}\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

todo.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ https://medium.com/@atul.auddy/question-answering-over-documents-using-%EF%B8%8Flangchain-and-pinecone-30250391d6a5
2	+ read this to know how to do rag pinecone and langchain on docs

utils/__init__.py ADDED Viewed

File without changes

utils/helpers.py ADDED Viewed

File without changes

utils/models.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Boolean
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import relationship, sessionmaker
+from datetime import datetime
+import uuid
+Base = declarative_base()
+class ChatSession(Base):
+    __tablename__ = 'chat_sessions'
+    id = Column(Integer, primary_key=True)
+    session_id = Column(String(36), unique=True, default=lambda: str(uuid.uuid4()))
+    doctor_name = Column(String(100), nullable=False)  # Added doctor name
+    user_identifier = Column(String(150))  # Will store doctor_name + timestamp
+    started_at = Column(DateTime, default=datetime.utcnow)
+    last_activity = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    messages = relationship("ChatMessage", back_populates="session")
+    @classmethod
+    def get_sessions_by_doctor(cls, session, doctor_name):
+        """Get all sessions for a specific doctor"""
+        return session.query(cls).filter(cls.doctor_name == doctor_name).all()
+class ChatMessage(Base):
+    __tablename__ = 'chat_messages'
+    id = Column(Integer, primary_key=True)
+    session_id = Column(String(36), ForeignKey('chat_sessions.session_id'))
+    timestamp = Column(DateTime, default=datetime.utcnow)
+    is_user = Column(Boolean, default=True)
+    message = Column(Text)
+    sources_used = Column(Text, nullable=True)
+    session = relationship("ChatSession", back_populates="messages")
+class DatabaseManager:
+    def __init__(self, db_url="sqlite:///chat_history.db"):
+        self.engine = create_engine(db_url)
+        Base.metadata.create_all(self.engine)
+        self.Session = sessionmaker(bind=self.engine)
+    def create_session(self, doctor_name):
+        """Create a new chat session with doctor name and timestamp"""
+        session = self.Session()
+        try:
+            # Create unique timestamp for this session
+            timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")  # Added milliseconds
+            user_identifier = f"{doctor_name}_{timestamp}"
+            chat_session = ChatSession(
+                doctor_name=doctor_name,
+                user_identifier=user_identifier
+            )
+            session.add(chat_session)
+            session.commit()
+            return chat_session.session_id
+        finally:
+            session.close()
+    def get_doctor_sessions(self, doctor_name):
+        """Get all sessions for a specific doctor"""
+        session = self.Session()
+        try:
+            return session.query(ChatSession)\
+                .filter(ChatSession.doctor_name == doctor_name)\
+                .order_by(ChatSession.last_activity.desc()).all()
+        finally:
+            session.close()
+    def log_message(self, session_id, message, is_user=True, sources=None):
+        session = self.Session()
+        try:
+            chat_message = ChatMessage(
+                session_id=session_id,
+                message=message,
+                is_user=is_user,
+                sources_used=sources
+            )
+            session.add(chat_message)
+            session.commit()
+        finally:
+            session.close()
+    def get_session_history(self, session_id):
+        session = self.Session()
+        try:
+            messages = session.query(ChatMessage)\
+                .filter(ChatMessage.session_id == session_id)\
+                .order_by(ChatMessage.timestamp).all()
+            return messages
+        finally:
+            session.close()