Spaces:

frasan
/

test-flask-app

Sleeping

App Files Files Community

frasan commited on Mar 7, 2024

Commit

8ef5a0f

1 Parent(s): a5afede

first commit

Browse files

Files changed (22) hide show

Config/model_config.json +46 -0
Dockerfile +44 -0
docker-compose.yml +11 -0
librarymed/.DS_Store +0 -0
librarymed/.gitkeep +1 -0
librarymed/__init__.py +0 -0
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Bold.ttf +0 -0
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Oblique.ttf +0 -0
librarymed/huggingface/DejaVu/DejaVuSansCondensed.ttf +0 -0
librarymed/huggingface/DejaVu/readme.txt +40 -0
librarymed/huggingface/RAG_utils_huggingface.py +995 -0
librarymed/huggingface/app_huggingface.py +304 -0
librarymed/kromin/RAG_utils.py +983 -0
librarymed/kromin/__init__.py +0 -0
librarymed/kromin/app_librarymed.py +169 -0
librarymed/local/RAG_utils.py +979 -0
librarymed/local/__init__.py +0 -0
librarymed/local/app_local.py +160 -0
librarymed/local/templates/index.html +187 -0
librarymed/local/templates/upload_and_results.html +227 -0
librarymed/main.py +22 -0
requirements.txt +41 -0

Config/model_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+    "pdf_processing": {
+        "extract_images": false,
+        "infer_table_structure": true,
+        "strategy": "fast",
+        "chunking_strategy": "by_title",
+        "model_name": "yolox",
+        "max_characters": 10000,
+        "combine_text_under_n_chars": 100
+      },
+    "allowed_extensions": "pdf",
+    "embeddings": "huggingface",
+    "embeddings_model": "BAAI/bge-small-en-v1.5",
+    "llm_model": "gpt-4",
+    "model_temp": 0.2,
+    "max_tokens": 512,
+    "context_window": 5000,
+    "UPLOAD_FOLDER": "../path/to/upload/folder",
+    "GPT_PROMPT_PATH": "data/prompts/prompt_gpt.txt",
+    "MISTRAL_PROMPT_PATH": "data/prompts/prompt_mistral.txt",
+    "INFO_PROMPT_PATH": "data/prompts/prompt_info.txt",
+    "peer_review_journals_path": "data/prompts/peer_review_journals.txt",
+    "eq_network_journals_path": "data/prompts/eq_network_journals.txt",
+    "queries": ["Does the article share any data or code? Look for terms related to supplementary materials or reproducibility.",
+      "Has the study or any data in the article been registered in advance?",
+      "Does the article adhere to specific reporting guidelines such as ISRCTN, CONSORT, PRISMA, MOOSE, STARD, ARRIVE, STROBE, SPIRIT, CARE, AGREE, SRQR, SQUIRE, MDAR, REMARK?",
+      "Is the article's methodology described in detail, including where, when, how, what, and who?",
+      "Are the data collection processes described in detail, including where, when, how, what, and who?",
+      "Does the article provide a detailed description of the sample, including size, demographics, recruitment, and criteria?",
+      "Does the article describe the data analysis process in detail?",
+      "Does the article discuss measures taken to avoid or minimize systematic bias?",
+      "Has the article been published in a journal?"],
+    "criteria": [
+        "Data and code sharing.",
+        "Has anything in the article been registered (in advance)?",
+        "Does the article follow any reporting guidelines?",
+        "Description of methodology",
+        "Data collection processes",
+        "Sample description. eg. size, demographics, recruitment, in-/exclusion criteria",
+        "Data analysis process",
+        "Measures to minimize systematic bias",
+        "Peer Review"],
+    "journal_query": "Is the given research paper published in any of the following journals: {}?",
+    "author_query": "Give me details about the institutions (like university or hospital) and contact details (eg. email) of the corresponding author.",
+    "title_query": "Output title of the paper."
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+# Use the official Python image as base image
+FROM python:3.9
+RUN apt-get update && apt-get install -y \
+    python3.10 python3-pip \
+    tesseract-ocr \
+    libtesseract-dev \
+    libgl1-mesa-glx \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Set the working directory in the container
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y libgl1-mesa-glx
+# Copy the dependencies file to the working directory
+COPY requirements.txt .
+# Install dependencies
+RUN pip install --trusted-host pypi.python.org -r requirements.txt
+# Copy the content of the local src directory to the working directory
+COPY . .
+# Create a user to run the application
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory in the user's home directory
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+# Expose the port number on which the Flask app will run
+EXPOSE 80
+# Define environment variable
+ENV NAME World
+# Command to run on container start
+CMD ["python", "librarymed/main.py"]

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+version: '3.8'
+services:
+  flask-app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    volumes:
+      - .:/app
+    ports:
+      - "80:80"

librarymed/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

librarymed/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

librarymed/__init__.py ADDED Viewed

File without changes

librarymed/huggingface/DejaVu/DejaVuSansCondensed-Bold.ttf ADDED Viewed

Binary file (632 kB). View file

librarymed/huggingface/DejaVu/DejaVuSansCondensed-Oblique.ttf ADDED Viewed

Binary file (576 kB). View file

librarymed/huggingface/DejaVu/DejaVuSansCondensed.ttf ADDED Viewed

Binary file (644 kB). View file

librarymed/huggingface/DejaVu/readme.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+Congratulations, you have successfully downloaded font file!
+This font is provided to you by Fonts2u.com – the largest online
+repository of free fonts for Windows and Mac.
+How to install this font on your computer?
+For Windows 7 / Vista users:
+- Right-click the font file(s) and choose "Install".
+For users of the previous Windows versions:
+- Copy the included file(s) into a default Windows font folder
+  (usually C:\WINDOWS\FONTS or C:\WINNT\FONTS)
+For Mac users:
+Mac OS X 10.3 or above (including the FontBook)
+- Double-click the font file and hit "Install font" button at
+  the bottom of the preview.
+Mac OS X
+- Either copy the font file(s) to /Library/Fonts (for all users),
+  or to /Users/Your_username/Library/Fonts (for you only).
+Mac OS 9 or earlier
+- You have to convert the font file(s) you have downloaded.
+  Drag the font suitcases into the System folder. The system
+  will propose you to add them to the Fonts folder.
+For Linux users:
+- Copy the font file(s) to /USR/SHARE/FONTS

librarymed/huggingface/RAG_utils_huggingface.py ADDED Viewed

	@@ -0,0 +1,995 @@

+import os
+import re
+import json
+import torch
+import openai
+import logging
+import asyncio
+import aiohttp
+import pandas as pd
+import numpy as np
+import evaluate
+import qdrant_client
+from pypdf import PdfReader
+from pydantic import BaseModel, Field
+from typing import Any, List, Tuple, Set, Dict, Optional, Union
+from sklearn.metrics.pairwise import cosine_similarity
+from unstructured.partition.pdf import partition_pdf
+import llama_index
+from llama_index import PromptTemplate
+from llama_index.retrievers import VectorIndexRetriever, BaseRetriever, BM25Retriever
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index import get_response_synthesizer
+from llama_index.schema import NodeWithScore
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index import VectorStoreIndex, ServiceContext
+from llama_index.embeddings import OpenAIEmbedding
+from llama_index.llms import HuggingFaceLLM
+import requests
+from llama_index.llms import (
+    CustomLLM,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.llms.base import llm_completion_callback
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from llama_index.storage.storage_context import StorageContext
+from llama_index.postprocessor import SentenceTransformerRerank, LLMRerank
+from tempfile import NamedTemporaryFile
+# Configure basic logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Create a logger object
+logger = logging.getLogger(__name__)
+class ConfigManager:
+    """
+    A class to manage loading and accessing configuration settings.
+    Attributes:
+        config (dict): Dictionary to hold configuration settings.
+    Methods:
+        load_config(config_path: str): Loads the configuration from a given JSON file.
+        get_config_value(key: str): Retrieves a specific configuration value.
+    """
+    def __init__(self):
+        self.configs = {}
+    def load_config(self, config_name: str, config_path: str) -> None:
+        """
+        Loads configuration settings from a specified JSON file into a named configuration.
+        Args:
+            config_name (str): The name to assign to this set of configurations.
+            config_path (str): The path to the configuration file.
+        Raises:
+            FileNotFoundError: If the config file is not found.
+            json.JSONDecodeError: If there is an error parsing the config file.
+        """
+        try:
+            with open(config_path, 'r') as f:
+                self.configs[config_name] = json.load(f)
+        except FileNotFoundError:
+            logging.error(f"Config file not found at {config_path}")
+            raise
+        except json.JSONDecodeError as e:
+            logging.error(f"Error decoding config file: {e}")
+            raise
+    def get_config_value(self, config_name: str, key: str) -> str:
+        """
+        Retrieves a specific configuration value.
+        Args:
+            key (str): The key for the configuration setting.
+        Returns:
+            str: The value of the configuration setting.
+        Raises:
+            ValueError: If the key is not found or is set to a placeholder value.
+        """
+        value = self.configs.get(config_name, {}).get(key)
+        if value is None or value == "ENTER_YOUR_TOKEN_HERE":
+            raise ValueError(f"Please set your '{key}' in the config.json file.")
+        return value
+class base_utils:
+    """
+    A utility class providing miscellaneous static methods for processing and analyzing text data,
+    particularly from PDF documents and filenames. This class also includes methods for file operations.
+    This class encapsulates the functionality of extracting key information from text, such as scores,
+    reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
+    and reading content from files.
+    Attributes:
+        None (This class contains only static methods and does not maintain any state)
+    Methods:
+        extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
+            Extracts a score and reasoning from a given text using regular expressions.
+        extract_id_from_filename(filename: str) -> Optional[int]:
+            Extracts an ID from a given filename based on a specified pattern.
+        find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
+            Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
+        read_from_file(file_path: str) -> str:
+            Reads the content of a file and returns it as a string.
+    """
+    @staticmethod
+    def read_from_file(file_path: str) -> str:
+        """
+        Reads the content of a file and returns it as a string.
+        Args:
+            file_path (str): The path to the file to be read.
+        Returns:
+            str: The content of the file.
+        """
+        with open(file_path, 'r') as prompt_file:
+            prompt = prompt_file.read()
+        return prompt
+    @staticmethod
+    def extract_id_from_filename(filename: str) -> Optional[int]:
+        """
+        Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
+        Args:
+            filename (str): The filename from which to extract the ID.
+        Returns:
+            int: The extracted ID as an integer, or None if the pattern is not found.
+        """
+        # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
+        match = re.search(r'Id_(\d+).pdf', filename)
+        if match:
+            return int(match.group(1))  # Convert to integer if ID is numeric
+        else:
+            return None
+    @staticmethod
+    def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
+        """
+        Extracts score and the longest reasoning from a given text using regular expressions.
+        Args:
+            text (str): The text from which to extract the score and reasoning.
+        Returns:
+            dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
+        """
+        # Define regular expression patterns for score and reasoning
+        score_pattern = r"Score: (\d+)"
+        reasoning_pattern = r"Reasoning: (\S.+)"
+        # Extract score using regular expressions
+        score_match = re.search(score_pattern, text)
+        # Extract all reasoning matches
+        reasoning_matches = re.findall(reasoning_pattern, text, re.DOTALL)
+        # Find the longest reasoning match
+        longest_reasoning = min(reasoning_matches, key=len) if reasoning_matches else None
+        # Extract and return the results
+        extracted_data = {
+            "score": score_match.group(1) if score_match else None,
+            "reasoning": longest_reasoning.strip() if longest_reasoning else None
+        }
+        return extracted_data
+    @staticmethod
+    def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
+        """
+        Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
+        Args:
+            pdf_filename (str): The filename of the PDF.
+            dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
+        Returns:
+            pandas.Series or str: The matched row from the dataframe or a message indicating
+                                  that no matching row or invalid filename was found.
+        """
+        pdf_id = Utility.extract_id_from_filename(pdf_filename)
+        if pdf_id is not None:
+            # Assuming the first column contains the ID
+            matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
+            if not matched_row.empty:
+                return matched_row
+            else:
+                return "No matching row found."
+        else:
+            return "Invalid file name."
+class PDFProcessor_Unstructured:
+    """
+    A class to process PDF files, providing functionalities for extracting, categorizing,
+    and merging elements from a PDF file.
+    This class is designed to handle unstructured PDF documents, particularly useful for
+    tasks involving text extraction, categorization, and data processing within PDFs.
+    Attributes:
+        file_path (str): The full path to the PDF file.
+        folder_path (str): The directory path where the PDF file is located.
+        file_name (str): The name of the PDF file.
+        texts (List[str]): A list to store extracted text chunks.
+        tables (List[str]): A list to store extracted tables.
+    Methods:
+        extract_pdf_elements() -> List:
+            Extracts images, tables, and text chunks from a PDF file.
+        categorize_elements(raw_pdf_elements: List) -> None:
+            Categorizes extracted elements from a PDF into tables and texts.
+        merge_chunks() -> List[str]:
+            Merges text chunks based on punctuation and character case criteria.
+        should_skip_chunk(chunk: str) -> bool:
+            Determines if a chunk should be skipped based on its content.
+        should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
+            Determines if the current chunk should be merged with the next one.
+        process_pdf() -> Tuple[List[str], List[str]]:
+            Processes the PDF by extracting, categorizing, and merging elements.
+        process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
+            Processes an uploaded PDF file to extract and categorize text and tables.
+    """
+    def __init__(self, config: Dict[str, any]):
+        self.file_path = None
+        self.folder_path = None
+        self.file_name = None
+        self.texts = []
+        self.tables = []
+        self.config = config if config is not None else self.default_config()
+        logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
+    @staticmethod
+    def default_config() -> Dict[str, any]:
+        """
+        Returns the default configuration for PDF processing.
+        Returns:
+            Dict[str, any]: Default configuration options.
+        """
+        return {
+            "extract_images": False,
+            "infer_table_structure": True,
+            "chunking_strategy": "by_title",
+            "max_characters": 10000,
+            "combine_text_under_n_chars": 100,
+            "strategy": "fast",
+            "model_name": "yolox"
+        }
+    def extract_pdf_elements(self) -> List:
+        """
+        Extracts images, tables, and text chunks from a PDF file.
+        Returns:
+            List: A list of extracted elements from the PDF.
+        """
+        logger.info("Starting extraction of PDF elements.")
+        try:
+            extracted_elements = partition_pdf(
+                filename=self.file_path,
+                extract_images_in_pdf=False,
+                infer_table_structure=True,
+                chunking_strategy="by_title",
+                strategy = "fast",
+                max_characters=10000,
+                combine_text_under_n_chars=100,
+                image_output_dir_path=self.folder_path,
+            )
+            logger.info("Extraction of PDF elements completed successfully.")
+            return extracted_elements
+        except Exception as e:
+            logger.error(f"Error extracting PDF elements: {e}", exc_info=True)
+            raise
+    def categorize_elements(self, raw_pdf_elements: List) -> None:
+        """
+        Categorizes extracted elements from a PDF into tables and texts.
+        Args:
+            raw_pdf_elements (List): A list of elements extracted from the PDF.
+        """
+        logger.debug("Starting categorization of PDF elements.")
+        for element in raw_pdf_elements:
+            element_type = str(type(element))
+            if "unstructured.documents.elements.Table" in element_type:
+                self.tables.append(str(element))
+            elif "unstructured.documents.elements.CompositeElement" in element_type:
+                self.texts.append(str(element))
+        logger.debug("Categorization of PDF elements completed.")
+    def merge_chunks(self) -> List[str]:
+        """
+        Merges text chunks based on punctuation and character case criteria.
+        Returns:
+            List[str]: A list of merged text chunks.
+        """
+        logger.debug("Starting merging of text chunks.")
+        merged_chunks = []
+        skip_next = False
+        for i, current_chunk in enumerate(self.texts[:-1]):
+            next_chunk = self.texts[i + 1]
+            if self.should_skip_chunk(current_chunk):
+                continue
+            if self.should_merge_with_next(current_chunk, next_chunk):
+                merged_chunks.append(current_chunk + " " + next_chunk)
+                skip_next = True
+            else:
+                merged_chunks.append(current_chunk)
+        if not skip_next:
+            merged_chunks.append(self.texts[-1])
+        logger.debug("Merging of text chunks completed.")
+        return merged_chunks
+    @staticmethod
+    def should_skip_chunk(chunk: str) -> bool:
+        """
+        Determines if a chunk should be skipped based on its content.
+        Args:
+            chunk (str): The text chunk to be evaluated.
+        Returns:
+            bool: True if the chunk should be skipped, False otherwise.
+        """
+        return (chunk.lower().startswith(("figure", "fig", "table")) or
+                not chunk[0].isalnum() or
+                re.match(r'^\d+\.', chunk))
+    @staticmethod
+    def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
+        """
+        Determines if the current chunk should be merged with the next one.
+        Args:
+            current_chunk (str): The current text chunk.
+            next_chunk (str): The next text chunk.
+        Returns:
+            bool: True if the chunks should be merged, False otherwise.
+        """
+        return (current_chunk.endswith(",") or
+                (current_chunk[-1].islower() and next_chunk[0].islower()))
+    def extract_title_from_pdf(self, uploaded_file):
+        """
+        Extracts the title from a PDF file's metadata.
+        This function reads the metadata of a PDF file using PyPDF2 and attempts to
+        extract the title. If the title is present in the metadata, it is returned.
+        Otherwise, a default message indicating that the title was not found is returned.
+        Parameters:
+        uploaded_file (file): A file object or a path to the PDF file from which
+                          to extract the title. The file must be opened in binary mode.
+        Returns:
+        str: The title of the PDF file as a string. If no title is found, returns
+             'Title not found'.
+        """
+        # Initialize PDF reader
+        pdf_reader = PdfReader(uploaded_file)
+        # Extract document information
+        meta = pdf_reader.metadata
+        # Retrieve title from document information
+        title = meta.title if meta and meta.title else 'Title not found'
+        return title
+    def process_pdf(self) -> Tuple[List[str], List[str]]:
+        """
+        Processes the PDF by extracting, categorizing, and merging elements.
+        Returns:
+            Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
+        """
+        logger.info("Starting processing of the PDF.")
+        try:
+            raw_pdf_elements = self.extract_pdf_elements()
+            self.categorize_elements(raw_pdf_elements)
+            merged_chunks = self.merge_chunks()
+            return merged_chunks, self.tables
+        except Exception as e:
+            logger.error(f"Error processing PDF: {e}", exc_info=True)
+            raise
+    def process_pdf_file(self, uploaded_file):
+        """
+        Process an uploaded PDF file.
+        If a new file is uploaded, the previously stored file is deleted.
+        The method updates the file path, processes the PDF, and returns the results.
+        Parameters:
+        uploaded_file: The new PDF file uploaded for processing.
+        Returns:
+        The results of processing the PDF file.
+        """
+        # Delete the previous file if it exists
+        if self.file_path and os.path.exists(self.file_path):
+            try:
+                os.remove(self.file_path)
+                logging.debug(f"Previous file {self.file_path} deleted.")
+            except Exception as e:
+                logging.warning(f"Error deleting previous file: {e}", exc_info=True)
+        # Process the new file
+        self.file_path = str(uploaded_file)
+        self.folder_path = os.path.dirname(self.file_path)
+        logging.info(f"Starting to process the PDF file: {self.file_path}")
+        try:
+            logging.debug(f"Processing PDF at {self.file_path}")
+            results = self.process_pdf()
+            title = self.extract_title_from_pdf(self.file_path)
+            logging.info("PDF processing completed successfully.")
+            return (*results, title)
+        except Exception as e:
+            logging.error(f"Error processing PDF file: {e}", exc_info=True)
+            raise
+class HybridRetriever(BaseRetriever):
+    """
+    A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
+    Inherits from BaseRetriever.
+    This class uses two different retrieval methods and merges their results to provide a
+    comprehensive set of documents in response to a query. It ensures diversity in the
+    retrieved documents by leveraging the strengths of both retrieval methods.
+    Attributes:
+        vector_retriever: An instance of a vector-based retriever.
+        bm25_retriever: An instance of a BM25 retriever.
+    Methods:
+        __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
+        _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
+        _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
+    """
+    def __init__(self, vector_retriever, bm25_retriever):
+        super().__init__()
+        self.vector_retriever = vector_retriever
+        self.bm25_retriever = bm25_retriever
+        logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
+    def _retrieve(self, query: str, **kwargs) -> List:
+        """
+        Retrieves and combines results from both vector and BM25 retrievers.
+        Args:
+            query: The query string for document retrieval.
+            **kwargs: Additional keyword arguments for retrieval.
+        Returns:
+            List: Combined list of unique nodes retrieved from both methods.
+        """
+        logger.info(f"Retrieving documents for query: {query}")
+        try:
+            bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
+            vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
+            combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
+            logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
+            return combined_nodes
+        except Exception as e:
+            logger.error(f"Error in retrieval: {e}")
+            raise
+    @staticmethod
+    def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
+        """
+        Combines and de-duplicates results from BM25 and vector retrievers.
+        Args:
+            bm25_nodes: Nodes retrieved from BM25 retriever.
+            vector_nodes: Nodes retrieved from vector retriever.
+        Returns:
+            List: Combined list of unique nodes.
+        """
+        node_ids: Set = set()
+        combined_nodes = []
+        for node in bm25_nodes + vector_nodes:
+            if node.node_id not in node_ids:
+                combined_nodes.append(node)
+                node_ids.add(node.node_id)
+        return combined_nodes
+class PDFQueryEngine:
+    """
+    A class to handle the process of setting up a query engine and performing queries on PDF documents.
+    This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
+    indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
+    Attributes:
+        documents (List): A list of documents to be indexed.
+        llm (Language Model): The language model to be used for embeddings and queries.
+        qa_prompt_tmpl (str): Template for creating query prompts.
+        queries (List[str]): List of queries to be executed.
+    Methods:
+        setup_query_engine(): Sets up the query engine with all necessary components.
+        execute_queries(): Executes the predefined queries and prints the results.
+    """
+    def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
+        self.documents = documents
+        self.llm = llm
+        self.embed_model = embed_model
+        self.qa_prompt_tmpl = qa_prompt_tmpl
+        self.base_utils = base_utils()
+        self.config_manager = ConfigManager()
+        logger.info("PDFQueryEngine initialized.")
+    def format_example(self, example):
+        """
+        Formats a few-shot example into a string.
+        Args:
+            example (dict): A dictionary containing 'query', 'score', and 'reasoning' for the few-shot example.
+        Returns:
+            str: Formatted few-shot example text.
+        """
+        return "Example:\nQuery: {}\nScore: {}\nReasoning: {}\n".format(
+            example['query'], example['score'], example['reasoning']
+        )
+    def setup_query_engine(self):
+        """
+        Sets up the query engine by initializing and configuring the embedding model, service context, index,
+        hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
+        Args:
+            embed_model: The embedding model to be used.
+            service_context: The context for providing services to the query engine.
+            index: The index used for storing and retrieving documents.
+            hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
+            response_synthesizer: The synthesizer for generating responses to queries.
+        Returns:
+            Any: The configured query engine.
+        """
+        client = qdrant_client.QdrantClient(
+            # you can use :memory: mode for fast and light-weight experiments,
+            # it does not require to have Qdrant deployed anywhere
+            # but requires qdrant-client >= 1.1.1
+            location=":memory:"
+            # otherwise set Qdrant instance address with:
+            # uri="http://<host>:<port>"
+            # set API KEY for Qdrant Cloud
+            # api_key="<qdrant-api-key>",
+        )
+        try:
+            logger.info("Initializing the service context for query engine setup.")
+            service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
+            vector_store = QdrantVectorStore(client=client, collection_name="med_library")
+            storage_context = StorageContext.from_defaults(vector_store=vector_store)
+            logger.info("Creating an index from documents.")
+            index = VectorStoreIndex.from_documents(documents=self.documents, storage_context=storage_context, service_context=service_context)
+            nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
+            logger.info("Setting up vector and BM25 retrievers.")
+            vector_retriever = index.as_retriever(similarity_top_k=3)
+            bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)
+            hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
+            logger.info("Configuring the response synthesizer with the prompt template.")
+            qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
+            response_synthesizer = get_response_synthesizer(
+                service_context=service_context,
+                text_qa_template=qa_prompt,
+                response_mode="compact",
+            )
+            logger.info("Assembling the query engine with reranker and synthesizer.")
+            reranker = SentenceTransformerRerank(top_n=3, model="BAAI/bge-reranker-base")
+            query_engine = RetrieverQueryEngine.from_args(
+                retriever=hybrid_retriever,
+                node_postprocessors=[reranker],
+                response_synthesizer=response_synthesizer,
+            )
+            logger.info("Query engine setup complete.")
+            return query_engine
+        except Exception as e:
+            logger.error(f"Error during query engine setup: {e}")
+            raise
+    def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[int, List[int], int, float, List[str]]:
+        """
+        Evaluate documents using a language model based on various criteria.
+        Args:
+            reg_result (Any): Result related to registration.
+            peer_result (Any): Result related to peer review.
+            guidelines_result (Any): Result related to following guidelines.
+            queries (List[str]): A list of queries to be processed.
+        Returns:
+            Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria.
+        """
+        logger.info("Starting evaluation with LLM.")
+        self.config_manager.load_config("few_shot", "few_shot.json")
+        query_engine = self.setup_query_engine()
+        total_score = 0
+        criteria_met = 0
+        reasoning = []
+        for j, query in enumerate(queries):
+            # Handle special cases based on the value of j and other conditions
+            if j == 1 and reg_result:
+                extracted_data = {"score": 1, "reasoning": reg_result[0]}
+            elif j == 2 and guidelines_result:
+                extracted_data = {"score": 1, "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
+            elif j == 8 and (guidelines_result or peer_result):
+                extracted_data = {"score": 1, "reasoning": "The article is published in a peer-reviewed journal."}
+            else:
+                # Execute the query
+                result = query_engine.query(query).response
+                extracted_data = self.base_utils.extract_score_reasoning(result)
+            # Validate and accumulate the scores
+            extracted_data_score = 0 if extracted_data.get("score") is None else int(extracted_data.get("score"))
+            if extracted_data_score > 0:
+                criteria_met += 1
+            reasoning.append(extracted_data["reasoning"])
+            total_score += extracted_data_score
+        score_percentage = (float(total_score) / len(queries)) * 100
+        logger.info("Evaluation completed.")
+        return total_score, criteria_met, score_percentage, reasoning
+class MixtralLLM(CustomLLM):
+    """
+    A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
+    Attributes:
+        context_window (int): Number of tokens used for context during inference.
+        num_output (int): Number of tokens to generate as output.
+        temperature (float): Sampling temperature for token generation.
+        model_name (str): Name of the model on Hugging Face's model hub.
+        api_key (str): API key for authenticating with the Hugging Face API.
+    Methods:
+        metadata: Retrieves metadata about the model.
+        do_hf_call: Makes an API call to the Hugging Face model.
+        complete: Generates a complete response for a given prompt.
+        stream_complete: Streams a series of token completions for a given prompt.
+    """
+    context_window: int = Field(..., description="Number of tokens used for context during inference.")
+    num_output: int = Field(..., description="Number of tokens to generate as output.")
+    temperature: float = Field(..., description="Sampling temperature for token generation.")
+    model_name: str = Field(..., description="Name of the model on Hugging Face's model hub.")
+    api_key: str = Field(..., description="API key for authenticating with the Hugging Face API.")
+    @property
+    def metadata(self) -> LLMMetadata:
+        """
+        Retrieves metadata for the Mixtral LLM.
+        Returns:
+            LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
+        """
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.num_output,
+            model_name=self.model_name,
+        )
+    def do_hf_call(self, prompt: str) -> str:
+        """
+        Makes an API call to the Hugging Face model and retrieves the generated response.
+        Args:
+            prompt (str): The input prompt for the model.
+        Returns:
+            str: The text generated by the model in response to the prompt.
+        Raises:
+            Exception: If the API call fails or returns an error.
+        """
+        data = {
+            "inputs": prompt,
+            "parameters": {"Temperature": self.temperature}
+        }
+        # Makes a POST request to the Hugging Face API to get the model's response
+        response = requests.post(
+            f'https://api-inference.huggingface.co/models/{self.model_name}',
+            headers={
+                'authorization': f'Bearer {self.api_key}',
+                'content-type': 'application/json',
+            },
+            json=data,
+            stream=True
+        )
+        # Checks for a successful response and parses the generated text
+        if response.status_code != 200 or not response.json() or 'error' in response.json():
+            print(f"Error: {response}")
+            return "Unable to answer for technical reasons."
+        full_txt = response.json()[0]['generated_text']
+        # Finds the section of the text following the context separator
+        offset = full_txt.find("---------------------")
+        ss = full_txt[offset:]
+        # Extracts the actual answer from the response
+        offset = ss.find("Answer:")
+        return ss[offset+7:].strip()
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        """
+        Generates a complete response for a given prompt using the Hugging Face API.
+        Args:
+            prompt (str): The input prompt for the model.
+            **kwargs: Additional keyword arguments for the completion.
+        Returns:
+            CompletionResponse: The complete response from the model.
+        """
+        response = self.do_hf_call(prompt)
+        return CompletionResponse(text=response)
+    @llm_completion_callback()
+    def stream_complete(
+            self, prompt: str, **kwargs: Any
+    ) -> CompletionResponseGen:
+        """
+        Streams a series of token completions as a response for the given prompt.
+        This method is useful for streaming responses where each token is generated sequentially.
+        Args:
+            prompt (str): The input prompt for the model.
+            **kwargs: Additional keyword arguments for the streaming completion.
+        Yields:
+            CompletionResponseGen: A generator yielding each token in the completion response.
+        """
+        # Yields a stream of tokens as the completion response for the given prompt
+        response = ""
+        for token in self.do_hf_call(prompt):
+            response += token
+            yield CompletionResponse(text=response, delta=token)
+class KeywordSearch():
+    def __init__(self, chunks):
+        self.chunks = chunks
+    def find_journal_name(self, response: str, journal_list: list) -> str:
+        """
+        Searches for a journal name in a given response string.
+        This function iterates through a list of known journal names and checks if any of these
+        names are present in the response string. It returns the first journal name found in the
+        response. If no journal names from the list are found in the response, a default message
+        indicating that the journal name was not found is returned.
+        Args:
+            response (str): The response string to search for a journal name.
+            journal_list (list): A list of journal names to search within the response.
+        Returns:
+            str: The first journal name found in the response, or a default message if no journal name is found.
+        """
+        response_lower = response.lower()
+        for journal in journal_list:
+            journal_lower = journal.lower()
+            if journal_lower in response_lower:
+                return True
+        return False
+    def check_registration(self):
+        """
+        Check chunks of text for various registration numbers or URLs of registries.
+        Returns the sentence containing a registration number, or if not found,
+        returns chunks containing registry URLs.
+        Args:
+        chunks (list of str): List of text chunks to search.
+        Returns:
+        list of str: List of matching sentences or chunks, or an empty list if no matches are found.
+        """
+        # Patterns for different registration types
+        patterns = {
+            "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
+            "ISRCTN": r"(ISRCTN\d{8})",
+            "EudraCT": r"(\d{4}-\d{6}-\d{2})",
+            "UMIN-CTR": r"(UMIN\d{9})",
+            "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
+        }
+        # Registry URLs
+        registry_urls = [
+            "www.anzctr.org.au",
+            "anzctr.org.au",
+            "www.clinicaltrials.gov",
+            "clinicaltrials.gov",
+            "www.ISRCTN.org",
+            "ISRCTN.org",
+            "www.umin.ac.jp/ctr/index/htm",
+            "umin.ac.jp/ctr/index/htm",
+            "www.onderzoekmetmensen.nl/en",
+            "onderzoekmetmensen.nl/en",
+            "eudract.ema.europa.eu",
+            "www.eudract.ema.europa.eu"
+        ]
+        # Check each chunk for registration numbers
+        for chunk in self.chunks:
+            # Split chunk into sentences
+            sentences = re.split(r'(?<=[.!?]) +', chunk)
+            # Check each sentence for any registration number
+            for sentence in sentences:
+                for pattern in patterns.values():
+                    if re.search(pattern, sentence):
+                        return [sentence]  # Return immediately if a registration number is found
+        # If no registration number found, check for URLs in chunks
+        matching_chunks = []
+        for chunk in self.chunks:
+            if any(url in chunk for url in registry_urls):
+                matching_chunks.append(chunk)
+        return matching_chunks
+class StringExtraction():
+    """
+    A class to handle the the process of extraction of query string from complete LLM responses.
+    This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
+    LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
+    satisfactory results. Best case scenario will be write your own string extraction method in such cases.
+    Methods:
+        extract_original_prompt():
+        extraction_ground_truth():
+    """
+    def extract_original_prompt(self,result):
+        r1 = result.response.strip().split("\n")
+        binary_response = ""
+        explanation_response = ""
+        for r in r1:
+            if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
+                binary_response = r
+            elif r.find("Reasoning:") >= 0:
+                cut = r.find(":")
+                explanation_response += r[cut+1:].strip()
+        return binary_response,explanation_response
+    def extraction_ground_truth(self,paper_name,labelled_data):
+        id = int(paper_name[paper_name.find("_")+1:paper_name.find(".pdf")])
+        id_row = labelled_data[labelled_data["id"] == id]
+        ground_truth = id_row.iloc[:,2:11].values.tolist()[0]
+        binary_ground_truth = []
+        explanation_ground_truth = []
+        for g in ground_truth:
+            if len(g) > 0:
+                binary_ground_truth.append("Yes")
+                explanation_ground_truth.append(g)
+            else:
+                binary_ground_truth.append("No")
+                explanation_ground_truth.append("The article does not provide any relevant information.")
+        return binary_ground_truth,explanation_ground_truth
+class EvaluationMetrics():
+    """
+    This class encapsulates the evaluation methods that have been used in the project.
+    Attributes:
+        explanation_response = a list of detailed response from the LLM model corresponding to each query
+        explanation_ground_truth = the list of ground truth corresponding to each query
+    Methods:
+        metric_cosine_similairty(): Sets up the query engine with all necessary components.
+        metric_rouge(): Executes the predefined queries and prints the results.
+        metric_binary_accuracy():
+    """
+    def __init__(self,explanation_response,explanation_ground_truth,embedding_model):
+        self.explanation_response = explanation_response
+        self.explanation_ground_truth = explanation_ground_truth
+        self.embedding_model = embedding_model
+    def metric_cosine_similarity(self):
+        ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
+        explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
+        return np.diag(cosine_similarity(ground_truth_embedding,explanation_response_embedding))
+    def metric_rouge(self):
+        rouge = evaluate.load("rouge")
+        results = rouge.compute(predictions = self.explanation_response,references = self.explanation_ground_truth)
+        return results
+    def binary_accuracy(self,binary_response,binary_ground_truth):
+        count = 0
+        if len(binary_response) != len(binary_ground_truth):
+            return "Arrays which are to be compared has different lengths."
+        else:
+            for i in range(len(binary_response)):
+                if binary_response[i] == binary_ground_truth[i]:
+                    count += 1
+            return np.round(count/len(binary_response),2)

librarymed/huggingface/app_huggingface.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import logging
+import os
+import gradio as gr
+import openai
+from fpdf import FPDF
+from llama_index import Document
+from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
+from llama_index.llms import OpenAI
+from RAG_utils_huggingface import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
+    ConfigManager
+# Configure basic logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Create a logger object
+logger = logging.getLogger(__name__)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+config_manager = ConfigManager()
+# config_manager.load_config("api", "Config/api_config.json")
+config_manager.load_config("model", "model_config.json")
+openai.api_key = os.environ['OPENAI_API_KEY']  # config_manager.get_config_value("api", "OPENAI_API_KEY")
+hf_token = os.environ['HF_TOKEN']  # config_manager.get_config_value("api", "HF_TOKEN")
+# PDF rendering and chunking parameters
+pdf_processing_config = config_manager.get_config_value("model", "pdf_processing")
+ALLOWED_EXTENSIONS = config_manager.get_config_value("model", "allowed_extensions")
+embed = config_manager.get_config_value("model", "embeddings")
+embed_model_name = config_manager.get_config_value("model", "embeddings_model")
+# llm_model = config_manager.get_config_value("model", "llm_model")
+model_temperature = config_manager.get_config_value("model", "model_temp")
+output_token_size = config_manager.get_config_value("model", "max_tokens")
+model_context_window = config_manager.get_config_value("model", "context_window")
+gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
+mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
+info_prompt_path = config_manager.get_config_value("model", "INFO_PROMPT_PATH")
+peer_review_journals_path = config_manager.get_config_value("model", "peer_review_journals_path")
+eq_network_journals_path = config_manager.get_config_value("model", "eq_network_journals_path")
+queries = config_manager.get_config_value("model", "queries")
+criteria = config_manager.get_config_value("model", "criteria")
+num_criteria = len(queries)
+author_query = config_manager.get_config_value("model", "author_query")
+journal_query = config_manager.get_config_value("model", "journal_query")
+# Helper function to check if the file extension is allowed
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def generate_score_bar(score, num_criteria):
+    # Convert and round the score from a 9-point scale to a 100-point scale
+    score_out_of_100 = round((score / num_criteria) * 100)
+    # Determine the color and text based on the original score
+    if score == 9:
+        color = "#4CAF50"  # green
+        text = "Very good"
+    elif score in [7, 8]:
+        color = "#FFEB3B"  # yellow
+        text = "Good"
+    elif score in [5, 6]:
+        color = "#FF9800"  # orange
+        text = "Ok"
+    elif score in [3, 4]:
+        color = "#F44336"  # red
+        text = "Bad"
+    else:  # score < 3
+        color = "#800000"  # maroon
+        text = "Very bad"
+    # Create the HTML for the score bar
+    score_bar_html = f"""
+        <div style="background-color: #ddd; border-radius: 10px; position: relative; height: 20px; width: 100%;">
+            <div style="background-color: {color}; height: 100%; border-radius: 10px; width: {score_out_of_100}%;"></div>
+        </div>
+        <p style="color: {color};">{text}</p>  <!-- Display the text -->
+    """
+    return score_bar_html
+class PDF(FPDF):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Load the DejaVu font files
+        self.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
+        self.add_font('DejaVu', 'B', 'DejaVuSansCondensed-Bold.ttf', uni=True)
+        self.add_font('DejaVu', 'I', 'DejaVuSansCondensed-Oblique.ttf', uni=True)
+    def header(self):
+        self.set_font('DejaVu', 'B', 12)
+        self.cell(0, 10, 'Paper Analysis Report', 0, 1, 'C')
+    def footer(self):
+        self.set_y(-15)
+        self.set_font('DejaVu', 'I', 8)
+        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
+import os
+def create_pdf_report(title, author_info, score, criteria, reasoning_list, output_path):
+    pdf = PDF()
+    pdf.add_page()
+    # Set margins
+    pdf.set_left_margin(10)
+    pdf.set_right_margin(10)
+    # Title
+    pdf.set_font("DejaVu", 'B', 14)
+    pdf.cell(0, 10, "Title:", 0, 1)
+    pdf.set_font("DejaVu", '', 12)
+    pdf.multi_cell(0, 10, title, 0, 1)
+    # Author Information
+    pdf.set_font("DejaVu", 'B', 14)
+    pdf.cell(0, 10, "Author Information:", 0, 1)
+    pdf.set_font("DejaVu", '', 12)
+    pdf.multi_cell(0, 10, author_info, 0, 1)
+    # Score
+    pdf.set_font("DejaVu", 'B', 14)
+    pdf.cell(0, 10, "Score:", 0, 1)
+    pdf.set_font("DejaVu", '', 12)
+    pdf.multi_cell(0, 10, score, 0, 1)
+    # Reasoning - each reasoning with a green heading in bold
+    for heading, reasoning in zip(criteria, reasoning_list):
+        print(reasoning)
+        pdf.set_font("DejaVu", 'B', 14)
+        pdf.set_text_color(0, 128, 0)  # Green color
+        pdf.multi_cell(0, 10, heading, 0, 1)
+        pdf.set_text_color(0, 0, 0)  # Reset to black color
+        pdf.set_font("DejaVu", '', 12)
+        pdf.multi_cell(0, 10, reasoning, 0, 1)
+    # Save the PDF to the specified output path
+    pdf.output(output_path)
+    return output_path  # Return the path to the generated report
+def check_title_for_review(uploaded_files):
+    title_message = "All articles are valid for review."
+    if not uploaded_files:
+        title_message = "No files uploaded or upload canceled."
+    else:
+        for uploaded_file in uploaded_files:
+            pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
+            title = pdf_processor.extract_title_from_pdf(uploaded_file)
+            if 'review' in title.lower():
+                title_message = "One or more files are review papers. Hence the evaluation may not be accurate."
+    return title_message
+def process_pdf(uploaded_files, llm_model, n_criteria=num_criteria):
+    # Initialize aggregation variables
+    final_score = 0
+    final_reasoning = []
+    final_score_bar_html = ""
+    final_author_info_html = ""
+    final_title_info_html = ""
+    output_files = []
+    for i, uploaded_file in enumerate(uploaded_files):
+        # Process the PDF file
+        file_name_without_extension = os.path.splitext(os.path.basename(uploaded_file))[0]
+        file_name_without_extension
+        pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
+        merged_chunks, tables, title = pdf_processor.process_pdf_file(uploaded_file)
+        documents = [Document(text=t) for t in merged_chunks]
+        # Prompts and Queries
+        utils = base_utils()
+        info_prompt = utils.read_from_file(info_prompt_path)
+        # LLM Model choice
+        try:
+            if llm_model == "Model 1":
+                llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=output_token_size)
+                general_prompt = utils.read_from_file(gpt_prompt_path)
+            elif llm_model == "Model 2":
+                if any(param is None for param in
+                       [model_context_window, output_token_size, model_temperature, hf_token]):
+                    raise ValueError("All parameters are required for Mistral LLM.")
+                llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
+                                 temperature=model_temperature, model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+                                 api_key=hf_token)
+                general_prompt = utils.read_from_file(mistral_prompt_path)
+            else:
+                raise ValueError(f"Unsupported language model: {llm_model}")
+        except Exception as e:
+            logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
+            raise  # Or handle the exception as needed
+        # Embedding model choice for RAG
+        try:
+            if embed == "openai":
+                embed_model = OpenAIEmbedding(model="text-embedding-3-large")
+            elif embed == "huggingface":
+                # Use the specified model name
+                embed_model = HuggingFaceEmbedding(embed_model_name)
+            else:
+                raise ValueError(f"Unsupported embedding model: {embed_model}")
+        except Exception as e:
+            logger.error(f"Error initializing embedding model: {e}", exc_info=True)
+            raise
+        peer_review_journals = utils.read_from_file(peer_review_journals_path)
+        eq_network_journals = utils.read_from_file(eq_network_journals_path)
+        peer_review_journals_list = peer_review_journals.split('\n')
+        eq_network_journals_list = eq_network_journals.split('\n')
+        modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
+            peer_review_journals_list) + "?"
+        info_llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=100)
+        pdf_info_query = PDFQueryEngine(documents, info_llm, embed_model, (info_prompt))
+        info_query_engine = pdf_info_query.setup_query_engine()
+        journal_result = info_query_engine.query(modified_journal_query).response
+        author_result = info_query_engine.query(author_query).response
+        pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (general_prompt))
+        # Check for prior registration
+        nlp_methods = KeywordSearch(merged_chunks)
+        eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
+        peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
+        registration_result = nlp_methods.check_registration()
+        # Evaluate with OpenAI model
+        total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
+            registration_result, peer_journal_result, eq_journal_result, queries)
+        # Convert reasoning list to plain text
+        # reasoning_text = "\n".join([f"{idx + 1}. {reason}" for idx, reason in enumerate(reasoning)])
+        # Generate the score bar HTML
+        score_bar_html = generate_score_bar(total_score, n_criteria)
+        scaled_total_score = str(round((total_score / n_criteria) * 100)) + "/100"
+        output_dir = "/tmp"
+        base_name = os.path.splitext(uploaded_file)[0]
+        output_path = os.path.join(output_dir, f"{base_name}_report.pdf")
+        create_pdf_report(title, author_result, scaled_total_score, criteria, reasoning, output_path)
+        output_files.append(output_path)
+        # Construct the processing message
+        processing_message = f"Processing complete. {len(uploaded_files)} reports generated. Please download your reports below."
+    return processing_message, output_files
+    # Return the score as a string and the reasoning as HTML
+    # return str(round((total_score / n_criteria) * 100)) + "/100", score_bar_html, reasoning_html, author_info_html, title_info_html
+with gr.Blocks(theme=gr.themes.Glass(
+        text_size="sm",
+        font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"],
+        primary_hue="neutral",
+        secondary_hue="gray")) as demo:
+    gr.Markdown("## Med Library")
+    with gr.Row():
+        file_upload = gr.File(label="Choose papers", file_types=['.pdf'], file_count="multiple")
+    title_check_output = gr.Textbox(label="Warnings", interactive=False)
+    file_upload.change(fn=check_title_for_review, inputs=file_upload, outputs=title_check_output)
+    with gr.Row():
+        model_choice = gr.Dropdown(["Model 1", "Model 2"], label="Choose a model", value="Model 1")
+        submit_button = gr.Button("Evaluate")
+    processing_message_output = gr.Textbox(label="Processing Status", interactive=False)
+    report_download_links = gr.File(label="Download Reports", type="filepath", file_count="multiple")
+    submit_button.click(
+        fn=process_pdf,
+        inputs=[file_upload, model_choice],
+        outputs=[processing_message_output, report_download_links]
+    )
+demo.launch(share=True, server_name="0.0.0.0", server_port=7860)

librarymed/kromin/RAG_utils.py ADDED Viewed

	@@ -0,0 +1,983 @@

+import json
+import logging
+import os
+import re
+import time
+from tempfile import NamedTemporaryFile
+from typing import Any, List, Tuple, Set, Dict, Optional, Union
+import evaluate
+import numpy as np
+import pandas as pd
+import requests
+from llama_index import PromptTemplate
+from llama_index import VectorStoreIndex, ServiceContext
+from llama_index import get_response_synthesizer
+from llama_index.llms import (
+    CustomLLM,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.llms.base import llm_completion_callback
+from llama_index.postprocessor import SentenceTransformerRerank
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.retrievers import BaseRetriever, BM25Retriever
+from sklearn.metrics.pairwise import cosine_similarity
+from unstructured.partition.pdf import partition_pdf
+from pypdf import PdfReader
+# Configure basic logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Create a logger object
+logger = logging.getLogger(__name__)
+class ConfigManager:
+    """
+    A class to manage loading and accessing configuration settings.
+    Attributes:
+        config (dict): Dictionary to hold configuration settings.
+    Methods:
+        load_config(config_path: str): Loads the configuration from a given JSON file.
+        get_config_value(key: str): Retrieves a specific configuration value.
+    """
+    def __init__(self):
+        self.configs = {}
+    def load_config(self, config_name: str, config_path: str) -> None:
+        """
+        Loads configuration settings from a specified JSON file into a named configuration.
+        Args:
+            config_name (str): The name to assign to this set of configurations.
+            config_path (str): The path to the configuration file.
+        Raises:
+            FileNotFoundError: If the config file is not found.
+            json.JSONDecodeError: If there is an error parsing the config file.
+        """
+        try:
+            with open(config_path, 'r') as f:
+                self.configs[config_name] = json.load(f)
+        except FileNotFoundError:
+            logging.error(f"Config file not found at {config_path}")
+            raise
+        except json.JSONDecodeError as e:
+            logging.error(f"Error decoding config file: {e}")
+            raise
+    def get_config_value(self, config_name: str, key: str) -> str:
+        """
+        Retrieves a specific configuration value.
+        Args:
+            key (str): The key for the configuration setting.
+        Returns:
+            str: The value of the configuration setting.
+        Raises:
+            ValueError: If the key is not found or is set to a placeholder value.
+        """
+        value = self.configs.get(config_name, {}).get(key)
+        if value is None or value == "ENTER_YOUR_TOKEN_HERE":
+            raise ValueError(f"Please set your '{key}' in the config.json file.")
+        return value
+class base_utils:
+    """
+    A utility class providing miscellaneous static methods for processing and analyzing text data,
+    particularly from PDF documents and filenames. This class also includes methods for file operations.
+    This class encapsulates the functionality of extracting key information from text, such as scores,
+    reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
+    and reading content from files.
+    Attributes:
+        None (This class contains only static methods and does not maintain any state)
+    Methods:
+        extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
+            Extracts a score and reasoning from a given text using regular expressions.
+        extract_id_from_filename(filename: str) -> Optional[int]:
+            Extracts an ID from a given filename based on a specified pattern.
+        find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
+            Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
+        read_from_file(file_path: str) -> str:
+            Reads the content of a file and returns it as a string.
+    """
+    @staticmethod
+    def read_from_file(file_path: str) -> str:
+        """
+        Reads the content of a file and returns it as a string.
+        Args:
+            file_path (str): The path to the file to be read.
+        Returns:
+            str: The content of the file.
+        """
+        with open(file_path, 'r') as prompt_file:
+            prompt = prompt_file.read()
+        return prompt
+    @staticmethod
+    def extract_id_from_filename(filename: str) -> Optional[int]:
+        """
+        Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
+        Args:
+            filename (str): The filename from which to extract the ID.
+        Returns:
+            int: The extracted ID as an integer, or None if the pattern is not found.
+        """
+        # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
+        match = re.search(r'Id_(\d+).pdf', filename)
+        if match:
+            return int(match.group(1))  # Convert to integer if ID is numeric
+        else:
+            return None
+    @staticmethod
+    def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
+        """
+        Extracts score and reasoning from a given text using regular expressions.
+        Args:
+            text (str): The text from which to extract the score and reasoning.
+        Returns:
+            dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
+        """
+        # Define regular expression patterns for score and reasoning
+        score_pattern = r"Score: (\d+)"
+        reasoning_pattern = r"Reasoning: (.+)"
+        # Extract data using regular expressions
+        score_match = re.search(score_pattern, text)
+        reasoning_match = re.search(reasoning_pattern, text, re.DOTALL)  # re.DOTALL allows '.' to match newlines
+        # Extract and return the results
+        extracted_data = {
+            "score": score_match.group(1) if score_match else None,
+            "reasoning": reasoning_match.group(1).strip() if reasoning_match else None
+        }
+        return extracted_data
+    @staticmethod
+    def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
+        """
+        Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
+        Args:
+            pdf_filename (str): The filename of the PDF.
+            dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
+        Returns:
+            pandas.Series or str: The matched row from the dataframe or a message indicating
+                                  that no matching row or invalid filename was found.
+        """
+        pdf_id = Utility.extract_id_from_filename(pdf_filename)
+        if pdf_id is not None:
+            # Assuming the first column contains the ID
+            matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
+            if not matched_row.empty:
+                return matched_row
+            else:
+                return "No matching row found."
+        else:
+            return "Invalid file name."
+class PDFProcessor_Unstructured:
+    """
+    A class to process PDF files, providing functionalities for extracting, categorizing,
+    and merging elements from a PDF file.
+    This class is designed to handle unstructured PDF documents, particularly useful for
+    tasks involving text extraction, categorization, and data processing within PDFs.
+    Attributes:
+        file_path (str): The full path to the PDF file.
+        folder_path (str): The directory path where the PDF file is located.
+        file_name (str): The name of the PDF file.
+        texts (List[str]): A list to store extracted text chunks.
+        tables (List[str]): A list to store extracted tables.
+    Methods:
+        extract_pdf_elements() -> List:
+            Extracts images, tables, and text chunks from a PDF file.
+        categorize_elements(raw_pdf_elements: List) -> None:
+            Categorizes extracted elements from a PDF into tables and texts.
+        merge_chunks() -> List[str]:
+            Merges text chunks based on punctuation and character case criteria.
+        should_skip_chunk(chunk: str) -> bool:
+            Determines if a chunk should be skipped based on its content.
+        should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
+            Determines if the current chunk should be merged with the next one.
+        process_pdf() -> Tuple[List[str], List[str]]:
+            Processes the PDF by extracting, categorizing, and merging elements.
+        process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
+            Processes an uploaded PDF file to extract and categorize text and tables.
+    """
+    def __init__(self, config: Dict[str, any]):
+        self.file_path = None
+        self.folder_path = None
+        self.file_name = None
+        self.texts = []
+        self.tables = []
+        self.config = config if config is not None else self.default_config()
+        logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
+    @staticmethod
+    def default_config() -> Dict[str, any]:
+        """
+        Returns the default configuration for PDF processing.
+        Returns:
+            Dict[str, any]: Default configuration options.
+        """
+        return {
+            "extract_images": False,
+            "infer_table_structure": True,
+            "chunking_strategy": "by_title",
+            "max_characters": 10000,
+            "combine_text_under_n_chars": 100,
+            "strategy": "auto",
+            "model_name": "yolox"
+        }
+    def extract_pdf_elements(self) -> List:
+        """
+        Extracts images, tables, and text chunks from a PDF file.
+        Returns:
+            List: A list of extracted elements from the PDF.
+        """
+        logger.info("Starting extraction of PDF elements.")
+        try:
+            extracted_elements = partition_pdf(
+                filename=self.file_path,
+                extract_images_in_pdf=False,
+                infer_table_structure=True,
+                chunking_strategy="by_title",
+                max_characters=10000,
+                combine_text_under_n_chars=100,
+                image_output_dir_path=self.folder_path,
+                # strategy="fast",
+            )
+            logger.info("Extraction of PDF elements completed successfully.")
+            return extracted_elements
+        except Exception as e:
+            raise NotImplementedError(f"Error extracting PDF elements: {e}")
+    def categorize_elements(self, raw_pdf_elements: List) -> None:
+        """
+        Categorizes extracted elements from a PDF into tables and texts.
+        Args:
+            raw_pdf_elements (List): A list of elements extracted from the PDF.
+        """
+        logger.debug("Starting categorization of PDF elements.")
+        for element in raw_pdf_elements:
+            element_type = str(type(element))
+            if "unstructured.documents.elements.Table" in element_type:
+                self.tables.append(str(element))
+            elif "unstructured.documents.elements.CompositeElement" in element_type:
+                self.texts.append(str(element))
+        logger.debug("Categorization of PDF elements completed.")
+    def merge_chunks(self) -> List[str]:
+        """
+        Merges text chunks based on punctuation and character case criteria.
+        Returns:
+            List[str]: A list of merged text chunks.
+        """
+        logger.debug("Starting merging of text chunks.")
+        merged_chunks = []
+        skip_next = False
+        for i, current_chunk in enumerate(self.texts[:-1]):
+            next_chunk = self.texts[i + 1]
+            if self.should_skip_chunk(current_chunk):
+                continue
+            if self.should_merge_with_next(current_chunk, next_chunk):
+                merged_chunks.append(current_chunk + " " + next_chunk)
+                skip_next = True
+            else:
+                merged_chunks.append(current_chunk)
+        if not skip_next:
+            merged_chunks.append(self.texts[-1])
+        logger.debug("Merging of text chunks completed.")
+        return merged_chunks
+    @staticmethod
+    def should_skip_chunk(chunk: str) -> bool:
+        """
+        Determines if a chunk should be skipped based on its content.
+        Args:
+            chunk (str): The text chunk to be evaluated.
+        Returns:
+            bool: True if the chunk should be skipped, False otherwise.
+        """
+        return (chunk.lower().startswith(("figure", "fig", "table")) or
+                not chunk[0].isalnum() or
+                re.match(r'^\d+\.', chunk))
+    @staticmethod
+    def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
+        """
+        Determines if the current chunk should be merged with the next one.
+        Args:
+            current_chunk (str): The current text chunk.
+            next_chunk (str): The next text chunk.
+        Returns:
+            bool: True if the chunks should be merged, False otherwise.
+        """
+        return (current_chunk.endswith(",") or
+                (current_chunk[-1].islower() and next_chunk[0].islower()))
+    def process_pdf(self) -> Tuple[List[str], List[str]]:
+        """
+        Processes the PDF by extracting, categorizing, and merging elements.
+        Returns:
+            Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
+            is_research_paper: A boolean indicating if the paper is a research paper or not.
+        """
+        is_review_paper = False
+        logger.info("Starting processing of the PDF.")
+        try:
+            time_extract = time.time()
+            raw_pdf_elements = self.extract_pdf_elements()
+            logger.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements extracted in {time.time() - time_extract:.2f} seconds.")
+            time_review = time.time()
+            for element in raw_pdf_elements:
+                text = element.text.split()
+                for word in text:
+                    if word.lower() == 'review':
+                        logger.warning("!!! this seems to be a review paper and not a research paper. this demo "
+                                       "analyses only research papers.")
+                        is_review_paper = True
+            logging.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF review check completed in {time.time() - time_review:.2f} seconds.")
+            time_categorize = time.time()
+            self.categorize_elements(raw_pdf_elements)
+            logger.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements categorized in {time.time() - time_categorize:.2f} seconds.")
+            time_merge = time.time()
+            merged_chunks = self.merge_chunks()
+            logger.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF text chunks merged in {time.time() - time_merge:.2f} seconds.")
+            return merged_chunks, self.tables
+        except Exception as e:
+            raise NotImplementedError(f"Error processing PDF: {e}")
+    def process_pdf_file(self, uploaded_file):
+        """
+        Process an uploaded PDF file.
+        If a new file is uploaded, the previously stored file is deleted.
+        The method updates the file path, processes the PDF, and returns the results.
+        Parameters:
+        uploaded_file: The new PDF file uploaded for processing.
+        Returns:
+        The results of processing the PDF file.
+        """
+        logger.info(f"Starting to process the PDF file: {uploaded_file.filename}")
+        with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            uploaded_file.save(temp_file.name)
+            self.file_path = temp_file.name
+            self.folder_path = os.path.dirname(self.file_path)
+        try:
+            logger.debug(f"Processing PDF at {self.file_path}")
+            results = self.process_pdf()
+            title = self.extract_title_from_pdf(self.file_path)
+            logger.info("PDF processing completed successfully.")
+            return (*results, title)
+        except Exception as e:
+            logger.error(f"Error processing PDF file: {e}", exc_info=True)
+            raise
+        finally:
+            try:
+                os.remove(self.file_path)
+                logger.debug(f"Temporary file {self.file_path} deleted.")
+            except Exception as e:
+                logger.warning(f"Error deleting temporary file: {e}", exc_info=True)
+    def extract_title_from_pdf(self, uploaded_file):
+        """
+        Extracts the title from a PDF file's metadata.
+        This function reads the metadata of a PDF file using PyPDF2 and attempts to
+        extract the title. If the title is present in the metadata, it is returned.
+        Otherwise, a default message indicating that the title was not found is returned.
+        Parameters:
+        uploaded_file (file): A file object or a path to the PDF file from which
+                          to extract the title. The file must be opened in binary mode.
+        Returns:
+        str: The title of the PDF file as a string. If no title is found, returns
+             'Title not found'.
+        """
+        # Initialize PDF reader
+        pdf_reader = PdfReader(uploaded_file)
+        # Extract document information
+        meta = pdf_reader.metadata
+        # Retrieve title from document information
+        title = meta.title if meta and meta.title else 'Title not found'
+        return title
+class HybridRetriever(BaseRetriever):
+    """
+    A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
+    Inherits from BaseRetriever.
+    This class uses two different retrieval methods and merges their results to provide a
+    comprehensive set of documents in response to a query. It ensures diversity in the
+    retrieved documents by leveraging the strengths of both retrieval methods.
+    Attributes:
+        vector_retriever: An instance of a vector-based retriever.
+        bm25_retriever: An instance of a BM25 retriever.
+    Methods:
+        __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
+        _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
+        _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
+    """
+    def __init__(self, vector_retriever, bm25_retriever):
+        super().__init__()
+        self.vector_retriever = vector_retriever
+        self.bm25_retriever = bm25_retriever
+        logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
+    def _retrieve(self, query: str, **kwargs) -> List:
+        """
+        Retrieves and combines results from both vector and BM25 retrievers.
+        Args:
+            query: The query string for document retrieval.
+            **kwargs: Additional keyword arguments for retrieval.
+        Returns:
+            List: Combined list of unique nodes retrieved from both methods.
+        """
+        logger.info(f"Retrieving documents for query: {query}")
+        try:
+            bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
+            vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
+            combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
+            logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
+            return combined_nodes
+        except Exception as e:
+            logger.error(f"Error in retrieval: {e}")
+            raise
+    @staticmethod
+    def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
+        """
+        Combines and de-duplicates results from BM25 and vector retrievers.
+        Args:
+            bm25_nodes: Nodes retrieved from BM25 retriever.
+            vector_nodes: Nodes retrieved from vector retriever.
+        Returns:
+            List: Combined list of unique nodes.
+        """
+        node_ids: Set = set()
+        combined_nodes = []
+        for node in bm25_nodes + vector_nodes:
+            if node.node_id not in node_ids:
+                combined_nodes.append(node)
+                node_ids.add(node.node_id)
+        return combined_nodes
+class PDFQueryEngine:
+    """
+    A class to handle the process of setting up a query engine and performing queries on PDF documents.
+    This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
+    indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
+    Attributes:
+        documents (List): A list of documents to be indexed.
+        llm (Language Model): The language model to be used for embeddings and queries.
+        qa_prompt_tmpl (str): Template for creating query prompts.
+        queries (List[str]): List of queries to be executed.
+    Methods:
+        setup_query_engine(): Sets up the query engine with all necessary components.
+        execute_queries(): Executes the predefined queries and prints the results.
+    """
+    def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
+        self.documents = documents
+        self.llm = llm
+        self.embed_model = embed_model
+        self.qa_prompt_tmpl = qa_prompt_tmpl
+        self.base_utils = base_utils()
+        logger.info("PDFQueryEngine initialized.")
+    def setup_query_engine(self):
+        """
+        Sets up the query engine by initializing and configuring the embedding model, service context, index,
+        hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
+        Args:
+            embed_model: The embedding model to be used.
+            service_context: The context for providing services to the query engine.
+            index: The index used for storing and retrieving documents.
+            hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
+            response_synthesizer: The synthesizer for generating responses to queries.
+        Returns:
+            Any: The configured query engine.
+        """
+        try:
+            logger.info("Initializing the service context for query engine setup.")
+            service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
+            logger.info("Creating an index from documents.")
+            index = VectorStoreIndex.from_documents(documents=self.documents, service_context=service_context)
+            nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
+            logger.info("Setting up vector and BM25 retrievers.")
+            vector_retriever = index.as_retriever(similarity_top_k=5)
+            bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
+            hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
+            logger.info("Configuring the response synthesizer with the prompt template.")
+            qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
+            response_synthesizer = get_response_synthesizer(
+                service_context=service_context,
+                text_qa_template=qa_prompt,
+                response_mode="compact",
+            )
+            logger.info("Assembling the query engine with reranker and synthesizer.")
+            reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")
+            query_engine = RetrieverQueryEngine.from_args(
+                retriever=hybrid_retriever,
+                node_postprocessors=[reranker],
+                response_synthesizer=response_synthesizer,
+            )
+            logger.info("Query engine setup complete.")
+            return query_engine
+        except Exception as e:
+            logger.error(f"Error during query engine setup: {e}")
+            raise
+    def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[
+        int, List[int], int, float, List[str]]:
+        """
+        Evaluate documents using a language model based on various criteria.
+        Args:
+            reg_result (Any): Result related to registration.
+            peer_result (Any): Result related to peer review.
+            guidelines_result (Any): Result related to following guidelines.
+            queries (List[str]): A list of queries to be processed.
+        Returns:
+            Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria,
+        """
+        logger.info("Starting evaluation with LLM.")
+        query_engine = self.setup_query_engine()
+        total_score = 0
+        criteria_met = 0
+        reasoning = []
+        results = {}
+        for j, query in enumerate(queries):
+            # Predefine extracted_data to handle the default case
+            extracted_data = None
+            # Handle special cases based on the value of j and other conditions
+            if j == 1 and reg_result:
+                extracted_data = {"score": 1, "reasoning": reg_result[0]}
+            elif j == 2 and guidelines_result:
+                extracted_data = {"score": 1,
+                                  "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
+            elif j == 8 and (guidelines_result or peer_result):
+                extracted_data = {"score": 1, "reasoning": "The article is published in a peer reviewed journal."}
+            # Handle the default case if none of the special conditions were met
+            if extracted_data is None:
+                result = query_engine.query(query).response
+                extracted_data = self.base_utils.extract_score_reasoning(result)
+            if extracted_data['score'] and int(extracted_data["score"]) > 0:
+                criteria_met += 1
+                total_score += int(extracted_data["score"])
+            reasoning.append(extracted_data["reasoning"])
+            results[j] = {
+                "reasoning": extracted_data["reasoning"],
+                "score": int(extracted_data["score"]) if extracted_data['score'] else 0
+            }
+        score_percentage = (float(total_score) / len(queries)) * 100
+        logger.info("Evaluation completed.")
+        return total_score, criteria_met, score_percentage, reasoning, results
+class MixtralLLM(CustomLLM):
+    """
+    A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
+    Attributes:
+        context_window (int): Number of tokens used for context during inference.
+        num_output (int): Number of tokens to generate as output.
+        temperature (float): Sampling temperature for token generation.
+        model_name (str): Name of the model on Hugging Face's model hub.
+        api_key (str): API key for authenticating with the Hugging Face API.
+    Methods:
+        metadata: Retrieves metadata about the model.
+        do_hf_call: Makes an API call to the Hugging Face model.
+        complete: Generates a complete response for a given prompt.
+        stream_complete: Streams a series of token completions for a given prompt.
+    """
+    def __init__(self, context_window: int, num_output: int, temperature: float, model_name: str, api_key: str):
+        """
+        Initialize the MixtralLLM class with specific configuration values.
+        Args:
+            context_window (int): The number of tokens to consider for context during LLM inference.
+            num_output (int): The number of tokens to generate in the output.
+            temperature (float): The sampling temperature to use for generating tokens.
+            model_name (str): The name of the model to be used from Hugging Face's model hub.
+            api_key (str): The API key for authentication with Hugging Face's inference API.
+        """
+        super().__init__()
+        self.context_window = context_window
+        self.num_output = num_output
+        self.temperature = temperature
+        self.model_name = model_name
+        self.api_key = api_key
+    @property
+    def metadata(self) -> LLMMetadata:
+        """
+        Retrieves metadata for the Mixtral LLM.
+        Returns:
+            LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
+        """
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.num_output,
+            model_name=self.model_name,
+        )
+    def do_hf_call(self, prompt: str) -> str:
+        """
+        Makes an API call to the Hugging Face model and retrieves the generated response.
+        Args:
+            prompt (str): The input prompt for the model.
+        Returns:
+            str: The text generated by the model in response to the prompt.
+        Raises:
+            Exception: If the API call fails or returns an error.
+        """
+        data = {
+            "inputs": prompt,
+            "parameters": {"Temperature": self.temperature}
+        }
+        # Makes a POST request to the Hugging Face API to get the model's response
+        response = requests.post(
+            f'https://api-inference.huggingface.co/models/{self.model_name}',
+            headers={
+                'authorization': f'Bearer {self.api_key}',
+                'content-type': 'application/json',
+            },
+            json=data,
+            stream=True
+        )
+        # Checks for a successful response and parses the generated text
+        if response.status_code != 200 or not response.json() or 'error' in response.json():
+            print(f"Error: {response}")
+            return "Unable to answer for technical reasons."
+        full_txt = response.json()[0]['generated_text']
+        # Finds the section of the text following the context separator
+        offset = full_txt.find("---------------------")
+        ss = full_txt[offset:]
+        # Extracts the actual answer from the response
+        offset = ss.find("Answer:")
+        return ss[offset + 7:].strip()
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        """
+        Generates a complete response for a given prompt using the Hugging Face API.
+        Args:
+            prompt (str): The input prompt for the model.
+            **kwargs: Additional keyword arguments for the completion.
+        Returns:
+            CompletionResponse: The complete response from the model.
+        """
+        response = self.do_hf_call(prompt)
+        return CompletionResponse(text=response)
+    @llm_completion_callback()
+    def stream_complete(
+            self, prompt: str, **kwargs: Any
+    ) -> CompletionResponseGen:
+        """
+        Streams a series of token completions as a response for the given prompt.
+        This method is useful for streaming responses where each token is generated sequentially.
+        Args:
+            prompt (str): The input prompt for the model.
+            **kwargs: Additional keyword arguments for the streaming completion.
+        Yields:
+            CompletionResponseGen: A generator yielding each token in the completion response.
+        """
+        # Yields a stream of tokens as the completion response for the given prompt
+        response = ""
+        for token in self.do_hf_call(prompt):
+            response += token
+            yield CompletionResponse(text=response, delta=token)
+class KeywordSearch():
+    def __init__(self, chunks):
+        self.chunks = chunks
+    def find_journal_name(self, response: str, journal_list: list) -> str:
+        """
+        Searches for a journal name in a given response string.
+        This function iterates through a list of known journal names and checks if any of these
+        names are present in the response string. It returns the first journal name found in the
+        response. If no journal names from the list are found in the response, a default message
+        indicating that the journal name was not found is returned.
+        Args:
+            response (str): The response string to search for a journal name.
+            journal_list (list): A list of journal names to search within the response.
+        Returns:
+            str: The first journal name found in the response, or a default message if no journal name is found.
+        """
+        response_lower = response.lower()
+        for journal in journal_list:
+            journal_lower = journal.lower()
+            if journal_lower in response_lower:
+                return True
+        return False
+    def check_registration(self):
+        """
+        Check chunks of text for various registration numbers or URLs of registries.
+        Returns the sentence containing a registration number, or if not found,
+        returns chunks containing registry URLs.
+        Args:
+        chunks (list of str): List of text chunks to search.
+        Returns:
+        list of str: List of matching sentences or chunks, or an empty list if no matches are found.
+        """
+        # Patterns for different registration types
+        patterns = {
+            "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
+            "ISRCTN": r"(ISRCTN\d{8})",
+            "EudraCT": r"(\d{4}-\d{6}-\d{2})",
+            "UMIN-CTR": r"(UMIN\d{9})",
+            "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
+        }
+        # Registry URLs
+        registry_urls = [
+            "www.anzctr.org.au",
+            "anzctr.org.au",
+            "www.clinicaltrials.gov",
+            "clinicaltrials.gov",
+            "www.ISRCTN.org",
+            "ISRCTN.org",
+            "www.umin.ac.jp/ctr/index/htm",
+            "umin.ac.jp/ctr/index/htm",
+            "www.onderzoekmetmensen.nl/en",
+            "onderzoekmetmensen.nl/en",
+            "eudract.ema.europa.eu",
+            "www.eudract.ema.europa.eu"
+        ]
+        # Check each chunk for registration numbers
+        for chunk in self.chunks:
+            # Split chunk into sentences
+            sentences = re.split(r'(?<=[.!?]) +', chunk)
+            # Check each sentence for any registration number
+            for sentence in sentences:
+                for pattern in patterns.values():
+                    if re.search(pattern, sentence):
+                        return [sentence]  # Return immediately if a registration number is found
+        # If no registration number found, check for URLs in chunks
+        matching_chunks = []
+        for chunk in self.chunks:
+            if any(url in chunk for url in registry_urls):
+                matching_chunks.append(chunk)
+        return matching_chunks
+class StringExtraction():
+    """
+    A class to handle the the process of extraction of query string from complete LLM responses.
+    This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
+    LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
+    satisfactory results. Best case scenario will be write your own string extraction method in such cases.
+    Methods:
+        extract_original_prompt():
+        extraction_ground_truth():
+    """
+    def extract_original_prompt(self, result):
+        r1 = result.response.strip().split("\n")
+        binary_response = ""
+        explanation_response = ""
+        for r in r1:
+            if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
+                binary_response = r
+            elif r.find("Reasoning:") >= 0:
+                cut = r.find(":")
+                explanation_response += r[cut + 1:].strip()
+        return binary_response, explanation_response
+    def extraction_ground_truth(self, paper_name, labelled_data):
+        id = int(paper_name[paper_name.find("_") + 1:paper_name.find(".pdf")])
+        id_row = labelled_data[labelled_data["id"] == id]
+        ground_truth = id_row.iloc[:, 2:11].values.tolist()[0]
+        binary_ground_truth = []
+        explanation_ground_truth = []
+        for g in ground_truth:
+            if len(g) > 0:
+                binary_ground_truth.append("Yes")
+                explanation_ground_truth.append(g)
+            else:
+                binary_ground_truth.append("No")
+                explanation_ground_truth.append("The article does not provide any relevant information.")
+        return binary_ground_truth, explanation_ground_truth
+class EvaluationMetrics():
+    """
+    This class encapsulates the evaluation methods that have been used in the project.
+    Attributes:
+        explanation_response = a list of detailed response from the LLM model corresponding to each query
+        explanation_ground_truth = the list of ground truth corresponding to each query
+    Methods:
+        metric_cosine_similairty(): Sets up the query engine with all necessary components.
+        metric_rouge(): Executes the predefined queries and prints the results.
+        metric_binary_accuracy():
+    """
+    def __init__(self, explanation_response, explanation_ground_truth, embedding_model):
+        self.explanation_response = explanation_response
+        self.explanation_ground_truth = explanation_ground_truth
+        self.embedding_model = embedding_model
+    def metric_cosine_similarity(self):
+        ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
+        explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
+        return np.diag(cosine_similarity(ground_truth_embedding, explanation_response_embedding))
+    def metric_rouge(self):
+        rouge = evaluate.load("rouge")
+        results = rouge.compute(predictions=self.explanation_response, references=self.explanation_ground_truth)
+        return results
+    def binary_accuracy(self, binary_response, binary_ground_truth):
+        count = 0
+        if len(binary_response) != len(binary_ground_truth):
+            return "Arrays which are to be compared has different lengths."
+        else:
+            for i in range(len(binary_response)):
+                if binary_response[i] == binary_ground_truth[i]:
+                    count += 1
+            return np.round(count / len(binary_response), 2)

librarymed/kromin/__init__.py ADDED Viewed

File without changes

librarymed/kromin/app_librarymed.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import logging
+import os
+import openai
+from flask import Flask, flash, request, redirect, jsonify
+from llama_index import Document
+from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
+from llama_index.llms import OpenAI
+from kromin.RAG_utils import ConfigManager
+from kromin.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils
+from dotenv import load_dotenv
+load_dotenv()
+app = Flask(__name__)
+app.config['SECRET_KEY'] = 'librarymed super secret key'
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+config_manager = ConfigManager()
+config_manager.load_config("model", "Config/model_config.json")
+app.config['user_config'] = config_manager
+def allowed_file(filename, allowed_extensions):
+    """ Helper function to check if the file extension is allowed """
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
+@app.route('/', methods=['GET'])
+def __get__():
+    score = 0
+    criteria_met = 0
+    title = ""
+    author_info = ""
+    reasoning = ""
+    return jsonify({
+        'title': title,
+        'author': author_info,
+        'score': score,
+        'num_criteria_met': criteria_met,
+        'reasoning': reasoning
+    })
+@app.route('/upload', methods=['POST'])
+def __post__():
+    config = app.config['user_config']
+    openai.api_key = os.getenv('OPENAI_API_KEY')
+    hf_token = os.getenv('HF_TOKEN')
+    embed = config.get_config_value("model", "embeddings")
+    embed_model_name = config.get_config_value("model", "embeddings_model")
+    llm_model = config.get_config_value("model", "llm_model")
+    model_temperature = config.get_config_value("model", "model_temp")
+    output_token_size = config.get_config_value("model", "max_tokens")
+    model_context_window = config.get_config_value("model", "context_window")
+    gpt_prompt_path = config.get_config_value("model", "GPT_PROMPT_PATH")
+    mistral_prompt_path = config.get_config_value("model", "MISTRAL_PROMPT_PATH")
+    info_prompt_path = config.get_config_value("model", "INFO_PROMPT_PATH")
+    peer_review_journals_path = config.get_config_value("model", "peer_review_journals_path")
+    eq_network_journals_path = config.get_config_value("model", "eq_network_journals_path")
+    queries = config.get_config_value("model", "queries")
+    num_criteria = len(config.get_config_value("model", "criteria"))
+    author_query = config.get_config_value("model", "author_query")
+    journal_query = config.get_config_value("model", "journal_query")
+    prompt_path = gpt_prompt_path if gpt_prompt_path else mistral_prompt_path
+    utils = base_utils()
+    # Check if the post request has the file part
+    if 'file' not in request.files:
+        flash('No file part')
+        return jsonify({'error': 'No file part given in the request'}), 500
+    file = request.files['file']
+    # If user does not select file, browser also submits an empty part without filename
+    if file.filename == '':
+        flash('No selected file')
+        return jsonify({'error': 'Empty filename given'}), 500
+    if file and allowed_file(file.filename, config.get_config_value("model", "allowed_extensions")):
+        try:
+            # Process the PDF file
+            pdf_processor = PDFProcessor_Unstructured(config.get_config_value("model", "pdf_processing"))
+            merged_chunks, tables, title = pdf_processor.process_pdf_file(file)
+            documents = [Document(text=t) for t in merged_chunks]
+            # LLM Model choice
+            if 'gpt' in llm_model.lower():  # TODO tested "gpt-4" and  "gpt-3.5-turbo":
+                llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)
+                prompt_template = utils.read_from_file(gpt_prompt_path)
+            elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
+                if any(param is None for param in
+                       [model_context_window, output_token_size, model_temperature, hf_token]):
+                    raise ValueError("All parameters are required for Mistral LLM.")
+                llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
+                                 temperature=model_temperature, model_name=llm_model, api_key=hf_token)
+                prompt_template = utils.read_from_file(mistral_prompt_path)
+            else:
+                raise NotImplementedError(f"Error initializing language model '{llm_model}'")
+            # Embedding model choice for RAG
+            try:
+                if embed == "openai":
+                    embed_model = OpenAIEmbedding()
+                elif embed == "huggingface":
+                    if embed_model_name is None:
+                        # Set to default model if name not provided
+                        embed_model_name = "BAAI/bge-small-en-v1.5"
+                        embed_model = HuggingFaceEmbedding(embed_model_name)
+                    else:
+                        # Use the specified model name
+                        embed_model = HuggingFaceEmbedding(embed_model_name)
+                else:
+                    raise ValueError(f"Unsupported embedding model: {embed}")
+            except Exception as e:
+                raise NotImplementedError(f"Error initializing embedding model: {e}")
+            # Prompts and Queries
+            prompt_template = utils.read_from_file(prompt_path)
+            info_prompt = utils.read_from_file(info_prompt_path)
+            peer_review_journals = utils.read_from_file(peer_review_journals_path)
+            eq_network_journals = utils.read_from_file(eq_network_journals_path)
+            peer_review_journals_list = peer_review_journals.split('\n')
+            eq_network_journals_list = eq_network_journals.split('\n')
+            modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
+                peer_review_journals_list) + "?"
+            pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
+            info_query_engine = pdf_info_query.setup_query_engine()
+            journal_result = info_query_engine.query(modified_journal_query).response
+            author_info = info_query_engine.query(author_query).response
+            pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (prompt_template))
+            # Check for prior registration
+            nlp_methods = KeywordSearch(merged_chunks)
+            eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
+            peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
+            registration_result = nlp_methods.check_registration()
+            # Evaluate with OpenAI model
+            total_score, criteria_met, score_percentage, reasoning, results = pdf_criteria_query.evaluate_with_llm(
+                registration_result, peer_journal_result, eq_journal_result, queries)
+            score = f"{round((total_score / num_criteria) * 100)}/100"
+        except Exception as e:
+            logging.exception("An error occurred while processing the file.")
+            # Consider adding a user-friendly message or redirect
+            flash('An error occurred while processing the file.')
+            return jsonify({'error': str(e)}), 500
+    return jsonify({
+        'title': title,
+        'author': author_info,
+        'score': score,
+        'num_criteria_met': criteria_met,
+        'reasoning': reasoning,
+        'results': results
+    })

librarymed/local/RAG_utils.py ADDED Viewed

	@@ -0,0 +1,979 @@

+"""Utility functions for working with the RAG model"""
+import json
+import logging
+import os
+import re
+import time
+from tempfile import NamedTemporaryFile
+from typing import Any, List, Tuple, Set, Dict, Optional, Union
+import evaluate
+import numpy as np
+import pandas as pd
+import requests
+from llama_index import PromptTemplate
+from llama_index import VectorStoreIndex, ServiceContext
+from llama_index import get_response_synthesizer
+from llama_index.llms import (
+    CustomLLM,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.llms.base import llm_completion_callback
+from llama_index.postprocessor import SentenceTransformerRerank
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.retrievers import BaseRetriever, BM25Retriever
+from sklearn.metrics.pairwise import cosine_similarity
+from unstructured.partition.pdf import partition_pdf
+from pypdf import PdfReader
+# Configure basic logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Create a logger object
+logger = logging.getLogger(__name__)
+class ConfigManager:
+    """
+    A class to manage loading and accessing configuration settings.
+    Attributes:
+        config (dict): Dictionary to hold configuration settings.
+    Methods:
+        load_config(config_path: str): Loads the configuration from a given JSON file.
+        get_config_value(key: str): Retrieves a specific configuration value.
+    """
+    def __init__(self):
+        self.configs = {}
+    def load_config(self, config_name: str, config_path: str) -> None:
+        """
+        Loads configuration settings from a specified JSON file into a named configuration.
+        Args:
+            config_name (str): The name to assign to this set of configurations.
+            config_path (str): The path to the configuration file.
+        Raises:
+            FileNotFoundError: If the config file is not found.
+            json.JSONDecodeError: If there is an error parsing the config file.
+        """
+        try:
+            with open(config_path, 'r') as f:
+                self.configs[config_name] = json.load(f)
+        except FileNotFoundError:
+            logging.error(f"Config file not found at {config_path}")
+            raise
+        except json.JSONDecodeError as e:
+            logging.error(f"Error decoding config file: {e}")
+            raise
+    def get_config_value(self, config_name: str, key: str) -> str:
+        """
+        Retrieves a specific configuration value.
+        Args:
+            key (str): The key for the configuration setting.
+        Returns:
+            str: The value of the configuration setting.
+        Raises:
+            ValueError: If the key is not found or is set to a placeholder value.
+        """
+        value = self.configs.get(config_name, {}).get(key)
+        if value is None or value == "ENTER_YOUR_TOKEN_HERE":
+            raise ValueError(f"Please set your '{key}' in the config.json file.")
+        return value
+class base_utils:
+    """
+    A utility class providing miscellaneous static methods for processing and analyzing text data,
+    particularly from PDF documents and filenames. This class also includes methods for file operations.
+    This class encapsulates the functionality of extracting key information from text, such as scores,
+    reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
+    and reading content from files.
+    Attributes:
+        None (This class contains only static methods and does not maintain any state)
+    Methods:
+        extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
+            Extracts a score and reasoning from a given text using regular expressions.
+        extract_id_from_filename(filename: str) -> Optional[int]:
+            Extracts an ID from a given filename based on a specified pattern.
+        find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
+            Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
+        read_from_file(file_path: str) -> str:
+            Reads the content of a file and returns it as a string.
+    """
+    @staticmethod
+    def read_from_file(file_path: str) -> str:
+        """
+        Reads the content of a file and returns it as a string.
+        Args:
+            file_path (str): The path to the file to be read.
+        Returns:
+            str: The content of the file.
+        """
+        with open(file_path, 'r') as prompt_file:
+            prompt = prompt_file.read()
+        return prompt
+    @staticmethod
+    def extract_id_from_filename(filename: str) -> Optional[int]:
+        """
+        Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
+        Args:
+            filename (str): The filename from which to extract the ID.
+        Returns:
+            int: The extracted ID as an integer, or None if the pattern is not found.
+        """
+        # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
+        match = re.search(r'Id_(\d+).pdf', filename)
+        if match:
+            return int(match.group(1))  # Convert to integer if ID is numeric
+        else:
+            return None
+    @staticmethod
+    def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
+        """
+        Extracts score and reasoning from a given text using regular expressions.
+        Args:
+            text (str): The text from which to extract the score and reasoning.
+        Returns:
+            dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
+        """
+        # Define regular expression patterns for score and reasoning
+        score_pattern = r"Score: (\d+)"
+        reasoning_pattern = r"Reasoning: (.+)"
+        # Extract data using regular expressions
+        score_match = re.search(score_pattern, text)
+        reasoning_match = re.search(reasoning_pattern, text, re.DOTALL)  # re.DOTALL allows '.' to match newlines
+        # Extract and return the results
+        extracted_data = {
+            "score": score_match.group(1) if score_match else None,
+            "reasoning": reasoning_match.group(1).strip() if reasoning_match else None
+        }
+        return extracted_data
+    @staticmethod
+    def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
+        """
+        Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
+        Args:
+            pdf_filename (str): The filename of the PDF.
+            dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
+        Returns:
+            pandas.Series or str: The matched row from the dataframe or a message indicating
+                                  that no matching row or invalid filename was found.
+        """
+        pdf_id = Utility.extract_id_from_filename(pdf_filename)
+        if pdf_id is not None:
+            # Assuming the first column contains the ID
+            matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
+            if not matched_row.empty:
+                return matched_row
+            else:
+                return "No matching row found."
+        else:
+            return "Invalid file name."
+class PDFProcessor_Unstructured:
+    """
+    A class to process PDF files, providing functionalities for extracting, categorizing,
+    and merging elements from a PDF file.
+    This class is designed to handle unstructured PDF documents, particularly useful for
+    tasks involving text extraction, categorization, and data processing within PDFs.
+    Attributes:
+        file_path (str): The full path to the PDF file.
+        folder_path (str): The directory path where the PDF file is located.
+        file_name (str): The name of the PDF file.
+        texts (List[str]): A list to store extracted text chunks.
+        tables (List[str]): A list to store extracted tables.
+    Methods:
+        extract_pdf_elements() -> List:
+            Extracts images, tables, and text chunks from a PDF file.
+        categorize_elements(raw_pdf_elements: List) -> None:
+            Categorizes extracted elements from a PDF into tables and texts.
+        merge_chunks() -> List[str]:
+            Merges text chunks based on punctuation and character case criteria.
+        should_skip_chunk(chunk: str) -> bool:
+            Determines if a chunk should be skipped based on its content.
+        should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
+            Determines if the current chunk should be merged with the next one.
+        process_pdf() -> Tuple[List[str], List[str]]:
+            Processes the PDF by extracting, categorizing, and merging elements.
+        process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
+            Processes an uploaded PDF file to extract and categorize text and tables.
+    """
+    def __init__(self, config: Dict[str, any]):
+        self.file_path = None
+        self.folder_path = None
+        self.file_name = None
+        self.texts = []
+        self.tables = []
+        self.config = config if config is not None else self.default_config()
+        logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
+    @staticmethod
+    def default_config() -> Dict[str, any]:
+        """
+        Returns the default configuration for PDF processing.
+        Returns:
+            Dict[str, any]: Default configuration options.
+        """
+        return {
+            "extract_images": False,
+            "infer_table_structure": True,
+            "chunking_strategy": "by_title",
+            "max_characters": 10000,
+            "combine_text_under_n_chars": 100,
+            "strategy": "auto",
+            "model_name": "yolox"
+        }
+    def extract_pdf_elements(self) -> List:
+        """
+        Extracts images, tables, and text chunks from a PDF file.
+        Returns:
+            List: A list of extracted elements from the PDF.
+        """
+        logger.info("Starting extraction of PDF elements.")
+        try:
+            extracted_elements = partition_pdf(
+                filename=self.file_path,
+                extract_images_in_pdf=False,
+                infer_table_structure=True,
+                chunking_strategy="by_title",
+                max_characters=10000,
+                combine_text_under_n_chars=100,
+                image_output_dir_path=self.folder_path,
+                # strategy="fast",
+            )
+            logger.info("Extraction of PDF elements completed successfully.")
+            return extracted_elements
+        except Exception as e:
+            raise NotImplementedError(f"Error extracting PDF elements: {e}")
+    def categorize_elements(self, raw_pdf_elements: List) -> None:
+        """
+        Categorizes extracted elements from a PDF into tables and texts.
+        Args:
+            raw_pdf_elements (List): A list of elements extracted from the PDF.
+        """
+        logger.debug("Starting categorization of PDF elements.")
+        for element in raw_pdf_elements:
+            element_type = str(type(element))
+            if "unstructured.documents.elements.Table" in element_type:
+                self.tables.append(str(element))
+            elif "unstructured.documents.elements.CompositeElement" in element_type:
+                self.texts.append(str(element))
+        logger.debug("Categorization of PDF elements completed.")
+    def merge_chunks(self) -> List[str]:
+        """
+        Merges text chunks based on punctuation and character case criteria.
+        Returns:
+            List[str]: A list of merged text chunks.
+        """
+        logger.debug("Starting merging of text chunks.")
+        merged_chunks = []
+        skip_next = False
+        for i, current_chunk in enumerate(self.texts[:-1]):
+            next_chunk = self.texts[i + 1]
+            if self.should_skip_chunk(current_chunk):
+                continue
+            if self.should_merge_with_next(current_chunk, next_chunk):
+                merged_chunks.append(current_chunk + " " + next_chunk)
+                skip_next = True
+            else:
+                merged_chunks.append(current_chunk)
+        if not skip_next:
+            merged_chunks.append(self.texts[-1])
+        logger.debug("Merging of text chunks completed.")
+        return merged_chunks
+    @staticmethod
+    def should_skip_chunk(chunk: str) -> bool:
+        """
+        Determines if a chunk should be skipped based on its content.
+        Args:
+            chunk (str): The text chunk to be evaluated.
+        Returns:
+            bool: True if the chunk should be skipped, False otherwise.
+        """
+        return (chunk.lower().startswith(("figure", "fig", "table")) or
+                not chunk[0].isalnum() or
+                re.match(r'^\d+\.', chunk))
+    @staticmethod
+    def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
+        """
+        Determines if the current chunk should be merged with the next one.
+        Args:
+            current_chunk (str): The current text chunk.
+            next_chunk (str): The next text chunk.
+        Returns:
+            bool: True if the chunks should be merged, False otherwise.
+        """
+        return (current_chunk.endswith(",") or
+                (current_chunk[-1].islower() and next_chunk[0].islower()))
+    def process_pdf(self) -> Tuple[List[str], List[str]]:
+        """
+        Processes the PDF by extracting, categorizing, and merging elements.
+        Returns:
+            Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
+            is_research_paper: A boolean indicating if the paper is a research paper or not.
+        """
+        is_review_paper = False
+        logger.info("Starting processing of the PDF.")
+        try:
+            time_extract = time.time()
+            raw_pdf_elements = self.extract_pdf_elements()
+            logger.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements extracted in {time.time() - time_extract:.2f} seconds.")
+            time_review = time.time()
+            for element in raw_pdf_elements:
+                text = element.text.split()
+                for word in text:
+                    if word.lower() == 'review':
+                        logger.warning("!!! this seems to be a review paper and not a research paper. this demo "
+                                       "analyses only research papers.")
+                        is_review_paper = True
+            logging.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF review check completed in {time.time() - time_review:.2f} seconds.")
+            time_categorize = time.time()
+            self.categorize_elements(raw_pdf_elements)
+            logger.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements categorized in {time.time() - time_categorize:.2f} seconds.")
+            time_merge = time.time()
+            merged_chunks = self.merge_chunks()
+            logger.info(
+                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF text chunks merged in {time.time() - time_merge:.2f} seconds.")
+            return merged_chunks, self.tables
+        except Exception as e:
+            raise NotImplementedError(f"Error processing PDF: {e}")
+    def process_pdf_file(self, uploaded_file):
+        """
+        Process an uploaded PDF file.
+        If a new file is uploaded, the previously stored file is deleted.
+        The method updates the file path, processes the PDF, and returns the results.
+        Parameters:
+        uploaded_file: The new PDF file uploaded for processing.
+        Returns:
+        The results of processing the PDF file.
+        """
+        logger.info(f"Starting to process the PDF file: {uploaded_file.filename}")
+        with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            uploaded_file.save(temp_file.name)
+            self.file_path = temp_file.name
+            self.folder_path = os.path.dirname(self.file_path)
+        try:
+            logger.debug(f"Processing PDF at {self.file_path}")
+            results = self.process_pdf()
+            title = self.extract_title_from_pdf(self.file_path)
+            logger.info("PDF processing completed successfully.")
+            return (*results, title)
+        except Exception as e:
+            logger.error(f"Error processing PDF file: {e}", exc_info=True)
+            raise
+        finally:
+            try:
+                os.remove(self.file_path)
+                logger.debug(f"Temporary file {self.file_path} deleted.")
+            except Exception as e:
+                logger.warning(f"Error deleting temporary file: {e}", exc_info=True)
+    def extract_title_from_pdf(self, uploaded_file):
+        """
+        Extracts the title from a PDF file's metadata.
+        This function reads the metadata of a PDF file using PyPDF2 and attempts to
+        extract the title. If the title is present in the metadata, it is returned.
+        Otherwise, a default message indicating that the title was not found is returned.
+        Parameters:
+        uploaded_file (file): A file object or a path to the PDF file from which
+                          to extract the title. The file must be opened in binary mode.
+        Returns:
+        str: The title of the PDF file as a string. If no title is found, returns
+             'Title not found'.
+        """
+        # Initialize PDF reader
+        pdf_reader = PdfReader(uploaded_file)
+        # Extract document information
+        meta = pdf_reader.metadata
+        # Retrieve title from document information
+        title = meta.title if meta and meta.title else 'Title not found'
+        return title
+class HybridRetriever(BaseRetriever):
+    """
+    A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
+    Inherits from BaseRetriever.
+    This class uses two different retrieval methods and merges their results to provide a
+    comprehensive set of documents in response to a query. It ensures diversity in the
+    retrieved documents by leveraging the strengths of both retrieval methods.
+    Attributes:
+        vector_retriever: An instance of a vector-based retriever.
+        bm25_retriever: An instance of a BM25 retriever.
+    Methods:
+        __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
+        _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
+        _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
+    """
+    def __init__(self, vector_retriever, bm25_retriever):
+        super().__init__()
+        self.vector_retriever = vector_retriever
+        self.bm25_retriever = bm25_retriever
+        logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
+    def _retrieve(self, query: str, **kwargs) -> List:
+        """
+        Retrieves and combines results from both vector and BM25 retrievers.
+        Args:
+            query: The query string for document retrieval.
+            **kwargs: Additional keyword arguments for retrieval.
+        Returns:
+            List: Combined list of unique nodes retrieved from both methods.
+        """
+        logger.info(f"Retrieving documents for query: {query}")
+        try:
+            bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
+            vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
+            combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
+            logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
+            return combined_nodes
+        except Exception as e:
+            logger.error(f"Error in retrieval: {e}")
+            raise
+    @staticmethod
+    def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
+        """
+        Combines and de-duplicates results from BM25 and vector retrievers.
+        Args:
+            bm25_nodes: Nodes retrieved from BM25 retriever.
+            vector_nodes: Nodes retrieved from vector retriever.
+        Returns:
+            List: Combined list of unique nodes.
+        """
+        node_ids: Set = set()
+        combined_nodes = []
+        for node in bm25_nodes + vector_nodes:
+            if node.node_id not in node_ids:
+                combined_nodes.append(node)
+                node_ids.add(node.node_id)
+        return combined_nodes
+class PDFQueryEngine:
+    """
+    A class to handle the process of setting up a query engine and performing queries on PDF documents.
+    This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
+    indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
+    Attributes:
+        documents (List): A list of documents to be indexed.
+        llm (Language Model): The language model to be used for embeddings and queries.
+        qa_prompt_tmpl (str): Template for creating query prompts.
+        queries (List[str]): List of queries to be executed.
+    Methods:
+        setup_query_engine(): Sets up the query engine with all necessary components.
+        execute_queries(): Executes the predefined queries and prints the results.
+    """
+    def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
+        self.documents = documents
+        self.llm = llm
+        self.embed_model = embed_model
+        self.qa_prompt_tmpl = qa_prompt_tmpl
+        self.base_utils = base_utils()
+        logger.info("PDFQueryEngine initialized.")
+    def setup_query_engine(self):
+        """
+        Sets up the query engine by initializing and configuring the embedding model, service context, index,
+        hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
+        Args:
+            embed_model: The embedding model to be used.
+            service_context: The context for providing services to the query engine.
+            index: The index used for storing and retrieving documents.
+            hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
+            response_synthesizer: The synthesizer for generating responses to queries.
+        Returns:
+            Any: The configured query engine.
+        """
+        try:
+            logger.info("Initializing the service context for query engine setup.")
+            service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
+            logger.info("Creating an index from documents.")
+            index = VectorStoreIndex.from_documents(documents=self.documents, service_context=service_context)
+            nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
+            logger.info("Setting up vector and BM25 retrievers.")
+            vector_retriever = index.as_retriever(similarity_top_k=5)
+            bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
+            hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
+            logger.info("Configuring the response synthesizer with the prompt template.")
+            qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
+            response_synthesizer = get_response_synthesizer(
+                service_context=service_context,
+                text_qa_template=qa_prompt,
+                response_mode="compact",
+            )
+            logger.info("Assembling the query engine with reranker and synthesizer.")
+            reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")
+            query_engine = RetrieverQueryEngine.from_args(
+                retriever=hybrid_retriever,
+                node_postprocessors=[reranker],
+                response_synthesizer=response_synthesizer,
+            )
+            logger.info("Query engine setup complete.")
+            return query_engine
+        except Exception as e:
+            logger.error(f"Error during query engine setup: {e}")
+            raise
+    def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[
+        int, List[int], int, float, List[str]]:
+        """
+        Evaluate documents using a language model based on various criteria.
+        Args:
+            reg_result (Any): Result related to registration.
+            peer_result (Any): Result related to peer review.
+            guidelines_result (Any): Result related to following guidelines.
+            queries (List[str]): A list of queries to be processed.
+        Returns:
+            Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria,
+        """
+        logger.info("Starting evaluation with LLM.")
+        query_engine = self.setup_query_engine()
+        total_score = 0
+        criteria_met = 0
+        reasoning = []
+        for j, query in enumerate(queries):
+            # Predefine extracted_data to handle the default case
+            extracted_data = None
+            # Handle special cases based on the value of j and other conditions
+            if j == 1 and reg_result:
+                extracted_data = {"score": 1, "reasoning": reg_result[0]}
+            elif j == 2 and guidelines_result:
+                extracted_data = {"score": 1,
+                                  "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
+            elif j == 8 and (guidelines_result or peer_result):
+                extracted_data = {"score": 1, "reasoning": "The article is published in a peer reviewed journal."}
+            # Handle the default case if none of the special conditions were met
+            if extracted_data is None:
+                result = query_engine.query(query).response
+                extracted_data = self.base_utils.extract_score_reasoning(result)
+            if extracted_data['score'] and int(extracted_data["score"]) > 0:
+                criteria_met += 1
+                total_score += int(extracted_data["score"])
+            reasoning.append(extracted_data["reasoning"])
+        score_percentage = (float(total_score) / len(queries)) * 100
+        logger.info("Evaluation completed.")
+        return total_score, criteria_met, score_percentage, reasoning
+class MixtralLLM(CustomLLM):
+    """
+    A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
+    Attributes:
+        context_window (int): Number of tokens used for context during inference.
+        num_output (int): Number of tokens to generate as output.
+        temperature (float): Sampling temperature for token generation.
+        model_name (str): Name of the model on Hugging Face's model hub.
+        api_key (str): API key for authenticating with the Hugging Face API.
+    Methods:
+        metadata: Retrieves metadata about the model.
+        do_hf_call: Makes an API call to the Hugging Face model.
+        complete: Generates a complete response for a given prompt.
+        stream_complete: Streams a series of token completions for a given prompt.
+    """
+    def __init__(self, context_window: int, num_output: int, temperature: float, model_name: str, api_key: str):
+        """
+        Initialize the MixtralLLM class with specific configuration values.
+        Args:
+            context_window (int): The number of tokens to consider for context during LLM inference.
+            num_output (int): The number of tokens to generate in the output.
+            temperature (float): The sampling temperature to use for generating tokens.
+            model_name (str): The name of the model to be used from Hugging Face's model hub.
+            api_key (str): The API key for authentication with Hugging Face's inference API.
+        """
+        super().__init__()
+        self.context_window = context_window
+        self.num_output = num_output
+        self.temperature = temperature
+        self.model_name = model_name
+        self.api_key = api_key
+    @property
+    def metadata(self) -> LLMMetadata:
+        """
+        Retrieves metadata for the Mixtral LLM.
+        Returns:
+            LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
+        """
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.num_output,
+            model_name=self.model_name,
+        )
+    def do_hf_call(self, prompt: str) -> str:
+        """
+        Makes an API call to the Hugging Face model and retrieves the generated response.
+        Args:
+            prompt (str): The input prompt for the model.
+        Returns:
+            str: The text generated by the model in response to the prompt.
+        Raises:
+            Exception: If the API call fails or returns an error.
+        """
+        data = {
+            "inputs": prompt,
+            "parameters": {"Temperature": self.temperature}
+        }
+        # Makes a POST request to the Hugging Face API to get the model's response
+        response = requests.post(
+            f'https://api-inference.huggingface.co/models/{self.model_name}',
+            headers={
+                'authorization': f'Bearer {self.api_key}',
+                'content-type': 'application/json',
+            },
+            json=data,
+            stream=True
+        )
+        # Checks for a successful response and parses the generated text
+        if response.status_code != 200 or not response.json() or 'error' in response.json():
+            print(f"Error: {response}")
+            return "Unable to answer for technical reasons."
+        full_txt = response.json()[0]['generated_text']
+        # Finds the section of the text following the context separator
+        offset = full_txt.find("---------------------")
+        ss = full_txt[offset:]
+        # Extracts the actual answer from the response
+        offset = ss.find("Answer:")
+        return ss[offset + 7:].strip()
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        """
+        Generates a complete response for a given prompt using the Hugging Face API.
+        Args:
+            prompt (str): The input prompt for the model.
+            **kwargs: Additional keyword arguments for the completion.
+        Returns:
+            CompletionResponse: The complete response from the model.
+        """
+        response = self.do_hf_call(prompt)
+        return CompletionResponse(text=response)
+    @llm_completion_callback()
+    def stream_complete(
+            self, prompt: str, **kwargs: Any
+    ) -> CompletionResponseGen:
+        """
+        Streams a series of token completions as a response for the given prompt.
+        This method is useful for streaming responses where each token is generated sequentially.
+        Args:
+            prompt (str): The input prompt for the model.
+            **kwargs: Additional keyword arguments for the streaming completion.
+        Yields:
+            CompletionResponseGen: A generator yielding each token in the completion response.
+        """
+        # Yields a stream of tokens as the completion response for the given prompt
+        response = ""
+        for token in self.do_hf_call(prompt):
+            response += token
+            yield CompletionResponse(text=response, delta=token)
+class KeywordSearch():
+    def __init__(self, chunks):
+        self.chunks = chunks
+    def find_journal_name(self, response: str, journal_list: list) -> str:
+        """
+        Searches for a journal name in a given response string.
+        This function iterates through a list of known journal names and checks if any of these
+        names are present in the response string. It returns the first journal name found in the
+        response. If no journal names from the list are found in the response, a default message
+        indicating that the journal name was not found is returned.
+        Args:
+            response (str): The response string to search for a journal name.
+            journal_list (list): A list of journal names to search within the response.
+        Returns:
+            str: The first journal name found in the response, or a default message if no journal name is found.
+        """
+        response_lower = response.lower()
+        for journal in journal_list:
+            journal_lower = journal.lower()
+            if journal_lower in response_lower:
+                return True
+        return False
+    def check_registration(self):
+        """
+        Check chunks of text for various registration numbers or URLs of registries.
+        Returns the sentence containing a registration number, or if not found,
+        returns chunks containing registry URLs.
+        Args:
+        chunks (list of str): List of text chunks to search.
+        Returns:
+        list of str: List of matching sentences or chunks, or an empty list if no matches are found.
+        """
+        # Patterns for different registration types
+        patterns = {
+            "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
+            "ISRCTN": r"(ISRCTN\d{8})",
+            "EudraCT": r"(\d{4}-\d{6}-\d{2})",
+            "UMIN-CTR": r"(UMIN\d{9})",
+            "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
+        }
+        # Registry URLs
+        registry_urls = [
+            "www.anzctr.org.au",
+            "anzctr.org.au",
+            "www.clinicaltrials.gov",
+            "clinicaltrials.gov",
+            "www.ISRCTN.org",
+            "ISRCTN.org",
+            "www.umin.ac.jp/ctr/index/htm",
+            "umin.ac.jp/ctr/index/htm",
+            "www.onderzoekmetmensen.nl/en",
+            "onderzoekmetmensen.nl/en",
+            "eudract.ema.europa.eu",
+            "www.eudract.ema.europa.eu"
+        ]
+        # Check each chunk for registration numbers
+        for chunk in self.chunks:
+            # Split chunk into sentences
+            sentences = re.split(r'(?<=[.!?]) +', chunk)
+            # Check each sentence for any registration number
+            for sentence in sentences:
+                for pattern in patterns.values():
+                    if re.search(pattern, sentence):
+                        return [sentence]  # Return immediately if a registration number is found
+        # If no registration number found, check for URLs in chunks
+        matching_chunks = []
+        for chunk in self.chunks:
+            if any(url in chunk for url in registry_urls):
+                matching_chunks.append(chunk)
+        return matching_chunks
+class StringExtraction():
+    """
+    A class to handle the the process of extraction of query string from complete LLM responses.
+    This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
+    LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
+    satisfactory results. Best case scenario will be write your own string extraction method in such cases.
+    Methods:
+        extract_original_prompt():
+        extraction_ground_truth():
+    """
+    def extract_original_prompt(self, result):
+        r1 = result.response.strip().split("\n")
+        binary_response = ""
+        explanation_response = ""
+        for r in r1:
+            if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
+                binary_response = r
+            elif r.find("Reasoning:") >= 0:
+                cut = r.find(":")
+                explanation_response += r[cut + 1:].strip()
+        return binary_response, explanation_response
+    def extraction_ground_truth(self, paper_name, labelled_data):
+        id = int(paper_name[paper_name.find("_") + 1:paper_name.find(".pdf")])
+        id_row = labelled_data[labelled_data["id"] == id]
+        ground_truth = id_row.iloc[:, 2:11].values.tolist()[0]
+        binary_ground_truth = []
+        explanation_ground_truth = []
+        for g in ground_truth:
+            if len(g) > 0:
+                binary_ground_truth.append("Yes")
+                explanation_ground_truth.append(g)
+            else:
+                binary_ground_truth.append("No")
+                explanation_ground_truth.append("The article does not provide any relevant information.")
+        return binary_ground_truth, explanation_ground_truth
+class EvaluationMetrics():
+    """
+    This class encapsulates the evaluation methods that have been used in the project.
+    Attributes:
+        explanation_response = a list of detailed response from the LLM model corresponding to each query
+        explanation_ground_truth = the list of ground truth corresponding to each query
+    Methods:
+        metric_cosine_similairty(): Sets up the query engine with all necessary components.
+        metric_rouge(): Executes the predefined queries and prints the results.
+        metric_binary_accuracy():
+    """
+    def __init__(self, explanation_response, explanation_ground_truth, embedding_model):
+        self.explanation_response = explanation_response
+        self.explanation_ground_truth = explanation_ground_truth
+        self.embedding_model = embedding_model
+    def metric_cosine_similarity(self):
+        ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
+        explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
+        return np.diag(cosine_similarity(ground_truth_embedding, explanation_response_embedding))
+    def metric_rouge(self):
+        rouge = evaluate.load("rouge")
+        results = rouge.compute(predictions=self.explanation_response, references=self.explanation_ground_truth)
+        return results
+    def binary_accuracy(self, binary_response, binary_ground_truth):
+        count = 0
+        if len(binary_response) != len(binary_ground_truth):
+            return "Arrays which are to be compared has different lengths."
+        else:
+            for i in range(len(binary_response)):
+                if binary_response[i] == binary_ground_truth[i]:
+                    count += 1
+            return np.round(count / len(binary_response), 2)

librarymed/local/__init__.py ADDED Viewed

File without changes

librarymed/local/app_local.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import time
+import argparse
+import logging
+import os
+import openai
+from flask import Flask, flash, request, render_template, redirect
+from llama_index import Document
+from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
+from llama_index.llms import OpenAI
+from librarymed.local.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
+    ConfigManager
+app = Flask(__name__)
+app.config['SECRET_KEY'] = 'librarymed super secret key'
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+config_manager = ConfigManager()
+config_manager.load_config("api", "Config/api_config.json")
+config_manager.load_config("model", "Config/model_config.json")
+app.config['user_config'] = config_manager
+def allowed_file(filename, allowed_extensions):
+    """ Helper function to check if the file extension is allowed """
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
+@app.route('/', methods=['GET'])
+def __get__():
+    score = 0
+    criteria_met = 0
+    title = ""
+    author_info = ""
+    reasoning = ""
+    return render_template('index.html',
+                           title=title,
+                           author=author_info,
+                           score=score,
+                           criteria_met=criteria_met,
+                           reasoning=reasoning,
+                           )
+@app.route('/upload', methods=['POST'])
+def upload():
+    config = app.config['user_config']
+    openai.api_key = config.get_config_value("api", "OPENAI_API_KEY")
+    hf_token = config.get_config_value("api", "HF_TOKEN")
+    embed = config.get_config_value("model", "embeddings")
+    embed_model_name = config.get_config_value("model", "embeddings_model")
+    llm_model = config.get_config_value("model", "llm_model")
+    model_temperature = config.get_config_value("model", "model_temp")
+    output_token_size = config.get_config_value("model", "max_tokens")
+    model_context_window = config.get_config_value("model", "context_window")
+    gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
+    mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
+    info_prompt_path = config.get_config_value("model", "INFO_PROMPT_PATH")
+    peer_review_journals_path = config.get_config_value("model", "peer_review_journals_path")
+    eq_network_journals_path = config.get_config_value("model", "eq_network_journals_path")
+    queries = config.get_config_value("model", "queries")
+    num_criteria = len(config.get_config_value("model", "criteria"))
+    author_query = config.get_config_value("model", "author_query")
+    journal_query = config.get_config_value("model", "journal_query")
+    # Check if the post request has the file part
+    if 'file' not in request.files:
+        flash('No file part')
+        return redirect(request.url)
+    file = request.files['file']
+    # If user does not select file, browser also submits an empty part without filename
+    if file.filename == '':
+        flash('No selected file')
+        return redirect(request.url)
+    if file and allowed_file(file.filename, config.get_config_value("model", "allowed_extensions")):
+        try:
+            # Process the PDF file
+            pdf_processor = PDFProcessor_Unstructured(config.get_config_value("model", "pdf_processing"))
+            merged_chunks, tables, title = pdf_processor.process_pdf_file(file)
+            documents = [Document(text=t) for t in merged_chunks]
+            utils = base_utils()
+            # LLM Model choice
+            if 'gpt' in llm_model.lower():  # TODO tested "gpt-4" and  "gpt-3.5-turbo":
+                llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)
+                prompt_template = utils.read_from_file(gpt_prompt_path)
+            elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
+                if any(param is None for param in
+                       [model_context_window, output_token_size, model_temperature, hf_token]):
+                    raise ValueError("All parameters are required for Mistral LLM.")
+                llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
+                                 temperature=model_temperature, model_name=llm_model, api_key=hf_token)
+                prompt_template = utils.read_from_file(mistral_prompt_path)
+            else:
+                raise NotImplementedError(f"Error initializing language model '{llm_model}'")
+            # Embedding model choice for RAG
+            if embed == "openai":
+                embed_model = OpenAIEmbedding()
+            elif embed == "huggingface":
+                if embed_model_name is None:
+                    # Set to default model if name not provided
+                    embed_model_name = "BAAI/bge-small-en-v1.5"
+                    embed_model = HuggingFaceEmbedding(embed_model_name)
+                else:
+                    # Use the specified model name
+                    embed_model = HuggingFaceEmbedding(embed_model_name)
+            else:
+                raise NotImplementedError(f"Error initializing embedding model: {embed}")
+            # Prompts and Queries
+            info_prompt = utils.read_from_file(info_prompt_path)
+            peer_review_journals = utils.read_from_file(peer_review_journals_path)
+            eq_network_journals = utils.read_from_file(eq_network_journals_path)
+            peer_review_journals_list = peer_review_journals.split('\n')
+            eq_network_journals_list = eq_network_journals.split('\n')
+            modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
+                peer_review_journals_list) + "?"
+            pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
+            info_query_engine = pdf_info_query.setup_query_engine()
+            journal_result = info_query_engine.query(modified_journal_query).response
+            author_info = info_query_engine.query(author_query).response
+            pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (prompt_template))
+            # Check for prior registration
+            nlp_methods = KeywordSearch(merged_chunks)
+            eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
+            peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
+            registration_result = nlp_methods.check_registration()
+            # Evaluate with OpenAI model
+            total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
+                registration_result, peer_journal_result, eq_journal_result, queries)
+            score = f"{round((total_score / num_criteria) * 100)}/100"
+        except Exception as e:
+            flash('An error occurred while processing the file. Error: ' + str(e))
+            return redirect(request.url)
+    # e.g. score: 56 / 100 -  criteria_met: 5 - author_info: Direct
+    return render_template('index.html',
+                           title=title,
+                           author=author_info,
+                           score=score,
+                           criteria_met=criteria_met,
+                           reasoning=reasoning,
+                           )

librarymed/local/templates/index.html ADDED Viewed

	@@ -0,0 +1,187 @@

+<!doctype html>
+<html>
+<head>
+  <title>Upload and Results</title>
+  <!-- Include Google Fonts -->
+  <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
+  <style>
+    body {
+        font-family: 'Roboto', sans-serif;
+        background-color: #f4f4f4;
+        overflow: auto;
+        width: 100%;
+        margin: 0;
+        padding: 0;
+        display: flex;
+        flex-direction: column; /* Stack flex items vertically */
+        align-items: center; /* Center items horizontally */
+        justify-content: flex-start; /* Align items to the start of the container vertically */
+        min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
+    }
+    table {
+        width: 100%; /* Adjust the width as needed */
+        border-collapse: collapse; /* Collapse borders for a tighter look */
+    }
+    th, td {
+        border: 1px solid #ddd; /* Adjust the border size as needed */
+        text-align: left;
+        padding: 5px; /* Reduce padding to decrease cell spacing */
+        height: 30px; /* Optionally reduce the height of the cells */
+    }
+    .parent-element {
+        overflow: visible; /* Ensures content is not cut off */
+    }
+    .container {
+        background-color: white;
+        overflow: auto;
+        border-radius: 8px;
+        box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+        padding: 40px;
+        width: 100%; /* Set width to 100% of the viewport */
+        max-width: 700px;
+    }
+    .score-bar-container {
+      position: relative;
+      margin-top: 20px; /* Space above the score bar */
+      max-width: 100%; /* Ensures the container does not exceed the parent width */
+    }
+    .score-very-good-fill {
+        background-color: #4CAF50; /* Green */
+    }
+    .score-good-fill {
+        background-color: #FFEB3B; /* Yellow */
+    }
+    .score-ok-fill {
+        background-color: #FF9800; /* Orange */
+    }
+    .score-bad-fill {
+        background-color: #f44336; /* Red */
+    }
+    .score-very-bad-fill {
+        background-color: #9E9E9E; /* Grey */
+    }
+    .score-very-good-text {
+            color: #4CAF50; /* Green */
+      }
+    .score-good-text {
+            color: #FFEB3B; /* Yellow */
+      }
+    .score-ok-text {
+            color: #FF9800; /* Orange */
+      }
+    .score-bad-text {
+            color: #f44336; /* Red */
+      }
+    .score-very-bad-text {
+            color: #9E9E9E; /* Grey */
+      }
+    .score-bar {
+      background-color: #ddd;
+      border-radius: 10px;
+      height: 20px;
+      width: 100%; /* Adjusted to take the full width */
+      display: inline-block; /* Allows the score text to sit next to the score bar */
+      vertical-align: middle; /* Aligns score bar and text vertically */
+    }
+    .score-fill {
+      height: 100%;
+      border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
+      display: inline-block;
+      vertical-align: middle;
+    }
+    .score-text {
+      display: inline-block;
+      vertical-align: middle; /* Align with the score bar */
+      font-weight: bold; /* Make the score text bold */
+      margin-left: 10px; /* Space between the score bar and score text */
+    }
+    .score-title {
+      font-size: 20px;
+      font-weight: bold;
+      margin: 20px 0;
+      color: #333;
+    }
+    .major-issues {
+      text-align: left; /* Aligns the major issues to the left */
+      padding-left: 20px; /* Padding for the bullet list */
+      list-style: inside disc; /* Bullet style */
+    }
+    form {
+      margin-bottom: 20px;
+    }
+    input[type="file"] {
+      margin-bottom: 10px;
+    }
+    input[type="submit"] {
+      cursor: pointer;
+      margin-top: 10px;
+      padding: 10px 20px;
+      border: none;
+      background-color: #4CAF50;
+      color: white;
+      border-radius: 5px;
+      font-size: 16px;
+      font-weight: bold;
+    }
+    input[type="submit"]:hover {
+      background-color: #45a049;
+    }
+  </style>
+</head>
+<body>
+  <div class="container">
+    <h2>Upload PDF and View Results</h2>
+    <!-- Upload Form -->
+    <form action="/upload" method="post" enctype="multipart/form-data">
+      <input type="file" name="file" required>
+      <input type="submit" value="Upload">
+    </form>
+    <!-- Results Section -->
+    {% if total_score is not none %}
+      <!-- GPT-4 Score Bar -->
+      <div class="score-title">Score:</div>
+      <div class="score-bar-container">
+        <div class="score-bar">
+          <div class="score-fill {{
+            'score-very-good-fill' if criteria_met == 9 else
+            'score-good-fill' if criteria_met >= 7 else
+            'score-ok-fill' if criteria_met >= 5 else
+            'score-bad-fill' if criteria_met >= 3 else
+            'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
+        </div>
+        <div class="score-text">{{ score }}</div>
+      </div>
+      <h3>Title:</h3>
+        <p> {{title}}</p>
+      <h3>Author Information:</h3>
+          <p> {{author}}</p>
+      <h3>Reasoning:</h3>
+          <ul class="major-issues">
+            {% for issue in reasoning %}
+              <li>{{ issue }}</li>
+            {% endfor %}
+          </ul>
+    {% endif %}
+  </div>
+</body>
+</html>

librarymed/local/templates/upload_and_results.html ADDED Viewed

	@@ -0,0 +1,227 @@

+<!doctype html>
+<html>
+<head>
+  <title>Upload and Results</title>
+  <!-- Include Google Fonts -->
+  <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
+  <style>
+    body {
+        font-family: 'Roboto', sans-serif;
+        background-color: #f4f4f4;
+        overflow: auto;
+        width: 100%;
+        margin: 0;
+        padding: 0;
+        display: flex;
+        flex-direction: column; /* Stack flex items vertically */
+        align-items: center; /* Center items horizontally */
+        justify-content: flex-start; /* Align items to the start of the container vertically */
+        min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
+    }
+    table {
+        width: 100%; /* Adjust the width as needed */
+        border-collapse: collapse; /* Collapse borders for a tighter look */
+    }
+    th, td {
+        border: 1px solid #ddd; /* Adjust the border size as needed */
+        text-align: left;
+        padding: 5px; /* Reduce padding to decrease cell spacing */
+        height: 30px; /* Optionally reduce the height of the cells */
+    }
+    .parent-element {
+        overflow: visible; /* Ensures content is not cut off */
+    }
+    .container {
+        background-color: white;
+        overflow: auto;
+        border-radius: 8px;
+        box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+        padding: 40px;
+        width: 100%; /* Set width to 100% of the viewport */
+        max-width: 700px;
+    }
+    .score-bar-container {
+      position: relative;
+      margin-top: 20px; /* Space above the score bar */
+      max-width: 100%; /* Ensures the container does not exceed the parent width */
+    }
+    .score-very-good-fill {
+        background-color: #4CAF50; /* Green */
+    }
+    .score-good-fill {
+        background-color: #FFEB3B; /* Yellow */
+    }
+    .score-ok-fill {
+        background-color: #FF9800; /* Orange */
+    }
+    .score-bad-fill {
+        background-color: #f44336; /* Red */
+    }
+    .score-very-bad-fill {
+        background-color: #9E9E9E; /* Grey */
+    }
+    .score-very-good-text {
+            color: #4CAF50; /* Green */
+      }
+    .score-good-text {
+            color: #FFEB3B; /* Yellow */
+      }
+    .score-ok-text {
+            color: #FF9800; /* Orange */
+      }
+    .score-bad-text {
+            color: #f44336; /* Red */
+      }
+    .score-very-bad-text {
+            color: #9E9E9E; /* Grey */
+      }
+    .score-bar {
+      background-color: #ddd;
+      border-radius: 10px;
+      height: 20px;
+      width: 100%; /* Adjusted to take the full width */
+      display: inline-block; /* Allows the score text to sit next to the score bar */
+      vertical-align: middle; /* Aligns score bar and text vertically */
+    }
+    .score-fill {
+      height: 100%;
+      border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
+      display: inline-block;
+      vertical-align: middle;
+    }
+    .score-text {
+      display: inline-block;
+      vertical-align: middle; /* Align with the score bar */
+      font-weight: bold; /* Make the score text bold */
+      margin-left: 10px; /* Space between the score bar and score text */
+    }
+    .score-title {
+      font-size: 20px;
+      font-weight: bold;
+      margin: 20px 0;
+      color: #333;
+    }
+    .major-issues {
+      text-align: left; /* Aligns the major issues to the left */
+      padding-left: 20px; /* Padding for the bullet list */
+      list-style: inside disc; /* Bullet style */
+    }
+    form {
+      margin-bottom: 20px;
+    }
+    input[type="file"] {
+      margin-bottom: 10px;
+    }
+    input[type="submit"] {
+      cursor: pointer;
+      margin-top: 10px;
+      padding: 10px 20px;
+      border: none;
+      background-color: #4CAF50;
+      color: white;
+      border-radius: 5px;
+      font-size: 16px;
+      font-weight: bold;
+    }
+    input[type="submit"]:hover {
+      background-color: #45a049;
+    }
+  </style>
+</head>
+<body>
+  <div class="container">
+    <h2>Upload PDF and View Results</h2>
+    <!-- Upload Form -->
+    <form action="/upload" method="post" enctype="multipart/form-data">
+      <input type="file" name="file" required>
+      <input type="submit" value="Upload">
+    </form>
+    <!-- Results Section -->
+    {% if gpt4_score is not none or mistral_score is not none %}
+      <!-- GPT-4 Score Bar -->
+      <div class="score-title">Score for GPT-4:</div>
+      <div class="score-bar-container">
+        <div class="score-bar">
+          <div class="score-fill {{
+            'score-very-good-fill' if criteria_met_gpt4 == 9 else
+            'score-good-fill' if criteria_met_gpt4 >= 7 else
+            'score-ok-fill' if criteria_met_gpt4 >= 5 else
+            'score-bad-fill' if criteria_met_gpt4 >= 3 else
+            'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
+        </div>
+        <div class="score-text">{{ total_score_gpt4 }}/9</div>
+      </div>
+      <!-- Mistral Score Bar -->
+      <div class="score-title">Score for Mistral:</div>
+      <div class="score-bar-container">
+        <div class="score-bar">
+          <div class="score-fill {{
+            'score-very-good-fill' if criteria_met_mistral == 9 else
+            'score-good-fill' if criteria_met_mistral >= 7 else
+            'score-ok-fill' if criteria_met_mistral >= 5 else
+            'score-bad-fill' if criteria_met_mistral >= 3 else
+            'score-very-bad-fill' }}" style="width: {{ score_percentage_mistral }}%;"></div>
+        </div>
+        <div class="score-text">{{ total_score_mistral }}/9</div>
+      </div>
+      <!-- Reasoning for GPT-4 -->
+      <h3>Reasoning from GPT-4:</h3>
+      <ul class="major-issues">
+        {% for issue in reasoning_gpt4 %}
+          <li>{{ issue }}</li>
+        {% endfor %}
+      </ul>
+      <!-- Reasoning for Mistral -->
+      <h3>Reasoning from Mistral:</h3>
+      <ul class="major-issues">
+        {% for issue in reasoning_mistral %}
+          <li>{{ issue }}</li>
+        {% endfor %}
+      </ul>
+      <!-- Insert the Criteria Table Section Here -->
+        {% if combined_criteria_table %}
+        <h3>Criteria Evaluation</h3>
+        <table>
+            <thead>
+                <tr>
+                    <th>Criteria Number</th>
+                    <th>GPT-4 output</th>
+                    <th>Mistral output</th>
+                    <th>Ground truth</th>
+                </tr>
+            </thead>
+            <tbody>
+                {% for row in combined_criteria_table %}
+                  <tr>
+                    <td>{{ row['Criteria Number'] }}</td>
+                    <td>{{ 'Yes' if row['Score GPT-4'] == 1 else 'No' }}</td>
+                    <td>{{ 'Yes' if row['Score Mistral'] == 1 else 'No' }}</td>
+                    <td>{{ 'Yes' if row['ground truth'] else 'No' }}</td>
+                  </tr>
+                {% endfor %}
+              </tbody>
+            </table>
+          {% endif %}
+    {% endif %}
+  </div>
+</body>
+</html>

librarymed/main.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import argparse
+import logging
+import os
+from dotenv import load_dotenv
+load_dotenv()
+if __name__ == '__main__':
+    args_parse = argparse.ArgumentParser(description="LibraryMed")
+    args_parse.add_argument("--local", help="Run inferface v0.1.0 by the fellows", action="store_true")
+    args = args_parse.parse_args()
+    port = os.getenv("PORT") or 80
+    if args.local:
+        from .local.app_local import app
+        logging.info("Run LibraryMed interface v0.1.0 developed by the fellows")
+        app.run(debug=True, host="0.0.0.0", port=port)
+    else:
+        from kromin.app_librarymed import app
+        logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
+        app.run(debug=True, host="0.0.0.0", port=port)

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+beautifulsoup4
+chromadb
+cohere
+faiss-cpu
+Flask
+langchain
+langchainhub
+gradio
+llama-index == 0.9.35
+llmsherpa
+lxml
+unstructured
+bs4
+evaluate
+faiss-cpu
+numpy
+openai
+Pillow == 10.0.1
+PyPDF2
+pydantic
+rank-bm25
+requests
+rapidocr-onnxruntime
+rouge-score
+scikit-learn
+sentence-transformers
+tiktoken
+transformers
+tesseract
+pdf2image
+pdfminer.six
+opencv-python
+pikepdf
+pypdf
+unstructured-inference
+pytesseract
+pillow-heif
+unstructured-pytesseract
+fpdf
+qdrant_client
+python-dotenv