Spaces:

frasan
/

test-flask-app

Sleeping

App Files Files Community

frasan commited on Mar 7, 2024

Commit

7f8175c

1 Parent(s): 2684c41

changed files

Browse files

Files changed (16) hide show

Dockerfile +1 -1
librarymed/{kromin/RAG_utils.py → RAG_utils.py} +0 -0
librarymed/app.py +22 -0
librarymed/{kromin/app_librarymed.py → app_librarymed.py} +2 -2
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Bold.ttf +0 -0
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Oblique.ttf +0 -0
librarymed/huggingface/DejaVu/DejaVuSansCondensed.ttf +0 -0
librarymed/huggingface/DejaVu/readme.txt +0 -40
librarymed/huggingface/RAG_utils_huggingface.py +0 -995
librarymed/huggingface/app_huggingface.py +0 -304
librarymed/local/RAG_utils.py +0 -979
librarymed/local/__init__.py +0 -0
librarymed/local/app_local.py +0 -160
librarymed/local/templates/index.html +0 -187
librarymed/local/templates/upload_and_results.html +0 -227
librarymed/main.py +6 -16

Dockerfile CHANGED Viewed

@@ -41,4 +41,4 @@ EXPOSE 80
 ENV NAME World
 # Command to run on container start
-CMD ["uvicorn", "librarymed/main:app", "--host", "0.0.0.0", "--port", "7860"]

 ENV NAME World
 # Command to run on container start
+CMD ["uvicorn", "librarymed.main:app", "--host", "0.0.0.0", "--port", "80"]

librarymed/{kromin/RAG_utils.py → RAG_utils.py} RENAMED Viewed

File without changes

librarymed/app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import argparse
+import logging
+import os
+from dotenv import load_dotenv
+load_dotenv()
+if __name__ == '__main__':
+    args_parse = argparse.ArgumentParser(description="LibraryMed")
+    args_parse.add_argument("--local", help="Run interface v0.1.0 by the fellows", action="store_true")
+    args = args_parse.parse_args()
+    port = os.getenv("PORT") or 80
+    if args.local:
+        from local.app_local import app
+        logging.info("Run LibraryMed interface v0.1.0 developed by the fellows")
+        app.run(debug=True, host="0.0.0.0", port=port)
+    else:
+        from librarymed.app_librarymed import app
+        logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
+        app.run(debug=True, host="0.0.0.0", port=port)

librarymed/{kromin/app_librarymed.py → app_librarymed.py} RENAMED Viewed

@@ -7,8 +7,8 @@ from llama_index import Document
 from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
 from llama_index.llms import OpenAI
-from kromin.RAG_utils import ConfigManager
-from kromin.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils
 from dotenv import load_dotenv
 load_dotenv()

 from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
 from llama_index.llms import OpenAI
+from librarymed.RAG_utils import ConfigManager
+from librarymed.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils
 from dotenv import load_dotenv
 load_dotenv()

librarymed/huggingface/DejaVu/DejaVuSansCondensed-Bold.ttf DELETED Viewed

Binary file (632 kB)

librarymed/huggingface/DejaVu/DejaVuSansCondensed-Oblique.ttf DELETED Viewed

Binary file (576 kB)

librarymed/huggingface/DejaVu/DejaVuSansCondensed.ttf DELETED Viewed

Binary file (644 kB)

librarymed/huggingface/DejaVu/readme.txt DELETED Viewed

@@ -1,40 +0,0 @@
-Congratulations, you have successfully downloaded font file!
-This font is provided to you by Fonts2u.com – the largest online
-repository of free fonts for Windows and Mac.
-How to install this font on your computer?
-For Windows 7 / Vista users:
-- Right-click the font file(s) and choose "Install".
-For users of the previous Windows versions:
-- Copy the included file(s) into a default Windows font folder
-  (usually C:\WINDOWS\FONTS or C:\WINNT\FONTS)
-For Mac users:
-Mac OS X 10.3 or above (including the FontBook)
-- Double-click the font file and hit "Install font" button at
-  the bottom of the preview.
-Mac OS X
-- Either copy the font file(s) to /Library/Fonts (for all users),
-  or to /Users/Your_username/Library/Fonts (for you only).
-Mac OS 9 or earlier
-- You have to convert the font file(s) you have downloaded.
-  Drag the font suitcases into the System folder. The system
-  will propose you to add them to the Fonts folder.
-For Linux users:
-- Copy the font file(s) to /USR/SHARE/FONTS

librarymed/huggingface/RAG_utils_huggingface.py DELETED Viewed

@@ -1,995 +0,0 @@
-import os
-import re
-import json
-import torch
-import openai
-import logging
-import asyncio
-import aiohttp
-import pandas as pd
-import numpy as np
-import evaluate
-import qdrant_client
-from pypdf import PdfReader
-from pydantic import BaseModel, Field
-from typing import Any, List, Tuple, Set, Dict, Optional, Union
-from sklearn.metrics.pairwise import cosine_similarity
-from unstructured.partition.pdf import partition_pdf
-import llama_index
-from llama_index import PromptTemplate
-from llama_index.retrievers import VectorIndexRetriever, BaseRetriever, BM25Retriever
-from llama_index.query_engine import RetrieverQueryEngine
-from llama_index import get_response_synthesizer
-from llama_index.schema import NodeWithScore
-from llama_index.query_engine import RetrieverQueryEngine
-from llama_index import VectorStoreIndex, ServiceContext
-from llama_index.embeddings import OpenAIEmbedding
-from llama_index.llms import HuggingFaceLLM
-import requests
-from llama_index.llms import (
-    CustomLLM,
-    CompletionResponse,
-    CompletionResponseGen,
-    LLMMetadata,
-)
-from llama_index.query_engine import RetrieverQueryEngine
-from llama_index.llms.base import llm_completion_callback
-from llama_index.vector_stores.qdrant import QdrantVectorStore
-from llama_index.storage.storage_context import StorageContext
-from llama_index.postprocessor import SentenceTransformerRerank, LLMRerank
-from tempfile import NamedTemporaryFile
-# Configure basic logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-# Create a logger object
-logger = logging.getLogger(__name__)
-class ConfigManager:
-    """
-    A class to manage loading and accessing configuration settings.
-    Attributes:
-        config (dict): Dictionary to hold configuration settings.
-    Methods:
-        load_config(config_path: str): Loads the configuration from a given JSON file.
-        get_config_value(key: str): Retrieves a specific configuration value.
-    """
-    def __init__(self):
-        self.configs = {}
-    def load_config(self, config_name: str, config_path: str) -> None:
-        """
-        Loads configuration settings from a specified JSON file into a named configuration.
-        Args:
-            config_name (str): The name to assign to this set of configurations.
-            config_path (str): The path to the configuration file.
-        Raises:
-            FileNotFoundError: If the config file is not found.
-            json.JSONDecodeError: If there is an error parsing the config file.
-        """
-        try:
-            with open(config_path, 'r') as f:
-                self.configs[config_name] = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"Config file not found at {config_path}")
-            raise
-        except json.JSONDecodeError as e:
-            logging.error(f"Error decoding config file: {e}")
-            raise
-    def get_config_value(self, config_name: str, key: str) -> str:
-        """
-        Retrieves a specific configuration value.
-        Args:
-            key (str): The key for the configuration setting.
-        Returns:
-            str: The value of the configuration setting.
-        Raises:
-            ValueError: If the key is not found or is set to a placeholder value.
-        """
-        value = self.configs.get(config_name, {}).get(key)
-        if value is None or value == "ENTER_YOUR_TOKEN_HERE":
-            raise ValueError(f"Please set your '{key}' in the config.json file.")
-        return value
-class base_utils:
-    """
-    A utility class providing miscellaneous static methods for processing and analyzing text data,
-    particularly from PDF documents and filenames. This class also includes methods for file operations.
-    This class encapsulates the functionality of extracting key information from text, such as scores,
-    reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
-    and reading content from files.
-    Attributes:
-        None (This class contains only static methods and does not maintain any state)
-    Methods:
-        extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
-            Extracts a score and reasoning from a given text using regular expressions.
-        extract_id_from_filename(filename: str) -> Optional[int]:
-            Extracts an ID from a given filename based on a specified pattern.
-        find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
-            Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
-        read_from_file(file_path: str) -> str:
-            Reads the content of a file and returns it as a string.
-    """
-    @staticmethod
-    def read_from_file(file_path: str) -> str:
-        """
-        Reads the content of a file and returns it as a string.
-        Args:
-            file_path (str): The path to the file to be read.
-        Returns:
-            str: The content of the file.
-        """
-        with open(file_path, 'r') as prompt_file:
-            prompt = prompt_file.read()
-        return prompt
-    @staticmethod
-    def extract_id_from_filename(filename: str) -> Optional[int]:
-        """
-        Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
-        Args:
-            filename (str): The filename from which to extract the ID.
-        Returns:
-            int: The extracted ID as an integer, or None if the pattern is not found.
-        """
-        # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
-        match = re.search(r'Id_(\d+).pdf', filename)
-        if match:
-            return int(match.group(1))  # Convert to integer if ID is numeric
-        else:
-            return None
-    @staticmethod
-    def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
-        """
-        Extracts score and the longest reasoning from a given text using regular expressions.
-        Args:
-            text (str): The text from which to extract the score and reasoning.
-        Returns:
-            dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
-        """
-        # Define regular expression patterns for score and reasoning
-        score_pattern = r"Score: (\d+)"
-        reasoning_pattern = r"Reasoning: (\S.+)"
-        # Extract score using regular expressions
-        score_match = re.search(score_pattern, text)
-        # Extract all reasoning matches
-        reasoning_matches = re.findall(reasoning_pattern, text, re.DOTALL)
-        # Find the longest reasoning match
-        longest_reasoning = min(reasoning_matches, key=len) if reasoning_matches else None
-        # Extract and return the results
-        extracted_data = {
-            "score": score_match.group(1) if score_match else None,
-            "reasoning": longest_reasoning.strip() if longest_reasoning else None
-        }
-        return extracted_data
-    @staticmethod
-    def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
-        """
-        Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
-        Args:
-            pdf_filename (str): The filename of the PDF.
-            dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
-        Returns:
-            pandas.Series or str: The matched row from the dataframe or a message indicating
-                                  that no matching row or invalid filename was found.
-        """
-        pdf_id = Utility.extract_id_from_filename(pdf_filename)
-        if pdf_id is not None:
-            # Assuming the first column contains the ID
-            matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
-            if not matched_row.empty:
-                return matched_row
-            else:
-                return "No matching row found."
-        else:
-            return "Invalid file name."
-class PDFProcessor_Unstructured:
-    """
-    A class to process PDF files, providing functionalities for extracting, categorizing,
-    and merging elements from a PDF file.
-    This class is designed to handle unstructured PDF documents, particularly useful for
-    tasks involving text extraction, categorization, and data processing within PDFs.
-    Attributes:
-        file_path (str): The full path to the PDF file.
-        folder_path (str): The directory path where the PDF file is located.
-        file_name (str): The name of the PDF file.
-        texts (List[str]): A list to store extracted text chunks.
-        tables (List[str]): A list to store extracted tables.
-    Methods:
-        extract_pdf_elements() -> List:
-            Extracts images, tables, and text chunks from a PDF file.
-        categorize_elements(raw_pdf_elements: List) -> None:
-            Categorizes extracted elements from a PDF into tables and texts.
-        merge_chunks() -> List[str]:
-            Merges text chunks based on punctuation and character case criteria.
-        should_skip_chunk(chunk: str) -> bool:
-            Determines if a chunk should be skipped based on its content.
-        should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
-            Determines if the current chunk should be merged with the next one.
-        process_pdf() -> Tuple[List[str], List[str]]:
-            Processes the PDF by extracting, categorizing, and merging elements.
-        process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
-            Processes an uploaded PDF file to extract and categorize text and tables.
-    """
-    def __init__(self, config: Dict[str, any]):
-        self.file_path = None
-        self.folder_path = None
-        self.file_name = None
-        self.texts = []
-        self.tables = []
-        self.config = config if config is not None else self.default_config()
-        logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
-    @staticmethod
-    def default_config() -> Dict[str, any]:
-        """
-        Returns the default configuration for PDF processing.
-        Returns:
-            Dict[str, any]: Default configuration options.
-        """
-        return {
-            "extract_images": False,
-            "infer_table_structure": True,
-            "chunking_strategy": "by_title",
-            "max_characters": 10000,
-            "combine_text_under_n_chars": 100,
-            "strategy": "fast",
-            "model_name": "yolox"
-        }
-    def extract_pdf_elements(self) -> List:
-        """
-        Extracts images, tables, and text chunks from a PDF file.
-        Returns:
-            List: A list of extracted elements from the PDF.
-        """
-        logger.info("Starting extraction of PDF elements.")
-        try:
-            extracted_elements = partition_pdf(
-                filename=self.file_path,
-                extract_images_in_pdf=False,
-                infer_table_structure=True,
-                chunking_strategy="by_title",
-                strategy = "fast",
-                max_characters=10000,
-                combine_text_under_n_chars=100,
-                image_output_dir_path=self.folder_path,
-            )
-            logger.info("Extraction of PDF elements completed successfully.")
-            return extracted_elements
-        except Exception as e:
-            logger.error(f"Error extracting PDF elements: {e}", exc_info=True)
-            raise
-    def categorize_elements(self, raw_pdf_elements: List) -> None:
-        """
-        Categorizes extracted elements from a PDF into tables and texts.
-        Args:
-            raw_pdf_elements (List): A list of elements extracted from the PDF.
-        """
-        logger.debug("Starting categorization of PDF elements.")
-        for element in raw_pdf_elements:
-            element_type = str(type(element))
-            if "unstructured.documents.elements.Table" in element_type:
-                self.tables.append(str(element))
-            elif "unstructured.documents.elements.CompositeElement" in element_type:
-                self.texts.append(str(element))
-        logger.debug("Categorization of PDF elements completed.")
-    def merge_chunks(self) -> List[str]:
-        """
-        Merges text chunks based on punctuation and character case criteria.
-        Returns:
-            List[str]: A list of merged text chunks.
-        """
-        logger.debug("Starting merging of text chunks.")
-        merged_chunks = []
-        skip_next = False
-        for i, current_chunk in enumerate(self.texts[:-1]):
-            next_chunk = self.texts[i + 1]
-            if self.should_skip_chunk(current_chunk):
-                continue
-            if self.should_merge_with_next(current_chunk, next_chunk):
-                merged_chunks.append(current_chunk + " " + next_chunk)
-                skip_next = True
-            else:
-                merged_chunks.append(current_chunk)
-        if not skip_next:
-            merged_chunks.append(self.texts[-1])
-        logger.debug("Merging of text chunks completed.")
-        return merged_chunks
-    @staticmethod
-    def should_skip_chunk(chunk: str) -> bool:
-        """
-        Determines if a chunk should be skipped based on its content.
-        Args:
-            chunk (str): The text chunk to be evaluated.
-        Returns:
-            bool: True if the chunk should be skipped, False otherwise.
-        """
-        return (chunk.lower().startswith(("figure", "fig", "table")) or
-                not chunk[0].isalnum() or
-                re.match(r'^\d+\.', chunk))
-    @staticmethod
-    def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
-        """
-        Determines if the current chunk should be merged with the next one.
-        Args:
-            current_chunk (str): The current text chunk.
-            next_chunk (str): The next text chunk.
-        Returns:
-            bool: True if the chunks should be merged, False otherwise.
-        """
-        return (current_chunk.endswith(",") or
-                (current_chunk[-1].islower() and next_chunk[0].islower()))
-    def extract_title_from_pdf(self, uploaded_file):
-        """
-        Extracts the title from a PDF file's metadata.
-        This function reads the metadata of a PDF file using PyPDF2 and attempts to
-        extract the title. If the title is present in the metadata, it is returned.
-        Otherwise, a default message indicating that the title was not found is returned.
-        Parameters:
-        uploaded_file (file): A file object or a path to the PDF file from which
-                          to extract the title. The file must be opened in binary mode.
-        Returns:
-        str: The title of the PDF file as a string. If no title is found, returns
-             'Title not found'.
-        """
-        # Initialize PDF reader
-        pdf_reader = PdfReader(uploaded_file)
-        # Extract document information
-        meta = pdf_reader.metadata
-        # Retrieve title from document information
-        title = meta.title if meta and meta.title else 'Title not found'
-        return title
-    def process_pdf(self) -> Tuple[List[str], List[str]]:
-        """
-        Processes the PDF by extracting, categorizing, and merging elements.
-        Returns:
-            Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
-        """
-        logger.info("Starting processing of the PDF.")
-        try:
-            raw_pdf_elements = self.extract_pdf_elements()
-            self.categorize_elements(raw_pdf_elements)
-            merged_chunks = self.merge_chunks()
-            return merged_chunks, self.tables
-        except Exception as e:
-            logger.error(f"Error processing PDF: {e}", exc_info=True)
-            raise
-    def process_pdf_file(self, uploaded_file):
-        """
-        Process an uploaded PDF file.
-        If a new file is uploaded, the previously stored file is deleted.
-        The method updates the file path, processes the PDF, and returns the results.
-        Parameters:
-        uploaded_file: The new PDF file uploaded for processing.
-        Returns:
-        The results of processing the PDF file.
-        """
-        # Delete the previous file if it exists
-        if self.file_path and os.path.exists(self.file_path):
-            try:
-                os.remove(self.file_path)
-                logging.debug(f"Previous file {self.file_path} deleted.")
-            except Exception as e:
-                logging.warning(f"Error deleting previous file: {e}", exc_info=True)
-        # Process the new file
-        self.file_path = str(uploaded_file)
-        self.folder_path = os.path.dirname(self.file_path)
-        logging.info(f"Starting to process the PDF file: {self.file_path}")
-        try:
-            logging.debug(f"Processing PDF at {self.file_path}")
-            results = self.process_pdf()
-            title = self.extract_title_from_pdf(self.file_path)
-            logging.info("PDF processing completed successfully.")
-            return (*results, title)
-        except Exception as e:
-            logging.error(f"Error processing PDF file: {e}", exc_info=True)
-            raise
-class HybridRetriever(BaseRetriever):
-    """
-    A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
-    Inherits from BaseRetriever.
-    This class uses two different retrieval methods and merges their results to provide a
-    comprehensive set of documents in response to a query. It ensures diversity in the
-    retrieved documents by leveraging the strengths of both retrieval methods.
-    Attributes:
-        vector_retriever: An instance of a vector-based retriever.
-        bm25_retriever: An instance of a BM25 retriever.
-    Methods:
-        __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
-        _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
-        _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
-    """
-    def __init__(self, vector_retriever, bm25_retriever):
-        super().__init__()
-        self.vector_retriever = vector_retriever
-        self.bm25_retriever = bm25_retriever
-        logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
-    def _retrieve(self, query: str, **kwargs) -> List:
-        """
-        Retrieves and combines results from both vector and BM25 retrievers.
-        Args:
-            query: The query string for document retrieval.
-            **kwargs: Additional keyword arguments for retrieval.
-        Returns:
-            List: Combined list of unique nodes retrieved from both methods.
-        """
-        logger.info(f"Retrieving documents for query: {query}")
-        try:
-            bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
-            vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
-            combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
-            logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
-            return combined_nodes
-        except Exception as e:
-            logger.error(f"Error in retrieval: {e}")
-            raise
-    @staticmethod
-    def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
-        """
-        Combines and de-duplicates results from BM25 and vector retrievers.
-        Args:
-            bm25_nodes: Nodes retrieved from BM25 retriever.
-            vector_nodes: Nodes retrieved from vector retriever.
-        Returns:
-            List: Combined list of unique nodes.
-        """
-        node_ids: Set = set()
-        combined_nodes = []
-        for node in bm25_nodes + vector_nodes:
-            if node.node_id not in node_ids:
-                combined_nodes.append(node)
-                node_ids.add(node.node_id)
-        return combined_nodes
-class PDFQueryEngine:
-    """
-    A class to handle the process of setting up a query engine and performing queries on PDF documents.
-    This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
-    indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
-    Attributes:
-        documents (List): A list of documents to be indexed.
-        llm (Language Model): The language model to be used for embeddings and queries.
-        qa_prompt_tmpl (str): Template for creating query prompts.
-        queries (List[str]): List of queries to be executed.
-    Methods:
-        setup_query_engine(): Sets up the query engine with all necessary components.
-        execute_queries(): Executes the predefined queries and prints the results.
-    """
-    def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
-        self.documents = documents
-        self.llm = llm
-        self.embed_model = embed_model
-        self.qa_prompt_tmpl = qa_prompt_tmpl
-        self.base_utils = base_utils()
-        self.config_manager = ConfigManager()
-        logger.info("PDFQueryEngine initialized.")
-    def format_example(self, example):
-        """
-        Formats a few-shot example into a string.
-        Args:
-            example (dict): A dictionary containing 'query', 'score', and 'reasoning' for the few-shot example.
-        Returns:
-            str: Formatted few-shot example text.
-        """
-        return "Example:\nQuery: {}\nScore: {}\nReasoning: {}\n".format(
-            example['query'], example['score'], example['reasoning']
-        )
-    def setup_query_engine(self):
-        """
-        Sets up the query engine by initializing and configuring the embedding model, service context, index,
-        hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
-        Args:
-            embed_model: The embedding model to be used.
-            service_context: The context for providing services to the query engine.
-            index: The index used for storing and retrieving documents.
-            hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
-            response_synthesizer: The synthesizer for generating responses to queries.
-        Returns:
-            Any: The configured query engine.
-        """
-        client = qdrant_client.QdrantClient(
-            # you can use :memory: mode for fast and light-weight experiments,
-            # it does not require to have Qdrant deployed anywhere
-            # but requires qdrant-client >= 1.1.1
-            location=":memory:"
-            # otherwise set Qdrant instance address with:
-            # uri="http://<host>:<port>"
-            # set API KEY for Qdrant Cloud
-            # api_key="<qdrant-api-key>",
-        )
-        try:
-            logger.info("Initializing the service context for query engine setup.")
-            service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
-            vector_store = QdrantVectorStore(client=client, collection_name="med_library")
-            storage_context = StorageContext.from_defaults(vector_store=vector_store)
-            logger.info("Creating an index from documents.")
-            index = VectorStoreIndex.from_documents(documents=self.documents, storage_context=storage_context, service_context=service_context)
-            nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
-            logger.info("Setting up vector and BM25 retrievers.")
-            vector_retriever = index.as_retriever(similarity_top_k=3)
-            bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)
-            hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
-            logger.info("Configuring the response synthesizer with the prompt template.")
-            qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
-            response_synthesizer = get_response_synthesizer(
-                service_context=service_context,
-                text_qa_template=qa_prompt,
-                response_mode="compact",
-            )
-            logger.info("Assembling the query engine with reranker and synthesizer.")
-            reranker = SentenceTransformerRerank(top_n=3, model="BAAI/bge-reranker-base")
-            query_engine = RetrieverQueryEngine.from_args(
-                retriever=hybrid_retriever,
-                node_postprocessors=[reranker],
-                response_synthesizer=response_synthesizer,
-            )
-            logger.info("Query engine setup complete.")
-            return query_engine
-        except Exception as e:
-            logger.error(f"Error during query engine setup: {e}")
-            raise
-    def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[int, List[int], int, float, List[str]]:
-        """
-        Evaluate documents using a language model based on various criteria.
-        Args:
-            reg_result (Any): Result related to registration.
-            peer_result (Any): Result related to peer review.
-            guidelines_result (Any): Result related to following guidelines.
-            queries (List[str]): A list of queries to be processed.
-        Returns:
-            Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria.
-        """
-        logger.info("Starting evaluation with LLM.")
-        self.config_manager.load_config("few_shot", "few_shot.json")
-        query_engine = self.setup_query_engine()
-        total_score = 0
-        criteria_met = 0
-        reasoning = []
-        for j, query in enumerate(queries):
-            # Handle special cases based on the value of j and other conditions
-            if j == 1 and reg_result:
-                extracted_data = {"score": 1, "reasoning": reg_result[0]}
-            elif j == 2 and guidelines_result:
-                extracted_data = {"score": 1, "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
-            elif j == 8 and (guidelines_result or peer_result):
-                extracted_data = {"score": 1, "reasoning": "The article is published in a peer-reviewed journal."}
-            else:
-                # Execute the query
-                result = query_engine.query(query).response
-                extracted_data = self.base_utils.extract_score_reasoning(result)
-            # Validate and accumulate the scores
-            extracted_data_score = 0 if extracted_data.get("score") is None else int(extracted_data.get("score"))
-            if extracted_data_score > 0:
-                criteria_met += 1
-            reasoning.append(extracted_data["reasoning"])
-            total_score += extracted_data_score
-        score_percentage = (float(total_score) / len(queries)) * 100
-        logger.info("Evaluation completed.")
-        return total_score, criteria_met, score_percentage, reasoning
-class MixtralLLM(CustomLLM):
-    """
-    A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
-    Attributes:
-        context_window (int): Number of tokens used for context during inference.
-        num_output (int): Number of tokens to generate as output.
-        temperature (float): Sampling temperature for token generation.
-        model_name (str): Name of the model on Hugging Face's model hub.
-        api_key (str): API key for authenticating with the Hugging Face API.
-    Methods:
-        metadata: Retrieves metadata about the model.
-        do_hf_call: Makes an API call to the Hugging Face model.
-        complete: Generates a complete response for a given prompt.
-        stream_complete: Streams a series of token completions for a given prompt.
-    """
-    context_window: int = Field(..., description="Number of tokens used for context during inference.")
-    num_output: int = Field(..., description="Number of tokens to generate as output.")
-    temperature: float = Field(..., description="Sampling temperature for token generation.")
-    model_name: str = Field(..., description="Name of the model on Hugging Face's model hub.")
-    api_key: str = Field(..., description="API key for authenticating with the Hugging Face API.")
-    @property
-    def metadata(self) -> LLMMetadata:
-        """
-        Retrieves metadata for the Mixtral LLM.
-        Returns:
-            LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
-        """
-        return LLMMetadata(
-            context_window=self.context_window,
-            num_output=self.num_output,
-            model_name=self.model_name,
-        )
-    def do_hf_call(self, prompt: str) -> str:
-        """
-        Makes an API call to the Hugging Face model and retrieves the generated response.
-        Args:
-            prompt (str): The input prompt for the model.
-        Returns:
-            str: The text generated by the model in response to the prompt.
-        Raises:
-            Exception: If the API call fails or returns an error.
-        """
-        data = {
-            "inputs": prompt,
-            "parameters": {"Temperature": self.temperature}
-        }
-        # Makes a POST request to the Hugging Face API to get the model's response
-        response = requests.post(
-            f'https://api-inference.huggingface.co/models/{self.model_name}',
-            headers={
-                'authorization': f'Bearer {self.api_key}',
-                'content-type': 'application/json',
-            },
-            json=data,
-            stream=True
-        )
-        # Checks for a successful response and parses the generated text
-        if response.status_code != 200 or not response.json() or 'error' in response.json():
-            print(f"Error: {response}")
-            return "Unable to answer for technical reasons."
-        full_txt = response.json()[0]['generated_text']
-        # Finds the section of the text following the context separator
-        offset = full_txt.find("---------------------")
-        ss = full_txt[offset:]
-        # Extracts the actual answer from the response
-        offset = ss.find("Answer:")
-        return ss[offset+7:].strip()
-    @llm_completion_callback()
-    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
-        """
-        Generates a complete response for a given prompt using the Hugging Face API.
-        Args:
-            prompt (str): The input prompt for the model.
-            **kwargs: Additional keyword arguments for the completion.
-        Returns:
-            CompletionResponse: The complete response from the model.
-        """
-        response = self.do_hf_call(prompt)
-        return CompletionResponse(text=response)
-    @llm_completion_callback()
-    def stream_complete(
-            self, prompt: str, **kwargs: Any
-    ) -> CompletionResponseGen:
-        """
-        Streams a series of token completions as a response for the given prompt.
-        This method is useful for streaming responses where each token is generated sequentially.
-        Args:
-            prompt (str): The input prompt for the model.
-            **kwargs: Additional keyword arguments for the streaming completion.
-        Yields:
-            CompletionResponseGen: A generator yielding each token in the completion response.
-        """
-        # Yields a stream of tokens as the completion response for the given prompt
-        response = ""
-        for token in self.do_hf_call(prompt):
-            response += token
-            yield CompletionResponse(text=response, delta=token)
-class KeywordSearch():
-    def __init__(self, chunks):
-        self.chunks = chunks
-    def find_journal_name(self, response: str, journal_list: list) -> str:
-        """
-        Searches for a journal name in a given response string.
-        This function iterates through a list of known journal names and checks if any of these
-        names are present in the response string. It returns the first journal name found in the
-        response. If no journal names from the list are found in the response, a default message
-        indicating that the journal name was not found is returned.
-        Args:
-            response (str): The response string to search for a journal name.
-            journal_list (list): A list of journal names to search within the response.
-        Returns:
-            str: The first journal name found in the response, or a default message if no journal name is found.
-        """
-        response_lower = response.lower()
-        for journal in journal_list:
-            journal_lower = journal.lower()
-            if journal_lower in response_lower:
-                return True
-        return False
-    def check_registration(self):
-        """
-        Check chunks of text for various registration numbers or URLs of registries.
-        Returns the sentence containing a registration number, or if not found,
-        returns chunks containing registry URLs.
-        Args:
-        chunks (list of str): List of text chunks to search.
-        Returns:
-        list of str: List of matching sentences or chunks, or an empty list if no matches are found.
-        """
-        # Patterns for different registration types
-        patterns = {
-            "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
-            "ISRCTN": r"(ISRCTN\d{8})",
-            "EudraCT": r"(\d{4}-\d{6}-\d{2})",
-            "UMIN-CTR": r"(UMIN\d{9})",
-            "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
-        }
-        # Registry URLs
-        registry_urls = [
-            "www.anzctr.org.au",
-            "anzctr.org.au",
-            "www.clinicaltrials.gov",
-            "clinicaltrials.gov",
-            "www.ISRCTN.org",
-            "ISRCTN.org",
-            "www.umin.ac.jp/ctr/index/htm",
-            "umin.ac.jp/ctr/index/htm",
-            "www.onderzoekmetmensen.nl/en",
-            "onderzoekmetmensen.nl/en",
-            "eudract.ema.europa.eu",
-            "www.eudract.ema.europa.eu"
-        ]
-        # Check each chunk for registration numbers
-        for chunk in self.chunks:
-            # Split chunk into sentences
-            sentences = re.split(r'(?<=[.!?]) +', chunk)
-            # Check each sentence for any registration number
-            for sentence in sentences:
-                for pattern in patterns.values():
-                    if re.search(pattern, sentence):
-                        return [sentence]  # Return immediately if a registration number is found
-        # If no registration number found, check for URLs in chunks
-        matching_chunks = []
-        for chunk in self.chunks:
-            if any(url in chunk for url in registry_urls):
-                matching_chunks.append(chunk)
-        return matching_chunks
-class StringExtraction():
-    """
-    A class to handle the the process of extraction of query string from complete LLM responses.
-    This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
-    LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
-    satisfactory results. Best case scenario will be write your own string extraction method in such cases.
-    Methods:
-        extract_original_prompt():
-        extraction_ground_truth():
-    """
-    def extract_original_prompt(self,result):
-        r1 = result.response.strip().split("\n")
-        binary_response = ""
-        explanation_response = ""
-        for r in r1:
-            if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
-                binary_response = r
-            elif r.find("Reasoning:") >= 0:
-                cut = r.find(":")
-                explanation_response += r[cut+1:].strip()
-        return binary_response,explanation_response
-    def extraction_ground_truth(self,paper_name,labelled_data):
-        id = int(paper_name[paper_name.find("_")+1:paper_name.find(".pdf")])
-        id_row = labelled_data[labelled_data["id"] == id]
-        ground_truth = id_row.iloc[:,2:11].values.tolist()[0]
-        binary_ground_truth = []
-        explanation_ground_truth = []
-        for g in ground_truth:
-            if len(g) > 0:
-                binary_ground_truth.append("Yes")
-                explanation_ground_truth.append(g)
-            else:
-                binary_ground_truth.append("No")
-                explanation_ground_truth.append("The article does not provide any relevant information.")
-        return binary_ground_truth,explanation_ground_truth
-class EvaluationMetrics():
-    """
-    This class encapsulates the evaluation methods that have been used in the project.
-    Attributes:
-        explanation_response = a list of detailed response from the LLM model corresponding to each query
-        explanation_ground_truth = the list of ground truth corresponding to each query
-    Methods:
-        metric_cosine_similairty(): Sets up the query engine with all necessary components.
-        metric_rouge(): Executes the predefined queries and prints the results.
-        metric_binary_accuracy():
-    """
-    def __init__(self,explanation_response,explanation_ground_truth,embedding_model):
-        self.explanation_response = explanation_response
-        self.explanation_ground_truth = explanation_ground_truth
-        self.embedding_model = embedding_model
-    def metric_cosine_similarity(self):
-        ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
-        explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
-        return np.diag(cosine_similarity(ground_truth_embedding,explanation_response_embedding))
-    def metric_rouge(self):
-        rouge = evaluate.load("rouge")
-        results = rouge.compute(predictions = self.explanation_response,references = self.explanation_ground_truth)
-        return results
-    def binary_accuracy(self,binary_response,binary_ground_truth):
-        count = 0
-        if len(binary_response) != len(binary_ground_truth):
-            return "Arrays which are to be compared has different lengths."
-        else:
-            for i in range(len(binary_response)):
-                if binary_response[i] == binary_ground_truth[i]:
-                    count += 1
-            return np.round(count/len(binary_response),2)

librarymed/huggingface/app_huggingface.py DELETED Viewed

@@ -1,304 +0,0 @@
-import logging
-import os
-import gradio as gr
-import openai
-from fpdf import FPDF
-from llama_index import Document
-from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
-from llama_index.llms import OpenAI
-from RAG_utils_huggingface import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
-    ConfigManager
-# Configure basic logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-# Create a logger object
-logger = logging.getLogger(__name__)
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-config_manager = ConfigManager()
-# config_manager.load_config("api", "Config/api_config.json")
-config_manager.load_config("model", "model_config.json")
-openai.api_key = os.environ['OPENAI_API_KEY']  # config_manager.get_config_value("api", "OPENAI_API_KEY")
-hf_token = os.environ['HF_TOKEN']  # config_manager.get_config_value("api", "HF_TOKEN")
-# PDF rendering and chunking parameters
-pdf_processing_config = config_manager.get_config_value("model", "pdf_processing")
-ALLOWED_EXTENSIONS = config_manager.get_config_value("model", "allowed_extensions")
-embed = config_manager.get_config_value("model", "embeddings")
-embed_model_name = config_manager.get_config_value("model", "embeddings_model")
-# llm_model = config_manager.get_config_value("model", "llm_model")
-model_temperature = config_manager.get_config_value("model", "model_temp")
-output_token_size = config_manager.get_config_value("model", "max_tokens")
-model_context_window = config_manager.get_config_value("model", "context_window")
-gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
-mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
-info_prompt_path = config_manager.get_config_value("model", "INFO_PROMPT_PATH")
-peer_review_journals_path = config_manager.get_config_value("model", "peer_review_journals_path")
-eq_network_journals_path = config_manager.get_config_value("model", "eq_network_journals_path")
-queries = config_manager.get_config_value("model", "queries")
-criteria = config_manager.get_config_value("model", "criteria")
-num_criteria = len(queries)
-author_query = config_manager.get_config_value("model", "author_query")
-journal_query = config_manager.get_config_value("model", "journal_query")
-# Helper function to check if the file extension is allowed
-def allowed_file(filename):
-    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-def generate_score_bar(score, num_criteria):
-    # Convert and round the score from a 9-point scale to a 100-point scale
-    score_out_of_100 = round((score / num_criteria) * 100)
-    # Determine the color and text based on the original score
-    if score == 9:
-        color = "#4CAF50"  # green
-        text = "Very good"
-    elif score in [7, 8]:
-        color = "#FFEB3B"  # yellow
-        text = "Good"
-    elif score in [5, 6]:
-        color = "#FF9800"  # orange
-        text = "Ok"
-    elif score in [3, 4]:
-        color = "#F44336"  # red
-        text = "Bad"
-    else:  # score < 3
-        color = "#800000"  # maroon
-        text = "Very bad"
-    # Create the HTML for the score bar
-    score_bar_html = f"""
-        <div style="background-color: #ddd; border-radius: 10px; position: relative; height: 20px; width: 100%;">
-            <div style="background-color: {color}; height: 100%; border-radius: 10px; width: {score_out_of_100}%;"></div>
-        </div>
-        <p style="color: {color};">{text}</p>  <!-- Display the text -->
-    """
-    return score_bar_html
-class PDF(FPDF):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # Load the DejaVu font files
-        self.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
-        self.add_font('DejaVu', 'B', 'DejaVuSansCondensed-Bold.ttf', uni=True)
-        self.add_font('DejaVu', 'I', 'DejaVuSansCondensed-Oblique.ttf', uni=True)
-    def header(self):
-        self.set_font('DejaVu', 'B', 12)
-        self.cell(0, 10, 'Paper Analysis Report', 0, 1, 'C')
-    def footer(self):
-        self.set_y(-15)
-        self.set_font('DejaVu', 'I', 8)
-        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
-import os
-def create_pdf_report(title, author_info, score, criteria, reasoning_list, output_path):
-    pdf = PDF()
-    pdf.add_page()
-    # Set margins
-    pdf.set_left_margin(10)
-    pdf.set_right_margin(10)
-    # Title
-    pdf.set_font("DejaVu", 'B', 14)
-    pdf.cell(0, 10, "Title:", 0, 1)
-    pdf.set_font("DejaVu", '', 12)
-    pdf.multi_cell(0, 10, title, 0, 1)
-    # Author Information
-    pdf.set_font("DejaVu", 'B', 14)
-    pdf.cell(0, 10, "Author Information:", 0, 1)
-    pdf.set_font("DejaVu", '', 12)
-    pdf.multi_cell(0, 10, author_info, 0, 1)
-    # Score
-    pdf.set_font("DejaVu", 'B', 14)
-    pdf.cell(0, 10, "Score:", 0, 1)
-    pdf.set_font("DejaVu", '', 12)
-    pdf.multi_cell(0, 10, score, 0, 1)
-    # Reasoning - each reasoning with a green heading in bold
-    for heading, reasoning in zip(criteria, reasoning_list):
-        print(reasoning)
-        pdf.set_font("DejaVu", 'B', 14)
-        pdf.set_text_color(0, 128, 0)  # Green color
-        pdf.multi_cell(0, 10, heading, 0, 1)
-        pdf.set_text_color(0, 0, 0)  # Reset to black color
-        pdf.set_font("DejaVu", '', 12)
-        pdf.multi_cell(0, 10, reasoning, 0, 1)
-    # Save the PDF to the specified output path
-    pdf.output(output_path)
-    return output_path  # Return the path to the generated report
-def check_title_for_review(uploaded_files):
-    title_message = "All articles are valid for review."
-    if not uploaded_files:
-        title_message = "No files uploaded or upload canceled."
-    else:
-        for uploaded_file in uploaded_files:
-            pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
-            title = pdf_processor.extract_title_from_pdf(uploaded_file)
-            if 'review' in title.lower():
-                title_message = "One or more files are review papers. Hence the evaluation may not be accurate."
-    return title_message
-def process_pdf(uploaded_files, llm_model, n_criteria=num_criteria):
-    # Initialize aggregation variables
-    final_score = 0
-    final_reasoning = []
-    final_score_bar_html = ""
-    final_author_info_html = ""
-    final_title_info_html = ""
-    output_files = []
-    for i, uploaded_file in enumerate(uploaded_files):
-        # Process the PDF file
-        file_name_without_extension = os.path.splitext(os.path.basename(uploaded_file))[0]
-        file_name_without_extension
-        pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
-        merged_chunks, tables, title = pdf_processor.process_pdf_file(uploaded_file)
-        documents = [Document(text=t) for t in merged_chunks]
-        # Prompts and Queries
-        utils = base_utils()
-        info_prompt = utils.read_from_file(info_prompt_path)
-        # LLM Model choice
-        try:
-            if llm_model == "Model 1":
-                llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=output_token_size)
-                general_prompt = utils.read_from_file(gpt_prompt_path)
-            elif llm_model == "Model 2":
-                if any(param is None for param in
-                       [model_context_window, output_token_size, model_temperature, hf_token]):
-                    raise ValueError("All parameters are required for Mistral LLM.")
-                llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
-                                 temperature=model_temperature, model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
-                                 api_key=hf_token)
-                general_prompt = utils.read_from_file(mistral_prompt_path)
-            else:
-                raise ValueError(f"Unsupported language model: {llm_model}")
-        except Exception as e:
-            logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
-            raise  # Or handle the exception as needed
-        # Embedding model choice for RAG
-        try:
-            if embed == "openai":
-                embed_model = OpenAIEmbedding(model="text-embedding-3-large")
-            elif embed == "huggingface":
-                # Use the specified model name
-                embed_model = HuggingFaceEmbedding(embed_model_name)
-            else:
-                raise ValueError(f"Unsupported embedding model: {embed_model}")
-        except Exception as e:
-            logger.error(f"Error initializing embedding model: {e}", exc_info=True)
-            raise
-        peer_review_journals = utils.read_from_file(peer_review_journals_path)
-        eq_network_journals = utils.read_from_file(eq_network_journals_path)
-        peer_review_journals_list = peer_review_journals.split('\n')
-        eq_network_journals_list = eq_network_journals.split('\n')
-        modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
-            peer_review_journals_list) + "?"
-        info_llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=100)
-        pdf_info_query = PDFQueryEngine(documents, info_llm, embed_model, (info_prompt))
-        info_query_engine = pdf_info_query.setup_query_engine()
-        journal_result = info_query_engine.query(modified_journal_query).response
-        author_result = info_query_engine.query(author_query).response
-        pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (general_prompt))
-        # Check for prior registration
-        nlp_methods = KeywordSearch(merged_chunks)
-        eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
-        peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
-        registration_result = nlp_methods.check_registration()
-        # Evaluate with OpenAI model
-        total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
-            registration_result, peer_journal_result, eq_journal_result, queries)
-        # Convert reasoning list to plain text
-        # reasoning_text = "\n".join([f"{idx + 1}. {reason}" for idx, reason in enumerate(reasoning)])
-        # Generate the score bar HTML
-        score_bar_html = generate_score_bar(total_score, n_criteria)
-        scaled_total_score = str(round((total_score / n_criteria) * 100)) + "/100"
-        output_dir = "/tmp"
-        base_name = os.path.splitext(uploaded_file)[0]
-        output_path = os.path.join(output_dir, f"{base_name}_report.pdf")
-        create_pdf_report(title, author_result, scaled_total_score, criteria, reasoning, output_path)
-        output_files.append(output_path)
-        # Construct the processing message
-        processing_message = f"Processing complete. {len(uploaded_files)} reports generated. Please download your reports below."
-    return processing_message, output_files
-    # Return the score as a string and the reasoning as HTML
-    # return str(round((total_score / n_criteria) * 100)) + "/100", score_bar_html, reasoning_html, author_info_html, title_info_html
-with gr.Blocks(theme=gr.themes.Glass(
-        text_size="sm",
-        font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"],
-        primary_hue="neutral",
-        secondary_hue="gray")) as demo:
-    gr.Markdown("## Med Library")
-    with gr.Row():
-        file_upload = gr.File(label="Choose papers", file_types=['.pdf'], file_count="multiple")
-    title_check_output = gr.Textbox(label="Warnings", interactive=False)
-    file_upload.change(fn=check_title_for_review, inputs=file_upload, outputs=title_check_output)
-    with gr.Row():
-        model_choice = gr.Dropdown(["Model 1", "Model 2"], label="Choose a model", value="Model 1")
-        submit_button = gr.Button("Evaluate")
-    processing_message_output = gr.Textbox(label="Processing Status", interactive=False)
-    report_download_links = gr.File(label="Download Reports", type="filepath", file_count="multiple")
-    submit_button.click(
-        fn=process_pdf,
-        inputs=[file_upload, model_choice],
-        outputs=[processing_message_output, report_download_links]
-    )
-demo.launch(share=True, server_name="0.0.0.0", server_port=7860)

librarymed/local/RAG_utils.py DELETED Viewed

@@ -1,979 +0,0 @@
-"""Utility functions for working with the RAG model"""
-import json
-import logging
-import os
-import re
-import time
-from tempfile import NamedTemporaryFile
-from typing import Any, List, Tuple, Set, Dict, Optional, Union
-import evaluate
-import numpy as np
-import pandas as pd
-import requests
-from llama_index import PromptTemplate
-from llama_index import VectorStoreIndex, ServiceContext
-from llama_index import get_response_synthesizer
-from llama_index.llms import (
-    CustomLLM,
-    CompletionResponse,
-    CompletionResponseGen,
-    LLMMetadata,
-)
-from llama_index.llms.base import llm_completion_callback
-from llama_index.postprocessor import SentenceTransformerRerank
-from llama_index.query_engine import RetrieverQueryEngine
-from llama_index.retrievers import BaseRetriever, BM25Retriever
-from sklearn.metrics.pairwise import cosine_similarity
-from unstructured.partition.pdf import partition_pdf
-from pypdf import PdfReader
-# Configure basic logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-# Create a logger object
-logger = logging.getLogger(__name__)
-class ConfigManager:
-    """
-    A class to manage loading and accessing configuration settings.
-    Attributes:
-        config (dict): Dictionary to hold configuration settings.
-    Methods:
-        load_config(config_path: str): Loads the configuration from a given JSON file.
-        get_config_value(key: str): Retrieves a specific configuration value.
-    """
-    def __init__(self):
-        self.configs = {}
-    def load_config(self, config_name: str, config_path: str) -> None:
-        """
-        Loads configuration settings from a specified JSON file into a named configuration.
-        Args:
-            config_name (str): The name to assign to this set of configurations.
-            config_path (str): The path to the configuration file.
-        Raises:
-            FileNotFoundError: If the config file is not found.
-            json.JSONDecodeError: If there is an error parsing the config file.
-        """
-        try:
-            with open(config_path, 'r') as f:
-                self.configs[config_name] = json.load(f)
-        except FileNotFoundError:
-            logging.error(f"Config file not found at {config_path}")
-            raise
-        except json.JSONDecodeError as e:
-            logging.error(f"Error decoding config file: {e}")
-            raise
-    def get_config_value(self, config_name: str, key: str) -> str:
-        """
-        Retrieves a specific configuration value.
-        Args:
-            key (str): The key for the configuration setting.
-        Returns:
-            str: The value of the configuration setting.
-        Raises:
-            ValueError: If the key is not found or is set to a placeholder value.
-        """
-        value = self.configs.get(config_name, {}).get(key)
-        if value is None or value == "ENTER_YOUR_TOKEN_HERE":
-            raise ValueError(f"Please set your '{key}' in the config.json file.")
-        return value
-class base_utils:
-    """
-    A utility class providing miscellaneous static methods for processing and analyzing text data,
-    particularly from PDF documents and filenames. This class also includes methods for file operations.
-    This class encapsulates the functionality of extracting key information from text, such as scores,
-    reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
-    and reading content from files.
-    Attributes:
-        None (This class contains only static methods and does not maintain any state)
-    Methods:
-        extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
-            Extracts a score and reasoning from a given text using regular expressions.
-        extract_id_from_filename(filename: str) -> Optional[int]:
-            Extracts an ID from a given filename based on a specified pattern.
-        find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
-            Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
-        read_from_file(file_path: str) -> str:
-            Reads the content of a file and returns it as a string.
-    """
-    @staticmethod
-    def read_from_file(file_path: str) -> str:
-        """
-        Reads the content of a file and returns it as a string.
-        Args:
-            file_path (str): The path to the file to be read.
-        Returns:
-            str: The content of the file.
-        """
-        with open(file_path, 'r') as prompt_file:
-            prompt = prompt_file.read()
-        return prompt
-    @staticmethod
-    def extract_id_from_filename(filename: str) -> Optional[int]:
-        """
-        Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
-        Args:
-            filename (str): The filename from which to extract the ID.
-        Returns:
-            int: The extracted ID as an integer, or None if the pattern is not found.
-        """
-        # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
-        match = re.search(r'Id_(\d+).pdf', filename)
-        if match:
-            return int(match.group(1))  # Convert to integer if ID is numeric
-        else:
-            return None
-    @staticmethod
-    def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
-        """
-        Extracts score and reasoning from a given text using regular expressions.
-        Args:
-            text (str): The text from which to extract the score and reasoning.
-        Returns:
-            dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
-        """
-        # Define regular expression patterns for score and reasoning
-        score_pattern = r"Score: (\d+)"
-        reasoning_pattern = r"Reasoning: (.+)"
-        # Extract data using regular expressions
-        score_match = re.search(score_pattern, text)
-        reasoning_match = re.search(reasoning_pattern, text, re.DOTALL)  # re.DOTALL allows '.' to match newlines
-        # Extract and return the results
-        extracted_data = {
-            "score": score_match.group(1) if score_match else None,
-            "reasoning": reasoning_match.group(1).strip() if reasoning_match else None
-        }
-        return extracted_data
-    @staticmethod
-    def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
-        """
-        Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
-        Args:
-            pdf_filename (str): The filename of the PDF.
-            dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
-        Returns:
-            pandas.Series or str: The matched row from the dataframe or a message indicating
-                                  that no matching row or invalid filename was found.
-        """
-        pdf_id = Utility.extract_id_from_filename(pdf_filename)
-        if pdf_id is not None:
-            # Assuming the first column contains the ID
-            matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
-            if not matched_row.empty:
-                return matched_row
-            else:
-                return "No matching row found."
-        else:
-            return "Invalid file name."
-class PDFProcessor_Unstructured:
-    """
-    A class to process PDF files, providing functionalities for extracting, categorizing,
-    and merging elements from a PDF file.
-    This class is designed to handle unstructured PDF documents, particularly useful for
-    tasks involving text extraction, categorization, and data processing within PDFs.
-    Attributes:
-        file_path (str): The full path to the PDF file.
-        folder_path (str): The directory path where the PDF file is located.
-        file_name (str): The name of the PDF file.
-        texts (List[str]): A list to store extracted text chunks.
-        tables (List[str]): A list to store extracted tables.
-    Methods:
-        extract_pdf_elements() -> List:
-            Extracts images, tables, and text chunks from a PDF file.
-        categorize_elements(raw_pdf_elements: List) -> None:
-            Categorizes extracted elements from a PDF into tables and texts.
-        merge_chunks() -> List[str]:
-            Merges text chunks based on punctuation and character case criteria.
-        should_skip_chunk(chunk: str) -> bool:
-            Determines if a chunk should be skipped based on its content.
-        should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
-            Determines if the current chunk should be merged with the next one.
-        process_pdf() -> Tuple[List[str], List[str]]:
-            Processes the PDF by extracting, categorizing, and merging elements.
-        process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
-            Processes an uploaded PDF file to extract and categorize text and tables.
-    """
-    def __init__(self, config: Dict[str, any]):
-        self.file_path = None
-        self.folder_path = None
-        self.file_name = None
-        self.texts = []
-        self.tables = []
-        self.config = config if config is not None else self.default_config()
-        logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
-    @staticmethod
-    def default_config() -> Dict[str, any]:
-        """
-        Returns the default configuration for PDF processing.
-        Returns:
-            Dict[str, any]: Default configuration options.
-        """
-        return {
-            "extract_images": False,
-            "infer_table_structure": True,
-            "chunking_strategy": "by_title",
-            "max_characters": 10000,
-            "combine_text_under_n_chars": 100,
-            "strategy": "auto",
-            "model_name": "yolox"
-        }
-    def extract_pdf_elements(self) -> List:
-        """
-        Extracts images, tables, and text chunks from a PDF file.
-        Returns:
-            List: A list of extracted elements from the PDF.
-        """
-        logger.info("Starting extraction of PDF elements.")
-        try:
-            extracted_elements = partition_pdf(
-                filename=self.file_path,
-                extract_images_in_pdf=False,
-                infer_table_structure=True,
-                chunking_strategy="by_title",
-                max_characters=10000,
-                combine_text_under_n_chars=100,
-                image_output_dir_path=self.folder_path,
-                # strategy="fast",
-            )
-            logger.info("Extraction of PDF elements completed successfully.")
-            return extracted_elements
-        except Exception as e:
-            raise NotImplementedError(f"Error extracting PDF elements: {e}")
-    def categorize_elements(self, raw_pdf_elements: List) -> None:
-        """
-        Categorizes extracted elements from a PDF into tables and texts.
-        Args:
-            raw_pdf_elements (List): A list of elements extracted from the PDF.
-        """
-        logger.debug("Starting categorization of PDF elements.")
-        for element in raw_pdf_elements:
-            element_type = str(type(element))
-            if "unstructured.documents.elements.Table" in element_type:
-                self.tables.append(str(element))
-            elif "unstructured.documents.elements.CompositeElement" in element_type:
-                self.texts.append(str(element))
-        logger.debug("Categorization of PDF elements completed.")
-    def merge_chunks(self) -> List[str]:
-        """
-        Merges text chunks based on punctuation and character case criteria.
-        Returns:
-            List[str]: A list of merged text chunks.
-        """
-        logger.debug("Starting merging of text chunks.")
-        merged_chunks = []
-        skip_next = False
-        for i, current_chunk in enumerate(self.texts[:-1]):
-            next_chunk = self.texts[i + 1]
-            if self.should_skip_chunk(current_chunk):
-                continue
-            if self.should_merge_with_next(current_chunk, next_chunk):
-                merged_chunks.append(current_chunk + " " + next_chunk)
-                skip_next = True
-            else:
-                merged_chunks.append(current_chunk)
-        if not skip_next:
-            merged_chunks.append(self.texts[-1])
-        logger.debug("Merging of text chunks completed.")
-        return merged_chunks
-    @staticmethod
-    def should_skip_chunk(chunk: str) -> bool:
-        """
-        Determines if a chunk should be skipped based on its content.
-        Args:
-            chunk (str): The text chunk to be evaluated.
-        Returns:
-            bool: True if the chunk should be skipped, False otherwise.
-        """
-        return (chunk.lower().startswith(("figure", "fig", "table")) or
-                not chunk[0].isalnum() or
-                re.match(r'^\d+\.', chunk))
-    @staticmethod
-    def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
-        """
-        Determines if the current chunk should be merged with the next one.
-        Args:
-            current_chunk (str): The current text chunk.
-            next_chunk (str): The next text chunk.
-        Returns:
-            bool: True if the chunks should be merged, False otherwise.
-        """
-        return (current_chunk.endswith(",") or
-                (current_chunk[-1].islower() and next_chunk[0].islower()))
-    def process_pdf(self) -> Tuple[List[str], List[str]]:
-        """
-        Processes the PDF by extracting, categorizing, and merging elements.
-        Returns:
-            Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
-            is_research_paper: A boolean indicating if the paper is a research paper or not.
-        """
-        is_review_paper = False
-        logger.info("Starting processing of the PDF.")
-        try:
-            time_extract = time.time()
-            raw_pdf_elements = self.extract_pdf_elements()
-            logger.info(
-                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements extracted in {time.time() - time_extract:.2f} seconds.")
-            time_review = time.time()
-            for element in raw_pdf_elements:
-                text = element.text.split()
-                for word in text:
-                    if word.lower() == 'review':
-                        logger.warning("!!! this seems to be a review paper and not a research paper. this demo "
-                                       "analyses only research papers.")
-                        is_review_paper = True
-            logging.info(
-                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF review check completed in {time.time() - time_review:.2f} seconds.")
-            time_categorize = time.time()
-            self.categorize_elements(raw_pdf_elements)
-            logger.info(
-                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements categorized in {time.time() - time_categorize:.2f} seconds.")
-            time_merge = time.time()
-            merged_chunks = self.merge_chunks()
-            logger.info(
-                f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF text chunks merged in {time.time() - time_merge:.2f} seconds.")
-            return merged_chunks, self.tables
-        except Exception as e:
-            raise NotImplementedError(f"Error processing PDF: {e}")
-    def process_pdf_file(self, uploaded_file):
-        """
-        Process an uploaded PDF file.
-        If a new file is uploaded, the previously stored file is deleted.
-        The method updates the file path, processes the PDF, and returns the results.
-        Parameters:
-        uploaded_file: The new PDF file uploaded for processing.
-        Returns:
-        The results of processing the PDF file.
-        """
-        logger.info(f"Starting to process the PDF file: {uploaded_file.filename}")
-        with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
-            uploaded_file.save(temp_file.name)
-            self.file_path = temp_file.name
-            self.folder_path = os.path.dirname(self.file_path)
-        try:
-            logger.debug(f"Processing PDF at {self.file_path}")
-            results = self.process_pdf()
-            title = self.extract_title_from_pdf(self.file_path)
-            logger.info("PDF processing completed successfully.")
-            return (*results, title)
-        except Exception as e:
-            logger.error(f"Error processing PDF file: {e}", exc_info=True)
-            raise
-        finally:
-            try:
-                os.remove(self.file_path)
-                logger.debug(f"Temporary file {self.file_path} deleted.")
-            except Exception as e:
-                logger.warning(f"Error deleting temporary file: {e}", exc_info=True)
-    def extract_title_from_pdf(self, uploaded_file):
-        """
-        Extracts the title from a PDF file's metadata.
-        This function reads the metadata of a PDF file using PyPDF2 and attempts to
-        extract the title. If the title is present in the metadata, it is returned.
-        Otherwise, a default message indicating that the title was not found is returned.
-        Parameters:
-        uploaded_file (file): A file object or a path to the PDF file from which
-                          to extract the title. The file must be opened in binary mode.
-        Returns:
-        str: The title of the PDF file as a string. If no title is found, returns
-             'Title not found'.
-        """
-        # Initialize PDF reader
-        pdf_reader = PdfReader(uploaded_file)
-        # Extract document information
-        meta = pdf_reader.metadata
-        # Retrieve title from document information
-        title = meta.title if meta and meta.title else 'Title not found'
-        return title
-class HybridRetriever(BaseRetriever):
-    """
-    A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
-    Inherits from BaseRetriever.
-    This class uses two different retrieval methods and merges their results to provide a
-    comprehensive set of documents in response to a query. It ensures diversity in the
-    retrieved documents by leveraging the strengths of both retrieval methods.
-    Attributes:
-        vector_retriever: An instance of a vector-based retriever.
-        bm25_retriever: An instance of a BM25 retriever.
-    Methods:
-        __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
-        _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
-        _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
-    """
-    def __init__(self, vector_retriever, bm25_retriever):
-        super().__init__()
-        self.vector_retriever = vector_retriever
-        self.bm25_retriever = bm25_retriever
-        logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
-    def _retrieve(self, query: str, **kwargs) -> List:
-        """
-        Retrieves and combines results from both vector and BM25 retrievers.
-        Args:
-            query: The query string for document retrieval.
-            **kwargs: Additional keyword arguments for retrieval.
-        Returns:
-            List: Combined list of unique nodes retrieved from both methods.
-        """
-        logger.info(f"Retrieving documents for query: {query}")
-        try:
-            bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
-            vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
-            combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
-            logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
-            return combined_nodes
-        except Exception as e:
-            logger.error(f"Error in retrieval: {e}")
-            raise
-    @staticmethod
-    def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
-        """
-        Combines and de-duplicates results from BM25 and vector retrievers.
-        Args:
-            bm25_nodes: Nodes retrieved from BM25 retriever.
-            vector_nodes: Nodes retrieved from vector retriever.
-        Returns:
-            List: Combined list of unique nodes.
-        """
-        node_ids: Set = set()
-        combined_nodes = []
-        for node in bm25_nodes + vector_nodes:
-            if node.node_id not in node_ids:
-                combined_nodes.append(node)
-                node_ids.add(node.node_id)
-        return combined_nodes
-class PDFQueryEngine:
-    """
-    A class to handle the process of setting up a query engine and performing queries on PDF documents.
-    This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
-    indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
-    Attributes:
-        documents (List): A list of documents to be indexed.
-        llm (Language Model): The language model to be used for embeddings and queries.
-        qa_prompt_tmpl (str): Template for creating query prompts.
-        queries (List[str]): List of queries to be executed.
-    Methods:
-        setup_query_engine(): Sets up the query engine with all necessary components.
-        execute_queries(): Executes the predefined queries and prints the results.
-    """
-    def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
-        self.documents = documents
-        self.llm = llm
-        self.embed_model = embed_model
-        self.qa_prompt_tmpl = qa_prompt_tmpl
-        self.base_utils = base_utils()
-        logger.info("PDFQueryEngine initialized.")
-    def setup_query_engine(self):
-        """
-        Sets up the query engine by initializing and configuring the embedding model, service context, index,
-        hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
-        Args:
-            embed_model: The embedding model to be used.
-            service_context: The context for providing services to the query engine.
-            index: The index used for storing and retrieving documents.
-            hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
-            response_synthesizer: The synthesizer for generating responses to queries.
-        Returns:
-            Any: The configured query engine.
-        """
-        try:
-            logger.info("Initializing the service context for query engine setup.")
-            service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
-            logger.info("Creating an index from documents.")
-            index = VectorStoreIndex.from_documents(documents=self.documents, service_context=service_context)
-            nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
-            logger.info("Setting up vector and BM25 retrievers.")
-            vector_retriever = index.as_retriever(similarity_top_k=5)
-            bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
-            hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
-            logger.info("Configuring the response synthesizer with the prompt template.")
-            qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
-            response_synthesizer = get_response_synthesizer(
-                service_context=service_context,
-                text_qa_template=qa_prompt,
-                response_mode="compact",
-            )
-            logger.info("Assembling the query engine with reranker and synthesizer.")
-            reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")
-            query_engine = RetrieverQueryEngine.from_args(
-                retriever=hybrid_retriever,
-                node_postprocessors=[reranker],
-                response_synthesizer=response_synthesizer,
-            )
-            logger.info("Query engine setup complete.")
-            return query_engine
-        except Exception as e:
-            logger.error(f"Error during query engine setup: {e}")
-            raise
-    def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[
-        int, List[int], int, float, List[str]]:
-        """
-        Evaluate documents using a language model based on various criteria.
-        Args:
-            reg_result (Any): Result related to registration.
-            peer_result (Any): Result related to peer review.
-            guidelines_result (Any): Result related to following guidelines.
-            queries (List[str]): A list of queries to be processed.
-        Returns:
-            Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria,
-        """
-        logger.info("Starting evaluation with LLM.")
-        query_engine = self.setup_query_engine()
-        total_score = 0
-        criteria_met = 0
-        reasoning = []
-        for j, query in enumerate(queries):
-            # Predefine extracted_data to handle the default case
-            extracted_data = None
-            # Handle special cases based on the value of j and other conditions
-            if j == 1 and reg_result:
-                extracted_data = {"score": 1, "reasoning": reg_result[0]}
-            elif j == 2 and guidelines_result:
-                extracted_data = {"score": 1,
-                                  "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
-            elif j == 8 and (guidelines_result or peer_result):
-                extracted_data = {"score": 1, "reasoning": "The article is published in a peer reviewed journal."}
-            # Handle the default case if none of the special conditions were met
-            if extracted_data is None:
-                result = query_engine.query(query).response
-                extracted_data = self.base_utils.extract_score_reasoning(result)
-            if extracted_data['score'] and int(extracted_data["score"]) > 0:
-                criteria_met += 1
-                total_score += int(extracted_data["score"])
-            reasoning.append(extracted_data["reasoning"])
-        score_percentage = (float(total_score) / len(queries)) * 100
-        logger.info("Evaluation completed.")
-        return total_score, criteria_met, score_percentage, reasoning
-class MixtralLLM(CustomLLM):
-    """
-    A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
-    Attributes:
-        context_window (int): Number of tokens used for context during inference.
-        num_output (int): Number of tokens to generate as output.
-        temperature (float): Sampling temperature for token generation.
-        model_name (str): Name of the model on Hugging Face's model hub.
-        api_key (str): API key for authenticating with the Hugging Face API.
-    Methods:
-        metadata: Retrieves metadata about the model.
-        do_hf_call: Makes an API call to the Hugging Face model.
-        complete: Generates a complete response for a given prompt.
-        stream_complete: Streams a series of token completions for a given prompt.
-    """
-    def __init__(self, context_window: int, num_output: int, temperature: float, model_name: str, api_key: str):
-        """
-        Initialize the MixtralLLM class with specific configuration values.
-        Args:
-            context_window (int): The number of tokens to consider for context during LLM inference.
-            num_output (int): The number of tokens to generate in the output.
-            temperature (float): The sampling temperature to use for generating tokens.
-            model_name (str): The name of the model to be used from Hugging Face's model hub.
-            api_key (str): The API key for authentication with Hugging Face's inference API.
-        """
-        super().__init__()
-        self.context_window = context_window
-        self.num_output = num_output
-        self.temperature = temperature
-        self.model_name = model_name
-        self.api_key = api_key
-    @property
-    def metadata(self) -> LLMMetadata:
-        """
-        Retrieves metadata for the Mixtral LLM.
-        Returns:
-            LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
-        """
-        return LLMMetadata(
-            context_window=self.context_window,
-            num_output=self.num_output,
-            model_name=self.model_name,
-        )
-    def do_hf_call(self, prompt: str) -> str:
-        """
-        Makes an API call to the Hugging Face model and retrieves the generated response.
-        Args:
-            prompt (str): The input prompt for the model.
-        Returns:
-            str: The text generated by the model in response to the prompt.
-        Raises:
-            Exception: If the API call fails or returns an error.
-        """
-        data = {
-            "inputs": prompt,
-            "parameters": {"Temperature": self.temperature}
-        }
-        # Makes a POST request to the Hugging Face API to get the model's response
-        response = requests.post(
-            f'https://api-inference.huggingface.co/models/{self.model_name}',
-            headers={
-                'authorization': f'Bearer {self.api_key}',
-                'content-type': 'application/json',
-            },
-            json=data,
-            stream=True
-        )
-        # Checks for a successful response and parses the generated text
-        if response.status_code != 200 or not response.json() or 'error' in response.json():
-            print(f"Error: {response}")
-            return "Unable to answer for technical reasons."
-        full_txt = response.json()[0]['generated_text']
-        # Finds the section of the text following the context separator
-        offset = full_txt.find("---------------------")
-        ss = full_txt[offset:]
-        # Extracts the actual answer from the response
-        offset = ss.find("Answer:")
-        return ss[offset + 7:].strip()
-    @llm_completion_callback()
-    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
-        """
-        Generates a complete response for a given prompt using the Hugging Face API.
-        Args:
-            prompt (str): The input prompt for the model.
-            **kwargs: Additional keyword arguments for the completion.
-        Returns:
-            CompletionResponse: The complete response from the model.
-        """
-        response = self.do_hf_call(prompt)
-        return CompletionResponse(text=response)
-    @llm_completion_callback()
-    def stream_complete(
-            self, prompt: str, **kwargs: Any
-    ) -> CompletionResponseGen:
-        """
-        Streams a series of token completions as a response for the given prompt.
-        This method is useful for streaming responses where each token is generated sequentially.
-        Args:
-            prompt (str): The input prompt for the model.
-            **kwargs: Additional keyword arguments for the streaming completion.
-        Yields:
-            CompletionResponseGen: A generator yielding each token in the completion response.
-        """
-        # Yields a stream of tokens as the completion response for the given prompt
-        response = ""
-        for token in self.do_hf_call(prompt):
-            response += token
-            yield CompletionResponse(text=response, delta=token)
-class KeywordSearch():
-    def __init__(self, chunks):
-        self.chunks = chunks
-    def find_journal_name(self, response: str, journal_list: list) -> str:
-        """
-        Searches for a journal name in a given response string.
-        This function iterates through a list of known journal names and checks if any of these
-        names are present in the response string. It returns the first journal name found in the
-        response. If no journal names from the list are found in the response, a default message
-        indicating that the journal name was not found is returned.
-        Args:
-            response (str): The response string to search for a journal name.
-            journal_list (list): A list of journal names to search within the response.
-        Returns:
-            str: The first journal name found in the response, or a default message if no journal name is found.
-        """
-        response_lower = response.lower()
-        for journal in journal_list:
-            journal_lower = journal.lower()
-            if journal_lower in response_lower:
-                return True
-        return False
-    def check_registration(self):
-        """
-        Check chunks of text for various registration numbers or URLs of registries.
-        Returns the sentence containing a registration number, or if not found,
-        returns chunks containing registry URLs.
-        Args:
-        chunks (list of str): List of text chunks to search.
-        Returns:
-        list of str: List of matching sentences or chunks, or an empty list if no matches are found.
-        """
-        # Patterns for different registration types
-        patterns = {
-            "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
-            "ISRCTN": r"(ISRCTN\d{8})",
-            "EudraCT": r"(\d{4}-\d{6}-\d{2})",
-            "UMIN-CTR": r"(UMIN\d{9})",
-            "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
-        }
-        # Registry URLs
-        registry_urls = [
-            "www.anzctr.org.au",
-            "anzctr.org.au",
-            "www.clinicaltrials.gov",
-            "clinicaltrials.gov",
-            "www.ISRCTN.org",
-            "ISRCTN.org",
-            "www.umin.ac.jp/ctr/index/htm",
-            "umin.ac.jp/ctr/index/htm",
-            "www.onderzoekmetmensen.nl/en",
-            "onderzoekmetmensen.nl/en",
-            "eudract.ema.europa.eu",
-            "www.eudract.ema.europa.eu"
-        ]
-        # Check each chunk for registration numbers
-        for chunk in self.chunks:
-            # Split chunk into sentences
-            sentences = re.split(r'(?<=[.!?]) +', chunk)
-            # Check each sentence for any registration number
-            for sentence in sentences:
-                for pattern in patterns.values():
-                    if re.search(pattern, sentence):
-                        return [sentence]  # Return immediately if a registration number is found
-        # If no registration number found, check for URLs in chunks
-        matching_chunks = []
-        for chunk in self.chunks:
-            if any(url in chunk for url in registry_urls):
-                matching_chunks.append(chunk)
-        return matching_chunks
-class StringExtraction():
-    """
-    A class to handle the the process of extraction of query string from complete LLM responses.
-    This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
-    LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
-    satisfactory results. Best case scenario will be write your own string extraction method in such cases.
-    Methods:
-        extract_original_prompt():
-        extraction_ground_truth():
-    """
-    def extract_original_prompt(self, result):
-        r1 = result.response.strip().split("\n")
-        binary_response = ""
-        explanation_response = ""
-        for r in r1:
-            if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
-                binary_response = r
-            elif r.find("Reasoning:") >= 0:
-                cut = r.find(":")
-                explanation_response += r[cut + 1:].strip()
-        return binary_response, explanation_response
-    def extraction_ground_truth(self, paper_name, labelled_data):
-        id = int(paper_name[paper_name.find("_") + 1:paper_name.find(".pdf")])
-        id_row = labelled_data[labelled_data["id"] == id]
-        ground_truth = id_row.iloc[:, 2:11].values.tolist()[0]
-        binary_ground_truth = []
-        explanation_ground_truth = []
-        for g in ground_truth:
-            if len(g) > 0:
-                binary_ground_truth.append("Yes")
-                explanation_ground_truth.append(g)
-            else:
-                binary_ground_truth.append("No")
-                explanation_ground_truth.append("The article does not provide any relevant information.")
-        return binary_ground_truth, explanation_ground_truth
-class EvaluationMetrics():
-    """
-    This class encapsulates the evaluation methods that have been used in the project.
-    Attributes:
-        explanation_response = a list of detailed response from the LLM model corresponding to each query
-        explanation_ground_truth = the list of ground truth corresponding to each query
-    Methods:
-        metric_cosine_similairty(): Sets up the query engine with all necessary components.
-        metric_rouge(): Executes the predefined queries and prints the results.
-        metric_binary_accuracy():
-    """
-    def __init__(self, explanation_response, explanation_ground_truth, embedding_model):
-        self.explanation_response = explanation_response
-        self.explanation_ground_truth = explanation_ground_truth
-        self.embedding_model = embedding_model
-    def metric_cosine_similarity(self):
-        ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
-        explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
-        return np.diag(cosine_similarity(ground_truth_embedding, explanation_response_embedding))
-    def metric_rouge(self):
-        rouge = evaluate.load("rouge")
-        results = rouge.compute(predictions=self.explanation_response, references=self.explanation_ground_truth)
-        return results
-    def binary_accuracy(self, binary_response, binary_ground_truth):
-        count = 0
-        if len(binary_response) != len(binary_ground_truth):
-            return "Arrays which are to be compared has different lengths."
-        else:
-            for i in range(len(binary_response)):
-                if binary_response[i] == binary_ground_truth[i]:
-                    count += 1
-            return np.round(count / len(binary_response), 2)

librarymed/local/__init__.py DELETED Viewed

File without changes

librarymed/local/app_local.py DELETED Viewed

@@ -1,160 +0,0 @@
-import time
-import argparse
-import logging
-import os
-import openai
-from flask import Flask, flash, request, render_template, redirect
-from llama_index import Document
-from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
-from llama_index.llms import OpenAI
-from librarymed.local.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
-    ConfigManager
-app = Flask(__name__)
-app.config['SECRET_KEY'] = 'librarymed super secret key'
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-config_manager = ConfigManager()
-config_manager.load_config("api", "Config/api_config.json")
-config_manager.load_config("model", "Config/model_config.json")
-app.config['user_config'] = config_manager
-def allowed_file(filename, allowed_extensions):
-    """ Helper function to check if the file extension is allowed """
-    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
-@app.route('/', methods=['GET'])
-def __get__():
-    score = 0
-    criteria_met = 0
-    title = ""
-    author_info = ""
-    reasoning = ""
-    return render_template('index.html',
-                           title=title,
-                           author=author_info,
-                           score=score,
-                           criteria_met=criteria_met,
-                           reasoning=reasoning,
-                           )
-@app.route('/upload', methods=['POST'])
-def upload():
-    config = app.config['user_config']
-    openai.api_key = config.get_config_value("api", "OPENAI_API_KEY")
-    hf_token = config.get_config_value("api", "HF_TOKEN")
-    embed = config.get_config_value("model", "embeddings")
-    embed_model_name = config.get_config_value("model", "embeddings_model")
-    llm_model = config.get_config_value("model", "llm_model")
-    model_temperature = config.get_config_value("model", "model_temp")
-    output_token_size = config.get_config_value("model", "max_tokens")
-    model_context_window = config.get_config_value("model", "context_window")
-    gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
-    mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
-    info_prompt_path = config.get_config_value("model", "INFO_PROMPT_PATH")
-    peer_review_journals_path = config.get_config_value("model", "peer_review_journals_path")
-    eq_network_journals_path = config.get_config_value("model", "eq_network_journals_path")
-    queries = config.get_config_value("model", "queries")
-    num_criteria = len(config.get_config_value("model", "criteria"))
-    author_query = config.get_config_value("model", "author_query")
-    journal_query = config.get_config_value("model", "journal_query")
-    # Check if the post request has the file part
-    if 'file' not in request.files:
-        flash('No file part')
-        return redirect(request.url)
-    file = request.files['file']
-    # If user does not select file, browser also submits an empty part without filename
-    if file.filename == '':
-        flash('No selected file')
-        return redirect(request.url)
-    if file and allowed_file(file.filename, config.get_config_value("model", "allowed_extensions")):
-        try:
-            # Process the PDF file
-            pdf_processor = PDFProcessor_Unstructured(config.get_config_value("model", "pdf_processing"))
-            merged_chunks, tables, title = pdf_processor.process_pdf_file(file)
-            documents = [Document(text=t) for t in merged_chunks]
-            utils = base_utils()
-            # LLM Model choice
-            if 'gpt' in llm_model.lower():  # TODO tested "gpt-4" and  "gpt-3.5-turbo":
-                llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)
-                prompt_template = utils.read_from_file(gpt_prompt_path)
-            elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
-                if any(param is None for param in
-                       [model_context_window, output_token_size, model_temperature, hf_token]):
-                    raise ValueError("All parameters are required for Mistral LLM.")
-                llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
-                                 temperature=model_temperature, model_name=llm_model, api_key=hf_token)
-                prompt_template = utils.read_from_file(mistral_prompt_path)
-            else:
-                raise NotImplementedError(f"Error initializing language model '{llm_model}'")
-            # Embedding model choice for RAG
-            if embed == "openai":
-                embed_model = OpenAIEmbedding()
-            elif embed == "huggingface":
-                if embed_model_name is None:
-                    # Set to default model if name not provided
-                    embed_model_name = "BAAI/bge-small-en-v1.5"
-                    embed_model = HuggingFaceEmbedding(embed_model_name)
-                else:
-                    # Use the specified model name
-                    embed_model = HuggingFaceEmbedding(embed_model_name)
-            else:
-                raise NotImplementedError(f"Error initializing embedding model: {embed}")
-            # Prompts and Queries
-            info_prompt = utils.read_from_file(info_prompt_path)
-            peer_review_journals = utils.read_from_file(peer_review_journals_path)
-            eq_network_journals = utils.read_from_file(eq_network_journals_path)
-            peer_review_journals_list = peer_review_journals.split('\n')
-            eq_network_journals_list = eq_network_journals.split('\n')
-            modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
-                peer_review_journals_list) + "?"
-            pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
-            info_query_engine = pdf_info_query.setup_query_engine()
-            journal_result = info_query_engine.query(modified_journal_query).response
-            author_info = info_query_engine.query(author_query).response
-            pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (prompt_template))
-            # Check for prior registration
-            nlp_methods = KeywordSearch(merged_chunks)
-            eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
-            peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
-            registration_result = nlp_methods.check_registration()
-            # Evaluate with OpenAI model
-            total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
-                registration_result, peer_journal_result, eq_journal_result, queries)
-            score = f"{round((total_score / num_criteria) * 100)}/100"
-        except Exception as e:
-            flash('An error occurred while processing the file. Error: ' + str(e))
-            return redirect(request.url)
-    # e.g. score: 56 / 100 -  criteria_met: 5 - author_info: Direct
-    return render_template('index.html',
-                           title=title,
-                           author=author_info,
-                           score=score,
-                           criteria_met=criteria_met,
-                           reasoning=reasoning,
-                           )

librarymed/local/templates/index.html DELETED Viewed

@@ -1,187 +0,0 @@
-<!doctype html>
-<html>
-<head>
-  <title>Upload and Results</title>
-  <!-- Include Google Fonts -->
-  <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
-  <style>
-    body {
-        font-family: 'Roboto', sans-serif;
-        background-color: #f4f4f4;
-        overflow: auto;
-        width: 100%;
-        margin: 0;
-        padding: 0;
-        display: flex;
-        flex-direction: column; /* Stack flex items vertically */
-        align-items: center; /* Center items horizontally */
-        justify-content: flex-start; /* Align items to the start of the container vertically */
-        min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
-    }
-    table {
-        width: 100%; /* Adjust the width as needed */
-        border-collapse: collapse; /* Collapse borders for a tighter look */
-    }
-    th, td {
-        border: 1px solid #ddd; /* Adjust the border size as needed */
-        text-align: left;
-        padding: 5px; /* Reduce padding to decrease cell spacing */
-        height: 30px; /* Optionally reduce the height of the cells */
-    }
-    .parent-element {
-        overflow: visible; /* Ensures content is not cut off */
-    }
-    .container {
-        background-color: white;
-        overflow: auto;
-        border-radius: 8px;
-        box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
-        padding: 40px;
-        width: 100%; /* Set width to 100% of the viewport */
-        max-width: 700px;
-    }
-    .score-bar-container {
-      position: relative;
-      margin-top: 20px; /* Space above the score bar */
-      max-width: 100%; /* Ensures the container does not exceed the parent width */
-    }
-    .score-very-good-fill {
-        background-color: #4CAF50; /* Green */
-    }
-    .score-good-fill {
-        background-color: #FFEB3B; /* Yellow */
-    }
-    .score-ok-fill {
-        background-color: #FF9800; /* Orange */
-    }
-    .score-bad-fill {
-        background-color: #f44336; /* Red */
-    }
-    .score-very-bad-fill {
-        background-color: #9E9E9E; /* Grey */
-    }
-    .score-very-good-text {
-            color: #4CAF50; /* Green */
-      }
-    .score-good-text {
-            color: #FFEB3B; /* Yellow */
-      }
-    .score-ok-text {
-            color: #FF9800; /* Orange */
-      }
-    .score-bad-text {
-            color: #f44336; /* Red */
-      }
-    .score-very-bad-text {
-            color: #9E9E9E; /* Grey */
-      }
-    .score-bar {
-      background-color: #ddd;
-      border-radius: 10px;
-      height: 20px;
-      width: 100%; /* Adjusted to take the full width */
-      display: inline-block; /* Allows the score text to sit next to the score bar */
-      vertical-align: middle; /* Aligns score bar and text vertically */
-    }
-    .score-fill {
-      height: 100%;
-      border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
-      display: inline-block;
-      vertical-align: middle;
-    }
-    .score-text {
-      display: inline-block;
-      vertical-align: middle; /* Align with the score bar */
-      font-weight: bold; /* Make the score text bold */
-      margin-left: 10px; /* Space between the score bar and score text */
-    }
-    .score-title {
-      font-size: 20px;
-      font-weight: bold;
-      margin: 20px 0;
-      color: #333;
-    }
-    .major-issues {
-      text-align: left; /* Aligns the major issues to the left */
-      padding-left: 20px; /* Padding for the bullet list */
-      list-style: inside disc; /* Bullet style */
-    }
-    form {
-      margin-bottom: 20px;
-    }
-    input[type="file"] {
-      margin-bottom: 10px;
-    }
-    input[type="submit"] {
-      cursor: pointer;
-      margin-top: 10px;
-      padding: 10px 20px;
-      border: none;
-      background-color: #4CAF50;
-      color: white;
-      border-radius: 5px;
-      font-size: 16px;
-      font-weight: bold;
-    }
-    input[type="submit"]:hover {
-      background-color: #45a049;
-    }
-  </style>
-</head>
-<body>
-  <div class="container">
-    <h2>Upload PDF and View Results</h2>
-    <!-- Upload Form -->
-    <form action="/upload" method="post" enctype="multipart/form-data">
-      <input type="file" name="file" required>
-      <input type="submit" value="Upload">
-    </form>
-    <!-- Results Section -->
-    {% if total_score is not none %}
-      <!-- GPT-4 Score Bar -->
-      <div class="score-title">Score:</div>
-      <div class="score-bar-container">
-        <div class="score-bar">
-          <div class="score-fill {{
-            'score-very-good-fill' if criteria_met == 9 else
-            'score-good-fill' if criteria_met >= 7 else
-            'score-ok-fill' if criteria_met >= 5 else
-            'score-bad-fill' if criteria_met >= 3 else
-            'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
-        </div>
-        <div class="score-text">{{ score }}</div>
-      </div>
-      <h3>Title:</h3>
-        <p> {{title}}</p>
-      <h3>Author Information:</h3>
-          <p> {{author}}</p>
-      <h3>Reasoning:</h3>
-          <ul class="major-issues">
-            {% for issue in reasoning %}
-              <li>{{ issue }}</li>
-            {% endfor %}
-          </ul>
-    {% endif %}
-  </div>
-</body>
-</html>

librarymed/local/templates/upload_and_results.html DELETED Viewed

@@ -1,227 +0,0 @@
-<!doctype html>
-<html>
-<head>
-  <title>Upload and Results</title>
-  <!-- Include Google Fonts -->
-  <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
-  <style>
-    body {
-        font-family: 'Roboto', sans-serif;
-        background-color: #f4f4f4;
-        overflow: auto;
-        width: 100%;
-        margin: 0;
-        padding: 0;
-        display: flex;
-        flex-direction: column; /* Stack flex items vertically */
-        align-items: center; /* Center items horizontally */
-        justify-content: flex-start; /* Align items to the start of the container vertically */
-        min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
-    }
-    table {
-        width: 100%; /* Adjust the width as needed */
-        border-collapse: collapse; /* Collapse borders for a tighter look */
-    }
-    th, td {
-        border: 1px solid #ddd; /* Adjust the border size as needed */
-        text-align: left;
-        padding: 5px; /* Reduce padding to decrease cell spacing */
-        height: 30px; /* Optionally reduce the height of the cells */
-    }
-    .parent-element {
-        overflow: visible; /* Ensures content is not cut off */
-    }
-    .container {
-        background-color: white;
-        overflow: auto;
-        border-radius: 8px;
-        box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
-        padding: 40px;
-        width: 100%; /* Set width to 100% of the viewport */
-        max-width: 700px;
-    }
-    .score-bar-container {
-      position: relative;
-      margin-top: 20px; /* Space above the score bar */
-      max-width: 100%; /* Ensures the container does not exceed the parent width */
-    }
-    .score-very-good-fill {
-        background-color: #4CAF50; /* Green */
-    }
-    .score-good-fill {
-        background-color: #FFEB3B; /* Yellow */
-    }
-    .score-ok-fill {
-        background-color: #FF9800; /* Orange */
-    }
-    .score-bad-fill {
-        background-color: #f44336; /* Red */
-    }
-    .score-very-bad-fill {
-        background-color: #9E9E9E; /* Grey */
-    }
-    .score-very-good-text {
-            color: #4CAF50; /* Green */
-      }
-    .score-good-text {
-            color: #FFEB3B; /* Yellow */
-      }
-    .score-ok-text {
-            color: #FF9800; /* Orange */
-      }
-    .score-bad-text {
-            color: #f44336; /* Red */
-      }
-    .score-very-bad-text {
-            color: #9E9E9E; /* Grey */
-      }
-    .score-bar {
-      background-color: #ddd;
-      border-radius: 10px;
-      height: 20px;
-      width: 100%; /* Adjusted to take the full width */
-      display: inline-block; /* Allows the score text to sit next to the score bar */
-      vertical-align: middle; /* Aligns score bar and text vertically */
-    }
-    .score-fill {
-      height: 100%;
-      border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
-      display: inline-block;
-      vertical-align: middle;
-    }
-    .score-text {
-      display: inline-block;
-      vertical-align: middle; /* Align with the score bar */
-      font-weight: bold; /* Make the score text bold */
-      margin-left: 10px; /* Space between the score bar and score text */
-    }
-    .score-title {
-      font-size: 20px;
-      font-weight: bold;
-      margin: 20px 0;
-      color: #333;
-    }
-    .major-issues {
-      text-align: left; /* Aligns the major issues to the left */
-      padding-left: 20px; /* Padding for the bullet list */
-      list-style: inside disc; /* Bullet style */
-    }
-    form {
-      margin-bottom: 20px;
-    }
-    input[type="file"] {
-      margin-bottom: 10px;
-    }
-    input[type="submit"] {
-      cursor: pointer;
-      margin-top: 10px;
-      padding: 10px 20px;
-      border: none;
-      background-color: #4CAF50;
-      color: white;
-      border-radius: 5px;
-      font-size: 16px;
-      font-weight: bold;
-    }
-    input[type="submit"]:hover {
-      background-color: #45a049;
-    }
-  </style>
-</head>
-<body>
-  <div class="container">
-    <h2>Upload PDF and View Results</h2>
-    <!-- Upload Form -->
-    <form action="/upload" method="post" enctype="multipart/form-data">
-      <input type="file" name="file" required>
-      <input type="submit" value="Upload">
-    </form>
-    <!-- Results Section -->
-    {% if gpt4_score is not none or mistral_score is not none %}
-      <!-- GPT-4 Score Bar -->
-      <div class="score-title">Score for GPT-4:</div>
-      <div class="score-bar-container">
-        <div class="score-bar">
-          <div class="score-fill {{
-            'score-very-good-fill' if criteria_met_gpt4 == 9 else
-            'score-good-fill' if criteria_met_gpt4 >= 7 else
-            'score-ok-fill' if criteria_met_gpt4 >= 5 else
-            'score-bad-fill' if criteria_met_gpt4 >= 3 else
-            'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
-        </div>
-        <div class="score-text">{{ total_score_gpt4 }}/9</div>
-      </div>
-      <!-- Mistral Score Bar -->
-      <div class="score-title">Score for Mistral:</div>
-      <div class="score-bar-container">
-        <div class="score-bar">
-          <div class="score-fill {{
-            'score-very-good-fill' if criteria_met_mistral == 9 else
-            'score-good-fill' if criteria_met_mistral >= 7 else
-            'score-ok-fill' if criteria_met_mistral >= 5 else
-            'score-bad-fill' if criteria_met_mistral >= 3 else
-            'score-very-bad-fill' }}" style="width: {{ score_percentage_mistral }}%;"></div>
-        </div>
-        <div class="score-text">{{ total_score_mistral }}/9</div>
-      </div>
-      <!-- Reasoning for GPT-4 -->
-      <h3>Reasoning from GPT-4:</h3>
-      <ul class="major-issues">
-        {% for issue in reasoning_gpt4 %}
-          <li>{{ issue }}</li>
-        {% endfor %}
-      </ul>
-      <!-- Reasoning for Mistral -->
-      <h3>Reasoning from Mistral:</h3>
-      <ul class="major-issues">
-        {% for issue in reasoning_mistral %}
-          <li>{{ issue }}</li>
-        {% endfor %}
-      </ul>
-      <!-- Insert the Criteria Table Section Here -->
-        {% if combined_criteria_table %}
-        <h3>Criteria Evaluation</h3>
-        <table>
-            <thead>
-                <tr>
-                    <th>Criteria Number</th>
-                    <th>GPT-4 output</th>
-                    <th>Mistral output</th>
-                    <th>Ground truth</th>
-                </tr>
-            </thead>
-            <tbody>
-                {% for row in combined_criteria_table %}
-                  <tr>
-                    <td>{{ row['Criteria Number'] }}</td>
-                    <td>{{ 'Yes' if row['Score GPT-4'] == 1 else 'No' }}</td>
-                    <td>{{ 'Yes' if row['Score Mistral'] == 1 else 'No' }}</td>
-                    <td>{{ 'Yes' if row['ground truth'] else 'No' }}</td>
-                  </tr>
-                {% endfor %}
-              </tbody>
-            </table>
-          {% endif %}
-    {% endif %}
-  </div>
-</body>
-</html>

librarymed/main.py CHANGED Viewed

@@ -1,22 +1,12 @@
-import argparse
-import logging
 import os
 from dotenv import load_dotenv
 load_dotenv()
 if __name__ == '__main__':
-    args_parse = argparse.ArgumentParser(description="LibraryMed")
-    args_parse.add_argument("--local", help="Run inferface v0.1.0 by the fellows", action="store_true")
-    args = args_parse.parse_args()
     port = os.getenv("PORT") or 80
-    if args.local:
-        from .local.app_local import app
-        logging.info("Run LibraryMed interface v0.1.0 developed by the fellows")
-        app.run(debug=True, host="0.0.0.0", port=port)
-    else:
-        from kromin.app_librarymed import app
-        logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
-        app.run(debug=True, host="0.0.0.0", port=port)

 import os
+import logging
 from dotenv import load_dotenv
 load_dotenv()
+from .app_librarymed import app
+app = app
 if __name__ == '__main__':
     port = os.getenv("PORT") or 80
+    logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
+    app.run(debug=True, host="0.0.0.0", port=port)