frasan commited on
Commit
8ef5a0f
1 Parent(s): a5afede

first commit

Browse files
Config/model_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pdf_processing": {
3
+ "extract_images": false,
4
+ "infer_table_structure": true,
5
+ "strategy": "fast",
6
+ "chunking_strategy": "by_title",
7
+ "model_name": "yolox",
8
+ "max_characters": 10000,
9
+ "combine_text_under_n_chars": 100
10
+ },
11
+ "allowed_extensions": "pdf",
12
+ "embeddings": "huggingface",
13
+ "embeddings_model": "BAAI/bge-small-en-v1.5",
14
+ "llm_model": "gpt-4",
15
+ "model_temp": 0.2,
16
+ "max_tokens": 512,
17
+ "context_window": 5000,
18
+ "UPLOAD_FOLDER": "../path/to/upload/folder",
19
+ "GPT_PROMPT_PATH": "data/prompts/prompt_gpt.txt",
20
+ "MISTRAL_PROMPT_PATH": "data/prompts/prompt_mistral.txt",
21
+ "INFO_PROMPT_PATH": "data/prompts/prompt_info.txt",
22
+ "peer_review_journals_path": "data/prompts/peer_review_journals.txt",
23
+ "eq_network_journals_path": "data/prompts/eq_network_journals.txt",
24
+ "queries": ["Does the article share any data or code? Look for terms related to supplementary materials or reproducibility.",
25
+ "Has the study or any data in the article been registered in advance?",
26
+ "Does the article adhere to specific reporting guidelines such as ISRCTN, CONSORT, PRISMA, MOOSE, STARD, ARRIVE, STROBE, SPIRIT, CARE, AGREE, SRQR, SQUIRE, MDAR, REMARK?",
27
+ "Is the article's methodology described in detail, including where, when, how, what, and who?",
28
+ "Are the data collection processes described in detail, including where, when, how, what, and who?",
29
+ "Does the article provide a detailed description of the sample, including size, demographics, recruitment, and criteria?",
30
+ "Does the article describe the data analysis process in detail?",
31
+ "Does the article discuss measures taken to avoid or minimize systematic bias?",
32
+ "Has the article been published in a journal?"],
33
+ "criteria": [
34
+ "Data and code sharing.",
35
+ "Has anything in the article been registered (in advance)?",
36
+ "Does the article follow any reporting guidelines?",
37
+ "Description of methodology",
38
+ "Data collection processes",
39
+ "Sample description. eg. size, demographics, recruitment, in-/exclusion criteria",
40
+ "Data analysis process",
41
+ "Measures to minimize systematic bias",
42
+ "Peer Review"],
43
+ "journal_query": "Is the given research paper published in any of the following journals: {}?",
44
+ "author_query": "Give me details about the institutions (like university or hospital) and contact details (eg. email) of the corresponding author.",
45
+ "title_query": "Output title of the paper."
46
+ }
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image as base image
2
+ FROM python:3.9
3
+
4
+ RUN apt-get update && apt-get install -y \
5
+ python3.10 python3-pip \
6
+ tesseract-ocr \
7
+ libtesseract-dev \
8
+ libgl1-mesa-glx \
9
+ poppler-utils \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Set the working directory in the container
13
+ WORKDIR /app
14
+
15
+ # Install system dependencies
16
+ RUN apt-get update && apt-get install -y libgl1-mesa-glx
17
+
18
+ # Copy the dependencies file to the working directory
19
+ COPY requirements.txt .
20
+
21
+ # Install dependencies
22
+ RUN pip install --trusted-host pypi.python.org -r requirements.txt
23
+
24
+ # Copy the content of the local src directory to the working directory
25
+ COPY . .
26
+
27
+ # Create a user to run the application
28
+ RUN useradd -m -u 1000 user
29
+ USER user
30
+ ENV HOME=/home/user \
31
+ PATH=/home/user/.local/bin:$PATH
32
+
33
+ # Set the working directory in the user's home directory
34
+ WORKDIR $HOME/app
35
+ COPY --chown=user . $HOME/app
36
+
37
+ # Expose the port number on which the Flask app will run
38
+ EXPOSE 80
39
+
40
+ # Define environment variable
41
+ ENV NAME World
42
+
43
+ # Command to run on container start
44
+ CMD ["python", "librarymed/main.py"]
docker-compose.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ flask-app:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ volumes:
9
+ - .:/app
10
+ ports:
11
+ - "80:80"
librarymed/.DS_Store ADDED
Binary file (6.15 kB). View file
 
librarymed/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+
librarymed/__init__.py ADDED
File without changes
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Bold.ttf ADDED
Binary file (632 kB). View file
 
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Oblique.ttf ADDED
Binary file (576 kB). View file
 
librarymed/huggingface/DejaVu/DejaVuSansCondensed.ttf ADDED
Binary file (644 kB). View file
 
librarymed/huggingface/DejaVu/readme.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Congratulations, you have successfully downloaded font file!
2
+
3
+ This font is provided to you by Fonts2u.com – the largest online
4
+ repository of free fonts for Windows and Mac.
5
+
6
+
7
+
8
+ How to install this font on your computer?
9
+
10
+ For Windows 7 / Vista users:
11
+
12
+ - Right-click the font file(s) and choose "Install".
13
+
14
+ For users of the previous Windows versions:
15
+
16
+ - Copy the included file(s) into a default Windows font folder
17
+ (usually C:\WINDOWS\FONTS or C:\WINNT\FONTS)
18
+
19
+ For Mac users:
20
+
21
+ Mac OS X 10.3 or above (including the FontBook)
22
+
23
+ - Double-click the font file and hit "Install font" button at
24
+ the bottom of the preview.
25
+
26
+ Mac OS X
27
+
28
+ - Either copy the font file(s) to /Library/Fonts (for all users),
29
+ or to /Users/Your_username/Library/Fonts (for you only).
30
+
31
+ Mac OS 9 or earlier
32
+
33
+ - You have to convert the font file(s) you have downloaded.
34
+ Drag the font suitcases into the System folder. The system
35
+ will propose you to add them to the Fonts folder.
36
+
37
+ For Linux users:
38
+
39
+ - Copy the font file(s) to /USR/SHARE/FONTS
40
+
librarymed/huggingface/RAG_utils_huggingface.py ADDED
@@ -0,0 +1,995 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import torch
5
+
6
+ import openai
7
+ import logging
8
+ import asyncio
9
+ import aiohttp
10
+ import pandas as pd
11
+ import numpy as np
12
+ import evaluate
13
+ import qdrant_client
14
+ from pypdf import PdfReader
15
+ from pydantic import BaseModel, Field
16
+ from typing import Any, List, Tuple, Set, Dict, Optional, Union
17
+ from sklearn.metrics.pairwise import cosine_similarity
18
+
19
+ from unstructured.partition.pdf import partition_pdf
20
+
21
+ import llama_index
22
+ from llama_index import PromptTemplate
23
+ from llama_index.retrievers import VectorIndexRetriever, BaseRetriever, BM25Retriever
24
+ from llama_index.query_engine import RetrieverQueryEngine
25
+ from llama_index import get_response_synthesizer
26
+ from llama_index.schema import NodeWithScore
27
+ from llama_index.query_engine import RetrieverQueryEngine
28
+ from llama_index import VectorStoreIndex, ServiceContext
29
+ from llama_index.embeddings import OpenAIEmbedding
30
+ from llama_index.llms import HuggingFaceLLM
31
+ import requests
32
+ from llama_index.llms import (
33
+ CustomLLM,
34
+ CompletionResponse,
35
+ CompletionResponseGen,
36
+ LLMMetadata,
37
+ )
38
+ from llama_index.query_engine import RetrieverQueryEngine
39
+ from llama_index.llms.base import llm_completion_callback
40
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
41
+ from llama_index.storage.storage_context import StorageContext
42
+ from llama_index.postprocessor import SentenceTransformerRerank, LLMRerank
43
+
44
+ from tempfile import NamedTemporaryFile
45
+ # Configure basic logging
46
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
47
+
48
+ # Create a logger object
49
+ logger = logging.getLogger(__name__)
50
+
51
+ class ConfigManager:
52
+ """
53
+ A class to manage loading and accessing configuration settings.
54
+
55
+ Attributes:
56
+ config (dict): Dictionary to hold configuration settings.
57
+
58
+ Methods:
59
+ load_config(config_path: str): Loads the configuration from a given JSON file.
60
+ get_config_value(key: str): Retrieves a specific configuration value.
61
+ """
62
+
63
+ def __init__(self):
64
+ self.configs = {}
65
+
66
+ def load_config(self, config_name: str, config_path: str) -> None:
67
+ """
68
+ Loads configuration settings from a specified JSON file into a named configuration.
69
+
70
+ Args:
71
+ config_name (str): The name to assign to this set of configurations.
72
+ config_path (str): The path to the configuration file.
73
+
74
+ Raises:
75
+ FileNotFoundError: If the config file is not found.
76
+ json.JSONDecodeError: If there is an error parsing the config file.
77
+ """
78
+ try:
79
+ with open(config_path, 'r') as f:
80
+ self.configs[config_name] = json.load(f)
81
+ except FileNotFoundError:
82
+ logging.error(f"Config file not found at {config_path}")
83
+ raise
84
+ except json.JSONDecodeError as e:
85
+ logging.error(f"Error decoding config file: {e}")
86
+ raise
87
+
88
+
89
+ def get_config_value(self, config_name: str, key: str) -> str:
90
+ """
91
+ Retrieves a specific configuration value.
92
+
93
+ Args:
94
+ key (str): The key for the configuration setting.
95
+
96
+ Returns:
97
+ str: The value of the configuration setting.
98
+
99
+ Raises:
100
+ ValueError: If the key is not found or is set to a placeholder value.
101
+ """
102
+ value = self.configs.get(config_name, {}).get(key)
103
+ if value is None or value == "ENTER_YOUR_TOKEN_HERE":
104
+ raise ValueError(f"Please set your '{key}' in the config.json file.")
105
+ return value
106
+
107
+ class base_utils:
108
+ """
109
+ A utility class providing miscellaneous static methods for processing and analyzing text data,
110
+ particularly from PDF documents and filenames. This class also includes methods for file operations.
111
+
112
+ This class encapsulates the functionality of extracting key information from text, such as scores,
113
+ reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
114
+ and reading content from files.
115
+
116
+ Attributes:
117
+ None (This class contains only static methods and does not maintain any state)
118
+
119
+ Methods:
120
+ extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
121
+ Extracts a score and reasoning from a given text using regular expressions.
122
+
123
+ extract_id_from_filename(filename: str) -> Optional[int]:
124
+ Extracts an ID from a given filename based on a specified pattern.
125
+
126
+ find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
127
+ Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
128
+
129
+ read_from_file(file_path: str) -> str:
130
+ Reads the content of a file and returns it as a string.
131
+ """
132
+
133
+ @staticmethod
134
+ def read_from_file(file_path: str) -> str:
135
+ """
136
+ Reads the content of a file and returns it as a string.
137
+
138
+ Args:
139
+ file_path (str): The path to the file to be read.
140
+
141
+ Returns:
142
+ str: The content of the file.
143
+ """
144
+ with open(file_path, 'r') as prompt_file:
145
+ prompt = prompt_file.read()
146
+ return prompt
147
+
148
+ @staticmethod
149
+ def extract_id_from_filename(filename: str) -> Optional[int]:
150
+ """
151
+ Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
152
+
153
+ Args:
154
+ filename (str): The filename from which to extract the ID.
155
+
156
+ Returns:
157
+ int: The extracted ID as an integer, or None if the pattern is not found.
158
+ """
159
+ # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
160
+ match = re.search(r'Id_(\d+).pdf', filename)
161
+ if match:
162
+ return int(match.group(1)) # Convert to integer if ID is numeric
163
+ else:
164
+ return None
165
+
166
+ @staticmethod
167
+ def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
168
+ """
169
+ Extracts score and the longest reasoning from a given text using regular expressions.
170
+
171
+ Args:
172
+ text (str): The text from which to extract the score and reasoning.
173
+
174
+ Returns:
175
+ dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
176
+ """
177
+ # Define regular expression patterns for score and reasoning
178
+ score_pattern = r"Score: (\d+)"
179
+ reasoning_pattern = r"Reasoning: (\S.+)"
180
+
181
+ # Extract score using regular expressions
182
+ score_match = re.search(score_pattern, text)
183
+
184
+ # Extract all reasoning matches
185
+ reasoning_matches = re.findall(reasoning_pattern, text, re.DOTALL)
186
+
187
+ # Find the longest reasoning match
188
+ longest_reasoning = min(reasoning_matches, key=len) if reasoning_matches else None
189
+
190
+ # Extract and return the results
191
+ extracted_data = {
192
+ "score": score_match.group(1) if score_match else None,
193
+ "reasoning": longest_reasoning.strip() if longest_reasoning else None
194
+ }
195
+
196
+ return extracted_data
197
+
198
+
199
+ @staticmethod
200
+ def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
201
+ """
202
+ Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
203
+
204
+ Args:
205
+ pdf_filename (str): The filename of the PDF.
206
+ dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
207
+
208
+ Returns:
209
+ pandas.Series or str: The matched row from the dataframe or a message indicating
210
+ that no matching row or invalid filename was found.
211
+ """
212
+ pdf_id = Utility.extract_id_from_filename(pdf_filename)
213
+ if pdf_id is not None:
214
+ # Assuming the first column contains the ID
215
+ matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
216
+ if not matched_row.empty:
217
+ return matched_row
218
+ else:
219
+ return "No matching row found."
220
+ else:
221
+ return "Invalid file name."
222
+
223
+
224
+ class PDFProcessor_Unstructured:
225
+ """
226
+ A class to process PDF files, providing functionalities for extracting, categorizing,
227
+ and merging elements from a PDF file.
228
+
229
+ This class is designed to handle unstructured PDF documents, particularly useful for
230
+ tasks involving text extraction, categorization, and data processing within PDFs.
231
+
232
+ Attributes:
233
+ file_path (str): The full path to the PDF file.
234
+ folder_path (str): The directory path where the PDF file is located.
235
+ file_name (str): The name of the PDF file.
236
+ texts (List[str]): A list to store extracted text chunks.
237
+ tables (List[str]): A list to store extracted tables.
238
+
239
+
240
+ Methods:
241
+ extract_pdf_elements() -> List:
242
+ Extracts images, tables, and text chunks from a PDF file.
243
+
244
+ categorize_elements(raw_pdf_elements: List) -> None:
245
+ Categorizes extracted elements from a PDF into tables and texts.
246
+
247
+ merge_chunks() -> List[str]:
248
+ Merges text chunks based on punctuation and character case criteria.
249
+
250
+ should_skip_chunk(chunk: str) -> bool:
251
+ Determines if a chunk should be skipped based on its content.
252
+
253
+ should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
254
+ Determines if the current chunk should be merged with the next one.
255
+
256
+ process_pdf() -> Tuple[List[str], List[str]]:
257
+ Processes the PDF by extracting, categorizing, and merging elements.
258
+
259
+ process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
260
+ Processes an uploaded PDF file to extract and categorize text and tables.
261
+ """
262
+
263
+ def __init__(self, config: Dict[str, any]):
264
+ self.file_path = None
265
+ self.folder_path = None
266
+ self.file_name = None
267
+ self.texts = []
268
+ self.tables = []
269
+ self.config = config if config is not None else self.default_config()
270
+ logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
271
+
272
+ @staticmethod
273
+ def default_config() -> Dict[str, any]:
274
+ """
275
+ Returns the default configuration for PDF processing.
276
+
277
+ Returns:
278
+ Dict[str, any]: Default configuration options.
279
+ """
280
+ return {
281
+ "extract_images": False,
282
+ "infer_table_structure": True,
283
+ "chunking_strategy": "by_title",
284
+ "max_characters": 10000,
285
+ "combine_text_under_n_chars": 100,
286
+ "strategy": "fast",
287
+ "model_name": "yolox"
288
+ }
289
+
290
+
291
+ def extract_pdf_elements(self) -> List:
292
+ """
293
+ Extracts images, tables, and text chunks from a PDF file.
294
+
295
+ Returns:
296
+ List: A list of extracted elements from the PDF.
297
+ """
298
+ logger.info("Starting extraction of PDF elements.")
299
+ try:
300
+ extracted_elements = partition_pdf(
301
+ filename=self.file_path,
302
+ extract_images_in_pdf=False,
303
+ infer_table_structure=True,
304
+ chunking_strategy="by_title",
305
+ strategy = "fast",
306
+ max_characters=10000,
307
+ combine_text_under_n_chars=100,
308
+ image_output_dir_path=self.folder_path,
309
+ )
310
+ logger.info("Extraction of PDF elements completed successfully.")
311
+ return extracted_elements
312
+ except Exception as e:
313
+ logger.error(f"Error extracting PDF elements: {e}", exc_info=True)
314
+ raise
315
+
316
+ def categorize_elements(self, raw_pdf_elements: List) -> None:
317
+ """
318
+ Categorizes extracted elements from a PDF into tables and texts.
319
+
320
+ Args:
321
+ raw_pdf_elements (List): A list of elements extracted from the PDF.
322
+ """
323
+ logger.debug("Starting categorization of PDF elements.")
324
+ for element in raw_pdf_elements:
325
+ element_type = str(type(element))
326
+ if "unstructured.documents.elements.Table" in element_type:
327
+ self.tables.append(str(element))
328
+ elif "unstructured.documents.elements.CompositeElement" in element_type:
329
+ self.texts.append(str(element))
330
+
331
+ logger.debug("Categorization of PDF elements completed.")
332
+
333
+ def merge_chunks(self) -> List[str]:
334
+ """
335
+ Merges text chunks based on punctuation and character case criteria.
336
+
337
+ Returns:
338
+ List[str]: A list of merged text chunks.
339
+ """
340
+ logger.debug("Starting merging of text chunks.")
341
+
342
+ merged_chunks = []
343
+ skip_next = False
344
+
345
+ for i, current_chunk in enumerate(self.texts[:-1]):
346
+ next_chunk = self.texts[i + 1]
347
+
348
+ if self.should_skip_chunk(current_chunk):
349
+ continue
350
+
351
+ if self.should_merge_with_next(current_chunk, next_chunk):
352
+ merged_chunks.append(current_chunk + " " + next_chunk)
353
+ skip_next = True
354
+ else:
355
+ merged_chunks.append(current_chunk)
356
+
357
+ if not skip_next:
358
+ merged_chunks.append(self.texts[-1])
359
+
360
+ logger.debug("Merging of text chunks completed.")
361
+
362
+ return merged_chunks
363
+
364
+ @staticmethod
365
+ def should_skip_chunk(chunk: str) -> bool:
366
+ """
367
+ Determines if a chunk should be skipped based on its content.
368
+
369
+ Args:
370
+ chunk (str): The text chunk to be evaluated.
371
+
372
+ Returns:
373
+ bool: True if the chunk should be skipped, False otherwise.
374
+ """
375
+ return (chunk.lower().startswith(("figure", "fig", "table")) or
376
+ not chunk[0].isalnum() or
377
+ re.match(r'^\d+\.', chunk))
378
+
379
+ @staticmethod
380
+ def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
381
+ """
382
+ Determines if the current chunk should be merged with the next one.
383
+
384
+ Args:
385
+ current_chunk (str): The current text chunk.
386
+ next_chunk (str): The next text chunk.
387
+
388
+ Returns:
389
+ bool: True if the chunks should be merged, False otherwise.
390
+ """
391
+ return (current_chunk.endswith(",") or
392
+ (current_chunk[-1].islower() and next_chunk[0].islower()))
393
+
394
+ def extract_title_from_pdf(self, uploaded_file):
395
+ """
396
+ Extracts the title from a PDF file's metadata.
397
+
398
+ This function reads the metadata of a PDF file using PyPDF2 and attempts to
399
+ extract the title. If the title is present in the metadata, it is returned.
400
+ Otherwise, a default message indicating that the title was not found is returned.
401
+
402
+ Parameters:
403
+ uploaded_file (file): A file object or a path to the PDF file from which
404
+ to extract the title. The file must be opened in binary mode.
405
+
406
+ Returns:
407
+ str: The title of the PDF file as a string. If no title is found, returns
408
+ 'Title not found'.
409
+ """
410
+ # Initialize PDF reader
411
+ pdf_reader = PdfReader(uploaded_file)
412
+
413
+ # Extract document information
414
+ meta = pdf_reader.metadata
415
+
416
+ # Retrieve title from document information
417
+ title = meta.title if meta and meta.title else 'Title not found'
418
+ return title
419
+
420
+ def process_pdf(self) -> Tuple[List[str], List[str]]:
421
+ """
422
+ Processes the PDF by extracting, categorizing, and merging elements.
423
+
424
+ Returns:
425
+ Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
426
+ """
427
+ logger.info("Starting processing of the PDF.")
428
+ try:
429
+ raw_pdf_elements = self.extract_pdf_elements()
430
+ self.categorize_elements(raw_pdf_elements)
431
+ merged_chunks = self.merge_chunks()
432
+ return merged_chunks, self.tables
433
+ except Exception as e:
434
+ logger.error(f"Error processing PDF: {e}", exc_info=True)
435
+ raise
436
+
437
+ def process_pdf_file(self, uploaded_file):
438
+ """
439
+ Process an uploaded PDF file.
440
+
441
+ If a new file is uploaded, the previously stored file is deleted.
442
+ The method updates the file path, processes the PDF, and returns the results.
443
+
444
+ Parameters:
445
+ uploaded_file: The new PDF file uploaded for processing.
446
+
447
+ Returns:
448
+ The results of processing the PDF file.
449
+ """
450
+ # Delete the previous file if it exists
451
+ if self.file_path and os.path.exists(self.file_path):
452
+ try:
453
+ os.remove(self.file_path)
454
+ logging.debug(f"Previous file {self.file_path} deleted.")
455
+ except Exception as e:
456
+ logging.warning(f"Error deleting previous file: {e}", exc_info=True)
457
+
458
+ # Process the new file
459
+ self.file_path = str(uploaded_file)
460
+ self.folder_path = os.path.dirname(self.file_path)
461
+ logging.info(f"Starting to process the PDF file: {self.file_path}")
462
+
463
+ try:
464
+ logging.debug(f"Processing PDF at {self.file_path}")
465
+ results = self.process_pdf()
466
+ title = self.extract_title_from_pdf(self.file_path)
467
+ logging.info("PDF processing completed successfully.")
468
+ return (*results, title)
469
+ except Exception as e:
470
+ logging.error(f"Error processing PDF file: {e}", exc_info=True)
471
+ raise
472
+
473
+
474
+ class HybridRetriever(BaseRetriever):
475
+ """
476
+ A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
477
+ Inherits from BaseRetriever.
478
+
479
+ This class uses two different retrieval methods and merges their results to provide a
480
+ comprehensive set of documents in response to a query. It ensures diversity in the
481
+ retrieved documents by leveraging the strengths of both retrieval methods.
482
+
483
+ Attributes:
484
+ vector_retriever: An instance of a vector-based retriever.
485
+ bm25_retriever: An instance of a BM25 retriever.
486
+
487
+ Methods:
488
+ __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
489
+ _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
490
+ _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
491
+ """
492
+
493
+ def __init__(self, vector_retriever, bm25_retriever):
494
+ super().__init__()
495
+ self.vector_retriever = vector_retriever
496
+ self.bm25_retriever = bm25_retriever
497
+ logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
498
+
499
+ def _retrieve(self, query: str, **kwargs) -> List:
500
+ """
501
+ Retrieves and combines results from both vector and BM25 retrievers.
502
+
503
+ Args:
504
+ query: The query string for document retrieval.
505
+ **kwargs: Additional keyword arguments for retrieval.
506
+
507
+ Returns:
508
+ List: Combined list of unique nodes retrieved from both methods.
509
+ """
510
+ logger.info(f"Retrieving documents for query: {query}")
511
+ try:
512
+ bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
513
+ vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
514
+ combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
515
+
516
+ logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
517
+ return combined_nodes
518
+ except Exception as e:
519
+ logger.error(f"Error in retrieval: {e}")
520
+ raise
521
+
522
+ @staticmethod
523
+ def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
524
+ """
525
+ Combines and de-duplicates results from BM25 and vector retrievers.
526
+
527
+ Args:
528
+ bm25_nodes: Nodes retrieved from BM25 retriever.
529
+ vector_nodes: Nodes retrieved from vector retriever.
530
+
531
+ Returns:
532
+ List: Combined list of unique nodes.
533
+ """
534
+ node_ids: Set = set()
535
+ combined_nodes = []
536
+
537
+ for node in bm25_nodes + vector_nodes:
538
+ if node.node_id not in node_ids:
539
+ combined_nodes.append(node)
540
+ node_ids.add(node.node_id)
541
+
542
+ return combined_nodes
543
+
544
+
545
+
546
+ class PDFQueryEngine:
547
+ """
548
+ A class to handle the process of setting up a query engine and performing queries on PDF documents.
549
+
550
+ This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
551
+ indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
552
+
553
+ Attributes:
554
+ documents (List): A list of documents to be indexed.
555
+ llm (Language Model): The language model to be used for embeddings and queries.
556
+ qa_prompt_tmpl (str): Template for creating query prompts.
557
+ queries (List[str]): List of queries to be executed.
558
+
559
+ Methods:
560
+ setup_query_engine(): Sets up the query engine with all necessary components.
561
+ execute_queries(): Executes the predefined queries and prints the results.
562
+ """
563
+
564
+ def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
565
+
566
+ self.documents = documents
567
+ self.llm = llm
568
+ self.embed_model = embed_model
569
+ self.qa_prompt_tmpl = qa_prompt_tmpl
570
+ self.base_utils = base_utils()
571
+ self.config_manager = ConfigManager()
572
+
573
+
574
+
575
+ logger.info("PDFQueryEngine initialized.")
576
+
577
+ def format_example(self, example):
578
+ """
579
+ Formats a few-shot example into a string.
580
+
581
+ Args:
582
+ example (dict): A dictionary containing 'query', 'score', and 'reasoning' for the few-shot example.
583
+
584
+ Returns:
585
+ str: Formatted few-shot example text.
586
+ """
587
+ return "Example:\nQuery: {}\nScore: {}\nReasoning: {}\n".format(
588
+ example['query'], example['score'], example['reasoning']
589
+ )
590
+
591
+
592
+ def setup_query_engine(self):
593
+ """
594
+ Sets up the query engine by initializing and configuring the embedding model, service context, index,
595
+ hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
596
+
597
+ Args:
598
+ embed_model: The embedding model to be used.
599
+ service_context: The context for providing services to the query engine.
600
+ index: The index used for storing and retrieving documents.
601
+ hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
602
+ response_synthesizer: The synthesizer for generating responses to queries.
603
+
604
+ Returns:
605
+ Any: The configured query engine.
606
+ """
607
+ client = qdrant_client.QdrantClient(
608
+ # you can use :memory: mode for fast and light-weight experiments,
609
+ # it does not require to have Qdrant deployed anywhere
610
+ # but requires qdrant-client >= 1.1.1
611
+ location=":memory:"
612
+ # otherwise set Qdrant instance address with:
613
+ # uri="http://<host>:<port>"
614
+ # set API KEY for Qdrant Cloud
615
+ # api_key="<qdrant-api-key>",
616
+ )
617
+ try:
618
+ logger.info("Initializing the service context for query engine setup.")
619
+ service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
620
+ vector_store = QdrantVectorStore(client=client, collection_name="med_library")
621
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
622
+
623
+ logger.info("Creating an index from documents.")
624
+ index = VectorStoreIndex.from_documents(documents=self.documents, storage_context=storage_context, service_context=service_context)
625
+ nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
626
+
627
+ logger.info("Setting up vector and BM25 retrievers.")
628
+ vector_retriever = index.as_retriever(similarity_top_k=3)
629
+ bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)
630
+ hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
631
+
632
+ logger.info("Configuring the response synthesizer with the prompt template.")
633
+ qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
634
+ response_synthesizer = get_response_synthesizer(
635
+ service_context=service_context,
636
+ text_qa_template=qa_prompt,
637
+ response_mode="compact",
638
+ )
639
+
640
+ logger.info("Assembling the query engine with reranker and synthesizer.")
641
+ reranker = SentenceTransformerRerank(top_n=3, model="BAAI/bge-reranker-base")
642
+ query_engine = RetrieverQueryEngine.from_args(
643
+ retriever=hybrid_retriever,
644
+ node_postprocessors=[reranker],
645
+ response_synthesizer=response_synthesizer,
646
+ )
647
+
648
+ logger.info("Query engine setup complete.")
649
+ return query_engine
650
+ except Exception as e:
651
+ logger.error(f"Error during query engine setup: {e}")
652
+ raise
653
+
654
+ def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[int, List[int], int, float, List[str]]:
655
+ """
656
+ Evaluate documents using a language model based on various criteria.
657
+ Args:
658
+ reg_result (Any): Result related to registration.
659
+ peer_result (Any): Result related to peer review.
660
+ guidelines_result (Any): Result related to following guidelines.
661
+ queries (List[str]): A list of queries to be processed.
662
+ Returns:
663
+ Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria.
664
+ """
665
+
666
+ logger.info("Starting evaluation with LLM.")
667
+ self.config_manager.load_config("few_shot", "few_shot.json")
668
+ query_engine = self.setup_query_engine()
669
+
670
+ total_score = 0
671
+ criteria_met = 0
672
+ reasoning = []
673
+
674
+ for j, query in enumerate(queries):
675
+ # Handle special cases based on the value of j and other conditions
676
+ if j == 1 and reg_result:
677
+ extracted_data = {"score": 1, "reasoning": reg_result[0]}
678
+ elif j == 2 and guidelines_result:
679
+ extracted_data = {"score": 1, "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
680
+ elif j == 8 and (guidelines_result or peer_result):
681
+ extracted_data = {"score": 1, "reasoning": "The article is published in a peer-reviewed journal."}
682
+ else:
683
+
684
+ # Execute the query
685
+ result = query_engine.query(query).response
686
+ extracted_data = self.base_utils.extract_score_reasoning(result)
687
+
688
+
689
+ # Validate and accumulate the scores
690
+ extracted_data_score = 0 if extracted_data.get("score") is None else int(extracted_data.get("score"))
691
+ if extracted_data_score > 0:
692
+ criteria_met += 1
693
+ reasoning.append(extracted_data["reasoning"])
694
+ total_score += extracted_data_score
695
+
696
+ score_percentage = (float(total_score) / len(queries)) * 100
697
+ logger.info("Evaluation completed.")
698
+ return total_score, criteria_met, score_percentage, reasoning
699
+
700
+
701
+
702
+ class MixtralLLM(CustomLLM):
703
+ """
704
+ A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
705
+
706
+ Attributes:
707
+ context_window (int): Number of tokens used for context during inference.
708
+ num_output (int): Number of tokens to generate as output.
709
+ temperature (float): Sampling temperature for token generation.
710
+ model_name (str): Name of the model on Hugging Face's model hub.
711
+ api_key (str): API key for authenticating with the Hugging Face API.
712
+
713
+ Methods:
714
+ metadata: Retrieves metadata about the model.
715
+ do_hf_call: Makes an API call to the Hugging Face model.
716
+ complete: Generates a complete response for a given prompt.
717
+ stream_complete: Streams a series of token completions for a given prompt.
718
+ """
719
+ context_window: int = Field(..., description="Number of tokens used for context during inference.")
720
+ num_output: int = Field(..., description="Number of tokens to generate as output.")
721
+ temperature: float = Field(..., description="Sampling temperature for token generation.")
722
+ model_name: str = Field(..., description="Name of the model on Hugging Face's model hub.")
723
+ api_key: str = Field(..., description="API key for authenticating with the Hugging Face API.")
724
+
725
+
726
+ @property
727
+ def metadata(self) -> LLMMetadata:
728
+ """
729
+ Retrieves metadata for the Mixtral LLM.
730
+
731
+ Returns:
732
+ LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
733
+ """
734
+ return LLMMetadata(
735
+ context_window=self.context_window,
736
+ num_output=self.num_output,
737
+ model_name=self.model_name,
738
+ )
739
+
740
+ def do_hf_call(self, prompt: str) -> str:
741
+ """
742
+ Makes an API call to the Hugging Face model and retrieves the generated response.
743
+
744
+ Args:
745
+ prompt (str): The input prompt for the model.
746
+
747
+ Returns:
748
+ str: The text generated by the model in response to the prompt.
749
+
750
+ Raises:
751
+ Exception: If the API call fails or returns an error.
752
+ """
753
+ data = {
754
+ "inputs": prompt,
755
+ "parameters": {"Temperature": self.temperature}
756
+ }
757
+
758
+ # Makes a POST request to the Hugging Face API to get the model's response
759
+ response = requests.post(
760
+ f'https://api-inference.huggingface.co/models/{self.model_name}',
761
+ headers={
762
+ 'authorization': f'Bearer {self.api_key}',
763
+ 'content-type': 'application/json',
764
+ },
765
+ json=data,
766
+ stream=True
767
+ )
768
+
769
+ # Checks for a successful response and parses the generated text
770
+ if response.status_code != 200 or not response.json() or 'error' in response.json():
771
+ print(f"Error: {response}")
772
+ return "Unable to answer for technical reasons."
773
+ full_txt = response.json()[0]['generated_text']
774
+ # Finds the section of the text following the context separator
775
+ offset = full_txt.find("---------------------")
776
+ ss = full_txt[offset:]
777
+ # Extracts the actual answer from the response
778
+ offset = ss.find("Answer:")
779
+ return ss[offset+7:].strip()
780
+
781
+
782
+ @llm_completion_callback()
783
+ def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
784
+ """
785
+ Generates a complete response for a given prompt using the Hugging Face API.
786
+
787
+ Args:
788
+ prompt (str): The input prompt for the model.
789
+ **kwargs: Additional keyword arguments for the completion.
790
+
791
+ Returns:
792
+ CompletionResponse: The complete response from the model.
793
+ """
794
+ response = self.do_hf_call(prompt)
795
+ return CompletionResponse(text=response)
796
+
797
+
798
+ @llm_completion_callback()
799
+ def stream_complete(
800
+ self, prompt: str, **kwargs: Any
801
+ ) -> CompletionResponseGen:
802
+ """
803
+ Streams a series of token completions as a response for the given prompt.
804
+
805
+ This method is useful for streaming responses where each token is generated sequentially.
806
+
807
+ Args:
808
+ prompt (str): The input prompt for the model.
809
+ **kwargs: Additional keyword arguments for the streaming completion.
810
+
811
+ Yields:
812
+ CompletionResponseGen: A generator yielding each token in the completion response.
813
+ """
814
+ # Yields a stream of tokens as the completion response for the given prompt
815
+ response = ""
816
+ for token in self.do_hf_call(prompt):
817
+ response += token
818
+ yield CompletionResponse(text=response, delta=token)
819
+
820
+
821
+
822
+ class KeywordSearch():
823
+ def __init__(self, chunks):
824
+ self.chunks = chunks
825
+
826
+ def find_journal_name(self, response: str, journal_list: list) -> str:
827
+ """
828
+ Searches for a journal name in a given response string.
829
+
830
+ This function iterates through a list of known journal names and checks if any of these
831
+ names are present in the response string. It returns the first journal name found in the
832
+ response. If no journal names from the list are found in the response, a default message
833
+ indicating that the journal name was not found is returned.
834
+
835
+ Args:
836
+ response (str): The response string to search for a journal name.
837
+ journal_list (list): A list of journal names to search within the response.
838
+
839
+ Returns:
840
+ str: The first journal name found in the response, or a default message if no journal name is found.
841
+ """
842
+ response_lower = response.lower()
843
+ for journal in journal_list:
844
+ journal_lower = journal.lower()
845
+
846
+ if journal_lower in response_lower:
847
+ return True
848
+
849
+ return False
850
+
851
+ def check_registration(self):
852
+ """
853
+ Check chunks of text for various registration numbers or URLs of registries.
854
+ Returns the sentence containing a registration number, or if not found,
855
+ returns chunks containing registry URLs.
856
+
857
+ Args:
858
+ chunks (list of str): List of text chunks to search.
859
+
860
+ Returns:
861
+ list of str: List of matching sentences or chunks, or an empty list if no matches are found.
862
+ """
863
+
864
+ # Patterns for different registration types
865
+ patterns = {
866
+ "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
867
+ "ISRCTN": r"(ISRCTN\d{8})",
868
+ "EudraCT": r"(\d{4}-\d{6}-\d{2})",
869
+ "UMIN-CTR": r"(UMIN\d{9})",
870
+ "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
871
+ }
872
+
873
+ # Registry URLs
874
+ registry_urls = [
875
+ "www.anzctr.org.au",
876
+ "anzctr.org.au",
877
+ "www.clinicaltrials.gov",
878
+ "clinicaltrials.gov",
879
+ "www.ISRCTN.org",
880
+ "ISRCTN.org",
881
+ "www.umin.ac.jp/ctr/index/htm",
882
+ "umin.ac.jp/ctr/index/htm",
883
+ "www.onderzoekmetmensen.nl/en",
884
+ "onderzoekmetmensen.nl/en",
885
+ "eudract.ema.europa.eu",
886
+ "www.eudract.ema.europa.eu"
887
+ ]
888
+
889
+
890
+ # Check each chunk for registration numbers
891
+ for chunk in self.chunks:
892
+ # Split chunk into sentences
893
+ sentences = re.split(r'(?<=[.!?]) +', chunk)
894
+
895
+ # Check each sentence for any registration number
896
+ for sentence in sentences:
897
+ for pattern in patterns.values():
898
+ if re.search(pattern, sentence):
899
+ return [sentence] # Return immediately if a registration number is found
900
+
901
+ # If no registration number found, check for URLs in chunks
902
+ matching_chunks = []
903
+ for chunk in self.chunks:
904
+ if any(url in chunk for url in registry_urls):
905
+ matching_chunks.append(chunk)
906
+
907
+ return matching_chunks
908
+
909
+
910
+
911
+ class StringExtraction():
912
+
913
+ """
914
+ A class to handle the the process of extraction of query string from complete LLM responses.
915
+
916
+ This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
917
+ LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
918
+ satisfactory results. Best case scenario will be write your own string extraction method in such cases.
919
+
920
+
921
+ Methods:
922
+ extract_original_prompt():
923
+ extraction_ground_truth():
924
+ """
925
+
926
+ def extract_original_prompt(self,result):
927
+ r1 = result.response.strip().split("\n")
928
+ binary_response = ""
929
+ explanation_response = ""
930
+ for r in r1:
931
+ if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
932
+ binary_response = r
933
+ elif r.find("Reasoning:") >= 0:
934
+ cut = r.find(":")
935
+ explanation_response += r[cut+1:].strip()
936
+
937
+ return binary_response,explanation_response
938
+
939
+ def extraction_ground_truth(self,paper_name,labelled_data):
940
+ id = int(paper_name[paper_name.find("_")+1:paper_name.find(".pdf")])
941
+ id_row = labelled_data[labelled_data["id"] == id]
942
+ ground_truth = id_row.iloc[:,2:11].values.tolist()[0]
943
+ binary_ground_truth = []
944
+ explanation_ground_truth = []
945
+ for g in ground_truth:
946
+ if len(g) > 0:
947
+ binary_ground_truth.append("Yes")
948
+ explanation_ground_truth.append(g)
949
+ else:
950
+ binary_ground_truth.append("No")
951
+ explanation_ground_truth.append("The article does not provide any relevant information.")
952
+ return binary_ground_truth,explanation_ground_truth
953
+
954
+
955
+
956
+ class EvaluationMetrics():
957
+ """
958
+
959
+ This class encapsulates the evaluation methods that have been used in the project.
960
+
961
+ Attributes:
962
+ explanation_response = a list of detailed response from the LLM model corresponding to each query
963
+ explanation_ground_truth = the list of ground truth corresponding to each query
964
+
965
+ Methods:
966
+ metric_cosine_similairty(): Sets up the query engine with all necessary components.
967
+ metric_rouge(): Executes the predefined queries and prints the results.
968
+ metric_binary_accuracy():
969
+ """
970
+
971
+
972
+ def __init__(self,explanation_response,explanation_ground_truth,embedding_model):
973
+ self.explanation_response = explanation_response
974
+ self.explanation_ground_truth = explanation_ground_truth
975
+ self.embedding_model = embedding_model
976
+
977
+ def metric_cosine_similarity(self):
978
+ ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
979
+ explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
980
+ return np.diag(cosine_similarity(ground_truth_embedding,explanation_response_embedding))
981
+
982
+ def metric_rouge(self):
983
+ rouge = evaluate.load("rouge")
984
+ results = rouge.compute(predictions = self.explanation_response,references = self.explanation_ground_truth)
985
+ return results
986
+
987
+ def binary_accuracy(self,binary_response,binary_ground_truth):
988
+ count = 0
989
+ if len(binary_response) != len(binary_ground_truth):
990
+ return "Arrays which are to be compared has different lengths."
991
+ else:
992
+ for i in range(len(binary_response)):
993
+ if binary_response[i] == binary_ground_truth[i]:
994
+ count += 1
995
+ return np.round(count/len(binary_response),2)
librarymed/huggingface/app_huggingface.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import gradio as gr
5
+ import openai
6
+ from fpdf import FPDF
7
+ from llama_index import Document
8
+ from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
9
+ from llama_index.llms import OpenAI
10
+
11
+ from RAG_utils_huggingface import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
12
+ ConfigManager
13
+
14
+ # Configure basic logging
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16
+
17
+ # Create a logger object
18
+ logger = logging.getLogger(__name__)
19
+
20
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
21
+
22
+ config_manager = ConfigManager()
23
+ # config_manager.load_config("api", "Config/api_config.json")
24
+ config_manager.load_config("model", "model_config.json")
25
+
26
+ openai.api_key = os.environ['OPENAI_API_KEY'] # config_manager.get_config_value("api", "OPENAI_API_KEY")
27
+ hf_token = os.environ['HF_TOKEN'] # config_manager.get_config_value("api", "HF_TOKEN")
28
+
29
+ # PDF rendering and chunking parameters
30
+ pdf_processing_config = config_manager.get_config_value("model", "pdf_processing")
31
+
32
+ ALLOWED_EXTENSIONS = config_manager.get_config_value("model", "allowed_extensions")
33
+ embed = config_manager.get_config_value("model", "embeddings")
34
+ embed_model_name = config_manager.get_config_value("model", "embeddings_model")
35
+
36
+ # llm_model = config_manager.get_config_value("model", "llm_model")
37
+ model_temperature = config_manager.get_config_value("model", "model_temp")
38
+ output_token_size = config_manager.get_config_value("model", "max_tokens")
39
+ model_context_window = config_manager.get_config_value("model", "context_window")
40
+
41
+ gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
42
+ mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
43
+ info_prompt_path = config_manager.get_config_value("model", "INFO_PROMPT_PATH")
44
+
45
+ peer_review_journals_path = config_manager.get_config_value("model", "peer_review_journals_path")
46
+ eq_network_journals_path = config_manager.get_config_value("model", "eq_network_journals_path")
47
+
48
+ queries = config_manager.get_config_value("model", "queries")
49
+ criteria = config_manager.get_config_value("model", "criteria")
50
+ num_criteria = len(queries)
51
+
52
+ author_query = config_manager.get_config_value("model", "author_query")
53
+ journal_query = config_manager.get_config_value("model", "journal_query")
54
+
55
+
56
+ # Helper function to check if the file extension is allowed
57
+ def allowed_file(filename):
58
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
59
+
60
+
61
+ def generate_score_bar(score, num_criteria):
62
+ # Convert and round the score from a 9-point scale to a 100-point scale
63
+ score_out_of_100 = round((score / num_criteria) * 100)
64
+
65
+ # Determine the color and text based on the original score
66
+ if score == 9:
67
+ color = "#4CAF50" # green
68
+ text = "Very good"
69
+ elif score in [7, 8]:
70
+ color = "#FFEB3B" # yellow
71
+ text = "Good"
72
+ elif score in [5, 6]:
73
+ color = "#FF9800" # orange
74
+ text = "Ok"
75
+ elif score in [3, 4]:
76
+ color = "#F44336" # red
77
+ text = "Bad"
78
+ else: # score < 3
79
+ color = "#800000" # maroon
80
+ text = "Very bad"
81
+
82
+ # Create the HTML for the score bar
83
+ score_bar_html = f"""
84
+ <div style="background-color: #ddd; border-radius: 10px; position: relative; height: 20px; width: 100%;">
85
+ <div style="background-color: {color}; height: 100%; border-radius: 10px; width: {score_out_of_100}%;"></div>
86
+ </div>
87
+ <p style="color: {color};">{text}</p> <!-- Display the text -->
88
+ """
89
+ return score_bar_html
90
+
91
+
92
+ class PDF(FPDF):
93
+ def __init__(self, *args, **kwargs):
94
+ super().__init__(*args, **kwargs)
95
+ # Load the DejaVu font files
96
+ self.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
97
+ self.add_font('DejaVu', 'B', 'DejaVuSansCondensed-Bold.ttf', uni=True)
98
+ self.add_font('DejaVu', 'I', 'DejaVuSansCondensed-Oblique.ttf', uni=True)
99
+
100
+ def header(self):
101
+ self.set_font('DejaVu', 'B', 12)
102
+ self.cell(0, 10, 'Paper Analysis Report', 0, 1, 'C')
103
+
104
+ def footer(self):
105
+ self.set_y(-15)
106
+ self.set_font('DejaVu', 'I', 8)
107
+ self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
108
+
109
+
110
+ import os
111
+
112
+
113
+ def create_pdf_report(title, author_info, score, criteria, reasoning_list, output_path):
114
+ pdf = PDF()
115
+ pdf.add_page()
116
+
117
+ # Set margins
118
+ pdf.set_left_margin(10)
119
+ pdf.set_right_margin(10)
120
+
121
+ # Title
122
+ pdf.set_font("DejaVu", 'B', 14)
123
+ pdf.cell(0, 10, "Title:", 0, 1)
124
+ pdf.set_font("DejaVu", '', 12)
125
+ pdf.multi_cell(0, 10, title, 0, 1)
126
+
127
+ # Author Information
128
+ pdf.set_font("DejaVu", 'B', 14)
129
+ pdf.cell(0, 10, "Author Information:", 0, 1)
130
+ pdf.set_font("DejaVu", '', 12)
131
+ pdf.multi_cell(0, 10, author_info, 0, 1)
132
+
133
+ # Score
134
+ pdf.set_font("DejaVu", 'B', 14)
135
+ pdf.cell(0, 10, "Score:", 0, 1)
136
+ pdf.set_font("DejaVu", '', 12)
137
+ pdf.multi_cell(0, 10, score, 0, 1)
138
+
139
+ # Reasoning - each reasoning with a green heading in bold
140
+ for heading, reasoning in zip(criteria, reasoning_list):
141
+ print(reasoning)
142
+ pdf.set_font("DejaVu", 'B', 14)
143
+ pdf.set_text_color(0, 128, 0) # Green color
144
+ pdf.multi_cell(0, 10, heading, 0, 1)
145
+ pdf.set_text_color(0, 0, 0) # Reset to black color
146
+ pdf.set_font("DejaVu", '', 12)
147
+ pdf.multi_cell(0, 10, reasoning, 0, 1)
148
+
149
+ # Save the PDF to the specified output path
150
+ pdf.output(output_path)
151
+
152
+ return output_path # Return the path to the generated report
153
+
154
+
155
+ def check_title_for_review(uploaded_files):
156
+ title_message = "All articles are valid for review."
157
+ if not uploaded_files:
158
+ title_message = "No files uploaded or upload canceled."
159
+ else:
160
+ for uploaded_file in uploaded_files:
161
+ pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
162
+ title = pdf_processor.extract_title_from_pdf(uploaded_file)
163
+ if 'review' in title.lower():
164
+ title_message = "One or more files are review papers. Hence the evaluation may not be accurate."
165
+
166
+ return title_message
167
+
168
+
169
+ def process_pdf(uploaded_files, llm_model, n_criteria=num_criteria):
170
+ # Initialize aggregation variables
171
+ final_score = 0
172
+ final_reasoning = []
173
+ final_score_bar_html = ""
174
+ final_author_info_html = ""
175
+ final_title_info_html = ""
176
+ output_files = []
177
+ for i, uploaded_file in enumerate(uploaded_files):
178
+ # Process the PDF file
179
+ file_name_without_extension = os.path.splitext(os.path.basename(uploaded_file))[0]
180
+ file_name_without_extension
181
+
182
+ pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
183
+ merged_chunks, tables, title = pdf_processor.process_pdf_file(uploaded_file)
184
+ documents = [Document(text=t) for t in merged_chunks]
185
+
186
+ # Prompts and Queries
187
+ utils = base_utils()
188
+
189
+ info_prompt = utils.read_from_file(info_prompt_path)
190
+
191
+ # LLM Model choice
192
+ try:
193
+ if llm_model == "Model 1":
194
+ llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=output_token_size)
195
+ general_prompt = utils.read_from_file(gpt_prompt_path)
196
+
197
+ elif llm_model == "Model 2":
198
+ if any(param is None for param in
199
+ [model_context_window, output_token_size, model_temperature, hf_token]):
200
+ raise ValueError("All parameters are required for Mistral LLM.")
201
+
202
+ llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
203
+ temperature=model_temperature, model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
204
+ api_key=hf_token)
205
+ general_prompt = utils.read_from_file(mistral_prompt_path)
206
+ else:
207
+ raise ValueError(f"Unsupported language model: {llm_model}")
208
+
209
+ except Exception as e:
210
+ logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
211
+ raise # Or handle the exception as needed
212
+
213
+ # Embedding model choice for RAG
214
+ try:
215
+ if embed == "openai":
216
+ embed_model = OpenAIEmbedding(model="text-embedding-3-large")
217
+
218
+ elif embed == "huggingface":
219
+ # Use the specified model name
220
+ embed_model = HuggingFaceEmbedding(embed_model_name)
221
+
222
+ else:
223
+ raise ValueError(f"Unsupported embedding model: {embed_model}")
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error initializing embedding model: {e}", exc_info=True)
227
+ raise
228
+
229
+ peer_review_journals = utils.read_from_file(peer_review_journals_path)
230
+ eq_network_journals = utils.read_from_file(eq_network_journals_path)
231
+
232
+ peer_review_journals_list = peer_review_journals.split('\n')
233
+ eq_network_journals_list = eq_network_journals.split('\n')
234
+
235
+ modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
236
+ peer_review_journals_list) + "?"
237
+
238
+ info_llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=100)
239
+ pdf_info_query = PDFQueryEngine(documents, info_llm, embed_model, (info_prompt))
240
+ info_query_engine = pdf_info_query.setup_query_engine()
241
+ journal_result = info_query_engine.query(modified_journal_query).response
242
+ author_result = info_query_engine.query(author_query).response
243
+
244
+ pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (general_prompt))
245
+
246
+ # Check for prior registration
247
+ nlp_methods = KeywordSearch(merged_chunks)
248
+ eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
249
+ peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
250
+ registration_result = nlp_methods.check_registration()
251
+
252
+ # Evaluate with OpenAI model
253
+ total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
254
+ registration_result, peer_journal_result, eq_journal_result, queries)
255
+
256
+ # Convert reasoning list to plain text
257
+ # reasoning_text = "\n".join([f"{idx + 1}. {reason}" for idx, reason in enumerate(reasoning)])
258
+
259
+ # Generate the score bar HTML
260
+ score_bar_html = generate_score_bar(total_score, n_criteria)
261
+ scaled_total_score = str(round((total_score / n_criteria) * 100)) + "/100"
262
+
263
+ output_dir = "/tmp"
264
+ base_name = os.path.splitext(uploaded_file)[0]
265
+ output_path = os.path.join(output_dir, f"{base_name}_report.pdf")
266
+
267
+ create_pdf_report(title, author_result, scaled_total_score, criteria, reasoning, output_path)
268
+ output_files.append(output_path)
269
+
270
+ # Construct the processing message
271
+ processing_message = f"Processing complete. {len(uploaded_files)} reports generated. Please download your reports below."
272
+
273
+ return processing_message, output_files
274
+ # Return the score as a string and the reasoning as HTML
275
+ # return str(round((total_score / n_criteria) * 100)) + "/100", score_bar_html, reasoning_html, author_info_html, title_info_html
276
+
277
+
278
+ with gr.Blocks(theme=gr.themes.Glass(
279
+ text_size="sm",
280
+ font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"],
281
+ primary_hue="neutral",
282
+ secondary_hue="gray")) as demo:
283
+ gr.Markdown("## Med Library")
284
+ with gr.Row():
285
+ file_upload = gr.File(label="Choose papers", file_types=['.pdf'], file_count="multiple")
286
+
287
+ title_check_output = gr.Textbox(label="Warnings", interactive=False)
288
+ file_upload.change(fn=check_title_for_review, inputs=file_upload, outputs=title_check_output)
289
+
290
+ with gr.Row():
291
+ model_choice = gr.Dropdown(["Model 1", "Model 2"], label="Choose a model", value="Model 1")
292
+ submit_button = gr.Button("Evaluate")
293
+
294
+ processing_message_output = gr.Textbox(label="Processing Status", interactive=False)
295
+ report_download_links = gr.File(label="Download Reports", type="filepath", file_count="multiple")
296
+
297
+ submit_button.click(
298
+ fn=process_pdf,
299
+ inputs=[file_upload, model_choice],
300
+ outputs=[processing_message_output, report_download_links]
301
+ )
302
+
303
+
304
+ demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
librarymed/kromin/RAG_utils.py ADDED
@@ -0,0 +1,983 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import re
5
+ import time
6
+ from tempfile import NamedTemporaryFile
7
+ from typing import Any, List, Tuple, Set, Dict, Optional, Union
8
+
9
+ import evaluate
10
+ import numpy as np
11
+ import pandas as pd
12
+ import requests
13
+ from llama_index import PromptTemplate
14
+ from llama_index import VectorStoreIndex, ServiceContext
15
+ from llama_index import get_response_synthesizer
16
+ from llama_index.llms import (
17
+ CustomLLM,
18
+ CompletionResponse,
19
+ CompletionResponseGen,
20
+ LLMMetadata,
21
+ )
22
+ from llama_index.llms.base import llm_completion_callback
23
+ from llama_index.postprocessor import SentenceTransformerRerank
24
+ from llama_index.query_engine import RetrieverQueryEngine
25
+ from llama_index.retrievers import BaseRetriever, BM25Retriever
26
+ from sklearn.metrics.pairwise import cosine_similarity
27
+ from unstructured.partition.pdf import partition_pdf
28
+ from pypdf import PdfReader
29
+
30
+
31
+ # Configure basic logging
32
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33
+
34
+ # Create a logger object
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class ConfigManager:
39
+ """
40
+ A class to manage loading and accessing configuration settings.
41
+
42
+ Attributes:
43
+ config (dict): Dictionary to hold configuration settings.
44
+
45
+ Methods:
46
+ load_config(config_path: str): Loads the configuration from a given JSON file.
47
+ get_config_value(key: str): Retrieves a specific configuration value.
48
+ """
49
+
50
+ def __init__(self):
51
+ self.configs = {}
52
+
53
+ def load_config(self, config_name: str, config_path: str) -> None:
54
+ """
55
+ Loads configuration settings from a specified JSON file into a named configuration.
56
+
57
+ Args:
58
+ config_name (str): The name to assign to this set of configurations.
59
+ config_path (str): The path to the configuration file.
60
+
61
+ Raises:
62
+ FileNotFoundError: If the config file is not found.
63
+ json.JSONDecodeError: If there is an error parsing the config file.
64
+ """
65
+ try:
66
+ with open(config_path, 'r') as f:
67
+ self.configs[config_name] = json.load(f)
68
+ except FileNotFoundError:
69
+ logging.error(f"Config file not found at {config_path}")
70
+ raise
71
+ except json.JSONDecodeError as e:
72
+ logging.error(f"Error decoding config file: {e}")
73
+ raise
74
+
75
+ def get_config_value(self, config_name: str, key: str) -> str:
76
+ """
77
+ Retrieves a specific configuration value.
78
+
79
+ Args:
80
+ key (str): The key for the configuration setting.
81
+
82
+ Returns:
83
+ str: The value of the configuration setting.
84
+
85
+ Raises:
86
+ ValueError: If the key is not found or is set to a placeholder value.
87
+ """
88
+ value = self.configs.get(config_name, {}).get(key)
89
+ if value is None or value == "ENTER_YOUR_TOKEN_HERE":
90
+ raise ValueError(f"Please set your '{key}' in the config.json file.")
91
+ return value
92
+
93
+
94
+ class base_utils:
95
+ """
96
+ A utility class providing miscellaneous static methods for processing and analyzing text data,
97
+ particularly from PDF documents and filenames. This class also includes methods for file operations.
98
+
99
+ This class encapsulates the functionality of extracting key information from text, such as scores,
100
+ reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
101
+ and reading content from files.
102
+
103
+ Attributes:
104
+ None (This class contains only static methods and does not maintain any state)
105
+
106
+ Methods:
107
+ extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
108
+ Extracts a score and reasoning from a given text using regular expressions.
109
+
110
+ extract_id_from_filename(filename: str) -> Optional[int]:
111
+ Extracts an ID from a given filename based on a specified pattern.
112
+
113
+ find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
114
+ Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
115
+
116
+ read_from_file(file_path: str) -> str:
117
+ Reads the content of a file and returns it as a string.
118
+ """
119
+
120
+ @staticmethod
121
+ def read_from_file(file_path: str) -> str:
122
+ """
123
+ Reads the content of a file and returns it as a string.
124
+
125
+ Args:
126
+ file_path (str): The path to the file to be read.
127
+
128
+ Returns:
129
+ str: The content of the file.
130
+ """
131
+ with open(file_path, 'r') as prompt_file:
132
+ prompt = prompt_file.read()
133
+ return prompt
134
+
135
+ @staticmethod
136
+ def extract_id_from_filename(filename: str) -> Optional[int]:
137
+ """
138
+ Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
139
+
140
+ Args:
141
+ filename (str): The filename from which to extract the ID.
142
+
143
+ Returns:
144
+ int: The extracted ID as an integer, or None if the pattern is not found.
145
+ """
146
+ # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
147
+ match = re.search(r'Id_(\d+).pdf', filename)
148
+ if match:
149
+ return int(match.group(1)) # Convert to integer if ID is numeric
150
+ else:
151
+ return None
152
+
153
+ @staticmethod
154
+ def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
155
+ """
156
+ Extracts score and reasoning from a given text using regular expressions.
157
+
158
+ Args:
159
+ text (str): The text from which to extract the score and reasoning.
160
+
161
+ Returns:
162
+ dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
163
+ """
164
+ # Define regular expression patterns for score and reasoning
165
+ score_pattern = r"Score: (\d+)"
166
+ reasoning_pattern = r"Reasoning: (.+)"
167
+
168
+ # Extract data using regular expressions
169
+ score_match = re.search(score_pattern, text)
170
+ reasoning_match = re.search(reasoning_pattern, text, re.DOTALL) # re.DOTALL allows '.' to match newlines
171
+
172
+ # Extract and return the results
173
+ extracted_data = {
174
+ "score": score_match.group(1) if score_match else None,
175
+ "reasoning": reasoning_match.group(1).strip() if reasoning_match else None
176
+ }
177
+
178
+ return extracted_data
179
+
180
+ @staticmethod
181
+ def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
182
+ """
183
+ Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
184
+
185
+ Args:
186
+ pdf_filename (str): The filename of the PDF.
187
+ dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
188
+
189
+ Returns:
190
+ pandas.Series or str: The matched row from the dataframe or a message indicating
191
+ that no matching row or invalid filename was found.
192
+ """
193
+ pdf_id = Utility.extract_id_from_filename(pdf_filename)
194
+ if pdf_id is not None:
195
+ # Assuming the first column contains the ID
196
+ matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
197
+ if not matched_row.empty:
198
+ return matched_row
199
+ else:
200
+ return "No matching row found."
201
+ else:
202
+ return "Invalid file name."
203
+
204
+
205
+ class PDFProcessor_Unstructured:
206
+ """
207
+ A class to process PDF files, providing functionalities for extracting, categorizing,
208
+ and merging elements from a PDF file.
209
+
210
+ This class is designed to handle unstructured PDF documents, particularly useful for
211
+ tasks involving text extraction, categorization, and data processing within PDFs.
212
+
213
+ Attributes:
214
+ file_path (str): The full path to the PDF file.
215
+ folder_path (str): The directory path where the PDF file is located.
216
+ file_name (str): The name of the PDF file.
217
+ texts (List[str]): A list to store extracted text chunks.
218
+ tables (List[str]): A list to store extracted tables.
219
+
220
+
221
+ Methods:
222
+ extract_pdf_elements() -> List:
223
+ Extracts images, tables, and text chunks from a PDF file.
224
+
225
+ categorize_elements(raw_pdf_elements: List) -> None:
226
+ Categorizes extracted elements from a PDF into tables and texts.
227
+
228
+ merge_chunks() -> List[str]:
229
+ Merges text chunks based on punctuation and character case criteria.
230
+
231
+ should_skip_chunk(chunk: str) -> bool:
232
+ Determines if a chunk should be skipped based on its content.
233
+
234
+ should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
235
+ Determines if the current chunk should be merged with the next one.
236
+
237
+ process_pdf() -> Tuple[List[str], List[str]]:
238
+ Processes the PDF by extracting, categorizing, and merging elements.
239
+
240
+ process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
241
+ Processes an uploaded PDF file to extract and categorize text and tables.
242
+ """
243
+
244
+ def __init__(self, config: Dict[str, any]):
245
+ self.file_path = None
246
+ self.folder_path = None
247
+ self.file_name = None
248
+ self.texts = []
249
+ self.tables = []
250
+ self.config = config if config is not None else self.default_config()
251
+ logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
252
+
253
+ @staticmethod
254
+ def default_config() -> Dict[str, any]:
255
+ """
256
+ Returns the default configuration for PDF processing.
257
+
258
+ Returns:
259
+ Dict[str, any]: Default configuration options.
260
+ """
261
+ return {
262
+ "extract_images": False,
263
+ "infer_table_structure": True,
264
+ "chunking_strategy": "by_title",
265
+ "max_characters": 10000,
266
+ "combine_text_under_n_chars": 100,
267
+ "strategy": "auto",
268
+ "model_name": "yolox"
269
+ }
270
+
271
+ def extract_pdf_elements(self) -> List:
272
+ """
273
+ Extracts images, tables, and text chunks from a PDF file.
274
+
275
+ Returns:
276
+ List: A list of extracted elements from the PDF.
277
+ """
278
+ logger.info("Starting extraction of PDF elements.")
279
+ try:
280
+ extracted_elements = partition_pdf(
281
+ filename=self.file_path,
282
+ extract_images_in_pdf=False,
283
+ infer_table_structure=True,
284
+ chunking_strategy="by_title",
285
+ max_characters=10000,
286
+ combine_text_under_n_chars=100,
287
+ image_output_dir_path=self.folder_path,
288
+ # strategy="fast",
289
+ )
290
+ logger.info("Extraction of PDF elements completed successfully.")
291
+ return extracted_elements
292
+ except Exception as e:
293
+ raise NotImplementedError(f"Error extracting PDF elements: {e}")
294
+
295
+ def categorize_elements(self, raw_pdf_elements: List) -> None:
296
+ """
297
+ Categorizes extracted elements from a PDF into tables and texts.
298
+
299
+ Args:
300
+ raw_pdf_elements (List): A list of elements extracted from the PDF.
301
+ """
302
+ logger.debug("Starting categorization of PDF elements.")
303
+ for element in raw_pdf_elements:
304
+ element_type = str(type(element))
305
+ if "unstructured.documents.elements.Table" in element_type:
306
+ self.tables.append(str(element))
307
+ elif "unstructured.documents.elements.CompositeElement" in element_type:
308
+ self.texts.append(str(element))
309
+
310
+ logger.debug("Categorization of PDF elements completed.")
311
+
312
+ def merge_chunks(self) -> List[str]:
313
+ """
314
+ Merges text chunks based on punctuation and character case criteria.
315
+
316
+ Returns:
317
+ List[str]: A list of merged text chunks.
318
+ """
319
+ logger.debug("Starting merging of text chunks.")
320
+
321
+ merged_chunks = []
322
+ skip_next = False
323
+
324
+ for i, current_chunk in enumerate(self.texts[:-1]):
325
+ next_chunk = self.texts[i + 1]
326
+
327
+ if self.should_skip_chunk(current_chunk):
328
+ continue
329
+
330
+ if self.should_merge_with_next(current_chunk, next_chunk):
331
+ merged_chunks.append(current_chunk + " " + next_chunk)
332
+ skip_next = True
333
+ else:
334
+ merged_chunks.append(current_chunk)
335
+
336
+ if not skip_next:
337
+ merged_chunks.append(self.texts[-1])
338
+
339
+ logger.debug("Merging of text chunks completed.")
340
+
341
+ return merged_chunks
342
+
343
+ @staticmethod
344
+ def should_skip_chunk(chunk: str) -> bool:
345
+ """
346
+ Determines if a chunk should be skipped based on its content.
347
+
348
+ Args:
349
+ chunk (str): The text chunk to be evaluated.
350
+
351
+ Returns:
352
+ bool: True if the chunk should be skipped, False otherwise.
353
+ """
354
+ return (chunk.lower().startswith(("figure", "fig", "table")) or
355
+ not chunk[0].isalnum() or
356
+ re.match(r'^\d+\.', chunk))
357
+
358
+ @staticmethod
359
+ def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
360
+ """
361
+ Determines if the current chunk should be merged with the next one.
362
+
363
+ Args:
364
+ current_chunk (str): The current text chunk.
365
+ next_chunk (str): The next text chunk.
366
+
367
+ Returns:
368
+ bool: True if the chunks should be merged, False otherwise.
369
+ """
370
+ return (current_chunk.endswith(",") or
371
+ (current_chunk[-1].islower() and next_chunk[0].islower()))
372
+
373
+ def process_pdf(self) -> Tuple[List[str], List[str]]:
374
+ """
375
+ Processes the PDF by extracting, categorizing, and merging elements.
376
+
377
+ Returns:
378
+ Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
379
+ is_research_paper: A boolean indicating if the paper is a research paper or not.
380
+ """
381
+ is_review_paper = False
382
+ logger.info("Starting processing of the PDF.")
383
+ try:
384
+ time_extract = time.time()
385
+ raw_pdf_elements = self.extract_pdf_elements()
386
+ logger.info(
387
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements extracted in {time.time() - time_extract:.2f} seconds.")
388
+
389
+ time_review = time.time()
390
+ for element in raw_pdf_elements:
391
+ text = element.text.split()
392
+ for word in text:
393
+ if word.lower() == 'review':
394
+ logger.warning("!!! this seems to be a review paper and not a research paper. this demo "
395
+ "analyses only research papers.")
396
+ is_review_paper = True
397
+ logging.info(
398
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF review check completed in {time.time() - time_review:.2f} seconds.")
399
+
400
+ time_categorize = time.time()
401
+ self.categorize_elements(raw_pdf_elements)
402
+ logger.info(
403
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements categorized in {time.time() - time_categorize:.2f} seconds.")
404
+
405
+ time_merge = time.time()
406
+ merged_chunks = self.merge_chunks()
407
+ logger.info(
408
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF text chunks merged in {time.time() - time_merge:.2f} seconds.")
409
+ return merged_chunks, self.tables
410
+ except Exception as e:
411
+ raise NotImplementedError(f"Error processing PDF: {e}")
412
+
413
+ def process_pdf_file(self, uploaded_file):
414
+ """
415
+ Process an uploaded PDF file.
416
+
417
+ If a new file is uploaded, the previously stored file is deleted.
418
+ The method updates the file path, processes the PDF, and returns the results.
419
+
420
+ Parameters:
421
+ uploaded_file: The new PDF file uploaded for processing.
422
+
423
+ Returns:
424
+ The results of processing the PDF file.
425
+ """
426
+
427
+ logger.info(f"Starting to process the PDF file: {uploaded_file.filename}")
428
+
429
+ with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
430
+ uploaded_file.save(temp_file.name)
431
+ self.file_path = temp_file.name
432
+ self.folder_path = os.path.dirname(self.file_path)
433
+
434
+ try:
435
+ logger.debug(f"Processing PDF at {self.file_path}")
436
+ results = self.process_pdf()
437
+ title = self.extract_title_from_pdf(self.file_path)
438
+ logger.info("PDF processing completed successfully.")
439
+ return (*results, title)
440
+
441
+ except Exception as e:
442
+ logger.error(f"Error processing PDF file: {e}", exc_info=True)
443
+ raise
444
+ finally:
445
+ try:
446
+ os.remove(self.file_path)
447
+ logger.debug(f"Temporary file {self.file_path} deleted.")
448
+ except Exception as e:
449
+ logger.warning(f"Error deleting temporary file: {e}", exc_info=True)
450
+
451
+ def extract_title_from_pdf(self, uploaded_file):
452
+ """
453
+ Extracts the title from a PDF file's metadata.
454
+
455
+ This function reads the metadata of a PDF file using PyPDF2 and attempts to
456
+ extract the title. If the title is present in the metadata, it is returned.
457
+ Otherwise, a default message indicating that the title was not found is returned.
458
+
459
+ Parameters:
460
+ uploaded_file (file): A file object or a path to the PDF file from which
461
+ to extract the title. The file must be opened in binary mode.
462
+
463
+ Returns:
464
+ str: The title of the PDF file as a string. If no title is found, returns
465
+ 'Title not found'.
466
+ """
467
+ # Initialize PDF reader
468
+ pdf_reader = PdfReader(uploaded_file)
469
+
470
+ # Extract document information
471
+ meta = pdf_reader.metadata
472
+
473
+ # Retrieve title from document information
474
+ title = meta.title if meta and meta.title else 'Title not found'
475
+ return title
476
+
477
+
478
+
479
+
480
+ class HybridRetriever(BaseRetriever):
481
+ """
482
+ A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
483
+ Inherits from BaseRetriever.
484
+
485
+ This class uses two different retrieval methods and merges their results to provide a
486
+ comprehensive set of documents in response to a query. It ensures diversity in the
487
+ retrieved documents by leveraging the strengths of both retrieval methods.
488
+
489
+ Attributes:
490
+ vector_retriever: An instance of a vector-based retriever.
491
+ bm25_retriever: An instance of a BM25 retriever.
492
+
493
+ Methods:
494
+ __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
495
+ _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
496
+ _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
497
+ """
498
+
499
+ def __init__(self, vector_retriever, bm25_retriever):
500
+ super().__init__()
501
+ self.vector_retriever = vector_retriever
502
+ self.bm25_retriever = bm25_retriever
503
+ logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
504
+
505
+ def _retrieve(self, query: str, **kwargs) -> List:
506
+ """
507
+ Retrieves and combines results from both vector and BM25 retrievers.
508
+
509
+ Args:
510
+ query: The query string for document retrieval.
511
+ **kwargs: Additional keyword arguments for retrieval.
512
+
513
+ Returns:
514
+ List: Combined list of unique nodes retrieved from both methods.
515
+ """
516
+ logger.info(f"Retrieving documents for query: {query}")
517
+ try:
518
+ bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
519
+ vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
520
+ combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
521
+
522
+ logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
523
+ return combined_nodes
524
+ except Exception as e:
525
+ logger.error(f"Error in retrieval: {e}")
526
+ raise
527
+
528
+ @staticmethod
529
+ def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
530
+ """
531
+ Combines and de-duplicates results from BM25 and vector retrievers.
532
+
533
+ Args:
534
+ bm25_nodes: Nodes retrieved from BM25 retriever.
535
+ vector_nodes: Nodes retrieved from vector retriever.
536
+
537
+ Returns:
538
+ List: Combined list of unique nodes.
539
+ """
540
+ node_ids: Set = set()
541
+ combined_nodes = []
542
+
543
+ for node in bm25_nodes + vector_nodes:
544
+ if node.node_id not in node_ids:
545
+ combined_nodes.append(node)
546
+ node_ids.add(node.node_id)
547
+
548
+ return combined_nodes
549
+
550
+
551
+ class PDFQueryEngine:
552
+ """
553
+ A class to handle the process of setting up a query engine and performing queries on PDF documents.
554
+
555
+ This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
556
+ indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
557
+
558
+ Attributes:
559
+ documents (List): A list of documents to be indexed.
560
+ llm (Language Model): The language model to be used for embeddings and queries.
561
+ qa_prompt_tmpl (str): Template for creating query prompts.
562
+ queries (List[str]): List of queries to be executed.
563
+
564
+ Methods:
565
+ setup_query_engine(): Sets up the query engine with all necessary components.
566
+ execute_queries(): Executes the predefined queries and prints the results.
567
+ """
568
+
569
+ def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
570
+
571
+ self.documents = documents
572
+ self.llm = llm
573
+ self.embed_model = embed_model
574
+ self.qa_prompt_tmpl = qa_prompt_tmpl
575
+ self.base_utils = base_utils()
576
+
577
+ logger.info("PDFQueryEngine initialized.")
578
+
579
+ def setup_query_engine(self):
580
+ """
581
+ Sets up the query engine by initializing and configuring the embedding model, service context, index,
582
+ hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
583
+
584
+ Args:
585
+ embed_model: The embedding model to be used.
586
+ service_context: The context for providing services to the query engine.
587
+ index: The index used for storing and retrieving documents.
588
+ hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
589
+ response_synthesizer: The synthesizer for generating responses to queries.
590
+
591
+ Returns:
592
+ Any: The configured query engine.
593
+ """
594
+
595
+ try:
596
+ logger.info("Initializing the service context for query engine setup.")
597
+ service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
598
+
599
+ logger.info("Creating an index from documents.")
600
+ index = VectorStoreIndex.from_documents(documents=self.documents, service_context=service_context)
601
+ nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
602
+
603
+ logger.info("Setting up vector and BM25 retrievers.")
604
+ vector_retriever = index.as_retriever(similarity_top_k=5)
605
+ bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
606
+ hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
607
+
608
+ logger.info("Configuring the response synthesizer with the prompt template.")
609
+ qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
610
+ response_synthesizer = get_response_synthesizer(
611
+ service_context=service_context,
612
+ text_qa_template=qa_prompt,
613
+ response_mode="compact",
614
+ )
615
+
616
+ logger.info("Assembling the query engine with reranker and synthesizer.")
617
+ reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")
618
+ query_engine = RetrieverQueryEngine.from_args(
619
+ retriever=hybrid_retriever,
620
+ node_postprocessors=[reranker],
621
+ response_synthesizer=response_synthesizer,
622
+ )
623
+
624
+ logger.info("Query engine setup complete.")
625
+ return query_engine
626
+ except Exception as e:
627
+ logger.error(f"Error during query engine setup: {e}")
628
+ raise
629
+
630
+ def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[
631
+ int, List[int], int, float, List[str]]:
632
+ """
633
+ Evaluate documents using a language model based on various criteria.
634
+
635
+ Args:
636
+ reg_result (Any): Result related to registration.
637
+ peer_result (Any): Result related to peer review.
638
+ guidelines_result (Any): Result related to following guidelines.
639
+ queries (List[str]): A list of queries to be processed.
640
+
641
+ Returns:
642
+ Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria,
643
+ """
644
+
645
+ logger.info("Starting evaluation with LLM.")
646
+ query_engine = self.setup_query_engine()
647
+
648
+ total_score = 0
649
+ criteria_met = 0
650
+ reasoning = []
651
+ results = {}
652
+
653
+ for j, query in enumerate(queries):
654
+ # Predefine extracted_data to handle the default case
655
+ extracted_data = None
656
+
657
+ # Handle special cases based on the value of j and other conditions
658
+ if j == 1 and reg_result:
659
+ extracted_data = {"score": 1, "reasoning": reg_result[0]}
660
+ elif j == 2 and guidelines_result:
661
+ extracted_data = {"score": 1,
662
+ "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
663
+ elif j == 8 and (guidelines_result or peer_result):
664
+ extracted_data = {"score": 1, "reasoning": "The article is published in a peer reviewed journal."}
665
+
666
+ # Handle the default case if none of the special conditions were met
667
+ if extracted_data is None:
668
+ result = query_engine.query(query).response
669
+ extracted_data = self.base_utils.extract_score_reasoning(result)
670
+
671
+ if extracted_data['score'] and int(extracted_data["score"]) > 0:
672
+ criteria_met += 1
673
+ total_score += int(extracted_data["score"])
674
+
675
+ reasoning.append(extracted_data["reasoning"])
676
+ results[j] = {
677
+ "reasoning": extracted_data["reasoning"],
678
+ "score": int(extracted_data["score"]) if extracted_data['score'] else 0
679
+ }
680
+
681
+ score_percentage = (float(total_score) / len(queries)) * 100
682
+ logger.info("Evaluation completed.")
683
+ return total_score, criteria_met, score_percentage, reasoning, results
684
+
685
+
686
+ class MixtralLLM(CustomLLM):
687
+ """
688
+ A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
689
+
690
+ Attributes:
691
+ context_window (int): Number of tokens used for context during inference.
692
+ num_output (int): Number of tokens to generate as output.
693
+ temperature (float): Sampling temperature for token generation.
694
+ model_name (str): Name of the model on Hugging Face's model hub.
695
+ api_key (str): API key for authenticating with the Hugging Face API.
696
+
697
+ Methods:
698
+ metadata: Retrieves metadata about the model.
699
+ do_hf_call: Makes an API call to the Hugging Face model.
700
+ complete: Generates a complete response for a given prompt.
701
+ stream_complete: Streams a series of token completions for a given prompt.
702
+ """
703
+
704
+ def __init__(self, context_window: int, num_output: int, temperature: float, model_name: str, api_key: str):
705
+ """
706
+ Initialize the MixtralLLM class with specific configuration values.
707
+
708
+ Args:
709
+ context_window (int): The number of tokens to consider for context during LLM inference.
710
+ num_output (int): The number of tokens to generate in the output.
711
+ temperature (float): The sampling temperature to use for generating tokens.
712
+ model_name (str): The name of the model to be used from Hugging Face's model hub.
713
+ api_key (str): The API key for authentication with Hugging Face's inference API.
714
+ """
715
+ super().__init__()
716
+ self.context_window = context_window
717
+ self.num_output = num_output
718
+ self.temperature = temperature
719
+ self.model_name = model_name
720
+ self.api_key = api_key
721
+
722
+ @property
723
+ def metadata(self) -> LLMMetadata:
724
+ """
725
+ Retrieves metadata for the Mixtral LLM.
726
+
727
+ Returns:
728
+ LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
729
+ """
730
+ return LLMMetadata(
731
+ context_window=self.context_window,
732
+ num_output=self.num_output,
733
+ model_name=self.model_name,
734
+ )
735
+
736
+ def do_hf_call(self, prompt: str) -> str:
737
+ """
738
+ Makes an API call to the Hugging Face model and retrieves the generated response.
739
+
740
+ Args:
741
+ prompt (str): The input prompt for the model.
742
+
743
+ Returns:
744
+ str: The text generated by the model in response to the prompt.
745
+
746
+ Raises:
747
+ Exception: If the API call fails or returns an error.
748
+ """
749
+ data = {
750
+ "inputs": prompt,
751
+ "parameters": {"Temperature": self.temperature}
752
+ }
753
+
754
+ # Makes a POST request to the Hugging Face API to get the model's response
755
+ response = requests.post(
756
+ f'https://api-inference.huggingface.co/models/{self.model_name}',
757
+ headers={
758
+ 'authorization': f'Bearer {self.api_key}',
759
+ 'content-type': 'application/json',
760
+ },
761
+ json=data,
762
+ stream=True
763
+ )
764
+
765
+ # Checks for a successful response and parses the generated text
766
+ if response.status_code != 200 or not response.json() or 'error' in response.json():
767
+ print(f"Error: {response}")
768
+ return "Unable to answer for technical reasons."
769
+ full_txt = response.json()[0]['generated_text']
770
+ # Finds the section of the text following the context separator
771
+ offset = full_txt.find("---------------------")
772
+ ss = full_txt[offset:]
773
+ # Extracts the actual answer from the response
774
+ offset = ss.find("Answer:")
775
+ return ss[offset + 7:].strip()
776
+
777
+ @llm_completion_callback()
778
+ def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
779
+ """
780
+ Generates a complete response for a given prompt using the Hugging Face API.
781
+
782
+ Args:
783
+ prompt (str): The input prompt for the model.
784
+ **kwargs: Additional keyword arguments for the completion.
785
+
786
+ Returns:
787
+ CompletionResponse: The complete response from the model.
788
+ """
789
+ response = self.do_hf_call(prompt)
790
+ return CompletionResponse(text=response)
791
+
792
+ @llm_completion_callback()
793
+ def stream_complete(
794
+ self, prompt: str, **kwargs: Any
795
+ ) -> CompletionResponseGen:
796
+ """
797
+ Streams a series of token completions as a response for the given prompt.
798
+
799
+ This method is useful for streaming responses where each token is generated sequentially.
800
+
801
+ Args:
802
+ prompt (str): The input prompt for the model.
803
+ **kwargs: Additional keyword arguments for the streaming completion.
804
+
805
+ Yields:
806
+ CompletionResponseGen: A generator yielding each token in the completion response.
807
+ """
808
+ # Yields a stream of tokens as the completion response for the given prompt
809
+ response = ""
810
+ for token in self.do_hf_call(prompt):
811
+ response += token
812
+ yield CompletionResponse(text=response, delta=token)
813
+
814
+
815
+ class KeywordSearch():
816
+ def __init__(self, chunks):
817
+ self.chunks = chunks
818
+
819
+ def find_journal_name(self, response: str, journal_list: list) -> str:
820
+ """
821
+ Searches for a journal name in a given response string.
822
+
823
+ This function iterates through a list of known journal names and checks if any of these
824
+ names are present in the response string. It returns the first journal name found in the
825
+ response. If no journal names from the list are found in the response, a default message
826
+ indicating that the journal name was not found is returned.
827
+
828
+ Args:
829
+ response (str): The response string to search for a journal name.
830
+ journal_list (list): A list of journal names to search within the response.
831
+
832
+ Returns:
833
+ str: The first journal name found in the response, or a default message if no journal name is found.
834
+ """
835
+ response_lower = response.lower()
836
+ for journal in journal_list:
837
+ journal_lower = journal.lower()
838
+
839
+ if journal_lower in response_lower:
840
+ return True
841
+
842
+ return False
843
+
844
+ def check_registration(self):
845
+ """
846
+ Check chunks of text for various registration numbers or URLs of registries.
847
+ Returns the sentence containing a registration number, or if not found,
848
+ returns chunks containing registry URLs.
849
+
850
+ Args:
851
+ chunks (list of str): List of text chunks to search.
852
+
853
+ Returns:
854
+ list of str: List of matching sentences or chunks, or an empty list if no matches are found.
855
+ """
856
+
857
+ # Patterns for different registration types
858
+ patterns = {
859
+ "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
860
+ "ISRCTN": r"(ISRCTN\d{8})",
861
+ "EudraCT": r"(\d{4}-\d{6}-\d{2})",
862
+ "UMIN-CTR": r"(UMIN\d{9})",
863
+ "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
864
+ }
865
+
866
+ # Registry URLs
867
+ registry_urls = [
868
+ "www.anzctr.org.au",
869
+ "anzctr.org.au",
870
+ "www.clinicaltrials.gov",
871
+ "clinicaltrials.gov",
872
+ "www.ISRCTN.org",
873
+ "ISRCTN.org",
874
+ "www.umin.ac.jp/ctr/index/htm",
875
+ "umin.ac.jp/ctr/index/htm",
876
+ "www.onderzoekmetmensen.nl/en",
877
+ "onderzoekmetmensen.nl/en",
878
+ "eudract.ema.europa.eu",
879
+ "www.eudract.ema.europa.eu"
880
+ ]
881
+
882
+ # Check each chunk for registration numbers
883
+ for chunk in self.chunks:
884
+ # Split chunk into sentences
885
+ sentences = re.split(r'(?<=[.!?]) +', chunk)
886
+
887
+ # Check each sentence for any registration number
888
+ for sentence in sentences:
889
+ for pattern in patterns.values():
890
+ if re.search(pattern, sentence):
891
+ return [sentence] # Return immediately if a registration number is found
892
+
893
+ # If no registration number found, check for URLs in chunks
894
+ matching_chunks = []
895
+ for chunk in self.chunks:
896
+ if any(url in chunk for url in registry_urls):
897
+ matching_chunks.append(chunk)
898
+
899
+ return matching_chunks
900
+
901
+
902
+ class StringExtraction():
903
+ """
904
+ A class to handle the the process of extraction of query string from complete LLM responses.
905
+
906
+ This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
907
+ LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
908
+ satisfactory results. Best case scenario will be write your own string extraction method in such cases.
909
+
910
+
911
+ Methods:
912
+ extract_original_prompt():
913
+ extraction_ground_truth():
914
+ """
915
+
916
+ def extract_original_prompt(self, result):
917
+ r1 = result.response.strip().split("\n")
918
+ binary_response = ""
919
+ explanation_response = ""
920
+ for r in r1:
921
+ if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
922
+ binary_response = r
923
+ elif r.find("Reasoning:") >= 0:
924
+ cut = r.find(":")
925
+ explanation_response += r[cut + 1:].strip()
926
+
927
+ return binary_response, explanation_response
928
+
929
+ def extraction_ground_truth(self, paper_name, labelled_data):
930
+ id = int(paper_name[paper_name.find("_") + 1:paper_name.find(".pdf")])
931
+ id_row = labelled_data[labelled_data["id"] == id]
932
+ ground_truth = id_row.iloc[:, 2:11].values.tolist()[0]
933
+ binary_ground_truth = []
934
+ explanation_ground_truth = []
935
+ for g in ground_truth:
936
+ if len(g) > 0:
937
+ binary_ground_truth.append("Yes")
938
+ explanation_ground_truth.append(g)
939
+ else:
940
+ binary_ground_truth.append("No")
941
+ explanation_ground_truth.append("The article does not provide any relevant information.")
942
+ return binary_ground_truth, explanation_ground_truth
943
+
944
+
945
+ class EvaluationMetrics():
946
+ """
947
+
948
+ This class encapsulates the evaluation methods that have been used in the project.
949
+
950
+ Attributes:
951
+ explanation_response = a list of detailed response from the LLM model corresponding to each query
952
+ explanation_ground_truth = the list of ground truth corresponding to each query
953
+
954
+ Methods:
955
+ metric_cosine_similairty(): Sets up the query engine with all necessary components.
956
+ metric_rouge(): Executes the predefined queries and prints the results.
957
+ metric_binary_accuracy():
958
+ """
959
+
960
+ def __init__(self, explanation_response, explanation_ground_truth, embedding_model):
961
+ self.explanation_response = explanation_response
962
+ self.explanation_ground_truth = explanation_ground_truth
963
+ self.embedding_model = embedding_model
964
+
965
+ def metric_cosine_similarity(self):
966
+ ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
967
+ explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
968
+ return np.diag(cosine_similarity(ground_truth_embedding, explanation_response_embedding))
969
+
970
+ def metric_rouge(self):
971
+ rouge = evaluate.load("rouge")
972
+ results = rouge.compute(predictions=self.explanation_response, references=self.explanation_ground_truth)
973
+ return results
974
+
975
+ def binary_accuracy(self, binary_response, binary_ground_truth):
976
+ count = 0
977
+ if len(binary_response) != len(binary_ground_truth):
978
+ return "Arrays which are to be compared has different lengths."
979
+ else:
980
+ for i in range(len(binary_response)):
981
+ if binary_response[i] == binary_ground_truth[i]:
982
+ count += 1
983
+ return np.round(count / len(binary_response), 2)
librarymed/kromin/__init__.py ADDED
File without changes
librarymed/kromin/app_librarymed.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import openai
5
+ from flask import Flask, flash, request, redirect, jsonify
6
+ from llama_index import Document
7
+ from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
8
+ from llama_index.llms import OpenAI
9
+
10
+ from kromin.RAG_utils import ConfigManager
11
+ from kromin.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
+
16
+ app = Flask(__name__)
17
+
18
+ app.config['SECRET_KEY'] = 'librarymed super secret key'
19
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
+ config_manager = ConfigManager()
21
+ config_manager.load_config("model", "Config/model_config.json")
22
+ app.config['user_config'] = config_manager
23
+
24
+
25
+ def allowed_file(filename, allowed_extensions):
26
+ """ Helper function to check if the file extension is allowed """
27
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
28
+
29
+
30
+ @app.route('/', methods=['GET'])
31
+ def __get__():
32
+ score = 0
33
+ criteria_met = 0
34
+ title = ""
35
+ author_info = ""
36
+ reasoning = ""
37
+
38
+ return jsonify({
39
+ 'title': title,
40
+ 'author': author_info,
41
+ 'score': score,
42
+ 'num_criteria_met': criteria_met,
43
+ 'reasoning': reasoning
44
+ })
45
+
46
+
47
+ @app.route('/upload', methods=['POST'])
48
+ def __post__():
49
+
50
+ config = app.config['user_config']
51
+ openai.api_key = os.getenv('OPENAI_API_KEY')
52
+ hf_token = os.getenv('HF_TOKEN')
53
+ embed = config.get_config_value("model", "embeddings")
54
+ embed_model_name = config.get_config_value("model", "embeddings_model")
55
+ llm_model = config.get_config_value("model", "llm_model")
56
+ model_temperature = config.get_config_value("model", "model_temp")
57
+ output_token_size = config.get_config_value("model", "max_tokens")
58
+ model_context_window = config.get_config_value("model", "context_window")
59
+ gpt_prompt_path = config.get_config_value("model", "GPT_PROMPT_PATH")
60
+ mistral_prompt_path = config.get_config_value("model", "MISTRAL_PROMPT_PATH")
61
+ info_prompt_path = config.get_config_value("model", "INFO_PROMPT_PATH")
62
+ peer_review_journals_path = config.get_config_value("model", "peer_review_journals_path")
63
+ eq_network_journals_path = config.get_config_value("model", "eq_network_journals_path")
64
+ queries = config.get_config_value("model", "queries")
65
+ num_criteria = len(config.get_config_value("model", "criteria"))
66
+ author_query = config.get_config_value("model", "author_query")
67
+ journal_query = config.get_config_value("model", "journal_query")
68
+
69
+ prompt_path = gpt_prompt_path if gpt_prompt_path else mistral_prompt_path
70
+
71
+ utils = base_utils()
72
+
73
+ # Check if the post request has the file part
74
+ if 'file' not in request.files:
75
+ flash('No file part')
76
+ return jsonify({'error': 'No file part given in the request'}), 500
77
+ file = request.files['file']
78
+ # If user does not select file, browser also submits an empty part without filename
79
+ if file.filename == '':
80
+ flash('No selected file')
81
+ return jsonify({'error': 'Empty filename given'}), 500
82
+ if file and allowed_file(file.filename, config.get_config_value("model", "allowed_extensions")):
83
+ try:
84
+ # Process the PDF file
85
+ pdf_processor = PDFProcessor_Unstructured(config.get_config_value("model", "pdf_processing"))
86
+ merged_chunks, tables, title = pdf_processor.process_pdf_file(file)
87
+ documents = [Document(text=t) for t in merged_chunks]
88
+
89
+ # LLM Model choice
90
+ if 'gpt' in llm_model.lower(): # TODO tested "gpt-4" and "gpt-3.5-turbo":
91
+ llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)
92
+ prompt_template = utils.read_from_file(gpt_prompt_path)
93
+
94
+ elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
95
+ if any(param is None for param in
96
+ [model_context_window, output_token_size, model_temperature, hf_token]):
97
+ raise ValueError("All parameters are required for Mistral LLM.")
98
+
99
+ llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
100
+ temperature=model_temperature, model_name=llm_model, api_key=hf_token)
101
+ prompt_template = utils.read_from_file(mistral_prompt_path)
102
+
103
+ else:
104
+ raise NotImplementedError(f"Error initializing language model '{llm_model}'")
105
+
106
+ # Embedding model choice for RAG
107
+ try:
108
+ if embed == "openai":
109
+ embed_model = OpenAIEmbedding()
110
+
111
+ elif embed == "huggingface":
112
+ if embed_model_name is None:
113
+ # Set to default model if name not provided
114
+ embed_model_name = "BAAI/bge-small-en-v1.5"
115
+ embed_model = HuggingFaceEmbedding(embed_model_name)
116
+ else:
117
+ # Use the specified model name
118
+ embed_model = HuggingFaceEmbedding(embed_model_name)
119
+ else:
120
+ raise ValueError(f"Unsupported embedding model: {embed}")
121
+
122
+ except Exception as e:
123
+ raise NotImplementedError(f"Error initializing embedding model: {e}")
124
+
125
+ # Prompts and Queries
126
+ prompt_template = utils.read_from_file(prompt_path)
127
+ info_prompt = utils.read_from_file(info_prompt_path)
128
+
129
+ peer_review_journals = utils.read_from_file(peer_review_journals_path)
130
+ eq_network_journals = utils.read_from_file(eq_network_journals_path)
131
+
132
+ peer_review_journals_list = peer_review_journals.split('\n')
133
+ eq_network_journals_list = eq_network_journals.split('\n')
134
+
135
+ modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
136
+ peer_review_journals_list) + "?"
137
+
138
+ pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
139
+ info_query_engine = pdf_info_query.setup_query_engine()
140
+ journal_result = info_query_engine.query(modified_journal_query).response
141
+ author_info = info_query_engine.query(author_query).response
142
+
143
+ pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (prompt_template))
144
+
145
+ # Check for prior registration
146
+ nlp_methods = KeywordSearch(merged_chunks)
147
+ eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
148
+ peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
149
+ registration_result = nlp_methods.check_registration()
150
+
151
+ # Evaluate with OpenAI model
152
+ total_score, criteria_met, score_percentage, reasoning, results = pdf_criteria_query.evaluate_with_llm(
153
+ registration_result, peer_journal_result, eq_journal_result, queries)
154
+ score = f"{round((total_score / num_criteria) * 100)}/100"
155
+
156
+ except Exception as e:
157
+ logging.exception("An error occurred while processing the file.")
158
+ # Consider adding a user-friendly message or redirect
159
+ flash('An error occurred while processing the file.')
160
+ return jsonify({'error': str(e)}), 500
161
+
162
+ return jsonify({
163
+ 'title': title,
164
+ 'author': author_info,
165
+ 'score': score,
166
+ 'num_criteria_met': criteria_met,
167
+ 'reasoning': reasoning,
168
+ 'results': results
169
+ })
librarymed/local/RAG_utils.py ADDED
@@ -0,0 +1,979 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility functions for working with the RAG model"""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import re
7
+ import time
8
+ from tempfile import NamedTemporaryFile
9
+ from typing import Any, List, Tuple, Set, Dict, Optional, Union
10
+
11
+ import evaluate
12
+ import numpy as np
13
+ import pandas as pd
14
+ import requests
15
+ from llama_index import PromptTemplate
16
+ from llama_index import VectorStoreIndex, ServiceContext
17
+ from llama_index import get_response_synthesizer
18
+ from llama_index.llms import (
19
+ CustomLLM,
20
+ CompletionResponse,
21
+ CompletionResponseGen,
22
+ LLMMetadata,
23
+ )
24
+ from llama_index.llms.base import llm_completion_callback
25
+ from llama_index.postprocessor import SentenceTransformerRerank
26
+ from llama_index.query_engine import RetrieverQueryEngine
27
+ from llama_index.retrievers import BaseRetriever, BM25Retriever
28
+ from sklearn.metrics.pairwise import cosine_similarity
29
+ from unstructured.partition.pdf import partition_pdf
30
+ from pypdf import PdfReader
31
+
32
+
33
+ # Configure basic logging
34
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
35
+
36
+ # Create a logger object
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class ConfigManager:
41
+ """
42
+ A class to manage loading and accessing configuration settings.
43
+
44
+ Attributes:
45
+ config (dict): Dictionary to hold configuration settings.
46
+
47
+ Methods:
48
+ load_config(config_path: str): Loads the configuration from a given JSON file.
49
+ get_config_value(key: str): Retrieves a specific configuration value.
50
+ """
51
+
52
+ def __init__(self):
53
+ self.configs = {}
54
+
55
+ def load_config(self, config_name: str, config_path: str) -> None:
56
+ """
57
+ Loads configuration settings from a specified JSON file into a named configuration.
58
+
59
+ Args:
60
+ config_name (str): The name to assign to this set of configurations.
61
+ config_path (str): The path to the configuration file.
62
+
63
+ Raises:
64
+ FileNotFoundError: If the config file is not found.
65
+ json.JSONDecodeError: If there is an error parsing the config file.
66
+ """
67
+ try:
68
+ with open(config_path, 'r') as f:
69
+ self.configs[config_name] = json.load(f)
70
+ except FileNotFoundError:
71
+ logging.error(f"Config file not found at {config_path}")
72
+ raise
73
+ except json.JSONDecodeError as e:
74
+ logging.error(f"Error decoding config file: {e}")
75
+ raise
76
+
77
+ def get_config_value(self, config_name: str, key: str) -> str:
78
+ """
79
+ Retrieves a specific configuration value.
80
+
81
+ Args:
82
+ key (str): The key for the configuration setting.
83
+
84
+ Returns:
85
+ str: The value of the configuration setting.
86
+
87
+ Raises:
88
+ ValueError: If the key is not found or is set to a placeholder value.
89
+ """
90
+ value = self.configs.get(config_name, {}).get(key)
91
+ if value is None or value == "ENTER_YOUR_TOKEN_HERE":
92
+ raise ValueError(f"Please set your '{key}' in the config.json file.")
93
+ return value
94
+
95
+
96
+ class base_utils:
97
+ """
98
+ A utility class providing miscellaneous static methods for processing and analyzing text data,
99
+ particularly from PDF documents and filenames. This class also includes methods for file operations.
100
+
101
+ This class encapsulates the functionality of extracting key information from text, such as scores,
102
+ reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
103
+ and reading content from files.
104
+
105
+ Attributes:
106
+ None (This class contains only static methods and does not maintain any state)
107
+
108
+ Methods:
109
+ extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
110
+ Extracts a score and reasoning from a given text using regular expressions.
111
+
112
+ extract_id_from_filename(filename: str) -> Optional[int]:
113
+ Extracts an ID from a given filename based on a specified pattern.
114
+
115
+ find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
116
+ Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
117
+
118
+ read_from_file(file_path: str) -> str:
119
+ Reads the content of a file and returns it as a string.
120
+ """
121
+
122
+ @staticmethod
123
+ def read_from_file(file_path: str) -> str:
124
+ """
125
+ Reads the content of a file and returns it as a string.
126
+
127
+ Args:
128
+ file_path (str): The path to the file to be read.
129
+
130
+ Returns:
131
+ str: The content of the file.
132
+ """
133
+ with open(file_path, 'r') as prompt_file:
134
+ prompt = prompt_file.read()
135
+ return prompt
136
+
137
+ @staticmethod
138
+ def extract_id_from_filename(filename: str) -> Optional[int]:
139
+ """
140
+ Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
141
+
142
+ Args:
143
+ filename (str): The filename from which to extract the ID.
144
+
145
+ Returns:
146
+ int: The extracted ID as an integer, or None if the pattern is not found.
147
+ """
148
+ # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
149
+ match = re.search(r'Id_(\d+).pdf', filename)
150
+ if match:
151
+ return int(match.group(1)) # Convert to integer if ID is numeric
152
+ else:
153
+ return None
154
+
155
+ @staticmethod
156
+ def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
157
+ """
158
+ Extracts score and reasoning from a given text using regular expressions.
159
+
160
+ Args:
161
+ text (str): The text from which to extract the score and reasoning.
162
+
163
+ Returns:
164
+ dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
165
+ """
166
+ # Define regular expression patterns for score and reasoning
167
+ score_pattern = r"Score: (\d+)"
168
+ reasoning_pattern = r"Reasoning: (.+)"
169
+
170
+ # Extract data using regular expressions
171
+ score_match = re.search(score_pattern, text)
172
+ reasoning_match = re.search(reasoning_pattern, text, re.DOTALL) # re.DOTALL allows '.' to match newlines
173
+
174
+ # Extract and return the results
175
+ extracted_data = {
176
+ "score": score_match.group(1) if score_match else None,
177
+ "reasoning": reasoning_match.group(1).strip() if reasoning_match else None
178
+ }
179
+
180
+ return extracted_data
181
+
182
+ @staticmethod
183
+ def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
184
+ """
185
+ Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
186
+
187
+ Args:
188
+ pdf_filename (str): The filename of the PDF.
189
+ dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
190
+
191
+ Returns:
192
+ pandas.Series or str: The matched row from the dataframe or a message indicating
193
+ that no matching row or invalid filename was found.
194
+ """
195
+ pdf_id = Utility.extract_id_from_filename(pdf_filename)
196
+ if pdf_id is not None:
197
+ # Assuming the first column contains the ID
198
+ matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
199
+ if not matched_row.empty:
200
+ return matched_row
201
+ else:
202
+ return "No matching row found."
203
+ else:
204
+ return "Invalid file name."
205
+
206
+
207
+ class PDFProcessor_Unstructured:
208
+ """
209
+ A class to process PDF files, providing functionalities for extracting, categorizing,
210
+ and merging elements from a PDF file.
211
+
212
+ This class is designed to handle unstructured PDF documents, particularly useful for
213
+ tasks involving text extraction, categorization, and data processing within PDFs.
214
+
215
+ Attributes:
216
+ file_path (str): The full path to the PDF file.
217
+ folder_path (str): The directory path where the PDF file is located.
218
+ file_name (str): The name of the PDF file.
219
+ texts (List[str]): A list to store extracted text chunks.
220
+ tables (List[str]): A list to store extracted tables.
221
+
222
+
223
+ Methods:
224
+ extract_pdf_elements() -> List:
225
+ Extracts images, tables, and text chunks from a PDF file.
226
+
227
+ categorize_elements(raw_pdf_elements: List) -> None:
228
+ Categorizes extracted elements from a PDF into tables and texts.
229
+
230
+ merge_chunks() -> List[str]:
231
+ Merges text chunks based on punctuation and character case criteria.
232
+
233
+ should_skip_chunk(chunk: str) -> bool:
234
+ Determines if a chunk should be skipped based on its content.
235
+
236
+ should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
237
+ Determines if the current chunk should be merged with the next one.
238
+
239
+ process_pdf() -> Tuple[List[str], List[str]]:
240
+ Processes the PDF by extracting, categorizing, and merging elements.
241
+
242
+ process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
243
+ Processes an uploaded PDF file to extract and categorize text and tables.
244
+ """
245
+
246
+ def __init__(self, config: Dict[str, any]):
247
+ self.file_path = None
248
+ self.folder_path = None
249
+ self.file_name = None
250
+ self.texts = []
251
+ self.tables = []
252
+ self.config = config if config is not None else self.default_config()
253
+ logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
254
+
255
+ @staticmethod
256
+ def default_config() -> Dict[str, any]:
257
+ """
258
+ Returns the default configuration for PDF processing.
259
+
260
+ Returns:
261
+ Dict[str, any]: Default configuration options.
262
+ """
263
+ return {
264
+ "extract_images": False,
265
+ "infer_table_structure": True,
266
+ "chunking_strategy": "by_title",
267
+ "max_characters": 10000,
268
+ "combine_text_under_n_chars": 100,
269
+ "strategy": "auto",
270
+ "model_name": "yolox"
271
+ }
272
+
273
+ def extract_pdf_elements(self) -> List:
274
+ """
275
+ Extracts images, tables, and text chunks from a PDF file.
276
+
277
+ Returns:
278
+ List: A list of extracted elements from the PDF.
279
+ """
280
+ logger.info("Starting extraction of PDF elements.")
281
+ try:
282
+ extracted_elements = partition_pdf(
283
+ filename=self.file_path,
284
+ extract_images_in_pdf=False,
285
+ infer_table_structure=True,
286
+ chunking_strategy="by_title",
287
+ max_characters=10000,
288
+ combine_text_under_n_chars=100,
289
+ image_output_dir_path=self.folder_path,
290
+ # strategy="fast",
291
+ )
292
+ logger.info("Extraction of PDF elements completed successfully.")
293
+ return extracted_elements
294
+ except Exception as e:
295
+ raise NotImplementedError(f"Error extracting PDF elements: {e}")
296
+
297
+ def categorize_elements(self, raw_pdf_elements: List) -> None:
298
+ """
299
+ Categorizes extracted elements from a PDF into tables and texts.
300
+
301
+ Args:
302
+ raw_pdf_elements (List): A list of elements extracted from the PDF.
303
+ """
304
+ logger.debug("Starting categorization of PDF elements.")
305
+ for element in raw_pdf_elements:
306
+ element_type = str(type(element))
307
+ if "unstructured.documents.elements.Table" in element_type:
308
+ self.tables.append(str(element))
309
+ elif "unstructured.documents.elements.CompositeElement" in element_type:
310
+ self.texts.append(str(element))
311
+
312
+ logger.debug("Categorization of PDF elements completed.")
313
+
314
+ def merge_chunks(self) -> List[str]:
315
+ """
316
+ Merges text chunks based on punctuation and character case criteria.
317
+
318
+ Returns:
319
+ List[str]: A list of merged text chunks.
320
+ """
321
+ logger.debug("Starting merging of text chunks.")
322
+
323
+ merged_chunks = []
324
+ skip_next = False
325
+
326
+ for i, current_chunk in enumerate(self.texts[:-1]):
327
+ next_chunk = self.texts[i + 1]
328
+
329
+ if self.should_skip_chunk(current_chunk):
330
+ continue
331
+
332
+ if self.should_merge_with_next(current_chunk, next_chunk):
333
+ merged_chunks.append(current_chunk + " " + next_chunk)
334
+ skip_next = True
335
+ else:
336
+ merged_chunks.append(current_chunk)
337
+
338
+ if not skip_next:
339
+ merged_chunks.append(self.texts[-1])
340
+
341
+ logger.debug("Merging of text chunks completed.")
342
+
343
+ return merged_chunks
344
+
345
+ @staticmethod
346
+ def should_skip_chunk(chunk: str) -> bool:
347
+ """
348
+ Determines if a chunk should be skipped based on its content.
349
+
350
+ Args:
351
+ chunk (str): The text chunk to be evaluated.
352
+
353
+ Returns:
354
+ bool: True if the chunk should be skipped, False otherwise.
355
+ """
356
+ return (chunk.lower().startswith(("figure", "fig", "table")) or
357
+ not chunk[0].isalnum() or
358
+ re.match(r'^\d+\.', chunk))
359
+
360
+ @staticmethod
361
+ def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
362
+ """
363
+ Determines if the current chunk should be merged with the next one.
364
+
365
+ Args:
366
+ current_chunk (str): The current text chunk.
367
+ next_chunk (str): The next text chunk.
368
+
369
+ Returns:
370
+ bool: True if the chunks should be merged, False otherwise.
371
+ """
372
+ return (current_chunk.endswith(",") or
373
+ (current_chunk[-1].islower() and next_chunk[0].islower()))
374
+
375
+ def process_pdf(self) -> Tuple[List[str], List[str]]:
376
+ """
377
+ Processes the PDF by extracting, categorizing, and merging elements.
378
+
379
+ Returns:
380
+ Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
381
+ is_research_paper: A boolean indicating if the paper is a research paper or not.
382
+ """
383
+ is_review_paper = False
384
+ logger.info("Starting processing of the PDF.")
385
+ try:
386
+ time_extract = time.time()
387
+ raw_pdf_elements = self.extract_pdf_elements()
388
+ logger.info(
389
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements extracted in {time.time() - time_extract:.2f} seconds.")
390
+
391
+ time_review = time.time()
392
+ for element in raw_pdf_elements:
393
+ text = element.text.split()
394
+ for word in text:
395
+ if word.lower() == 'review':
396
+ logger.warning("!!! this seems to be a review paper and not a research paper. this demo "
397
+ "analyses only research papers.")
398
+ is_review_paper = True
399
+ logging.info(
400
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF review check completed in {time.time() - time_review:.2f} seconds.")
401
+
402
+ time_categorize = time.time()
403
+ self.categorize_elements(raw_pdf_elements)
404
+ logger.info(
405
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements categorized in {time.time() - time_categorize:.2f} seconds.")
406
+
407
+ time_merge = time.time()
408
+ merged_chunks = self.merge_chunks()
409
+ logger.info(
410
+ f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF text chunks merged in {time.time() - time_merge:.2f} seconds.")
411
+ return merged_chunks, self.tables
412
+ except Exception as e:
413
+ raise NotImplementedError(f"Error processing PDF: {e}")
414
+
415
+ def process_pdf_file(self, uploaded_file):
416
+ """
417
+ Process an uploaded PDF file.
418
+
419
+ If a new file is uploaded, the previously stored file is deleted.
420
+ The method updates the file path, processes the PDF, and returns the results.
421
+
422
+ Parameters:
423
+ uploaded_file: The new PDF file uploaded for processing.
424
+
425
+ Returns:
426
+ The results of processing the PDF file.
427
+ """
428
+
429
+ logger.info(f"Starting to process the PDF file: {uploaded_file.filename}")
430
+
431
+ with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
432
+ uploaded_file.save(temp_file.name)
433
+ self.file_path = temp_file.name
434
+ self.folder_path = os.path.dirname(self.file_path)
435
+
436
+ try:
437
+ logger.debug(f"Processing PDF at {self.file_path}")
438
+ results = self.process_pdf()
439
+ title = self.extract_title_from_pdf(self.file_path)
440
+ logger.info("PDF processing completed successfully.")
441
+ return (*results, title)
442
+
443
+ except Exception as e:
444
+ logger.error(f"Error processing PDF file: {e}", exc_info=True)
445
+ raise
446
+ finally:
447
+ try:
448
+ os.remove(self.file_path)
449
+ logger.debug(f"Temporary file {self.file_path} deleted.")
450
+ except Exception as e:
451
+ logger.warning(f"Error deleting temporary file: {e}", exc_info=True)
452
+
453
+ def extract_title_from_pdf(self, uploaded_file):
454
+ """
455
+ Extracts the title from a PDF file's metadata.
456
+
457
+ This function reads the metadata of a PDF file using PyPDF2 and attempts to
458
+ extract the title. If the title is present in the metadata, it is returned.
459
+ Otherwise, a default message indicating that the title was not found is returned.
460
+
461
+ Parameters:
462
+ uploaded_file (file): A file object or a path to the PDF file from which
463
+ to extract the title. The file must be opened in binary mode.
464
+
465
+ Returns:
466
+ str: The title of the PDF file as a string. If no title is found, returns
467
+ 'Title not found'.
468
+ """
469
+ # Initialize PDF reader
470
+ pdf_reader = PdfReader(uploaded_file)
471
+
472
+ # Extract document information
473
+ meta = pdf_reader.metadata
474
+
475
+ # Retrieve title from document information
476
+ title = meta.title if meta and meta.title else 'Title not found'
477
+ return title
478
+
479
+
480
+
481
+
482
+ class HybridRetriever(BaseRetriever):
483
+ """
484
+ A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
485
+ Inherits from BaseRetriever.
486
+
487
+ This class uses two different retrieval methods and merges their results to provide a
488
+ comprehensive set of documents in response to a query. It ensures diversity in the
489
+ retrieved documents by leveraging the strengths of both retrieval methods.
490
+
491
+ Attributes:
492
+ vector_retriever: An instance of a vector-based retriever.
493
+ bm25_retriever: An instance of a BM25 retriever.
494
+
495
+ Methods:
496
+ __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
497
+ _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
498
+ _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
499
+ """
500
+
501
+ def __init__(self, vector_retriever, bm25_retriever):
502
+ super().__init__()
503
+ self.vector_retriever = vector_retriever
504
+ self.bm25_retriever = bm25_retriever
505
+ logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
506
+
507
+ def _retrieve(self, query: str, **kwargs) -> List:
508
+ """
509
+ Retrieves and combines results from both vector and BM25 retrievers.
510
+
511
+ Args:
512
+ query: The query string for document retrieval.
513
+ **kwargs: Additional keyword arguments for retrieval.
514
+
515
+ Returns:
516
+ List: Combined list of unique nodes retrieved from both methods.
517
+ """
518
+ logger.info(f"Retrieving documents for query: {query}")
519
+ try:
520
+ bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
521
+ vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
522
+ combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
523
+
524
+ logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
525
+ return combined_nodes
526
+ except Exception as e:
527
+ logger.error(f"Error in retrieval: {e}")
528
+ raise
529
+
530
+ @staticmethod
531
+ def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
532
+ """
533
+ Combines and de-duplicates results from BM25 and vector retrievers.
534
+
535
+ Args:
536
+ bm25_nodes: Nodes retrieved from BM25 retriever.
537
+ vector_nodes: Nodes retrieved from vector retriever.
538
+
539
+ Returns:
540
+ List: Combined list of unique nodes.
541
+ """
542
+ node_ids: Set = set()
543
+ combined_nodes = []
544
+
545
+ for node in bm25_nodes + vector_nodes:
546
+ if node.node_id not in node_ids:
547
+ combined_nodes.append(node)
548
+ node_ids.add(node.node_id)
549
+
550
+ return combined_nodes
551
+
552
+
553
+ class PDFQueryEngine:
554
+ """
555
+ A class to handle the process of setting up a query engine and performing queries on PDF documents.
556
+
557
+ This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
558
+ indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
559
+
560
+ Attributes:
561
+ documents (List): A list of documents to be indexed.
562
+ llm (Language Model): The language model to be used for embeddings and queries.
563
+ qa_prompt_tmpl (str): Template for creating query prompts.
564
+ queries (List[str]): List of queries to be executed.
565
+
566
+ Methods:
567
+ setup_query_engine(): Sets up the query engine with all necessary components.
568
+ execute_queries(): Executes the predefined queries and prints the results.
569
+ """
570
+
571
+ def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
572
+
573
+ self.documents = documents
574
+ self.llm = llm
575
+ self.embed_model = embed_model
576
+ self.qa_prompt_tmpl = qa_prompt_tmpl
577
+ self.base_utils = base_utils()
578
+
579
+ logger.info("PDFQueryEngine initialized.")
580
+
581
+ def setup_query_engine(self):
582
+ """
583
+ Sets up the query engine by initializing and configuring the embedding model, service context, index,
584
+ hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
585
+
586
+ Args:
587
+ embed_model: The embedding model to be used.
588
+ service_context: The context for providing services to the query engine.
589
+ index: The index used for storing and retrieving documents.
590
+ hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
591
+ response_synthesizer: The synthesizer for generating responses to queries.
592
+
593
+ Returns:
594
+ Any: The configured query engine.
595
+ """
596
+
597
+ try:
598
+ logger.info("Initializing the service context for query engine setup.")
599
+ service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
600
+
601
+ logger.info("Creating an index from documents.")
602
+ index = VectorStoreIndex.from_documents(documents=self.documents, service_context=service_context)
603
+ nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
604
+
605
+ logger.info("Setting up vector and BM25 retrievers.")
606
+ vector_retriever = index.as_retriever(similarity_top_k=5)
607
+ bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
608
+ hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
609
+
610
+ logger.info("Configuring the response synthesizer with the prompt template.")
611
+ qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
612
+ response_synthesizer = get_response_synthesizer(
613
+ service_context=service_context,
614
+ text_qa_template=qa_prompt,
615
+ response_mode="compact",
616
+ )
617
+
618
+ logger.info("Assembling the query engine with reranker and synthesizer.")
619
+ reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")
620
+ query_engine = RetrieverQueryEngine.from_args(
621
+ retriever=hybrid_retriever,
622
+ node_postprocessors=[reranker],
623
+ response_synthesizer=response_synthesizer,
624
+ )
625
+
626
+ logger.info("Query engine setup complete.")
627
+ return query_engine
628
+ except Exception as e:
629
+ logger.error(f"Error during query engine setup: {e}")
630
+ raise
631
+
632
+ def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[
633
+ int, List[int], int, float, List[str]]:
634
+ """
635
+ Evaluate documents using a language model based on various criteria.
636
+
637
+ Args:
638
+ reg_result (Any): Result related to registration.
639
+ peer_result (Any): Result related to peer review.
640
+ guidelines_result (Any): Result related to following guidelines.
641
+ queries (List[str]): A list of queries to be processed.
642
+
643
+ Returns:
644
+ Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria,
645
+ """
646
+
647
+ logger.info("Starting evaluation with LLM.")
648
+ query_engine = self.setup_query_engine()
649
+
650
+ total_score = 0
651
+ criteria_met = 0
652
+ reasoning = []
653
+
654
+ for j, query in enumerate(queries):
655
+ # Predefine extracted_data to handle the default case
656
+ extracted_data = None
657
+
658
+ # Handle special cases based on the value of j and other conditions
659
+ if j == 1 and reg_result:
660
+ extracted_data = {"score": 1, "reasoning": reg_result[0]}
661
+ elif j == 2 and guidelines_result:
662
+ extracted_data = {"score": 1,
663
+ "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
664
+ elif j == 8 and (guidelines_result or peer_result):
665
+ extracted_data = {"score": 1, "reasoning": "The article is published in a peer reviewed journal."}
666
+
667
+ # Handle the default case if none of the special conditions were met
668
+ if extracted_data is None:
669
+ result = query_engine.query(query).response
670
+ extracted_data = self.base_utils.extract_score_reasoning(result)
671
+
672
+ if extracted_data['score'] and int(extracted_data["score"]) > 0:
673
+ criteria_met += 1
674
+ total_score += int(extracted_data["score"])
675
+ reasoning.append(extracted_data["reasoning"])
676
+
677
+ score_percentage = (float(total_score) / len(queries)) * 100
678
+ logger.info("Evaluation completed.")
679
+ return total_score, criteria_met, score_percentage, reasoning
680
+
681
+
682
+ class MixtralLLM(CustomLLM):
683
+ """
684
+ A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
685
+
686
+ Attributes:
687
+ context_window (int): Number of tokens used for context during inference.
688
+ num_output (int): Number of tokens to generate as output.
689
+ temperature (float): Sampling temperature for token generation.
690
+ model_name (str): Name of the model on Hugging Face's model hub.
691
+ api_key (str): API key for authenticating with the Hugging Face API.
692
+
693
+ Methods:
694
+ metadata: Retrieves metadata about the model.
695
+ do_hf_call: Makes an API call to the Hugging Face model.
696
+ complete: Generates a complete response for a given prompt.
697
+ stream_complete: Streams a series of token completions for a given prompt.
698
+ """
699
+
700
+ def __init__(self, context_window: int, num_output: int, temperature: float, model_name: str, api_key: str):
701
+ """
702
+ Initialize the MixtralLLM class with specific configuration values.
703
+
704
+ Args:
705
+ context_window (int): The number of tokens to consider for context during LLM inference.
706
+ num_output (int): The number of tokens to generate in the output.
707
+ temperature (float): The sampling temperature to use for generating tokens.
708
+ model_name (str): The name of the model to be used from Hugging Face's model hub.
709
+ api_key (str): The API key for authentication with Hugging Face's inference API.
710
+ """
711
+ super().__init__()
712
+ self.context_window = context_window
713
+ self.num_output = num_output
714
+ self.temperature = temperature
715
+ self.model_name = model_name
716
+ self.api_key = api_key
717
+
718
+ @property
719
+ def metadata(self) -> LLMMetadata:
720
+ """
721
+ Retrieves metadata for the Mixtral LLM.
722
+
723
+ Returns:
724
+ LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
725
+ """
726
+ return LLMMetadata(
727
+ context_window=self.context_window,
728
+ num_output=self.num_output,
729
+ model_name=self.model_name,
730
+ )
731
+
732
+ def do_hf_call(self, prompt: str) -> str:
733
+ """
734
+ Makes an API call to the Hugging Face model and retrieves the generated response.
735
+
736
+ Args:
737
+ prompt (str): The input prompt for the model.
738
+
739
+ Returns:
740
+ str: The text generated by the model in response to the prompt.
741
+
742
+ Raises:
743
+ Exception: If the API call fails or returns an error.
744
+ """
745
+ data = {
746
+ "inputs": prompt,
747
+ "parameters": {"Temperature": self.temperature}
748
+ }
749
+
750
+ # Makes a POST request to the Hugging Face API to get the model's response
751
+ response = requests.post(
752
+ f'https://api-inference.huggingface.co/models/{self.model_name}',
753
+ headers={
754
+ 'authorization': f'Bearer {self.api_key}',
755
+ 'content-type': 'application/json',
756
+ },
757
+ json=data,
758
+ stream=True
759
+ )
760
+
761
+ # Checks for a successful response and parses the generated text
762
+ if response.status_code != 200 or not response.json() or 'error' in response.json():
763
+ print(f"Error: {response}")
764
+ return "Unable to answer for technical reasons."
765
+ full_txt = response.json()[0]['generated_text']
766
+ # Finds the section of the text following the context separator
767
+ offset = full_txt.find("---------------------")
768
+ ss = full_txt[offset:]
769
+ # Extracts the actual answer from the response
770
+ offset = ss.find("Answer:")
771
+ return ss[offset + 7:].strip()
772
+
773
+ @llm_completion_callback()
774
+ def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
775
+ """
776
+ Generates a complete response for a given prompt using the Hugging Face API.
777
+
778
+ Args:
779
+ prompt (str): The input prompt for the model.
780
+ **kwargs: Additional keyword arguments for the completion.
781
+
782
+ Returns:
783
+ CompletionResponse: The complete response from the model.
784
+ """
785
+ response = self.do_hf_call(prompt)
786
+ return CompletionResponse(text=response)
787
+
788
+ @llm_completion_callback()
789
+ def stream_complete(
790
+ self, prompt: str, **kwargs: Any
791
+ ) -> CompletionResponseGen:
792
+ """
793
+ Streams a series of token completions as a response for the given prompt.
794
+
795
+ This method is useful for streaming responses where each token is generated sequentially.
796
+
797
+ Args:
798
+ prompt (str): The input prompt for the model.
799
+ **kwargs: Additional keyword arguments for the streaming completion.
800
+
801
+ Yields:
802
+ CompletionResponseGen: A generator yielding each token in the completion response.
803
+ """
804
+ # Yields a stream of tokens as the completion response for the given prompt
805
+ response = ""
806
+ for token in self.do_hf_call(prompt):
807
+ response += token
808
+ yield CompletionResponse(text=response, delta=token)
809
+
810
+
811
+ class KeywordSearch():
812
+ def __init__(self, chunks):
813
+ self.chunks = chunks
814
+
815
+ def find_journal_name(self, response: str, journal_list: list) -> str:
816
+ """
817
+ Searches for a journal name in a given response string.
818
+
819
+ This function iterates through a list of known journal names and checks if any of these
820
+ names are present in the response string. It returns the first journal name found in the
821
+ response. If no journal names from the list are found in the response, a default message
822
+ indicating that the journal name was not found is returned.
823
+
824
+ Args:
825
+ response (str): The response string to search for a journal name.
826
+ journal_list (list): A list of journal names to search within the response.
827
+
828
+ Returns:
829
+ str: The first journal name found in the response, or a default message if no journal name is found.
830
+ """
831
+ response_lower = response.lower()
832
+ for journal in journal_list:
833
+ journal_lower = journal.lower()
834
+
835
+ if journal_lower in response_lower:
836
+ return True
837
+
838
+ return False
839
+
840
+ def check_registration(self):
841
+ """
842
+ Check chunks of text for various registration numbers or URLs of registries.
843
+ Returns the sentence containing a registration number, or if not found,
844
+ returns chunks containing registry URLs.
845
+
846
+ Args:
847
+ chunks (list of str): List of text chunks to search.
848
+
849
+ Returns:
850
+ list of str: List of matching sentences or chunks, or an empty list if no matches are found.
851
+ """
852
+
853
+ # Patterns for different registration types
854
+ patterns = {
855
+ "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
856
+ "ISRCTN": r"(ISRCTN\d{8})",
857
+ "EudraCT": r"(\d{4}-\d{6}-\d{2})",
858
+ "UMIN-CTR": r"(UMIN\d{9})",
859
+ "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
860
+ }
861
+
862
+ # Registry URLs
863
+ registry_urls = [
864
+ "www.anzctr.org.au",
865
+ "anzctr.org.au",
866
+ "www.clinicaltrials.gov",
867
+ "clinicaltrials.gov",
868
+ "www.ISRCTN.org",
869
+ "ISRCTN.org",
870
+ "www.umin.ac.jp/ctr/index/htm",
871
+ "umin.ac.jp/ctr/index/htm",
872
+ "www.onderzoekmetmensen.nl/en",
873
+ "onderzoekmetmensen.nl/en",
874
+ "eudract.ema.europa.eu",
875
+ "www.eudract.ema.europa.eu"
876
+ ]
877
+
878
+ # Check each chunk for registration numbers
879
+ for chunk in self.chunks:
880
+ # Split chunk into sentences
881
+ sentences = re.split(r'(?<=[.!?]) +', chunk)
882
+
883
+ # Check each sentence for any registration number
884
+ for sentence in sentences:
885
+ for pattern in patterns.values():
886
+ if re.search(pattern, sentence):
887
+ return [sentence] # Return immediately if a registration number is found
888
+
889
+ # If no registration number found, check for URLs in chunks
890
+ matching_chunks = []
891
+ for chunk in self.chunks:
892
+ if any(url in chunk for url in registry_urls):
893
+ matching_chunks.append(chunk)
894
+
895
+ return matching_chunks
896
+
897
+
898
+ class StringExtraction():
899
+ """
900
+ A class to handle the the process of extraction of query string from complete LLM responses.
901
+
902
+ This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
903
+ LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
904
+ satisfactory results. Best case scenario will be write your own string extraction method in such cases.
905
+
906
+
907
+ Methods:
908
+ extract_original_prompt():
909
+ extraction_ground_truth():
910
+ """
911
+
912
+ def extract_original_prompt(self, result):
913
+ r1 = result.response.strip().split("\n")
914
+ binary_response = ""
915
+ explanation_response = ""
916
+ for r in r1:
917
+ if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
918
+ binary_response = r
919
+ elif r.find("Reasoning:") >= 0:
920
+ cut = r.find(":")
921
+ explanation_response += r[cut + 1:].strip()
922
+
923
+ return binary_response, explanation_response
924
+
925
+ def extraction_ground_truth(self, paper_name, labelled_data):
926
+ id = int(paper_name[paper_name.find("_") + 1:paper_name.find(".pdf")])
927
+ id_row = labelled_data[labelled_data["id"] == id]
928
+ ground_truth = id_row.iloc[:, 2:11].values.tolist()[0]
929
+ binary_ground_truth = []
930
+ explanation_ground_truth = []
931
+ for g in ground_truth:
932
+ if len(g) > 0:
933
+ binary_ground_truth.append("Yes")
934
+ explanation_ground_truth.append(g)
935
+ else:
936
+ binary_ground_truth.append("No")
937
+ explanation_ground_truth.append("The article does not provide any relevant information.")
938
+ return binary_ground_truth, explanation_ground_truth
939
+
940
+
941
+ class EvaluationMetrics():
942
+ """
943
+
944
+ This class encapsulates the evaluation methods that have been used in the project.
945
+
946
+ Attributes:
947
+ explanation_response = a list of detailed response from the LLM model corresponding to each query
948
+ explanation_ground_truth = the list of ground truth corresponding to each query
949
+
950
+ Methods:
951
+ metric_cosine_similairty(): Sets up the query engine with all necessary components.
952
+ metric_rouge(): Executes the predefined queries and prints the results.
953
+ metric_binary_accuracy():
954
+ """
955
+
956
+ def __init__(self, explanation_response, explanation_ground_truth, embedding_model):
957
+ self.explanation_response = explanation_response
958
+ self.explanation_ground_truth = explanation_ground_truth
959
+ self.embedding_model = embedding_model
960
+
961
+ def metric_cosine_similarity(self):
962
+ ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
963
+ explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
964
+ return np.diag(cosine_similarity(ground_truth_embedding, explanation_response_embedding))
965
+
966
+ def metric_rouge(self):
967
+ rouge = evaluate.load("rouge")
968
+ results = rouge.compute(predictions=self.explanation_response, references=self.explanation_ground_truth)
969
+ return results
970
+
971
+ def binary_accuracy(self, binary_response, binary_ground_truth):
972
+ count = 0
973
+ if len(binary_response) != len(binary_ground_truth):
974
+ return "Arrays which are to be compared has different lengths."
975
+ else:
976
+ for i in range(len(binary_response)):
977
+ if binary_response[i] == binary_ground_truth[i]:
978
+ count += 1
979
+ return np.round(count / len(binary_response), 2)
librarymed/local/__init__.py ADDED
File without changes
librarymed/local/app_local.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import argparse
3
+ import logging
4
+ import os
5
+
6
+ import openai
7
+ from flask import Flask, flash, request, render_template, redirect
8
+ from llama_index import Document
9
+ from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
10
+ from llama_index.llms import OpenAI
11
+
12
+ from librarymed.local.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
13
+ ConfigManager
14
+
15
+ app = Flask(__name__)
16
+ app.config['SECRET_KEY'] = 'librarymed super secret key'
17
+
18
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
19
+ config_manager = ConfigManager()
20
+ config_manager.load_config("api", "Config/api_config.json")
21
+ config_manager.load_config("model", "Config/model_config.json")
22
+ app.config['user_config'] = config_manager
23
+
24
+
25
+ def allowed_file(filename, allowed_extensions):
26
+ """ Helper function to check if the file extension is allowed """
27
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
28
+
29
+
30
+ @app.route('/', methods=['GET'])
31
+ def __get__():
32
+ score = 0
33
+ criteria_met = 0
34
+ title = ""
35
+ author_info = ""
36
+ reasoning = ""
37
+
38
+ return render_template('index.html',
39
+ title=title,
40
+ author=author_info,
41
+ score=score,
42
+ criteria_met=criteria_met,
43
+ reasoning=reasoning,
44
+ )
45
+
46
+
47
+ @app.route('/upload', methods=['POST'])
48
+ def upload():
49
+ config = app.config['user_config']
50
+ openai.api_key = config.get_config_value("api", "OPENAI_API_KEY")
51
+ hf_token = config.get_config_value("api", "HF_TOKEN")
52
+ embed = config.get_config_value("model", "embeddings")
53
+ embed_model_name = config.get_config_value("model", "embeddings_model")
54
+ llm_model = config.get_config_value("model", "llm_model")
55
+ model_temperature = config.get_config_value("model", "model_temp")
56
+ output_token_size = config.get_config_value("model", "max_tokens")
57
+ model_context_window = config.get_config_value("model", "context_window")
58
+ gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
59
+ mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
60
+ info_prompt_path = config.get_config_value("model", "INFO_PROMPT_PATH")
61
+ peer_review_journals_path = config.get_config_value("model", "peer_review_journals_path")
62
+ eq_network_journals_path = config.get_config_value("model", "eq_network_journals_path")
63
+ queries = config.get_config_value("model", "queries")
64
+ num_criteria = len(config.get_config_value("model", "criteria"))
65
+ author_query = config.get_config_value("model", "author_query")
66
+ journal_query = config.get_config_value("model", "journal_query")
67
+
68
+ # Check if the post request has the file part
69
+ if 'file' not in request.files:
70
+ flash('No file part')
71
+ return redirect(request.url)
72
+ file = request.files['file']
73
+ # If user does not select file, browser also submits an empty part without filename
74
+ if file.filename == '':
75
+ flash('No selected file')
76
+ return redirect(request.url)
77
+ if file and allowed_file(file.filename, config.get_config_value("model", "allowed_extensions")):
78
+ try:
79
+ # Process the PDF file
80
+ pdf_processor = PDFProcessor_Unstructured(config.get_config_value("model", "pdf_processing"))
81
+ merged_chunks, tables, title = pdf_processor.process_pdf_file(file)
82
+ documents = [Document(text=t) for t in merged_chunks]
83
+
84
+ utils = base_utils()
85
+
86
+ # LLM Model choice
87
+ if 'gpt' in llm_model.lower(): # TODO tested "gpt-4" and "gpt-3.5-turbo":
88
+ llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)
89
+ prompt_template = utils.read_from_file(gpt_prompt_path)
90
+
91
+ elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
92
+ if any(param is None for param in
93
+ [model_context_window, output_token_size, model_temperature, hf_token]):
94
+ raise ValueError("All parameters are required for Mistral LLM.")
95
+
96
+ llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
97
+ temperature=model_temperature, model_name=llm_model, api_key=hf_token)
98
+ prompt_template = utils.read_from_file(mistral_prompt_path)
99
+
100
+ else:
101
+ raise NotImplementedError(f"Error initializing language model '{llm_model}'")
102
+
103
+ # Embedding model choice for RAG
104
+ if embed == "openai":
105
+ embed_model = OpenAIEmbedding()
106
+
107
+ elif embed == "huggingface":
108
+ if embed_model_name is None:
109
+ # Set to default model if name not provided
110
+ embed_model_name = "BAAI/bge-small-en-v1.5"
111
+ embed_model = HuggingFaceEmbedding(embed_model_name)
112
+ else:
113
+ # Use the specified model name
114
+ embed_model = HuggingFaceEmbedding(embed_model_name)
115
+
116
+ else:
117
+ raise NotImplementedError(f"Error initializing embedding model: {embed}")
118
+
119
+ # Prompts and Queries
120
+ info_prompt = utils.read_from_file(info_prompt_path)
121
+
122
+ peer_review_journals = utils.read_from_file(peer_review_journals_path)
123
+ eq_network_journals = utils.read_from_file(eq_network_journals_path)
124
+
125
+ peer_review_journals_list = peer_review_journals.split('\n')
126
+ eq_network_journals_list = eq_network_journals.split('\n')
127
+
128
+ modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
129
+ peer_review_journals_list) + "?"
130
+
131
+ pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
132
+ info_query_engine = pdf_info_query.setup_query_engine()
133
+ journal_result = info_query_engine.query(modified_journal_query).response
134
+ author_info = info_query_engine.query(author_query).response
135
+
136
+ pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (prompt_template))
137
+
138
+ # Check for prior registration
139
+ nlp_methods = KeywordSearch(merged_chunks)
140
+ eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
141
+ peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
142
+ registration_result = nlp_methods.check_registration()
143
+
144
+ # Evaluate with OpenAI model
145
+ total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
146
+ registration_result, peer_journal_result, eq_journal_result, queries)
147
+ score = f"{round((total_score / num_criteria) * 100)}/100"
148
+
149
+ except Exception as e:
150
+ flash('An error occurred while processing the file. Error: ' + str(e))
151
+ return redirect(request.url)
152
+
153
+ # e.g. score: 56 / 100 - criteria_met: 5 - author_info: Direct
154
+ return render_template('index.html',
155
+ title=title,
156
+ author=author_info,
157
+ score=score,
158
+ criteria_met=criteria_met,
159
+ reasoning=reasoning,
160
+ )
librarymed/local/templates/index.html ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <title>Upload and Results</title>
5
+ <!-- Include Google Fonts -->
6
+ <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
7
+ <style>
8
+ body {
9
+ font-family: 'Roboto', sans-serif;
10
+ background-color: #f4f4f4;
11
+ overflow: auto;
12
+ width: 100%;
13
+ margin: 0;
14
+ padding: 0;
15
+ display: flex;
16
+ flex-direction: column; /* Stack flex items vertically */
17
+ align-items: center; /* Center items horizontally */
18
+ justify-content: flex-start; /* Align items to the start of the container vertically */
19
+ min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
20
+ }
21
+
22
+ table {
23
+ width: 100%; /* Adjust the width as needed */
24
+ border-collapse: collapse; /* Collapse borders for a tighter look */
25
+ }
26
+
27
+ th, td {
28
+ border: 1px solid #ddd; /* Adjust the border size as needed */
29
+ text-align: left;
30
+ padding: 5px; /* Reduce padding to decrease cell spacing */
31
+ height: 30px; /* Optionally reduce the height of the cells */
32
+ }
33
+ .parent-element {
34
+ overflow: visible; /* Ensures content is not cut off */
35
+ }
36
+ .container {
37
+ background-color: white;
38
+ overflow: auto;
39
+ border-radius: 8px;
40
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
41
+ padding: 40px;
42
+ width: 100%; /* Set width to 100% of the viewport */
43
+ max-width: 700px;
44
+ }
45
+ .score-bar-container {
46
+ position: relative;
47
+ margin-top: 20px; /* Space above the score bar */
48
+ max-width: 100%; /* Ensures the container does not exceed the parent width */
49
+ }
50
+ .score-very-good-fill {
51
+ background-color: #4CAF50; /* Green */
52
+ }
53
+
54
+ .score-good-fill {
55
+ background-color: #FFEB3B; /* Yellow */
56
+ }
57
+
58
+ .score-ok-fill {
59
+ background-color: #FF9800; /* Orange */
60
+ }
61
+
62
+ .score-bad-fill {
63
+ background-color: #f44336; /* Red */
64
+ }
65
+
66
+ .score-very-bad-fill {
67
+ background-color: #9E9E9E; /* Grey */
68
+ }
69
+ .score-very-good-text {
70
+ color: #4CAF50; /* Green */
71
+ }
72
+
73
+ .score-good-text {
74
+ color: #FFEB3B; /* Yellow */
75
+ }
76
+
77
+ .score-ok-text {
78
+ color: #FF9800; /* Orange */
79
+ }
80
+
81
+ .score-bad-text {
82
+ color: #f44336; /* Red */
83
+ }
84
+
85
+ .score-very-bad-text {
86
+ color: #9E9E9E; /* Grey */
87
+ }
88
+
89
+ .score-bar {
90
+ background-color: #ddd;
91
+ border-radius: 10px;
92
+ height: 20px;
93
+ width: 100%; /* Adjusted to take the full width */
94
+ display: inline-block; /* Allows the score text to sit next to the score bar */
95
+ vertical-align: middle; /* Aligns score bar and text vertically */
96
+ }
97
+
98
+ .score-fill {
99
+ height: 100%;
100
+ border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
101
+ display: inline-block;
102
+ vertical-align: middle;
103
+ }
104
+
105
+ .score-text {
106
+ display: inline-block;
107
+ vertical-align: middle; /* Align with the score bar */
108
+ font-weight: bold; /* Make the score text bold */
109
+ margin-left: 10px; /* Space between the score bar and score text */
110
+ }
111
+
112
+ .score-title {
113
+ font-size: 20px;
114
+ font-weight: bold;
115
+ margin: 20px 0;
116
+ color: #333;
117
+ }
118
+ .major-issues {
119
+ text-align: left; /* Aligns the major issues to the left */
120
+ padding-left: 20px; /* Padding for the bullet list */
121
+ list-style: inside disc; /* Bullet style */
122
+ }
123
+ form {
124
+ margin-bottom: 20px;
125
+ }
126
+ input[type="file"] {
127
+ margin-bottom: 10px;
128
+ }
129
+ input[type="submit"] {
130
+ cursor: pointer;
131
+ margin-top: 10px;
132
+ padding: 10px 20px;
133
+ border: none;
134
+ background-color: #4CAF50;
135
+ color: white;
136
+ border-radius: 5px;
137
+ font-size: 16px;
138
+ font-weight: bold;
139
+ }
140
+ input[type="submit"]:hover {
141
+ background-color: #45a049;
142
+ }
143
+ </style>
144
+ </head>
145
+ <body>
146
+ <div class="container">
147
+ <h2>Upload PDF and View Results</h2>
148
+
149
+ <!-- Upload Form -->
150
+ <form action="/upload" method="post" enctype="multipart/form-data">
151
+ <input type="file" name="file" required>
152
+ <input type="submit" value="Upload">
153
+ </form>
154
+
155
+ <!-- Results Section -->
156
+ {% if total_score is not none %}
157
+ <!-- GPT-4 Score Bar -->
158
+ <div class="score-title">Score:</div>
159
+ <div class="score-bar-container">
160
+ <div class="score-bar">
161
+ <div class="score-fill {{
162
+ 'score-very-good-fill' if criteria_met == 9 else
163
+ 'score-good-fill' if criteria_met >= 7 else
164
+ 'score-ok-fill' if criteria_met >= 5 else
165
+ 'score-bad-fill' if criteria_met >= 3 else
166
+ 'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
167
+ </div>
168
+ <div class="score-text">{{ score }}</div>
169
+ </div>
170
+
171
+ <h3>Title:</h3>
172
+ <p> {{title}}</p>
173
+
174
+ <h3>Author Information:</h3>
175
+ <p> {{author}}</p>
176
+
177
+ <h3>Reasoning:</h3>
178
+ <ul class="major-issues">
179
+ {% for issue in reasoning %}
180
+ <li>{{ issue }}</li>
181
+ {% endfor %}
182
+ </ul>
183
+
184
+ {% endif %}
185
+ </div>
186
+ </body>
187
+ </html>
librarymed/local/templates/upload_and_results.html ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <title>Upload and Results</title>
5
+ <!-- Include Google Fonts -->
6
+ <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
7
+ <style>
8
+ body {
9
+ font-family: 'Roboto', sans-serif;
10
+ background-color: #f4f4f4;
11
+ overflow: auto;
12
+ width: 100%;
13
+ margin: 0;
14
+ padding: 0;
15
+ display: flex;
16
+ flex-direction: column; /* Stack flex items vertically */
17
+ align-items: center; /* Center items horizontally */
18
+ justify-content: flex-start; /* Align items to the start of the container vertically */
19
+ min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
20
+ }
21
+
22
+ table {
23
+ width: 100%; /* Adjust the width as needed */
24
+ border-collapse: collapse; /* Collapse borders for a tighter look */
25
+ }
26
+
27
+ th, td {
28
+ border: 1px solid #ddd; /* Adjust the border size as needed */
29
+ text-align: left;
30
+ padding: 5px; /* Reduce padding to decrease cell spacing */
31
+ height: 30px; /* Optionally reduce the height of the cells */
32
+ }
33
+ .parent-element {
34
+ overflow: visible; /* Ensures content is not cut off */
35
+ }
36
+ .container {
37
+ background-color: white;
38
+ overflow: auto;
39
+ border-radius: 8px;
40
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
41
+ padding: 40px;
42
+ width: 100%; /* Set width to 100% of the viewport */
43
+ max-width: 700px;
44
+ }
45
+ .score-bar-container {
46
+ position: relative;
47
+ margin-top: 20px; /* Space above the score bar */
48
+ max-width: 100%; /* Ensures the container does not exceed the parent width */
49
+ }
50
+ .score-very-good-fill {
51
+ background-color: #4CAF50; /* Green */
52
+ }
53
+
54
+ .score-good-fill {
55
+ background-color: #FFEB3B; /* Yellow */
56
+ }
57
+
58
+ .score-ok-fill {
59
+ background-color: #FF9800; /* Orange */
60
+ }
61
+
62
+ .score-bad-fill {
63
+ background-color: #f44336; /* Red */
64
+ }
65
+
66
+ .score-very-bad-fill {
67
+ background-color: #9E9E9E; /* Grey */
68
+ }
69
+ .score-very-good-text {
70
+ color: #4CAF50; /* Green */
71
+ }
72
+
73
+ .score-good-text {
74
+ color: #FFEB3B; /* Yellow */
75
+ }
76
+
77
+ .score-ok-text {
78
+ color: #FF9800; /* Orange */
79
+ }
80
+
81
+ .score-bad-text {
82
+ color: #f44336; /* Red */
83
+ }
84
+
85
+ .score-very-bad-text {
86
+ color: #9E9E9E; /* Grey */
87
+ }
88
+
89
+ .score-bar {
90
+ background-color: #ddd;
91
+ border-radius: 10px;
92
+ height: 20px;
93
+ width: 100%; /* Adjusted to take the full width */
94
+ display: inline-block; /* Allows the score text to sit next to the score bar */
95
+ vertical-align: middle; /* Aligns score bar and text vertically */
96
+ }
97
+
98
+ .score-fill {
99
+ height: 100%;
100
+ border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
101
+ display: inline-block;
102
+ vertical-align: middle;
103
+ }
104
+
105
+ .score-text {
106
+ display: inline-block;
107
+ vertical-align: middle; /* Align with the score bar */
108
+ font-weight: bold; /* Make the score text bold */
109
+ margin-left: 10px; /* Space between the score bar and score text */
110
+ }
111
+
112
+ .score-title {
113
+ font-size: 20px;
114
+ font-weight: bold;
115
+ margin: 20px 0;
116
+ color: #333;
117
+ }
118
+ .major-issues {
119
+ text-align: left; /* Aligns the major issues to the left */
120
+ padding-left: 20px; /* Padding for the bullet list */
121
+ list-style: inside disc; /* Bullet style */
122
+ }
123
+ form {
124
+ margin-bottom: 20px;
125
+ }
126
+ input[type="file"] {
127
+ margin-bottom: 10px;
128
+ }
129
+ input[type="submit"] {
130
+ cursor: pointer;
131
+ margin-top: 10px;
132
+ padding: 10px 20px;
133
+ border: none;
134
+ background-color: #4CAF50;
135
+ color: white;
136
+ border-radius: 5px;
137
+ font-size: 16px;
138
+ font-weight: bold;
139
+ }
140
+ input[type="submit"]:hover {
141
+ background-color: #45a049;
142
+ }
143
+ </style>
144
+ </head>
145
+ <body>
146
+ <div class="container">
147
+ <h2>Upload PDF and View Results</h2>
148
+
149
+ <!-- Upload Form -->
150
+ <form action="/upload" method="post" enctype="multipart/form-data">
151
+ <input type="file" name="file" required>
152
+ <input type="submit" value="Upload">
153
+ </form>
154
+
155
+ <!-- Results Section -->
156
+ {% if gpt4_score is not none or mistral_score is not none %}
157
+ <!-- GPT-4 Score Bar -->
158
+ <div class="score-title">Score for GPT-4:</div>
159
+ <div class="score-bar-container">
160
+ <div class="score-bar">
161
+ <div class="score-fill {{
162
+ 'score-very-good-fill' if criteria_met_gpt4 == 9 else
163
+ 'score-good-fill' if criteria_met_gpt4 >= 7 else
164
+ 'score-ok-fill' if criteria_met_gpt4 >= 5 else
165
+ 'score-bad-fill' if criteria_met_gpt4 >= 3 else
166
+ 'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
167
+ </div>
168
+ <div class="score-text">{{ total_score_gpt4 }}/9</div>
169
+ </div>
170
+
171
+ <!-- Mistral Score Bar -->
172
+ <div class="score-title">Score for Mistral:</div>
173
+ <div class="score-bar-container">
174
+ <div class="score-bar">
175
+ <div class="score-fill {{
176
+ 'score-very-good-fill' if criteria_met_mistral == 9 else
177
+ 'score-good-fill' if criteria_met_mistral >= 7 else
178
+ 'score-ok-fill' if criteria_met_mistral >= 5 else
179
+ 'score-bad-fill' if criteria_met_mistral >= 3 else
180
+ 'score-very-bad-fill' }}" style="width: {{ score_percentage_mistral }}%;"></div>
181
+ </div>
182
+ <div class="score-text">{{ total_score_mistral }}/9</div>
183
+ </div>
184
+
185
+ <!-- Reasoning for GPT-4 -->
186
+ <h3>Reasoning from GPT-4:</h3>
187
+ <ul class="major-issues">
188
+ {% for issue in reasoning_gpt4 %}
189
+ <li>{{ issue }}</li>
190
+ {% endfor %}
191
+ </ul>
192
+
193
+ <!-- Reasoning for Mistral -->
194
+ <h3>Reasoning from Mistral:</h3>
195
+ <ul class="major-issues">
196
+ {% for issue in reasoning_mistral %}
197
+ <li>{{ issue }}</li>
198
+ {% endfor %}
199
+ </ul>
200
+ <!-- Insert the Criteria Table Section Here -->
201
+ {% if combined_criteria_table %}
202
+ <h3>Criteria Evaluation</h3>
203
+ <table>
204
+ <thead>
205
+ <tr>
206
+ <th>Criteria Number</th>
207
+ <th>GPT-4 output</th>
208
+ <th>Mistral output</th>
209
+ <th>Ground truth</th>
210
+ </tr>
211
+ </thead>
212
+ <tbody>
213
+ {% for row in combined_criteria_table %}
214
+ <tr>
215
+ <td>{{ row['Criteria Number'] }}</td>
216
+ <td>{{ 'Yes' if row['Score GPT-4'] == 1 else 'No' }}</td>
217
+ <td>{{ 'Yes' if row['Score Mistral'] == 1 else 'No' }}</td>
218
+ <td>{{ 'Yes' if row['ground truth'] else 'No' }}</td>
219
+ </tr>
220
+ {% endfor %}
221
+ </tbody>
222
+ </table>
223
+ {% endif %}
224
+ {% endif %}
225
+ </div>
226
+ </body>
227
+ </html>
librarymed/main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ if __name__ == '__main__':
9
+ args_parse = argparse.ArgumentParser(description="LibraryMed")
10
+ args_parse.add_argument("--local", help="Run inferface v0.1.0 by the fellows", action="store_true")
11
+ args = args_parse.parse_args()
12
+ port = os.getenv("PORT") or 80
13
+
14
+ if args.local:
15
+ from .local.app_local import app
16
+ logging.info("Run LibraryMed interface v0.1.0 developed by the fellows")
17
+ app.run(debug=True, host="0.0.0.0", port=port)
18
+
19
+ else:
20
+ from kromin.app_librarymed import app
21
+ logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
22
+ app.run(debug=True, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ chromadb
3
+ cohere
4
+ faiss-cpu
5
+ Flask
6
+ langchain
7
+ langchainhub
8
+ gradio
9
+ llama-index == 0.9.35
10
+ llmsherpa
11
+ lxml
12
+ unstructured
13
+ bs4
14
+ evaluate
15
+ faiss-cpu
16
+ numpy
17
+ openai
18
+ Pillow == 10.0.1
19
+ PyPDF2
20
+ pydantic
21
+ rank-bm25
22
+ requests
23
+ rapidocr-onnxruntime
24
+ rouge-score
25
+ scikit-learn
26
+ sentence-transformers
27
+ tiktoken
28
+ transformers
29
+ tesseract
30
+ pdf2image
31
+ pdfminer.six
32
+ opencv-python
33
+ pikepdf
34
+ pypdf
35
+ unstructured-inference
36
+ pytesseract
37
+ pillow-heif
38
+ unstructured-pytesseract
39
+ fpdf
40
+ qdrant_client
41
+ python-dotenv