kpal002 commited on
Commit
b98ea00
1 Parent(s): 851ca26

Upload 16 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
2
+ FROM nvidia/cuda:${CUDA_IMAGE}
3
+
4
+ # Install Python 3.10 and other necessary packages
5
+ RUN apt-get update && apt-get install -y \
6
+ python3.10 python3-pip \
7
+ tesseract-ocr \
8
+ libtesseract-dev \
9
+ libgl1-mesa-glx \
10
+ poppler-utils \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Set the working directory in the container to /app
14
+ WORKDIR /app
15
+
16
+ # Copy the requirements file into the container at /app
17
+ COPY requirements.txt /app/
18
+
19
+ # Install any needed packages specified in requirements.txt
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Copy the rest of your application's code
23
+ COPY . /app
24
+
25
+ # Create a user to run the application
26
+ RUN useradd -m -u 1000 user
27
+ USER user
28
+ ENV HOME=/home/user \
29
+ PATH=/home/user/.local/bin:$PATH
30
+
31
+ # Set the working directory in the user's home directory
32
+ WORKDIR $HOME/app
33
+ COPY --chown=user . $HOME/app
34
+
35
+ # Make port 8000 available to the world outside this container
36
+ EXPOSE 8000
37
+
38
+ # Define environment variable
39
+ ENV NAME World
40
+
41
+ # Run the application when the container launches
42
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
RAG_utils.py ADDED
@@ -0,0 +1,1001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import torch
5
+
6
+ import openai
7
+ import logging
8
+ import asyncio
9
+ import aiohttp
10
+ import pandas as pd
11
+ import numpy as np
12
+ import evaluate
13
+ import qdrant_client
14
+ from pydantic import BaseModel, Field
15
+ from typing import Any, List, Tuple, Set, Dict, Optional, Union
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+
18
+ from unstructured.partition.pdf import partition_pdf
19
+
20
+ import llama_index
21
+ from llama_index import PromptTemplate
22
+ from llama_index.retrievers import VectorIndexRetriever, BaseRetriever, BM25Retriever
23
+ from llama_index.query_engine import RetrieverQueryEngine
24
+ from llama_index import get_response_synthesizer
25
+ from llama_index.schema import NodeWithScore
26
+ from llama_index.query_engine import RetrieverQueryEngine
27
+ from llama_index import VectorStoreIndex, ServiceContext
28
+ from llama_index.embeddings import OpenAIEmbedding
29
+ from llama_index.llms import HuggingFaceLLM
30
+ import requests
31
+ from llama_index.llms import (
32
+ CustomLLM,
33
+ CompletionResponse,
34
+ CompletionResponseGen,
35
+ LLMMetadata,
36
+ )
37
+ from llama_index.query_engine import RetrieverQueryEngine
38
+ from llama_index.llms.base import llm_completion_callback
39
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
40
+ from llama_index.storage.storage_context import StorageContext
41
+ from llama_index.postprocessor import SentenceTransformerRerank, LLMRerank
42
+
43
+ from tempfile import NamedTemporaryFile
44
+ # Configure basic logging
45
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
46
+
47
+ # Create a logger object
48
+ logger = logging.getLogger(__name__)
49
+
50
+ class ConfigManager:
51
+ """
52
+ A class to manage loading and accessing configuration settings.
53
+
54
+ Attributes:
55
+ config (dict): Dictionary to hold configuration settings.
56
+
57
+ Methods:
58
+ load_config(config_path: str): Loads the configuration from a given JSON file.
59
+ get_config_value(key: str): Retrieves a specific configuration value.
60
+ """
61
+
62
+ def __init__(self):
63
+ self.configs = {}
64
+
65
+ def load_config(self, config_name: str, config_path: str) -> None:
66
+ """
67
+ Loads configuration settings from a specified JSON file into a named configuration.
68
+
69
+ Args:
70
+ config_name (str): The name to assign to this set of configurations.
71
+ config_path (str): The path to the configuration file.
72
+
73
+ Raises:
74
+ FileNotFoundError: If the config file is not found.
75
+ json.JSONDecodeError: If there is an error parsing the config file.
76
+ """
77
+ try:
78
+ with open(config_path, 'r') as f:
79
+ self.configs[config_name] = json.load(f)
80
+ except FileNotFoundError:
81
+ logging.error(f"Config file not found at {config_path}")
82
+ raise
83
+ except json.JSONDecodeError as e:
84
+ logging.error(f"Error decoding config file: {e}")
85
+ raise
86
+
87
+
88
+ def get_config_value(self, config_name: str, key: str) -> str:
89
+ """
90
+ Retrieves a specific configuration value.
91
+
92
+ Args:
93
+ key (str): The key for the configuration setting.
94
+
95
+ Returns:
96
+ str: The value of the configuration setting.
97
+
98
+ Raises:
99
+ ValueError: If the key is not found or is set to a placeholder value.
100
+ """
101
+ value = self.configs.get(config_name, {}).get(key)
102
+ if value is None or value == "ENTER_YOUR_TOKEN_HERE":
103
+ raise ValueError(f"Please set your '{key}' in the config.json file.")
104
+ return value
105
+
106
+ class base_utils:
107
+ """
108
+ A utility class providing miscellaneous static methods for processing and analyzing text data,
109
+ particularly from PDF documents and filenames. This class also includes methods for file operations.
110
+
111
+ This class encapsulates the functionality of extracting key information from text, such as scores,
112
+ reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
113
+ and reading content from files.
114
+
115
+ Attributes:
116
+ None (This class contains only static methods and does not maintain any state)
117
+
118
+ Methods:
119
+ extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
120
+ Extracts a score and reasoning from a given text using regular expressions.
121
+
122
+ extract_id_from_filename(filename: str) -> Optional[int]:
123
+ Extracts an ID from a given filename based on a specified pattern.
124
+
125
+ find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
126
+ Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
127
+
128
+ read_from_file(file_path: str) -> str:
129
+ Reads the content of a file and returns it as a string.
130
+ """
131
+
132
+ @staticmethod
133
+ def read_from_file(file_path: str) -> str:
134
+ """
135
+ Reads the content of a file and returns it as a string.
136
+
137
+ Args:
138
+ file_path (str): The path to the file to be read.
139
+
140
+ Returns:
141
+ str: The content of the file.
142
+ """
143
+ with open(file_path, 'r') as prompt_file:
144
+ prompt = prompt_file.read()
145
+ return prompt
146
+
147
+ @staticmethod
148
+ def extract_id_from_filename(filename: str) -> Optional[int]:
149
+ """
150
+ Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
151
+
152
+ Args:
153
+ filename (str): The filename from which to extract the ID.
154
+
155
+ Returns:
156
+ int: The extracted ID as an integer, or None if the pattern is not found.
157
+ """
158
+ # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
159
+ match = re.search(r'Id_(\d+).pdf', filename)
160
+ if match:
161
+ return int(match.group(1)) # Convert to integer if ID is numeric
162
+ else:
163
+ return None
164
+
165
+ @staticmethod
166
+ def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
167
+ """
168
+ Extracts score and reasoning from a given text using regular expressions.
169
+
170
+ Args:
171
+ text (str): The text from which to extract the score and reasoning.
172
+
173
+ Returns:
174
+ dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
175
+ """
176
+ # Define regular expression patterns for score and reasoning
177
+ score_pattern = r"Score: (\d+)"
178
+ reasoning_pattern = r"Reasoning: (.+)"
179
+
180
+ # Extract data using regular expressions
181
+ score_match = re.search(score_pattern, text)
182
+ reasoning_match = re.search(reasoning_pattern, text, re.DOTALL) # re.DOTALL allows '.' to match newlines
183
+
184
+ # Extract and return the results
185
+ extracted_data = {
186
+ "score": score_match.group(1) if score_match else None,
187
+ "reasoning": reasoning_match.group(1).strip() if reasoning_match else None
188
+ }
189
+
190
+ return extracted_data
191
+
192
+
193
+ @staticmethod
194
+ def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
195
+ """
196
+ Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
197
+
198
+ Args:
199
+ pdf_filename (str): The filename of the PDF.
200
+ dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
201
+
202
+ Returns:
203
+ pandas.Series or str: The matched row from the dataframe or a message indicating
204
+ that no matching row or invalid filename was found.
205
+ """
206
+ pdf_id = Utility.extract_id_from_filename(pdf_filename)
207
+ if pdf_id is not None:
208
+ # Assuming the first column contains the ID
209
+ matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
210
+ if not matched_row.empty:
211
+ return matched_row
212
+ else:
213
+ return "No matching row found."
214
+ else:
215
+ return "Invalid file name."
216
+
217
+
218
+ class PDFProcessor_Unstructured:
219
+ """
220
+ A class to process PDF files, providing functionalities for extracting, categorizing,
221
+ and merging elements from a PDF file.
222
+
223
+ This class is designed to handle unstructured PDF documents, particularly useful for
224
+ tasks involving text extraction, categorization, and data processing within PDFs.
225
+
226
+ Attributes:
227
+ file_path (str): The full path to the PDF file.
228
+ folder_path (str): The directory path where the PDF file is located.
229
+ file_name (str): The name of the PDF file.
230
+ texts (List[str]): A list to store extracted text chunks.
231
+ tables (List[str]): A list to store extracted tables.
232
+
233
+
234
+ Methods:
235
+ extract_pdf_elements() -> List:
236
+ Extracts images, tables, and text chunks from a PDF file.
237
+
238
+ categorize_elements(raw_pdf_elements: List) -> None:
239
+ Categorizes extracted elements from a PDF into tables and texts.
240
+
241
+ merge_chunks() -> List[str]:
242
+ Merges text chunks based on punctuation and character case criteria.
243
+
244
+ should_skip_chunk(chunk: str) -> bool:
245
+ Determines if a chunk should be skipped based on its content.
246
+
247
+ should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
248
+ Determines if the current chunk should be merged with the next one.
249
+
250
+ process_pdf() -> Tuple[List[str], List[str]]:
251
+ Processes the PDF by extracting, categorizing, and merging elements.
252
+
253
+ process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
254
+ Processes an uploaded PDF file to extract and categorize text and tables.
255
+ """
256
+
257
+ def __init__(self, config: Dict[str, any]):
258
+ self.file_path = None
259
+ self.folder_path = None
260
+ self.file_name = None
261
+ self.texts = []
262
+ self.tables = []
263
+ self.config = config if config is not None else self.default_config()
264
+ logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
265
+
266
+ @staticmethod
267
+ def default_config() -> Dict[str, any]:
268
+ """
269
+ Returns the default configuration for PDF processing.
270
+
271
+ Returns:
272
+ Dict[str, any]: Default configuration options.
273
+ """
274
+ return {
275
+ "extract_images": False,
276
+ "infer_table_structure": True,
277
+ "chunking_strategy": "by_title",
278
+ "max_characters": 10000,
279
+ "combine_text_under_n_chars": 100,
280
+ "strategy": "fast",
281
+ "model_name": "yolox"
282
+ }
283
+
284
+
285
+ def extract_pdf_elements(self) -> List:
286
+ """
287
+ Extracts images, tables, and text chunks from a PDF file.
288
+
289
+ Returns:
290
+ List: A list of extracted elements from the PDF.
291
+ """
292
+ logger.info("Starting extraction of PDF elements.")
293
+ try:
294
+ extracted_elements = partition_pdf(
295
+ filename=self.file_path,
296
+ extract_images_in_pdf=False,
297
+ infer_table_structure=True,
298
+ chunking_strategy="by_title",
299
+ strategy = "fast",
300
+ max_characters=10000,
301
+ combine_text_under_n_chars=100,
302
+ image_output_dir_path=self.folder_path,
303
+ )
304
+ logger.info("Extraction of PDF elements completed successfully.")
305
+ return extracted_elements
306
+ except Exception as e:
307
+ logger.error(f"Error extracting PDF elements: {e}", exc_info=True)
308
+ raise
309
+
310
+ def categorize_elements(self, raw_pdf_elements: List) -> None:
311
+ """
312
+ Categorizes extracted elements from a PDF into tables and texts.
313
+
314
+ Args:
315
+ raw_pdf_elements (List): A list of elements extracted from the PDF.
316
+ """
317
+ logger.debug("Starting categorization of PDF elements.")
318
+ for element in raw_pdf_elements:
319
+ element_type = str(type(element))
320
+ if "unstructured.documents.elements.Table" in element_type:
321
+ self.tables.append(str(element))
322
+ elif "unstructured.documents.elements.CompositeElement" in element_type:
323
+ self.texts.append(str(element))
324
+
325
+ logger.debug("Categorization of PDF elements completed.")
326
+
327
+ def merge_chunks(self) -> List[str]:
328
+ """
329
+ Merges text chunks based on punctuation and character case criteria.
330
+
331
+ Returns:
332
+ List[str]: A list of merged text chunks.
333
+ """
334
+ logger.debug("Starting merging of text chunks.")
335
+
336
+ merged_chunks = []
337
+ skip_next = False
338
+
339
+ for i, current_chunk in enumerate(self.texts[:-1]):
340
+ next_chunk = self.texts[i + 1]
341
+
342
+ if self.should_skip_chunk(current_chunk):
343
+ continue
344
+
345
+ if self.should_merge_with_next(current_chunk, next_chunk):
346
+ merged_chunks.append(current_chunk + " " + next_chunk)
347
+ skip_next = True
348
+ else:
349
+ merged_chunks.append(current_chunk)
350
+
351
+ if not skip_next:
352
+ merged_chunks.append(self.texts[-1])
353
+
354
+ logger.debug("Merging of text chunks completed.")
355
+
356
+ return merged_chunks
357
+
358
+ @staticmethod
359
+ def should_skip_chunk(chunk: str) -> bool:
360
+ """
361
+ Determines if a chunk should be skipped based on its content.
362
+
363
+ Args:
364
+ chunk (str): The text chunk to be evaluated.
365
+
366
+ Returns:
367
+ bool: True if the chunk should be skipped, False otherwise.
368
+ """
369
+ return (chunk.lower().startswith(("figure", "fig", "table")) or
370
+ not chunk[0].isalnum() or
371
+ re.match(r'^\d+\.', chunk))
372
+
373
+ @staticmethod
374
+ def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
375
+ """
376
+ Determines if the current chunk should be merged with the next one.
377
+
378
+ Args:
379
+ current_chunk (str): The current text chunk.
380
+ next_chunk (str): The next text chunk.
381
+
382
+ Returns:
383
+ bool: True if the chunks should be merged, False otherwise.
384
+ """
385
+ return (current_chunk.endswith(",") or
386
+ (current_chunk[-1].islower() and next_chunk[0].islower()))
387
+
388
+ def process_pdf(self) -> Tuple[List[str], List[str]]:
389
+ """
390
+ Processes the PDF by extracting, categorizing, and merging elements.
391
+
392
+ Returns:
393
+ Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
394
+ """
395
+ logger.info("Starting processing of the PDF.")
396
+ try:
397
+ raw_pdf_elements = self.extract_pdf_elements()
398
+ self.categorize_elements(raw_pdf_elements)
399
+ merged_chunks = self.merge_chunks()
400
+ return merged_chunks, self.tables
401
+ except Exception as e:
402
+ logger.error(f"Error processing PDF: {e}", exc_info=True)
403
+ raise
404
+
405
+ def process_pdf_file(self, uploaded_file):
406
+ """
407
+ Process an uploaded PDF file.
408
+
409
+ If a new file is uploaded, the previously stored file is deleted.
410
+ The method updates the file path, processes the PDF, and returns the results.
411
+
412
+ Parameters:
413
+ uploaded_file: The new PDF file uploaded for processing.
414
+
415
+ Returns:
416
+ The results of processing the PDF file.
417
+ """
418
+ # Delete the previous file if it exists
419
+ if self.file_path and os.path.exists(self.file_path):
420
+ try:
421
+ os.remove(self.file_path)
422
+ logging.debug(f"Previous file {self.file_path} deleted.")
423
+ except Exception as e:
424
+ logging.warning(f"Error deleting previous file: {e}", exc_info=True)
425
+
426
+ # Process the new file
427
+ self.file_path = str(uploaded_file)
428
+ self.folder_path = os.path.dirname(self.file_path)
429
+ logging.info(f"Starting to process the PDF file: {self.file_path}")
430
+
431
+ try:
432
+ logging.debug(f"Processing PDF at {self.file_path}")
433
+ results = self.process_pdf() # Assuming this is a defined method
434
+ logging.info("PDF processing completed successfully.")
435
+ return results
436
+ except Exception as e:
437
+ logging.error(f"Error processing PDF file: {e}", exc_info=True)
438
+ raise
439
+
440
+
441
+ class HybridRetriever(BaseRetriever):
442
+ """
443
+ A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
444
+ Inherits from BaseRetriever.
445
+
446
+ This class uses two different retrieval methods and merges their results to provide a
447
+ comprehensive set of documents in response to a query. It ensures diversity in the
448
+ retrieved documents by leveraging the strengths of both retrieval methods.
449
+
450
+ Attributes:
451
+ vector_retriever: An instance of a vector-based retriever.
452
+ bm25_retriever: An instance of a BM25 retriever.
453
+
454
+ Methods:
455
+ __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
456
+ _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
457
+ _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
458
+ """
459
+
460
+ def __init__(self, vector_retriever, bm25_retriever):
461
+ super().__init__()
462
+ self.vector_retriever = vector_retriever
463
+ self.bm25_retriever = bm25_retriever
464
+ logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
465
+
466
+ def _retrieve(self, query: str, **kwargs) -> List:
467
+ """
468
+ Retrieves and combines results from both vector and BM25 retrievers.
469
+
470
+ Args:
471
+ query: The query string for document retrieval.
472
+ **kwargs: Additional keyword arguments for retrieval.
473
+
474
+ Returns:
475
+ List: Combined list of unique nodes retrieved from both methods.
476
+ """
477
+ logger.info(f"Retrieving documents for query: {query}")
478
+ try:
479
+ bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
480
+ vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
481
+ combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
482
+
483
+ logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
484
+ return combined_nodes
485
+ except Exception as e:
486
+ logger.error(f"Error in retrieval: {e}")
487
+ raise
488
+
489
+ @staticmethod
490
+ def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
491
+ """
492
+ Combines and de-duplicates results from BM25 and vector retrievers.
493
+
494
+ Args:
495
+ bm25_nodes: Nodes retrieved from BM25 retriever.
496
+ vector_nodes: Nodes retrieved from vector retriever.
497
+
498
+ Returns:
499
+ List: Combined list of unique nodes.
500
+ """
501
+ node_ids: Set = set()
502
+ combined_nodes = []
503
+
504
+ for node in bm25_nodes + vector_nodes:
505
+ if node.node_id not in node_ids:
506
+ combined_nodes.append(node)
507
+ node_ids.add(node.node_id)
508
+
509
+ return combined_nodes
510
+
511
+
512
+
513
+ class PDFQueryEngine:
514
+ """
515
+ A class to handle the process of setting up a query engine and performing queries on PDF documents.
516
+
517
+ This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
518
+ indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
519
+
520
+ Attributes:
521
+ documents (List): A list of documents to be indexed.
522
+ llm (Language Model): The language model to be used for embeddings and queries.
523
+ qa_prompt_tmpl (str): Template for creating query prompts.
524
+ queries (List[str]): List of queries to be executed.
525
+
526
+ Methods:
527
+ setup_query_engine(): Sets up the query engine with all necessary components.
528
+ execute_queries(): Executes the predefined queries and prints the results.
529
+ """
530
+
531
+ def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
532
+
533
+ self.documents = documents
534
+ self.llm = llm
535
+ self.embed_model = embed_model
536
+ self.qa_prompt_tmpl = qa_prompt_tmpl
537
+ self.base_utils = base_utils()
538
+ self.config_manager = ConfigManager()
539
+
540
+
541
+
542
+ logger.info("PDFQueryEngine initialized.")
543
+
544
+ def format_example(self, example):
545
+ """
546
+ Formats a few-shot example into a string.
547
+
548
+ Args:
549
+ example (dict): A dictionary containing 'query', 'score', and 'reasoning' for the few-shot example.
550
+
551
+ Returns:
552
+ str: Formatted few-shot example text.
553
+ """
554
+ return "Example:\nQuery: {}\nScore: {}\nReasoning: {}\n".format(
555
+ example['query'], example['score'], example['reasoning']
556
+ )
557
+
558
+
559
+ def setup_query_engine(self):
560
+ """
561
+ Sets up the query engine by initializing and configuring the embedding model, service context, index,
562
+ hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
563
+
564
+ Args:
565
+ embed_model: The embedding model to be used.
566
+ service_context: The context for providing services to the query engine.
567
+ index: The index used for storing and retrieving documents.
568
+ hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
569
+ response_synthesizer: The synthesizer for generating responses to queries.
570
+
571
+ Returns:
572
+ Any: The configured query engine.
573
+ """
574
+ client = qdrant_client.QdrantClient(
575
+ # you can use :memory: mode for fast and light-weight experiments,
576
+ # it does not require to have Qdrant deployed anywhere
577
+ # but requires qdrant-client >= 1.1.1
578
+ location=":memory:"
579
+ # otherwise set Qdrant instance address with:
580
+ # uri="http://<host>:<port>"
581
+ # set API KEY for Qdrant Cloud
582
+ # api_key="<qdrant-api-key>",
583
+ )
584
+ try:
585
+ logger.info("Initializing the service context for query engine setup.")
586
+ service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
587
+ vector_store = QdrantVectorStore(client=client, collection_name="med_library")
588
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
589
+
590
+ logger.info("Creating an index from documents.")
591
+ index = VectorStoreIndex.from_documents(documents=self.documents, storage_context=storage_context, service_context=service_context)
592
+ nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
593
+
594
+ logger.info("Setting up vector and BM25 retrievers.")
595
+ vector_retriever = index.as_retriever(similarity_top_k=3)
596
+ bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)
597
+ hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
598
+
599
+ logger.info("Configuring the response synthesizer with the prompt template.")
600
+ qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
601
+ response_synthesizer = get_response_synthesizer(
602
+ service_context=service_context,
603
+ text_qa_template=qa_prompt,
604
+ response_mode="compact",
605
+ )
606
+
607
+ logger.info("Assembling the query engine with reranker and synthesizer.")
608
+ reranker = SentenceTransformerRerank(top_n=3, model="BAAI/bge-reranker-base")
609
+ query_engine = RetrieverQueryEngine.from_args(
610
+ retriever=hybrid_retriever,
611
+ node_postprocessors=[reranker],
612
+ response_synthesizer=response_synthesizer,
613
+ )
614
+
615
+ logger.info("Query engine setup complete.")
616
+ return query_engine
617
+ except Exception as e:
618
+ logger.error(f"Error during query engine setup: {e}")
619
+ raise
620
+
621
+ # def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[int, List[int], int, float, List[str]]:
622
+ # """
623
+ # Evaluate documents using a language model based on various criteria.
624
+ # Args:
625
+ # reg_result (Any): Result related to registration.
626
+ # peer_result (Any): Result related to peer review.
627
+ # guidelines_result (Any): Result related to following guidelines.
628
+ # queries (List[str]): A list of queries to be processed.
629
+ # Returns:
630
+ # Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria.
631
+ # """
632
+
633
+ # logger.info("Starting evaluation with LLM.")
634
+ # self.config_manager.load_config("few_shot", "few_shot.json")
635
+ # query_engine = self.setup_query_engine()
636
+
637
+ # total_score = 0
638
+ # criteria_met = 0
639
+ # reasoning = []
640
+
641
+ # for j, query in enumerate(queries):
642
+ # # Handle special cases based on the value of j and other conditions
643
+ # if j == 1 and reg_result:
644
+ # extracted_data = {"score": 1, "reasoning": reg_result[0]}
645
+ # elif j == 2 and guidelines_result:
646
+ # extracted_data = {"score": 1, "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
647
+ # elif j == 8 and (guidelines_result or peer_result):
648
+ # extracted_data = {"score": 1, "reasoning": "The article is published in a peer-reviewed journal."}
649
+ # else:
650
+
651
+ # # Execute the query
652
+ # result = query_engine.query(query).response
653
+ # extracted_data = self.base_utils.extract_score_reasoning(result)
654
+
655
+
656
+ # # Validate and accumulate the scores
657
+ # extracted_data_score = 0 if extracted_data.get("score") is None else int(extracted_data.get("score"))
658
+ # if extracted_data_score > 0:
659
+ # criteria_met += 1
660
+ # reasoning.append(extracted_data["reasoning"])
661
+ # total_score += extracted_data_score
662
+
663
+ # score_percentage = (float(total_score) / len(queries)) * 100
664
+ # logger.info("Evaluation completed.")
665
+ # return total_score, criteria_met, score_percentage, reasoning
666
+
667
+ async def evaluate_with_llm_async(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[int, List[int], int, float, List[str]]:
668
+ logger.info("Starting evaluation with LLM.")
669
+ self.config_manager.load_config("few_shot", "few_shot.json")
670
+ # Setup your query engine, if it's using aiohttp this is where you'd configure it
671
+
672
+ total_score = 0
673
+ criteria_met = 0
674
+ reasoning = []
675
+
676
+ async def handle_query(session, j, query):
677
+ if j == 1 and reg_result:
678
+ return {"score": 1, "reasoning": reg_result[0]}
679
+ elif j == 2 and guidelines_result:
680
+ return {"score": 1, "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
681
+ elif j == 8 and (guidelines_result or peer_result):
682
+ return {"score": 1, "reasoning": "The article is published in a peer-reviewed journal."}
683
+ else:
684
+ # Here, adapt your query engine or direct API call to use aiohttp
685
+ async with session.post('Your API Endpoint', json={'query': query}) as response:
686
+ result = await response.json()
687
+ return self.base_utils.extract_score_reasoning(result)
688
+
689
+ async with aiohttp.ClientSession() as session:
690
+ tasks = [handle_query(session, j, query) for j, query in enumerate(queries)]
691
+ results = await asyncio.gather(*tasks)
692
+
693
+ # Process results
694
+ for extracted_data in results:
695
+ extracted_data_score = 0 if extracted_data.get("score") is None else int(extracted_data.get("score"))
696
+ if extracted_data_score > 0:
697
+ criteria_met += 1
698
+ reasoning.append(extracted_data["reasoning"])
699
+ total_score += extracted_data_score
700
+
701
+ score_percentage = (float(total_score) / len(queries)) * 100
702
+ logger.info("Evaluation completed.")
703
+ return total_score, criteria_met, len(queries), score_percentage, reasoning
704
+
705
+
706
+
707
+ class MixtralLLM(CustomLLM):
708
+ """
709
+ A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
710
+
711
+ Attributes:
712
+ context_window (int): Number of tokens used for context during inference.
713
+ num_output (int): Number of tokens to generate as output.
714
+ temperature (float): Sampling temperature for token generation.
715
+ model_name (str): Name of the model on Hugging Face's model hub.
716
+ api_key (str): API key for authenticating with the Hugging Face API.
717
+
718
+ Methods:
719
+ metadata: Retrieves metadata about the model.
720
+ do_hf_call: Makes an API call to the Hugging Face model.
721
+ complete: Generates a complete response for a given prompt.
722
+ stream_complete: Streams a series of token completions for a given prompt.
723
+ """
724
+ context_window: int = Field(..., description="Number of tokens used for context during inference.")
725
+ num_output: int = Field(..., description="Number of tokens to generate as output.")
726
+ temperature: float = Field(..., description="Sampling temperature for token generation.")
727
+ model_name: str = Field(..., description="Name of the model on Hugging Face's model hub.")
728
+ api_key: str = Field(..., description="API key for authenticating with the Hugging Face API.")
729
+
730
+
731
+ @property
732
+ def metadata(self) -> LLMMetadata:
733
+ """
734
+ Retrieves metadata for the Mixtral LLM.
735
+
736
+ Returns:
737
+ LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
738
+ """
739
+ return LLMMetadata(
740
+ context_window=self.context_window,
741
+ num_output=self.num_output,
742
+ model_name=self.model_name,
743
+ )
744
+
745
+ def do_hf_call(self, prompt: str) -> str:
746
+ """
747
+ Makes an API call to the Hugging Face model and retrieves the generated response.
748
+
749
+ Args:
750
+ prompt (str): The input prompt for the model.
751
+
752
+ Returns:
753
+ str: The text generated by the model in response to the prompt.
754
+
755
+ Raises:
756
+ Exception: If the API call fails or returns an error.
757
+ """
758
+ data = {
759
+ "inputs": prompt,
760
+ "parameters": {"Temperature": self.temperature}
761
+ }
762
+
763
+ # Makes a POST request to the Hugging Face API to get the model's response
764
+ response = requests.post(
765
+ f'https://api-inference.huggingface.co/models/{self.model_name}',
766
+ headers={
767
+ 'authorization': f'Bearer {self.api_key}',
768
+ 'content-type': 'application/json',
769
+ },
770
+ json=data,
771
+ stream=True
772
+ )
773
+
774
+ # Checks for a successful response and parses the generated text
775
+ if response.status_code != 200 or not response.json() or 'error' in response.json():
776
+ print(f"Error: {response}")
777
+ return "Unable to answer for technical reasons."
778
+ full_txt = response.json()[0]['generated_text']
779
+ # Finds the section of the text following the context separator
780
+ offset = full_txt.find("---------------------")
781
+ ss = full_txt[offset:]
782
+ # Extracts the actual answer from the response
783
+ offset = ss.find("Answer:")
784
+ return ss[offset+7:].strip()
785
+
786
+
787
+ @llm_completion_callback()
788
+ def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
789
+ """
790
+ Generates a complete response for a given prompt using the Hugging Face API.
791
+
792
+ Args:
793
+ prompt (str): The input prompt for the model.
794
+ **kwargs: Additional keyword arguments for the completion.
795
+
796
+ Returns:
797
+ CompletionResponse: The complete response from the model.
798
+ """
799
+ response = self.do_hf_call(prompt)
800
+ return CompletionResponse(text=response)
801
+
802
+
803
+ @llm_completion_callback()
804
+ def stream_complete(
805
+ self, prompt: str, **kwargs: Any
806
+ ) -> CompletionResponseGen:
807
+ """
808
+ Streams a series of token completions as a response for the given prompt.
809
+
810
+ This method is useful for streaming responses where each token is generated sequentially.
811
+
812
+ Args:
813
+ prompt (str): The input prompt for the model.
814
+ **kwargs: Additional keyword arguments for the streaming completion.
815
+
816
+ Yields:
817
+ CompletionResponseGen: A generator yielding each token in the completion response.
818
+ """
819
+ # Yields a stream of tokens as the completion response for the given prompt
820
+ response = ""
821
+ for token in self.do_hf_call(prompt):
822
+ response += token
823
+ yield CompletionResponse(text=response, delta=token)
824
+
825
+
826
+
827
+ class KeywordSearch():
828
+ def __init__(self, chunks):
829
+ self.chunks = chunks
830
+
831
+ def find_journal_name(self, response: str, journal_list: list) -> str:
832
+ """
833
+ Searches for a journal name in a given response string.
834
+
835
+ This function iterates through a list of known journal names and checks if any of these
836
+ names are present in the response string. It returns the first journal name found in the
837
+ response. If no journal names from the list are found in the response, a default message
838
+ indicating that the journal name was not found is returned.
839
+
840
+ Args:
841
+ response (str): The response string to search for a journal name.
842
+ journal_list (list): A list of journal names to search within the response.
843
+
844
+ Returns:
845
+ str: The first journal name found in the response, or a default message if no journal name is found.
846
+ """
847
+ response_lower = response.lower()
848
+ for journal in journal_list:
849
+ journal_lower = journal.lower()
850
+
851
+ if journal_lower in response_lower:
852
+ print(journal_lower,response_lower)
853
+ return True
854
+
855
+ return False
856
+
857
+ def check_registration(self):
858
+ """
859
+ Check chunks of text for various registration numbers or URLs of registries.
860
+ Returns the sentence containing a registration number, or if not found,
861
+ returns chunks containing registry URLs.
862
+
863
+ Args:
864
+ chunks (list of str): List of text chunks to search.
865
+
866
+ Returns:
867
+ list of str: List of matching sentences or chunks, or an empty list if no matches are found.
868
+ """
869
+
870
+ # Patterns for different registration types
871
+ patterns = {
872
+ "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
873
+ "ISRCTN": r"(ISRCTN\d{8})",
874
+ "EudraCT": r"(\d{4}-\d{6}-\d{2})",
875
+ "UMIN-CTR": r"(UMIN\d{9})",
876
+ "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
877
+ }
878
+
879
+ # Registry URLs
880
+ registry_urls = [
881
+ "www.anzctr.org.au",
882
+ "anzctr.org.au",
883
+ "www.clinicaltrials.gov",
884
+ "clinicaltrials.gov",
885
+ "www.ISRCTN.org",
886
+ "ISRCTN.org",
887
+ "www.umin.ac.jp/ctr/index/htm",
888
+ "umin.ac.jp/ctr/index/htm",
889
+ "www.onderzoekmetmensen.nl/en",
890
+ "onderzoekmetmensen.nl/en",
891
+ "eudract.ema.europa.eu",
892
+ "www.eudract.ema.europa.eu"
893
+ ]
894
+
895
+
896
+ # Check each chunk for registration numbers
897
+ for chunk in self.chunks:
898
+ # Split chunk into sentences
899
+ sentences = re.split(r'(?<=[.!?]) +', chunk)
900
+
901
+ # Check each sentence for any registration number
902
+ for sentence in sentences:
903
+ for pattern in patterns.values():
904
+ if re.search(pattern, sentence):
905
+ return [sentence] # Return immediately if a registration number is found
906
+
907
+ # If no registration number found, check for URLs in chunks
908
+ matching_chunks = []
909
+ for chunk in self.chunks:
910
+ if any(url in chunk for url in registry_urls):
911
+ matching_chunks.append(chunk)
912
+
913
+ return matching_chunks
914
+
915
+
916
+
917
+ class StringExtraction():
918
+
919
+ """
920
+ A class to handle the the process of extraction of query string from complete LLM responses.
921
+
922
+ This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
923
+ LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
924
+ satisfactory results. Best case scenario will be write your own string extraction method in such cases.
925
+
926
+
927
+ Methods:
928
+ extract_original_prompt():
929
+ extraction_ground_truth():
930
+ """
931
+
932
+ def extract_original_prompt(self,result):
933
+ r1 = result.response.strip().split("\n")
934
+ binary_response = ""
935
+ explanation_response = ""
936
+ for r in r1:
937
+ if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
938
+ binary_response = r
939
+ elif r.find("Reasoning:") >= 0:
940
+ cut = r.find(":")
941
+ explanation_response += r[cut+1:].strip()
942
+
943
+ return binary_response,explanation_response
944
+
945
+ def extraction_ground_truth(self,paper_name,labelled_data):
946
+ id = int(paper_name[paper_name.find("_")+1:paper_name.find(".pdf")])
947
+ id_row = labelled_data[labelled_data["id"] == id]
948
+ ground_truth = id_row.iloc[:,2:11].values.tolist()[0]
949
+ binary_ground_truth = []
950
+ explanation_ground_truth = []
951
+ for g in ground_truth:
952
+ if len(g) > 0:
953
+ binary_ground_truth.append("Yes")
954
+ explanation_ground_truth.append(g)
955
+ else:
956
+ binary_ground_truth.append("No")
957
+ explanation_ground_truth.append("The article does not provide any relevant information.")
958
+ return binary_ground_truth,explanation_ground_truth
959
+
960
+
961
+
962
+ class EvaluationMetrics():
963
+ """
964
+
965
+ This class encapsulates the evaluation methods that have been used in the project.
966
+
967
+ Attributes:
968
+ explanation_response = a list of detailed response from the LLM model corresponding to each query
969
+ explanation_ground_truth = the list of ground truth corresponding to each query
970
+
971
+ Methods:
972
+ metric_cosine_similairty(): Sets up the query engine with all necessary components.
973
+ metric_rouge(): Executes the predefined queries and prints the results.
974
+ metric_binary_accuracy():
975
+ """
976
+
977
+
978
+ def __init__(self,explanation_response,explanation_ground_truth,embedding_model):
979
+ self.explanation_response = explanation_response
980
+ self.explanation_ground_truth = explanation_ground_truth
981
+ self.embedding_model = embedding_model
982
+
983
+ def metric_cosine_similarity(self):
984
+ ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
985
+ explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
986
+ return np.diag(cosine_similarity(ground_truth_embedding,explanation_response_embedding))
987
+
988
+ def metric_rouge(self):
989
+ rouge = evaluate.load("rouge")
990
+ results = rouge.compute(predictions = self.explanation_response,references = self.explanation_ground_truth)
991
+ return results
992
+
993
+ def binary_accuracy(self,binary_response,binary_ground_truth):
994
+ count = 0
995
+ if len(binary_response) != len(binary_ground_truth):
996
+ return "Arrays which are to be compared has different lengths."
997
+ else:
998
+ for i in range(len(binary_response)):
999
+ if binary_response[i] == binary_ground_truth[i]:
1000
+ count += 1
1001
+ return np.round(count/len(binary_response),2)
README.md CHANGED
@@ -1,11 +1,12 @@
1
  ---
2
  title: Epitomea Demo
3
- emoji: 😻
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Epitomea Demo
3
+ emoji: 🐨
4
+ colorFrom: purple
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
9
+ startup_duration_timeout: 1h
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import getpass
5
+ import logging
6
+ import openai
7
+ import asyncio
8
+ from typing import Any, List, Tuple, Dict
9
+ import gradio as gr
10
+ import llama_index
11
+ from llama_index import Document
12
+ from llama_index.llms import OpenAI
13
+ from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
14
+ from llama_index.llms import HuggingFaceLLM
15
+ import requests
16
+
17
+ from RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, HybridRetriever, MixtralLLM, KeywordSearch, base_utils, ConfigManager
18
+
19
+
20
+ # Configure basic logging
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
22
+
23
+ # Create a logger object
24
+ logger = logging.getLogger(__name__)
25
+
26
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
27
+
28
+ config_manager = ConfigManager()
29
+ #config_manager.load_config("api", "Config/api_config.json")
30
+ config_manager.load_config("model", "model_config.json")
31
+
32
+ openai.api_key = os.environ['OPENAI_API_KEY'] #config_manager.get_config_value("api", "OPENAI_API_KEY")
33
+ hf_token = os.environ['HF_TOKEN']#config_manager.get_config_value("api", "HF_TOKEN")
34
+
35
+ # PDF rendering and chunking parameters
36
+ pdf_processing_config = config_manager.get_config_value("model", "pdf_processing")
37
+
38
+
39
+ ALLOWED_EXTENSIONS = config_manager.get_config_value("model", "allowed_extensions")
40
+ embed = config_manager.get_config_value("model", "embeddings")
41
+ embed_model_name = config_manager.get_config_value("model", "embeddings_model")
42
+
43
+
44
+ #llm_model = config_manager.get_config_value("model", "llm_model")
45
+ model_temperature = config_manager.get_config_value("model", "model_temp")
46
+ output_token_size = config_manager.get_config_value("model", "max_tokens")
47
+ model_context_window = config_manager.get_config_value("model", "context_window")
48
+
49
+ gpt_prompt_path = config_manager.get_config_value("model","GPT_PROMPT_PATH")
50
+ mistral_prompt_path = config_manager.get_config_value("model","MISTRAL_PROMPT_PATH")
51
+ info_prompt_path = config_manager.get_config_value("model", "INFO_PROMPT_PATH")
52
+
53
+ peer_review_journals_path = config_manager.get_config_value("model", "peer_review_journals_path")
54
+ eq_network_journals_path = config_manager.get_config_value("model", "eq_network_journals_path")
55
+
56
+ queries = config_manager.get_config_value("model", "queries")
57
+ criteria = config_manager.get_config_value("model", "criteria")
58
+ num_criteria = len(queries)
59
+
60
+ author_query = config_manager.get_config_value("model", "author_query")
61
+ journal_query = config_manager.get_config_value("model", "journal_query")
62
+
63
+
64
+ # Helper function to check if the file extension is allowed
65
+ def allowed_file(filename):
66
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
67
+
68
+ def generate_score_bar(score, num_criteria):
69
+ # Convert and round the score from a 9-point scale to a 100-point scale
70
+ score_out_of_100 = round((score / num_criteria) * 100)
71
+
72
+ # Determine the color and text based on the original score
73
+ if score == 9:
74
+ color = "#4CAF50" # green
75
+ text = "Very good"
76
+ elif score in [7, 8]:
77
+ color = "#FFEB3B" # yellow
78
+ text = "Good"
79
+ elif score in [5, 6]:
80
+ color = "#FF9800" # orange
81
+ text = "Ok"
82
+ elif score in [3, 4]:
83
+ color = "#F44336" # red
84
+ text = "Bad"
85
+ else: # score < 3
86
+ color = "#800000" # maroon
87
+ text = "Very bad"
88
+
89
+ # Create the HTML for the score bar
90
+ score_bar_html = f"""
91
+ <div style="background-color: #ddd; border-radius: 10px; position: relative; height: 20px; width: 100%;">
92
+ <div style="background-color: {color}; height: 100%; border-radius: 10px; width: {score_out_of_100}%;"></div>
93
+ </div>
94
+ <p style="color: {color};">{text}</p> <!-- Display the text -->
95
+ """
96
+ return score_bar_html
97
+ def format_example(example):
98
+ """
99
+ Formats a few-shot example into a string.
100
+ Args:
101
+ example (dict): A dictionary containing 'query', 'score', and 'reasoning' for the few-shot example.
102
+ Returns:
103
+ str: Formatted few-shot example text.
104
+ """
105
+ return "Example:\nQuery: {}\n Direct Answer: {}\n".format(
106
+ example['query'], example['Answer'])
107
+
108
+ def process_pdf(uploaded_file, llm_model, n_criteria = num_criteria):
109
+ # Process the PDF file
110
+ pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
111
+ merged_chunks, tables = pdf_processor.process_pdf_file(uploaded_file)
112
+ documents = [Document(text=t) for t in merged_chunks]
113
+
114
+ # Prompts and Queries
115
+ utils = base_utils()
116
+
117
+ info_prompt = utils.read_from_file(info_prompt_path)
118
+
119
+ # LLM Model choice
120
+ try:
121
+ if llm_model == "Model 1":
122
+ llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=output_token_size)
123
+ general_prompt = utils.read_from_file(gpt_prompt_path)
124
+
125
+ elif llm_model == "Model 2":
126
+ if any(param is None for param in [model_context_window, output_token_size, model_temperature, hf_token]):
127
+ raise ValueError("All parameters are required for Mistral LLM.")
128
+
129
+
130
+ llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
131
+ temperature=model_temperature, model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", api_key=hf_token)
132
+ general_prompt = utils.read_from_file(mistral_prompt_path)
133
+ else:
134
+ raise ValueError(f"Unsupported language model: {llm_model}")
135
+
136
+ except Exception as e:
137
+ logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
138
+ raise # Or handle the exception as needed
139
+
140
+ # Embedding model choice for RAG
141
+ try:
142
+ if embed == "openai":
143
+ embed_model = OpenAIEmbedding(model="text-embedding-3-large")
144
+
145
+ elif embed == "huggingface":
146
+ # Use the specified model name
147
+ embed_model = HuggingFaceEmbedding(embed_model_name)
148
+
149
+ else:
150
+ raise ValueError(f"Unsupported embedding model: {embed_model}")
151
+
152
+
153
+ except Exception as e:
154
+ logger.error(f"Error initializing embedding model: {e}", exc_info=True)
155
+ raise
156
+
157
+
158
+ peer_review_journals = utils.read_from_file(peer_review_journals_path)
159
+ eq_network_journals = utils.read_from_file(eq_network_journals_path)
160
+
161
+ peer_review_journals_list = peer_review_journals.split('\n')
162
+ eq_network_journals_list = eq_network_journals.split('\n')
163
+
164
+
165
+ modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(peer_review_journals_list) + "?"
166
+
167
+ example_journal = {"query":modified_journal_query,
168
+ "Answer": "The article is published in the Lancet."}
169
+ example_author = {"query":author_query,
170
+ "Answer": "Corresponding author. Stephanie J. Sohl, Ph.D., Department of Social Sciences & Health Policy, Wake Forest School of Medicine, Medical Center Boulevard, Winston-Salem, NC 27157, USA, ssohl@wakehealth.edu"}
171
+
172
+ formatted_journal_example = format_example(example_journal)
173
+ formatted_author_example = format_example(example_author)
174
+
175
+ qa_author_prompt_with_example = info_prompt.replace("{few_shot_examples}", formatted_author_example)
176
+ qa_journal_prompt_with_example = info_prompt.replace("{few_shot_examples}", formatted_journal_example)
177
+
178
+ info_llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=output_token_size)
179
+ pdf_info_query = PDFQueryEngine(documents, info_llm, embed_model, (info_prompt))
180
+ info_query_engine = pdf_info_query.setup_query_engine()
181
+ journal_result = info_query_engine.query(modified_journal_query).response
182
+ author_result = info_query_engine.query(author_query).response
183
+
184
+
185
+ pdf_criteria_query = PDFQueryEngine(documents, info_llm, embed_model, (general_prompt))
186
+
187
+ # Check for prior registration
188
+ nlp_methods = KeywordSearch(merged_chunks)
189
+ eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
190
+ peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
191
+
192
+ registration_result = nlp_methods.check_registration()
193
+
194
+ # Convert your asynchronous operations into a synchronous context using asyncio.run
195
+ async def async_evaluation():
196
+ # This assumes that evaluate_with_llm_async is an async version of your method
197
+ return await pdf_criteria_query.evaluate_with_llm_async(registration_result, peer_journal_result, eq_journal_result, queries)
198
+
199
+
200
+ # Evaluate with OpenAI model
201
+ total_score, criteria_met, score_percentage, reasoning = asyncio.run(async_evaluation())
202
+
203
+ reasoning_html = "<ul>"
204
+ for query, reason in zip(criteria, reasoning):
205
+ reasoning_html += f"<li style='font-size: 18px;'><strong style='color: forestgreen;'>{query}</strong> <br> Reasoning: {reason}</li>"
206
+ reasoning_html += "</ul>"
207
+
208
+ # Generate the score bar HTML
209
+ score_bar_html = generate_score_bar(total_score, n_criteria)
210
+
211
+ # Return the score as a string and the reasoning as HTML
212
+ return str(round((total_score / n_criteria) * 100)) + "/100", score_bar_html, reasoning_html, author_result
213
+
214
+
215
+ with gr.Blocks(theme=gr.themes.Glass(
216
+ text_size="sm",
217
+ font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"],
218
+ primary_hue="neutral",
219
+ secondary_hue="gray")) as demo:
220
+
221
+ gr.Markdown("## Med Library")
222
+
223
+ with gr.Row():
224
+ file_upload = gr.File(label="Choose a paper", file_types=['.pdf'])
225
+
226
+ with gr.Row():
227
+ models = ["Model 1", "Model 2"]
228
+ model_choice = gr.Dropdown(models, label="Choose a model", value="Model 1")
229
+ submit_button = gr.Button("Evaluate")
230
+
231
+ score_output = gr.Textbox(label="Final Score:", interactive=False)
232
+ score_bar_output = gr.HTML()
233
+ reasoning_output = gr.HTML()
234
+
235
+ # Heading for Author Information
236
+ gr.Markdown("## Author Information")
237
+
238
+ # Output for dynamically generated author information
239
+ author_info_output = gr.Markdown()
240
+
241
+ # Set the click event for the button
242
+ submit_button.click(
243
+ fn=process_pdf,
244
+ inputs=[file_upload, model_choice],
245
+ outputs=[score_output, score_bar_output, reasoning_output, author_info_output]
246
+ )
247
+
248
+
249
+ #Launch the app
250
+ demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
251
+
252
+ # Main route for file upload and display results
253
+ # @app.route('/', methods=['GET', 'POST'])
254
+ # def upload_and_display_results():
255
+ # total_score = 0
256
+ # score_percentage = 0
257
+ # reasoning = []
258
+ # criteria_met = 0
259
+
260
+ # if request.method == 'POST':
261
+ # # Check if the post request has the file part
262
+ # if 'file' not in request.files:
263
+ # flash('No file part')
264
+ # return redirect(request.url)
265
+ # file = request.files['file']
266
+ # # If user does not select file, browser also submits an empty part without filename
267
+ # if file.filename == '':
268
+ # flash('No selected file')
269
+ # return redirect(request.url)
270
+ # if file and allowed_file(file.filename):
271
+ # try:
272
+ # # Process the PDF file
273
+ # pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
274
+ # merged_chunks, tables = pdf_processor.process_pdf_file(file)
275
+ # documents = [Document(text=t) for t in merged_chunks]
276
+
277
+ # # LLM Model choice
278
+ # try:
279
+ # if llm_model == "gpt-4" or llm_model == "gpt-3.5-turbo":
280
+ # llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)
281
+
282
+ # elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
283
+ # if any(param is None for param in [model_context_window, output_token_size, model_temperature, hf_token]):
284
+ # raise ValueError("All parameters are required for Mistral LLM.")
285
+
286
+ # llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
287
+ # temperature=model_temperature, model_name=llm_model, api_key=hf_token)
288
+ # else:
289
+ # raise ValueError(f"Unsupported language model: {llm_model}")
290
+
291
+ # except Exception as e:
292
+ # logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
293
+ # raise # Or handle the exception as needed
294
+
295
+ # # Embedding model choice for RAG
296
+ # try:
297
+ # if embed == "openai":
298
+ # embed_model = OpenAIEmbedding()
299
+
300
+ # elif embed == "huggingface":
301
+ # if embed_model_name is None:
302
+ # # Set to default model if name not provided
303
+ # embed_model_name = "BAAI/bge-small-en-v1.5"
304
+ # embed_model = HuggingFaceEmbedding(embed_model_name)
305
+ # else:
306
+ # # Use the specified model name
307
+ # embed_model = HuggingFaceEmbedding(embed_model_name)
308
+ # else:
309
+ # raise ValueError(f"Unsupported embedding model: {embed_model}")
310
+
311
+
312
+ # except Exception as e:
313
+ # logger.error(f"Error initializing embedding model: {e}", exc_info=True)
314
+ # raise
315
+
316
+
317
+
318
+ # # Prompts and Queries
319
+ # utils = base_utils()
320
+ # general_prompt = utils.read_from_file(general_prompt_path)
321
+ # info_prompt = utils.read_from_file(info_prompt_path)
322
+
323
+ # peer_review_journals = utils.read_from_file(peer_review_journals_path)
324
+ # eq_network_journals = utils.read_from_file(eq_network_journals_path)
325
+
326
+ # peer_review_journals_list = peer_review_journals.split('\n')
327
+ # eq_network_journals_list = eq_network_journals.split('\n')
328
+
329
+
330
+ # modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(peer_review_journals_list) + "?"
331
+
332
+ # pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
333
+ # info_query_engine = pdf_info_query.setup_query_engine()
334
+ # journal_result = info_query_engine.query(modified_journal_query).response
335
+
336
+
337
+ # pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (general_prompt))
338
+
339
+ # # Check for prior registration
340
+ # nlp_methods = KeywordSearch(merged_chunks)
341
+ # eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
342
+ # peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
343
+ # registration_result = nlp_methods.check_registration()
344
+
345
+
346
+ # # Evaluate with OpenAI model
347
+ # total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(registration_result, peer_journal_result, eq_journal_result, queries)
348
+
349
+
350
+ # except Exception as e:
351
+ # logging.exception("An error occurred while processing the file.")
352
+ # # Consider adding a user-friendly message or redirect
353
+ # flash('An error occurred while processing the file.')
354
+ # return redirect(request.url)
355
+
356
+ # return render_template('index.html',
357
+ # total_score = total_score,
358
+ # score_percentage = score_percentage,
359
+ # criteria_met = criteria_met,
360
+ # reasoning = reasoning)
361
+
362
+
363
+ # if __name__ == '__main__':
364
+ # app.run(debug=True)
data/data1/git.keep ADDED
@@ -0,0 +1 @@
 
 
1
+
data/data1/labelled_dataset_1.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/git.keep ADDED
@@ -0,0 +1 @@
 
 
1
+
data/prompts/eq_network_journals.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ The New England Journal of Medicine (NEJM)
2
+ The Lancet
3
+ Journal of the American Medical Association (JAMA)
4
+ British Medical Journal (BMJ)
5
+ Annals of Internal Medicine
6
+ Nature Medicine
7
+ Journal of Clinical Oncology
8
+ Journal of Clinical Investigation
9
+ Pediatrics
data/prompts/peer_review_journals.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The New England Journal of Medicine (NEJM)
2
+ The Lancet
3
+ Journal of the American Medical Association (JAMA)
4
+ British Medical Journal (BMJ)
5
+ Annals of Internal Medicine
6
+ Nature Medicine
7
+ Journal of Clinical Oncology
8
+ Journal of Clinical Investigation
9
+ Pediatrics
10
+ Cell
11
+ Journal of Experimental Medicine
12
+ Circulation
13
+ Science Translational Medicine
14
+ Archives of Internal Medicine
15
+ Journal of Immunology
16
+ Proceedings of the National Academy of Sciences (PNAS)
17
+ Anesthesia & Analgesia
data/prompts/prompt_gpt.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Provided below is context information essential for the evaluation of a medical research paper.
3
+
4
+ -------------------------------------
5
+ {context_str}
6
+ -------------------------------------
7
+
8
+ As an expert in medical research and an experienced peer reviewer, your task is to thoroughly evaluate the provided medical research paper. The evaluation should be specifically tailored to the context information given above.
9
+ Your assessment will be guided by the specific query: {query_str}. You are required to assign a binary score of either 0 or 1. This score should be a direct reflection of the quality, relevance, and accuracy of the research paper in relation to the query.
10
+ In addition to the score, it is crucial that you provide concise and clear reasoning for your evaluation. Your justification should directly reference specific sections or findings within the paper, demonstrating how they influenced your scoring decision.
11
+ Keep your reasoning consice without missing any critical information.
12
+
13
+ In cases where a query cannot be satisfactorily answered due to a lack of information or clarity in the research paper, please indicate this explicitly in your response. Such instances should be factored into your scoring, reflecting the incompleteness or inadequacy of the paper in addressing the query at hand.
14
+
15
+ Please adhere to the following format when presenting your answers:
16
+ Score: [Insert score here, either 0 or 1]
17
+ Reasoning: [Provide a brief and clear justification for your score, citing specific parts of the paper]
18
+ This structured approach ensures a comprehensive and fair assessment of the research paper, based on the specific aspects highlighted in the queries.
19
+ """
data/prompts/prompt_info.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Context information is provided below.
3
+ ---------------------
4
+ {context_str}
5
+ ---------------------
6
+ As an information extraction specialist and based on the context information, answer the query: {query_str}. Focus on brevity and clarity. Your aim is to extract key information and provide direct, to-the-point answers to specific questions about a research article. Answers should be concise, avoiding unnecessary details.
7
+ Format the answer in a readable way, omitting any irrelevant special characters.
8
+ In cases where a query cannot be adequately answered due to insufficient or unclear information in the article, state this explicitly.
9
+ """
data/prompts/prompt_mistral.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "Context information is below.\n"
2
+ "---------------------\n"
3
+ "{context_str}\n"
4
+ "---------------------\n"
5
+ "As a medical research expert and peer reviewer, evaluate the medical research paper based on the relevant sections provided as context information.\n"
6
+ "Your evaluation should be based on the specific query: {query_str} \n"
7
+ "For each query, provide a binary score of either 0 or 1. \n"
8
+ "Justify your score with concise reasoning, citing specific parts of the paper.\n"
9
+ "If a query cannot be answered due to insufficient information, state this clearly and score accordingly.\n"
10
+ "Answer format is given below. \n"
11
+ "Score: \n"
12
+ "Reasoning: "
few_shot.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "query": "Is anything in the article shared (data, code)? Look for the words like Supporting, Additional, supplementary information/code/material/datashar -ing/-ed/-e, available, reproducibility and similar + links, appendix",
4
+ "score": 1,
5
+ "reasoning": "Supplementary materials\n Supplementary material associated with this article can be found in the online version at doi:10.1016/j. ebiom.2022.103945.\n \n Data sharing\n All relevant data are within the paper and its supplementary files. The raw data used and/or analysed during the study are available in the Genome Sequence Archive for Human repository [HRA001933 in https://bigd.big.ac.cn/gsa-human/]."
6
+ },
7
+ "2": {
8
+ "query": "Has anything in the article been registered (in advance)?",
9
+ "score": 1,
10
+ "reasoning": "This study was registered and the inclusion criteria for patients were presented on ClinicalTrials.gov with the number NCT02533271, STELLAR. The primary endpoint was 3-year relapse-free survival, defined as the time from the date of randomization to the first occurrence of local-regional failure or distant metastasis. The secondary objectives were 3-year local relapse-free survival, distant metastasis-free survival, and overall survival."
11
+ },
12
+ "3": {
13
+ "query": "Does the article follow any reporting guidelines? To answer this question, follow the 3 steps sequentially. If any of the steps is true, assign a score of 1 and if all the steps are false, give a score of 0. STEP 1. Look for ISRCTN registry. STEP 2. Look if it is published in either The Lancet, The New England Journal of Medicine (NEJM), Journal of the American Medical Association (JAMA), British Medical Journal (BMJ), Annals of Internal Medicine, Nature Medicine, or Journal of Clinical Oncology. STEP 3. Look for one of the following guidelines, CONSORT for randomized controlled trials, PRISMA for meta\u2010analyses or systematic reviews, MOOSE for Meta-analyses of observational studies, STARD for diagnostic/prognostic studies, ARRIVE for animal pre-clinical studies, STROBE observational studies,SPIRIT for study protocols, CARE for case reports, AGREE for clinical practice guidelines, SRQR for qualitative researches,SQUIRE for quality improvement studies, SPIRIT Statement: Standard Protocol Items: Recommendations for Interventional Trials, PRIMER Collaboration: PRESENTATION AND INTERPRETATION OF MEDICAL RESEARCH, MIBBI: Minimum Information for Biological and Biomedical Investigations, COREQ: Consolidated Criteria for Reporting Qualitative Research, MDAR (Materials Design Analysis Reporting) reproducibility checklist is not a traditional reporting guideline like CONSORT or PRISMA. Instead, it's a tool designed to enhance the reproducibility and transparency of scientific research, REMARK (Reporting Recommendations for Tumor Marker Prognostic Studies).",
14
+ "score": 1,
15
+ "reasoning": "The Lancet"
16
+ },
17
+ "4": {
18
+ "query": "Is the methodology described in detail (where, when, how, what, who)?",
19
+ "score": 1,
20
+ "reasoning": "Methods Sixty patients with LARC from a multicentre, phase II/III randomized trial were included, with tissue and blood samples collected. For each cfDNA sample, we profiled MRD using 3 approaches: personalized assay targeting tumour-informed mutations, universal panel of genes frequently mutated in colorectal cancer (CRC), and low depth sequencing for copy number alterations (CNAs).\n \n Patients enrolled were randomly assigned in a 1:1 ratio\n to short-course preoperative radiotherapy (SCPRT, 5 Gy\n x 5 alone) with neoadjuvant chemotherapy (NCT) (4\n cycles of capecitabine plus oxaliplatin regimen) and preoperative long-course chemoradiotherapy (2 Gy x 25\n with capecitabine). The treatment strategies in these\n two groups were described in detail in STELLAR registration file. \n \n For each patient, we selected up to 22 somatic mutations from the tumour tissue. We designed customized\n primers targeting the mutations and used the primers to profile the matched cfDNA with Mutation Capsule\n technology as previously described. Briefly, the cfDNA was ligated to a customized adaptor and amplified to\n produce a whole genome library that was subsequently used as a template and amplified with customized primers. Multiplex PCR primer pairs for the two rounds of nested amplification were designed using Oligo software (v7.53) and their uniqueness were verified in the human genome (http://genome.ucsc.edu/) to ensure amplification efficiency. In the first round of amplification, the whole genome library was amplified in 9 cycles of PCR using a target-specific primer and a primer matching the adapter sequence. A second round of 14 cycles of amplification was performed with one pair of nested primers matching the adapter and the target region to further enrich the target region and add the Illumina adapter sequences to the construct. The final libraries were sequenced using the Illumina NovaSeq 6000 platform at a median depth of 6835\u00a3 after removing duplicate molecules. The median on-target ratio of reads mapped to the target region was 80%. The clean reads were mapped to the human reference hg19 genome using 'BWA (v0.7.15) mem' with the default parameters. Samtools mpileup was used to identify somatic mutations, including SNVs and INDELs, across the targeted regions of interest. Each uniquely labelled template was amplified, resulting in a certain number of daughter molecules with the same sequence (defined as a UID family). If a mutation is pre-existing in the template molecule (original cfDNA) used for amplification, the mutation should be present in each daughter molecule containing the UID (barring any subsequent replication or sequencing errors). A UID family in which at least 80% of the family members have the same mutation is called the EUID family, indicating that it harbours a mutation that should be true instead of a false-positive mutation due to amplification or sequencing error. The mutant allelic fraction was calculated by dividing the number of alternative EUID families by the sum of alternative and reference families. Tissue-specific mutations with at least one distinct paired duplex EUID family or four distinct EUID families were subsequently manually checked in IGV and verified using a cross-validation method. The candidate mutations were annotated with Ensemble Variant Effect Predictor (VEP)."
21
+ },
22
+ "5": {
23
+ "query": "Is the data collection processes described in detail (where, when, how, what, who)?",
24
+ "score": 1,
25
+ "reasoning": "The tumour tissues were collected at the diagnostic stage by biopsy sampling, and peripheral blood was collected in EDTA Vacutainer tubes (BD Diagnostics; Franklin Lakes, NJ, USA) and centrifuged within 2 h of collection at 4000 \u00a3 g for 10 min to separate plasma and blood cells. Plasma was centrifuged a second time at 12,000 \u00a3 g for 10 min at 4\u00b0C to remove any remaining cellular debris and stored at -80\u00b0C.\n \n Clinical serum levels of the biomarkers carcinoembryonic antigen (CEA) and carbohydrate antigen 19-9 (CA 19-9) were monitored at baseline, before surgery and after surgery. CEA and CA19-9 levels were measured with immunoelectrochemiluminescence, with CEA concentrations of < 5.0 ng/mL and CA19-9 concentrations of < 27.0 U/mL considered within the reference range. Chest/abdominal/pelvic CT scans were performed every 3 months during the first two years and then every 6 months for a total of 5 years. Clinicians were blinded to the ctDNA results during the courses of neoadjuvant therapy.\n \n Genomic DNA (gDNA) was extracted from fresh frozen tumour biopsies and WBCs with the QIAamp DNA Mini Kit (Qiagen; Germantown, MD, USA), and cfDNA was extracted from 1.5-4.5 mL of plasma with the Apostle MiniMax cfDNA isolation kit (C40605, Apostle; San Jose, CA, USA). Targeted sequencing of a panel of 509 genes or exomes was performed using genomic DNA obtained from tumour tissue and WBCs as previously described.\n \n Briefly, the raw data (FASTQ file) were aligned to the UCSC human reference genome hg19 using BurrowsWheeler aligner software (BWA, v0.7.15). Basic processing, marking duplicates, local realignments and score recalibration were performed using The Genome Analysis Toolkit (GATK, v3.6), Picard (v2.7.1) and Samtools (v1.3.1). Candidate somatic mutations were detected by comparing sequencing data from tumour tissue samples with MuTect1 and Strelka. All selected mutations were further validated by performing a manual inspection using Integrated Genome Viewer (IGV).\n \n The raw sequencing data were treated as described above, and the next segmentation analysis was performed using QDNASeq (v1.14.0). The resulting output files were summarized using R software (v4.0.3). Overlap analysis was performed using bedtools (v2.17.0) and plotted with UpSetR (v1.4.0) within the R package (v4.0.3). Chromosome arm-level alterations show cancer-specific patterns. For example, a hierarchical clustering analysis of mean arm-level calls performed across 3,000 TCGA samples revealed that gastrointestinal tumours clustered with gains of chromosomes 8q, 13q, and 20. Some of these CNAs, including gains of chromosomes 1q, 8q, 7,12q, 13q, and 20q and loss of chromosomes 1p, 20p, and 22q, were also recurrently identified in our cohort as hot CNAs (34 baseline plasma samples from patients with LARC compared with 70 plasma samples from healthy controls). Therefore, we defined the CNA number as the sum of hot chromosome arms altered (|Z| > 2) to represent the level of copy number variation."
26
+ },
27
+ "6": {
28
+ "query": "Is there any sample description? eg. size, demographics, recruitment, in-/exclusion criteria",
29
+ "score": 1,
30
+ "reasoning": "Patient characteristics and tissue mutation identification\n Patients with locally advanced rectal cancer (n = 82; cT3- 4N0 or cTanyN1-2) were enrolled in the trial from December 30, 2016, to October 8, 2018. Twenty-two patients were excluded due to the lack of plasma samples obtained after NAT (Figure 1a). Thirty-one patients were treated with long-course neoadjuvant chemoradiotherapy (LCRT), and 29 patients were treated with short-course neoadjuvant radiotherapy (SCPRT) with neoadjuvant chemotherapy (Table 1). The median follow-up period was 33.25 months (range, 9.6342.43 months). Seventeen (28.33%) patients were diagnosed with local relapse or metastasis during follow-up, including 5/17 (29.41%) with local relapse, 6/17 (35.29%) with liver metastasis and 6/17 (35.29%) with lung metastasis (Table S1).\n One hundred ninety-six blood samples were available during the treatment process, including baseline (collected before NAT, n = 42), in-process (collected during NAT, n = 35), post-NAT (collected 2 weeks after SCPRT or LCRT, n = 60) and pre-TME (collected before surgery, n = 59) samples (Figure 1a). We performed targeted sequencing with a panel of 509 genes or exome sequencing on the genomic DNA isolated from the tumour tissue and matched WBCs, and then identified a median of 51 (range, 3-177) somatic mutations in each tumour (Table S2). The mutational landscape of the top 15 most significantly mutated genes in the cohort was shown in Figure 1b. Customized primers were designed to profile up to 22 somatic mutations in the matched cfDNA with Mutation Capsule technology (Table S3) as previously described.\n \n Thirty-five patients with a positive ctDNA fraction at baseline were analysed (35/42 patients) to explore the performance of the ctDNA fraction in monitoring the NAT response. With ctDNA clearance defined as ratio of post-NAT ctDNA fraction to baseline ctDNA fraction below 2% (median value of the ratio), 19 (54.29%) patients showed no clearance at the post-NAT time point relative to baseline ctDNA fraction values (Figures 5, S3b). For patients with or without ctDNA clearance, there were 9/16 (56.25%) and 18/19 (94.74%) exhibited nonpCR/cCR (clinical complete response), respectively."
31
+ },
32
+ "7": {
33
+ "query": "Does the article describe the data analysis process?",
34
+ "score": 1,
35
+ "reasoning": "Statistics\n In this clinical cohort-based investigative study, the primary aim was to test the hypothesis that changes in the ctDNA fraction during treatment dynamically reflect minimal residual disease. Correlation analysis between input and estimated ctDNA in ctDNA fraction model and analysis of variance for the assessment of longitudinal plasma samples were the exploratory studies. Method for hypothesis testing and survival analysis was commonly used by previous researchers. Specifically, correlation analysis used Spearman\u2019s correlation analysis. For continuous variables, differences in ctDNA fractions between recurrence and non-recurrence groups were assessed with MannWhitney (rank sum) test, ctDNA fractions across treatment courses of NAT were assessed by Kruskal-Wallis test and post hoc using Dunn's multiple comparisons test, and the ctDNA fraction was assessed for patients with paired baseline and post-NAT data using Wilcoxon matched-pairs signed rank test. Differences in clinical characteristics between patients with positive and negative ctDNA fractions were evaluated with Fisher\u2019s exact test for categorical variables. These statistical analyses were performed with Prism 8 software (v8.4.3). Relapse-free survival (RFS) was measured from the date of randomization to the first occurrence of local-regional failure or distant metastasis. The univariate analysis was conducted using the KaplanMeier method with the log-rank test. HR values were calculated using univariate Cox proportional hazard models. The multivariate analysis was based on the Cox proportional hazard model in which the common important factors, such as age, sex, and clinical risk (according to the ESMO guidelines) were included. The survival model was evaluated with the C-index. The KaplanMeier curves were verified by performing a time-dependent receiver operating characteristic (ROC) curve analysis, and the area under the curve (AUC) was calculated to evaluate the prognostic performance. These analyses were performed using R software (v4.0.3). P values < 0.05 from a 2-sided test were considered statistically significant in all analyses. A sample of fifty patients was needed to achieve the power of 0.8 in this study as previously described.\n \n We next checked longitudinal status of the ctDNA fraction and its possible association with the disease course, therapeutic effect and survival status of all 60 patients (Figure 4a). Compared with baseline and in-process samples, a clear trend of a reduced post-NAT ctDNA fraction was observed in both the recurrence and nonrecurrence groups (Figure 4b), which highlighted the significant therapeutic effect of NAT. We noticed a more substantial reduction in the ctDNA fraction during baseline, in-process and post-NAT stages within the nonrecurrence group (Dunn\u2019s multiple comparison test, baseline vs. in-process: P = 0.0130; baseline vs. postNAT: P < 0.0001; in-process vs. post-NAT: P = 0.0009) compared to the recurrence group (Dunn\u2019s multiple comparison test, baseline vs. in-process: P > 0.9999; baseline vs. post-NAT: P = 0.1819; in-process vs. post-NAT: P = 0.4114) (KruskalWallis test, nonrecurrence group, P < 0.0001; recurrence group, P = 0.113) (Figure 4b). Moreover, the post-NAT ctDNA fraction status exhibited the strongest association with RFS, followed by the status at the in-process (HR = 3.61; 95% CI, 0.73-17.91; log-rank P = 0.093) and baseline stages (HR = 1.58; 95% CI, 0.20-12.67; log-rank P = 0.66). For the 17 patients experiencing recurrence, the median lead time between the detection of positive post-NAT ctDNA fraction and finding of radiological recurrence was 10.2 months (range, 0.1-33.2 months) (Wilcoxon matched-pairs signed rank test, P = 0.0001) (Figure S3a). We explored whether ctDNA fraction dynamics were linked to RFS by specifically focusing on the 42 patients with both baseline and post-NAT samples and observed a decreased ctDNA fraction in most patients (85.71%, 36/42). For the 9 patients experiencing recurrence, the ctDNA fraction after NAT increased in 4 (44.44%) patients and decreased but was still positive in 4 (44.44%) patients. In the nonrecurrence group (n = 33), the ctDNA fraction decreased to undetectable levels in 30 patients (90.90%) (Figure 4c). These data showed better predictive value of the post-NAT ctDNA fraction status than the ctDNA fraction dynamics (HR = 7.40; 95% CI: 1.97-27.82; log-rank P = 0.00053; sensitivity of 44.44% and specificity of 93.94%) for RFS estimation. The ctDNA fraction (post-NAT) in MRD-positive samples varied significantly from 0.05% to 12.74%. We divided the post-NAT samples into two groups to test if the ctDNA fraction values were correlated with the recurrence status: highly positive ctDNA fraction ( 1%) and moderately positive ctDNA fraction (0.05%-1%). The RFS of the 3 patients with highly positive post-NAT ctDNA fractions was shorter (< 200 days) than that of the moderately positive group (Figure 4d). In patient FL126 with two post-NAT plasma samples, the ctDNA fraction in plasma was moderately positive (0.16%) at 20 days after NAT and highly positive (3.50%) at 141 days, and lung metastases appeared in this patient only 43 days after the second time point (Figure 4e). In patient FL199 with a moderately positive ctDNA fraction (0.23%), local relapse occurred 306 days later (Figure 4e). The dynamic ctDNA fraction in the remaining samples was shown in Figure S4.\n \n The association between ctDNA fraction clearance and response to neoadjuvant therapy was significant (Fisher's exact test, P = 0.013)."
36
+ },
37
+ "8": {
38
+ "query": "Were measures taken to avoid or minimize systematic bias?",
39
+ "score": 1,
40
+ "reasoning": "This study had several limitations. First, the sample size was modest, and a limited number of patients were included in each subgroup, such as longitudinal plasma samples or patients who accepted LCRT/SCPRT. Second, intervention studies are required to explore the potential clinical utility of ctDNA to guide therapeutic decision-making and to determine whether the administration of neoadjuvant chemotherapy under ctDNA guidance may exert a positive effect on survival.\n \n Declaration of interests\n YCJ is one of the cofounders, has owner interest in Genetron Holdings, and receives royalties from Genetron. The other authors have declared that no competing interest exists.\n \n Role of the funding source\n The sponsors did not have any role in the study design, data collection, data analyses, interpretation, or writing of the manuscript.\n \n Funding\n The National Key R&D Program of China, Beijing Municipal Science & Technology Commission, National Natural Science Foundation of China, and CAMS Innovation Fund for Medical Sciences.\n \n Acknowledgements\n The authors would like to thank Ying Zhang for the assistance with sample collection and Pei Wang for the primary technical assistance. This work was supported by financial support were as follows: the National Key R&D Program of China [2021YFC2500900], Beijing Municipal Science & Technology Commission [Z181100001718136], National Natural Science Foundation of China [82073352], and CAMS Innovation Fund for Medical Sciences [2017-I2M-1-006 and 2021-I2M-1- 067]. The sponsors had no role in study design, data collection, data analyses, interpretation, and writing of the manuscript.\n \n Funding The Beijing Municipal Science & Technology Commission, National Natural Science Foundation of China, and CAMS Innovation Fund for Medical Sciences"
41
+ },
42
+ "9": {
43
+ "query": "Has the article been published in a journal?",
44
+ "score": 1,
45
+ "reasoning": "the lancet"
46
+ }
47
+ }
model_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pdf_processing": {
3
+ "extract_images": false,
4
+ "infer_table_structure": true,
5
+ "strategy": "fast",
6
+ "chunking_strategy": "by_title",
7
+ "model_name": "yolox",
8
+ "max_characters": 10000,
9
+ "combine_text_under_n_chars": 100
10
+ },
11
+ "allowed_extensions": "pdf",
12
+ "embeddings": "huggingface",
13
+ "embeddings_model": "BAAI/bge-small-en-v1.5",
14
+ "llm_model": "gpt-4",
15
+ "model_temp": 0.2,
16
+ "max_tokens": 512,
17
+ "context_window": 5000,
18
+ "UPLOAD_FOLDER": "../path/to/upload/folder",
19
+ "GPT_PROMPT_PATH": "data/prompts/prompt_gpt.txt",
20
+ "MISTRAL_PROMPT_PATH": "data/prompts/prompt_mistral.txt",
21
+ "INFO_PROMPT_PATH": "data/prompts/prompt_info.txt",
22
+ "peer_review_journals_path": "data/prompts/peer_review_journals.txt",
23
+ "eq_network_journals_path": "data/prompts/eq_network_journals.txt",
24
+ "queries": [
25
+ "Is anything in the article shared (data, code)? Look for the words like Supporting, Additional, supplementary information/code/material/datashar -ing/-ed/-e, available, reproducibility and similar + links, appendix",
26
+ "Has anything in the article been registered (in advance)?",
27
+ "Does the article follow any reporting guidelines? To answer this question, follow the 3 steps sequentially. If any of the steps is true, assign a score of 1 and if all the steps are false, give a score of 0. STEP 1. Look for ISRCTN registry. STEP 2. Look if it is published in either The Lancet, The New England Journal of Medicine (NEJM), Journal of the American Medical Association (JAMA), British Medical Journal (BMJ), Annals of Internal Medicine, Nature Medicine, or Journal of Clinical Oncology. STEP 3. Look for one of the following guidelines, CONSORT for randomized controlled trials, PRISMA for meta‐analyses or systematic reviews, MOOSE for Meta-analyses of observational studies, STARD for diagnostic/prognostic studies, ARRIVE for animal pre-clinical studies, STROBE observational studies,SPIRIT for study protocols, CARE for case reports, AGREE for clinical practice guidelines, SRQR for qualitative researches,SQUIRE for quality improvement studies, SPIRIT Statement: Standard Protocol Items: Recommendations for Interventional Trials, PRIMER Collaboration: PRESENTATION AND INTERPRETATION OF MEDICAL RESEARCH, MIBBI: Minimum Information for Biological and Biomedical Investigations, COREQ: Consolidated Criteria for Reporting Qualitative Research, MDAR (Materials Design Analysis Reporting) reproducibility checklist is not a traditional reporting guideline like CONSORT or PRISMA. Instead, it's a tool designed to enhance the reproducibility and transparency of scientific research, REMARK (Reporting Recommendations for Tumor Marker Prognostic Studies).",
28
+ "Is the methodology described in detail (where, when, how, what, who)?",
29
+ "Is the data collection processes described in detail (where, when, how, what, who)?",
30
+ "Is there any sample description? eg. size, demographics, recruitment, in-/exclusion criteria",
31
+ "Does the article describe the data analysis process?",
32
+ "Were measures taken to avoid or minimize systematic bias?",
33
+ "Has the article been published in a journal?"],
34
+ "criteria": [
35
+ "Data and code sharing.",
36
+ "Has anything in the article been registered (in advance)?",
37
+ "Does the article follow any reporting guidelines?",
38
+ "Description of methodology",
39
+ "Data collection processes",
40
+ "Sample description. eg. size, demographics, recruitment, in-/exclusion criteria",
41
+ "Data analysis process",
42
+ "Measures to minimize systematic bias",
43
+ "Peer Review"],
44
+ "journal_query": "Is the given research paper published in any of the following journals: {}?",
45
+ "author_query": "Give me details about the institutions (like university or hospital) and contact details (eg. email) of the corresponding author."
46
+ }
requirements.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ shiny
2
+ beautifulsoup4
3
+ chromadb
4
+ gradio
5
+ cohere
6
+ faiss-cpu
7
+ Flask
8
+ langchain
9
+ langchainhub
10
+ llama-index == 0.9.35
11
+ llmsherpa
12
+ lxml
13
+ onnxruntime
14
+ unstructured
15
+ bs4
16
+ evaluate
17
+ faiss-cpu
18
+ numpy
19
+ openai
20
+ Pillow
21
+ PyPDF2
22
+ pydantic
23
+ rank-bm25
24
+ requests
25
+ rapidocr-onnxruntime
26
+ rouge-score
27
+ scikit-learn
28
+ sentence-transformers
29
+ tiktoken
30
+ transformers
31
+ tesseract
32
+ pdf2image
33
+ pdfminer.six
34
+ opencv-python
35
+ pikepdf
36
+ pypdf
37
+ qdrant-client
38
+ unstructured-inference
39
+ unstructured-pytesseract
templates/index.html ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <title>Upload and Results</title>
5
+ <!-- Include Google Fonts -->
6
+ <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
7
+ <style>
8
+ body {
9
+ font-family: 'Roboto', sans-serif;
10
+ background-color: #f4f4f4;
11
+ overflow: auto;
12
+ width: 100%;
13
+ margin: 0;
14
+ padding: 0;
15
+ display: flex;
16
+ flex-direction: column; /* Stack flex items vertically */
17
+ align-items: center; /* Center items horizontally */
18
+ justify-content: flex-start; /* Align items to the start of the container vertically */
19
+ min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
20
+ }
21
+
22
+ table {
23
+ width: 100%; /* Adjust the width as needed */
24
+ border-collapse: collapse; /* Collapse borders for a tighter look */
25
+ }
26
+
27
+ th, td {
28
+ border: 1px solid #ddd; /* Adjust the border size as needed */
29
+ text-align: left;
30
+ padding: 5px; /* Reduce padding to decrease cell spacing */
31
+ height: 30px; /* Optionally reduce the height of the cells */
32
+ }
33
+ .parent-element {
34
+ overflow: visible; /* Ensures content is not cut off */
35
+ }
36
+ .container {
37
+ background-color: white;
38
+ overflow: auto;
39
+ border-radius: 8px;
40
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
41
+ padding: 40px;
42
+ width: 100%; /* Set width to 100% of the viewport */
43
+ max-width: 700px;
44
+ }
45
+ .score-bar-container {
46
+ position: relative;
47
+ margin-top: 20px; /* Space above the score bar */
48
+ max-width: 100%; /* Ensures the container does not exceed the parent width */
49
+ }
50
+ .score-very-good-fill {
51
+ background-color: #4CAF50; /* Green */
52
+ }
53
+
54
+ .score-good-fill {
55
+ background-color: #FFEB3B; /* Yellow */
56
+ }
57
+
58
+ .score-ok-fill {
59
+ background-color: #FF9800; /* Orange */
60
+ }
61
+
62
+ .score-bad-fill {
63
+ background-color: #f44336; /* Red */
64
+ }
65
+
66
+ .score-very-bad-fill {
67
+ background-color: #9E9E9E; /* Grey */
68
+ }
69
+ .score-very-good-text {
70
+ color: #4CAF50; /* Green */
71
+ }
72
+
73
+ .score-good-text {
74
+ color: #FFEB3B; /* Yellow */
75
+ }
76
+
77
+ .score-ok-text {
78
+ color: #FF9800; /* Orange */
79
+ }
80
+
81
+ .score-bad-text {
82
+ color: #f44336; /* Red */
83
+ }
84
+
85
+ .score-very-bad-text {
86
+ color: #9E9E9E; /* Grey */
87
+ }
88
+
89
+ .score-bar {
90
+ background-color: #ddd;
91
+ border-radius: 10px;
92
+ height: 20px;
93
+ width: 100%; /* Adjusted to take the full width */
94
+ display: inline-block; /* Allows the score text to sit next to the score bar */
95
+ vertical-align: middle; /* Aligns score bar and text vertically */
96
+ }
97
+
98
+ .score-fill {
99
+ height: 100%;
100
+ border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
101
+ display: inline-block;
102
+ vertical-align: middle;
103
+ }
104
+
105
+ .score-text {
106
+ display: inline-block;
107
+ vertical-align: middle; /* Align with the score bar */
108
+ font-weight: bold; /* Make the score text bold */
109
+ margin-left: 10px; /* Space between the score bar and score text */
110
+ }
111
+
112
+ .score-title {
113
+ font-size: 20px;
114
+ font-weight: bold;
115
+ margin: 20px 0;
116
+ color: #333;
117
+ }
118
+ .major-issues {
119
+ text-align: left; /* Aligns the major issues to the left */
120
+ padding-left: 20px; /* Padding for the bullet list */
121
+ list-style: inside disc; /* Bullet style */
122
+ }
123
+ form {
124
+ margin-bottom: 20px;
125
+ }
126
+ input[type="file"] {
127
+ margin-bottom: 10px;
128
+ }
129
+ input[type="submit"] {
130
+ cursor: pointer;
131
+ margin-top: 10px;
132
+ padding: 10px 20px;
133
+ border: none;
134
+ background-color: #4CAF50;
135
+ color: white;
136
+ border-radius: 5px;
137
+ font-size: 16px;
138
+ font-weight: bold;
139
+ }
140
+ input[type="submit"]:hover {
141
+ background-color: #45a049;
142
+ }
143
+ </style>
144
+ </head>
145
+ <body>
146
+ <div class="container">
147
+ <h2>Upload PDF and View Results</h2>
148
+
149
+ <!-- Upload Form -->
150
+ <form method="post" enctype="multipart/form-data">
151
+ <input type="file" name="file" required>
152
+ <input type="submit" value="Upload">
153
+ </form>
154
+
155
+ <!-- Results Section -->
156
+ {% if total_score is not none %}
157
+ <!-- GPT-4 Score Bar -->
158
+ <div class="score-title">Score for GPT-4:</div>
159
+ <div class="score-bar-container">
160
+ <div class="score-bar">
161
+ <div class="score-fill {{
162
+ 'score-very-good-fill' if criteria_met == 9 else
163
+ 'score-good-fill' if criteria_met >= 7 else
164
+ 'score-ok-fill' if criteria_met >= 5 else
165
+ 'score-bad-fill' if criteria_met >= 3 else
166
+ 'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
167
+ </div>
168
+ <div class="score-text">{{ total_score }}/9</div>
169
+ </div>
170
+
171
+ <!-- Reasoning for GPT-4 -->
172
+ <h3>Reasoning:</h3>
173
+ <ul class="major-issues">
174
+ {% for issue in reasoning %}
175
+ <li>{{ issue }}</li>
176
+ {% endfor %}
177
+ </ul>
178
+ <h3>Contact Details:</h3>
179
+ <p> {{result}}</p>
180
+ {% endif %}
181
+ </div>
182
+ </body>
183
+ </html>