frasan commited on
Commit
7f8175c
β€’
1 Parent(s): 2684c41

changed files

Browse files
Dockerfile CHANGED
@@ -41,4 +41,4 @@ EXPOSE 80
41
  ENV NAME World
42
 
43
  # Command to run on container start
44
- CMD ["uvicorn", "librarymed/main:app", "--host", "0.0.0.0", "--port", "7860"]
 
41
  ENV NAME World
42
 
43
  # Command to run on container start
44
+ CMD ["uvicorn", "librarymed.main:app", "--host", "0.0.0.0", "--port", "80"]
librarymed/{kromin/RAG_utils.py β†’ RAG_utils.py} RENAMED
File without changes
librarymed/app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ if __name__ == '__main__':
9
+ args_parse = argparse.ArgumentParser(description="LibraryMed")
10
+ args_parse.add_argument("--local", help="Run interface v0.1.0 by the fellows", action="store_true")
11
+ args = args_parse.parse_args()
12
+ port = os.getenv("PORT") or 80
13
+
14
+ if args.local:
15
+ from local.app_local import app
16
+ logging.info("Run LibraryMed interface v0.1.0 developed by the fellows")
17
+ app.run(debug=True, host="0.0.0.0", port=port)
18
+
19
+ else:
20
+ from librarymed.app_librarymed import app
21
+ logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
22
+ app.run(debug=True, host="0.0.0.0", port=port)
librarymed/{kromin/app_librarymed.py β†’ app_librarymed.py} RENAMED
@@ -7,8 +7,8 @@ from llama_index import Document
7
  from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
8
  from llama_index.llms import OpenAI
9
 
10
- from kromin.RAG_utils import ConfigManager
11
- from kromin.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils
12
  from dotenv import load_dotenv
13
 
14
  load_dotenv()
 
7
  from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
8
  from llama_index.llms import OpenAI
9
 
10
+ from librarymed.RAG_utils import ConfigManager
11
+ from librarymed.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils
12
  from dotenv import load_dotenv
13
 
14
  load_dotenv()
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Bold.ttf DELETED
Binary file (632 kB)
 
librarymed/huggingface/DejaVu/DejaVuSansCondensed-Oblique.ttf DELETED
Binary file (576 kB)
 
librarymed/huggingface/DejaVu/DejaVuSansCondensed.ttf DELETED
Binary file (644 kB)
 
librarymed/huggingface/DejaVu/readme.txt DELETED
@@ -1,40 +0,0 @@
1
- ο»ΏCongratulations, you have successfully downloaded font file!
2
-
3
- This font is provided to you by Fonts2u.com – the largest online
4
- repository of free fonts for Windows and Mac.
5
-
6
-
7
-
8
- How to install this font on your computer?
9
-
10
- For Windows 7 / Vista users:
11
-
12
- - Right-click the font file(s) and choose "Install".
13
-
14
- For users of the previous Windows versions:
15
-
16
- - Copy the included file(s) into a default Windows font folder
17
- (usually C:\WINDOWS\FONTS or C:\WINNT\FONTS)
18
-
19
- For Mac users:
20
-
21
- Mac OS X 10.3 or above (including the FontBook)
22
-
23
- - Double-click the font file and hit "Install font" button at
24
- the bottom of the preview.
25
-
26
- Mac OS X
27
-
28
- - Either copy the font file(s) to /Library/Fonts (for all users),
29
- or to /Users/Your_username/Library/Fonts (for you only).
30
-
31
- Mac OS 9 or earlier
32
-
33
- - You have to convert the font file(s) you have downloaded.
34
- Drag the font suitcases into the System folder. The system
35
- will propose you to add them to the Fonts folder.
36
-
37
- For Linux users:
38
-
39
- - Copy the font file(s) to /USR/SHARE/FONTS
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
librarymed/huggingface/RAG_utils_huggingface.py DELETED
@@ -1,995 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import torch
5
-
6
- import openai
7
- import logging
8
- import asyncio
9
- import aiohttp
10
- import pandas as pd
11
- import numpy as np
12
- import evaluate
13
- import qdrant_client
14
- from pypdf import PdfReader
15
- from pydantic import BaseModel, Field
16
- from typing import Any, List, Tuple, Set, Dict, Optional, Union
17
- from sklearn.metrics.pairwise import cosine_similarity
18
-
19
- from unstructured.partition.pdf import partition_pdf
20
-
21
- import llama_index
22
- from llama_index import PromptTemplate
23
- from llama_index.retrievers import VectorIndexRetriever, BaseRetriever, BM25Retriever
24
- from llama_index.query_engine import RetrieverQueryEngine
25
- from llama_index import get_response_synthesizer
26
- from llama_index.schema import NodeWithScore
27
- from llama_index.query_engine import RetrieverQueryEngine
28
- from llama_index import VectorStoreIndex, ServiceContext
29
- from llama_index.embeddings import OpenAIEmbedding
30
- from llama_index.llms import HuggingFaceLLM
31
- import requests
32
- from llama_index.llms import (
33
- CustomLLM,
34
- CompletionResponse,
35
- CompletionResponseGen,
36
- LLMMetadata,
37
- )
38
- from llama_index.query_engine import RetrieverQueryEngine
39
- from llama_index.llms.base import llm_completion_callback
40
- from llama_index.vector_stores.qdrant import QdrantVectorStore
41
- from llama_index.storage.storage_context import StorageContext
42
- from llama_index.postprocessor import SentenceTransformerRerank, LLMRerank
43
-
44
- from tempfile import NamedTemporaryFile
45
- # Configure basic logging
46
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
47
-
48
- # Create a logger object
49
- logger = logging.getLogger(__name__)
50
-
51
- class ConfigManager:
52
- """
53
- A class to manage loading and accessing configuration settings.
54
-
55
- Attributes:
56
- config (dict): Dictionary to hold configuration settings.
57
-
58
- Methods:
59
- load_config(config_path: str): Loads the configuration from a given JSON file.
60
- get_config_value(key: str): Retrieves a specific configuration value.
61
- """
62
-
63
- def __init__(self):
64
- self.configs = {}
65
-
66
- def load_config(self, config_name: str, config_path: str) -> None:
67
- """
68
- Loads configuration settings from a specified JSON file into a named configuration.
69
-
70
- Args:
71
- config_name (str): The name to assign to this set of configurations.
72
- config_path (str): The path to the configuration file.
73
-
74
- Raises:
75
- FileNotFoundError: If the config file is not found.
76
- json.JSONDecodeError: If there is an error parsing the config file.
77
- """
78
- try:
79
- with open(config_path, 'r') as f:
80
- self.configs[config_name] = json.load(f)
81
- except FileNotFoundError:
82
- logging.error(f"Config file not found at {config_path}")
83
- raise
84
- except json.JSONDecodeError as e:
85
- logging.error(f"Error decoding config file: {e}")
86
- raise
87
-
88
-
89
- def get_config_value(self, config_name: str, key: str) -> str:
90
- """
91
- Retrieves a specific configuration value.
92
-
93
- Args:
94
- key (str): The key for the configuration setting.
95
-
96
- Returns:
97
- str: The value of the configuration setting.
98
-
99
- Raises:
100
- ValueError: If the key is not found or is set to a placeholder value.
101
- """
102
- value = self.configs.get(config_name, {}).get(key)
103
- if value is None or value == "ENTER_YOUR_TOKEN_HERE":
104
- raise ValueError(f"Please set your '{key}' in the config.json file.")
105
- return value
106
-
107
- class base_utils:
108
- """
109
- A utility class providing miscellaneous static methods for processing and analyzing text data,
110
- particularly from PDF documents and filenames. This class also includes methods for file operations.
111
-
112
- This class encapsulates the functionality of extracting key information from text, such as scores,
113
- reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
114
- and reading content from files.
115
-
116
- Attributes:
117
- None (This class contains only static methods and does not maintain any state)
118
-
119
- Methods:
120
- extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
121
- Extracts a score and reasoning from a given text using regular expressions.
122
-
123
- extract_id_from_filename(filename: str) -> Optional[int]:
124
- Extracts an ID from a given filename based on a specified pattern.
125
-
126
- find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
127
- Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
128
-
129
- read_from_file(file_path: str) -> str:
130
- Reads the content of a file and returns it as a string.
131
- """
132
-
133
- @staticmethod
134
- def read_from_file(file_path: str) -> str:
135
- """
136
- Reads the content of a file and returns it as a string.
137
-
138
- Args:
139
- file_path (str): The path to the file to be read.
140
-
141
- Returns:
142
- str: The content of the file.
143
- """
144
- with open(file_path, 'r') as prompt_file:
145
- prompt = prompt_file.read()
146
- return prompt
147
-
148
- @staticmethod
149
- def extract_id_from_filename(filename: str) -> Optional[int]:
150
- """
151
- Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
152
-
153
- Args:
154
- filename (str): The filename from which to extract the ID.
155
-
156
- Returns:
157
- int: The extracted ID as an integer, or None if the pattern is not found.
158
- """
159
- # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
160
- match = re.search(r'Id_(\d+).pdf', filename)
161
- if match:
162
- return int(match.group(1)) # Convert to integer if ID is numeric
163
- else:
164
- return None
165
-
166
- @staticmethod
167
- def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
168
- """
169
- Extracts score and the longest reasoning from a given text using regular expressions.
170
-
171
- Args:
172
- text (str): The text from which to extract the score and reasoning.
173
-
174
- Returns:
175
- dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
176
- """
177
- # Define regular expression patterns for score and reasoning
178
- score_pattern = r"Score: (\d+)"
179
- reasoning_pattern = r"Reasoning: (\S.+)"
180
-
181
- # Extract score using regular expressions
182
- score_match = re.search(score_pattern, text)
183
-
184
- # Extract all reasoning matches
185
- reasoning_matches = re.findall(reasoning_pattern, text, re.DOTALL)
186
-
187
- # Find the longest reasoning match
188
- longest_reasoning = min(reasoning_matches, key=len) if reasoning_matches else None
189
-
190
- # Extract and return the results
191
- extracted_data = {
192
- "score": score_match.group(1) if score_match else None,
193
- "reasoning": longest_reasoning.strip() if longest_reasoning else None
194
- }
195
-
196
- return extracted_data
197
-
198
-
199
- @staticmethod
200
- def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
201
- """
202
- Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
203
-
204
- Args:
205
- pdf_filename (str): The filename of the PDF.
206
- dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
207
-
208
- Returns:
209
- pandas.Series or str: The matched row from the dataframe or a message indicating
210
- that no matching row or invalid filename was found.
211
- """
212
- pdf_id = Utility.extract_id_from_filename(pdf_filename)
213
- if pdf_id is not None:
214
- # Assuming the first column contains the ID
215
- matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
216
- if not matched_row.empty:
217
- return matched_row
218
- else:
219
- return "No matching row found."
220
- else:
221
- return "Invalid file name."
222
-
223
-
224
- class PDFProcessor_Unstructured:
225
- """
226
- A class to process PDF files, providing functionalities for extracting, categorizing,
227
- and merging elements from a PDF file.
228
-
229
- This class is designed to handle unstructured PDF documents, particularly useful for
230
- tasks involving text extraction, categorization, and data processing within PDFs.
231
-
232
- Attributes:
233
- file_path (str): The full path to the PDF file.
234
- folder_path (str): The directory path where the PDF file is located.
235
- file_name (str): The name of the PDF file.
236
- texts (List[str]): A list to store extracted text chunks.
237
- tables (List[str]): A list to store extracted tables.
238
-
239
-
240
- Methods:
241
- extract_pdf_elements() -> List:
242
- Extracts images, tables, and text chunks from a PDF file.
243
-
244
- categorize_elements(raw_pdf_elements: List) -> None:
245
- Categorizes extracted elements from a PDF into tables and texts.
246
-
247
- merge_chunks() -> List[str]:
248
- Merges text chunks based on punctuation and character case criteria.
249
-
250
- should_skip_chunk(chunk: str) -> bool:
251
- Determines if a chunk should be skipped based on its content.
252
-
253
- should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
254
- Determines if the current chunk should be merged with the next one.
255
-
256
- process_pdf() -> Tuple[List[str], List[str]]:
257
- Processes the PDF by extracting, categorizing, and merging elements.
258
-
259
- process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
260
- Processes an uploaded PDF file to extract and categorize text and tables.
261
- """
262
-
263
- def __init__(self, config: Dict[str, any]):
264
- self.file_path = None
265
- self.folder_path = None
266
- self.file_name = None
267
- self.texts = []
268
- self.tables = []
269
- self.config = config if config is not None else self.default_config()
270
- logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
271
-
272
- @staticmethod
273
- def default_config() -> Dict[str, any]:
274
- """
275
- Returns the default configuration for PDF processing.
276
-
277
- Returns:
278
- Dict[str, any]: Default configuration options.
279
- """
280
- return {
281
- "extract_images": False,
282
- "infer_table_structure": True,
283
- "chunking_strategy": "by_title",
284
- "max_characters": 10000,
285
- "combine_text_under_n_chars": 100,
286
- "strategy": "fast",
287
- "model_name": "yolox"
288
- }
289
-
290
-
291
- def extract_pdf_elements(self) -> List:
292
- """
293
- Extracts images, tables, and text chunks from a PDF file.
294
-
295
- Returns:
296
- List: A list of extracted elements from the PDF.
297
- """
298
- logger.info("Starting extraction of PDF elements.")
299
- try:
300
- extracted_elements = partition_pdf(
301
- filename=self.file_path,
302
- extract_images_in_pdf=False,
303
- infer_table_structure=True,
304
- chunking_strategy="by_title",
305
- strategy = "fast",
306
- max_characters=10000,
307
- combine_text_under_n_chars=100,
308
- image_output_dir_path=self.folder_path,
309
- )
310
- logger.info("Extraction of PDF elements completed successfully.")
311
- return extracted_elements
312
- except Exception as e:
313
- logger.error(f"Error extracting PDF elements: {e}", exc_info=True)
314
- raise
315
-
316
- def categorize_elements(self, raw_pdf_elements: List) -> None:
317
- """
318
- Categorizes extracted elements from a PDF into tables and texts.
319
-
320
- Args:
321
- raw_pdf_elements (List): A list of elements extracted from the PDF.
322
- """
323
- logger.debug("Starting categorization of PDF elements.")
324
- for element in raw_pdf_elements:
325
- element_type = str(type(element))
326
- if "unstructured.documents.elements.Table" in element_type:
327
- self.tables.append(str(element))
328
- elif "unstructured.documents.elements.CompositeElement" in element_type:
329
- self.texts.append(str(element))
330
-
331
- logger.debug("Categorization of PDF elements completed.")
332
-
333
- def merge_chunks(self) -> List[str]:
334
- """
335
- Merges text chunks based on punctuation and character case criteria.
336
-
337
- Returns:
338
- List[str]: A list of merged text chunks.
339
- """
340
- logger.debug("Starting merging of text chunks.")
341
-
342
- merged_chunks = []
343
- skip_next = False
344
-
345
- for i, current_chunk in enumerate(self.texts[:-1]):
346
- next_chunk = self.texts[i + 1]
347
-
348
- if self.should_skip_chunk(current_chunk):
349
- continue
350
-
351
- if self.should_merge_with_next(current_chunk, next_chunk):
352
- merged_chunks.append(current_chunk + " " + next_chunk)
353
- skip_next = True
354
- else:
355
- merged_chunks.append(current_chunk)
356
-
357
- if not skip_next:
358
- merged_chunks.append(self.texts[-1])
359
-
360
- logger.debug("Merging of text chunks completed.")
361
-
362
- return merged_chunks
363
-
364
- @staticmethod
365
- def should_skip_chunk(chunk: str) -> bool:
366
- """
367
- Determines if a chunk should be skipped based on its content.
368
-
369
- Args:
370
- chunk (str): The text chunk to be evaluated.
371
-
372
- Returns:
373
- bool: True if the chunk should be skipped, False otherwise.
374
- """
375
- return (chunk.lower().startswith(("figure", "fig", "table")) or
376
- not chunk[0].isalnum() or
377
- re.match(r'^\d+\.', chunk))
378
-
379
- @staticmethod
380
- def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
381
- """
382
- Determines if the current chunk should be merged with the next one.
383
-
384
- Args:
385
- current_chunk (str): The current text chunk.
386
- next_chunk (str): The next text chunk.
387
-
388
- Returns:
389
- bool: True if the chunks should be merged, False otherwise.
390
- """
391
- return (current_chunk.endswith(",") or
392
- (current_chunk[-1].islower() and next_chunk[0].islower()))
393
-
394
- def extract_title_from_pdf(self, uploaded_file):
395
- """
396
- Extracts the title from a PDF file's metadata.
397
-
398
- This function reads the metadata of a PDF file using PyPDF2 and attempts to
399
- extract the title. If the title is present in the metadata, it is returned.
400
- Otherwise, a default message indicating that the title was not found is returned.
401
-
402
- Parameters:
403
- uploaded_file (file): A file object or a path to the PDF file from which
404
- to extract the title. The file must be opened in binary mode.
405
-
406
- Returns:
407
- str: The title of the PDF file as a string. If no title is found, returns
408
- 'Title not found'.
409
- """
410
- # Initialize PDF reader
411
- pdf_reader = PdfReader(uploaded_file)
412
-
413
- # Extract document information
414
- meta = pdf_reader.metadata
415
-
416
- # Retrieve title from document information
417
- title = meta.title if meta and meta.title else 'Title not found'
418
- return title
419
-
420
- def process_pdf(self) -> Tuple[List[str], List[str]]:
421
- """
422
- Processes the PDF by extracting, categorizing, and merging elements.
423
-
424
- Returns:
425
- Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
426
- """
427
- logger.info("Starting processing of the PDF.")
428
- try:
429
- raw_pdf_elements = self.extract_pdf_elements()
430
- self.categorize_elements(raw_pdf_elements)
431
- merged_chunks = self.merge_chunks()
432
- return merged_chunks, self.tables
433
- except Exception as e:
434
- logger.error(f"Error processing PDF: {e}", exc_info=True)
435
- raise
436
-
437
- def process_pdf_file(self, uploaded_file):
438
- """
439
- Process an uploaded PDF file.
440
-
441
- If a new file is uploaded, the previously stored file is deleted.
442
- The method updates the file path, processes the PDF, and returns the results.
443
-
444
- Parameters:
445
- uploaded_file: The new PDF file uploaded for processing.
446
-
447
- Returns:
448
- The results of processing the PDF file.
449
- """
450
- # Delete the previous file if it exists
451
- if self.file_path and os.path.exists(self.file_path):
452
- try:
453
- os.remove(self.file_path)
454
- logging.debug(f"Previous file {self.file_path} deleted.")
455
- except Exception as e:
456
- logging.warning(f"Error deleting previous file: {e}", exc_info=True)
457
-
458
- # Process the new file
459
- self.file_path = str(uploaded_file)
460
- self.folder_path = os.path.dirname(self.file_path)
461
- logging.info(f"Starting to process the PDF file: {self.file_path}")
462
-
463
- try:
464
- logging.debug(f"Processing PDF at {self.file_path}")
465
- results = self.process_pdf()
466
- title = self.extract_title_from_pdf(self.file_path)
467
- logging.info("PDF processing completed successfully.")
468
- return (*results, title)
469
- except Exception as e:
470
- logging.error(f"Error processing PDF file: {e}", exc_info=True)
471
- raise
472
-
473
-
474
- class HybridRetriever(BaseRetriever):
475
- """
476
- A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
477
- Inherits from BaseRetriever.
478
-
479
- This class uses two different retrieval methods and merges their results to provide a
480
- comprehensive set of documents in response to a query. It ensures diversity in the
481
- retrieved documents by leveraging the strengths of both retrieval methods.
482
-
483
- Attributes:
484
- vector_retriever: An instance of a vector-based retriever.
485
- bm25_retriever: An instance of a BM25 retriever.
486
-
487
- Methods:
488
- __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
489
- _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
490
- _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
491
- """
492
-
493
- def __init__(self, vector_retriever, bm25_retriever):
494
- super().__init__()
495
- self.vector_retriever = vector_retriever
496
- self.bm25_retriever = bm25_retriever
497
- logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
498
-
499
- def _retrieve(self, query: str, **kwargs) -> List:
500
- """
501
- Retrieves and combines results from both vector and BM25 retrievers.
502
-
503
- Args:
504
- query: The query string for document retrieval.
505
- **kwargs: Additional keyword arguments for retrieval.
506
-
507
- Returns:
508
- List: Combined list of unique nodes retrieved from both methods.
509
- """
510
- logger.info(f"Retrieving documents for query: {query}")
511
- try:
512
- bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
513
- vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
514
- combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
515
-
516
- logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
517
- return combined_nodes
518
- except Exception as e:
519
- logger.error(f"Error in retrieval: {e}")
520
- raise
521
-
522
- @staticmethod
523
- def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
524
- """
525
- Combines and de-duplicates results from BM25 and vector retrievers.
526
-
527
- Args:
528
- bm25_nodes: Nodes retrieved from BM25 retriever.
529
- vector_nodes: Nodes retrieved from vector retriever.
530
-
531
- Returns:
532
- List: Combined list of unique nodes.
533
- """
534
- node_ids: Set = set()
535
- combined_nodes = []
536
-
537
- for node in bm25_nodes + vector_nodes:
538
- if node.node_id not in node_ids:
539
- combined_nodes.append(node)
540
- node_ids.add(node.node_id)
541
-
542
- return combined_nodes
543
-
544
-
545
-
546
- class PDFQueryEngine:
547
- """
548
- A class to handle the process of setting up a query engine and performing queries on PDF documents.
549
-
550
- This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
551
- indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
552
-
553
- Attributes:
554
- documents (List): A list of documents to be indexed.
555
- llm (Language Model): The language model to be used for embeddings and queries.
556
- qa_prompt_tmpl (str): Template for creating query prompts.
557
- queries (List[str]): List of queries to be executed.
558
-
559
- Methods:
560
- setup_query_engine(): Sets up the query engine with all necessary components.
561
- execute_queries(): Executes the predefined queries and prints the results.
562
- """
563
-
564
- def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
565
-
566
- self.documents = documents
567
- self.llm = llm
568
- self.embed_model = embed_model
569
- self.qa_prompt_tmpl = qa_prompt_tmpl
570
- self.base_utils = base_utils()
571
- self.config_manager = ConfigManager()
572
-
573
-
574
-
575
- logger.info("PDFQueryEngine initialized.")
576
-
577
- def format_example(self, example):
578
- """
579
- Formats a few-shot example into a string.
580
-
581
- Args:
582
- example (dict): A dictionary containing 'query', 'score', and 'reasoning' for the few-shot example.
583
-
584
- Returns:
585
- str: Formatted few-shot example text.
586
- """
587
- return "Example:\nQuery: {}\nScore: {}\nReasoning: {}\n".format(
588
- example['query'], example['score'], example['reasoning']
589
- )
590
-
591
-
592
- def setup_query_engine(self):
593
- """
594
- Sets up the query engine by initializing and configuring the embedding model, service context, index,
595
- hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
596
-
597
- Args:
598
- embed_model: The embedding model to be used.
599
- service_context: The context for providing services to the query engine.
600
- index: The index used for storing and retrieving documents.
601
- hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
602
- response_synthesizer: The synthesizer for generating responses to queries.
603
-
604
- Returns:
605
- Any: The configured query engine.
606
- """
607
- client = qdrant_client.QdrantClient(
608
- # you can use :memory: mode for fast and light-weight experiments,
609
- # it does not require to have Qdrant deployed anywhere
610
- # but requires qdrant-client >= 1.1.1
611
- location=":memory:"
612
- # otherwise set Qdrant instance address with:
613
- # uri="http://<host>:<port>"
614
- # set API KEY for Qdrant Cloud
615
- # api_key="<qdrant-api-key>",
616
- )
617
- try:
618
- logger.info("Initializing the service context for query engine setup.")
619
- service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
620
- vector_store = QdrantVectorStore(client=client, collection_name="med_library")
621
- storage_context = StorageContext.from_defaults(vector_store=vector_store)
622
-
623
- logger.info("Creating an index from documents.")
624
- index = VectorStoreIndex.from_documents(documents=self.documents, storage_context=storage_context, service_context=service_context)
625
- nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
626
-
627
- logger.info("Setting up vector and BM25 retrievers.")
628
- vector_retriever = index.as_retriever(similarity_top_k=3)
629
- bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)
630
- hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
631
-
632
- logger.info("Configuring the response synthesizer with the prompt template.")
633
- qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
634
- response_synthesizer = get_response_synthesizer(
635
- service_context=service_context,
636
- text_qa_template=qa_prompt,
637
- response_mode="compact",
638
- )
639
-
640
- logger.info("Assembling the query engine with reranker and synthesizer.")
641
- reranker = SentenceTransformerRerank(top_n=3, model="BAAI/bge-reranker-base")
642
- query_engine = RetrieverQueryEngine.from_args(
643
- retriever=hybrid_retriever,
644
- node_postprocessors=[reranker],
645
- response_synthesizer=response_synthesizer,
646
- )
647
-
648
- logger.info("Query engine setup complete.")
649
- return query_engine
650
- except Exception as e:
651
- logger.error(f"Error during query engine setup: {e}")
652
- raise
653
-
654
- def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[int, List[int], int, float, List[str]]:
655
- """
656
- Evaluate documents using a language model based on various criteria.
657
- Args:
658
- reg_result (Any): Result related to registration.
659
- peer_result (Any): Result related to peer review.
660
- guidelines_result (Any): Result related to following guidelines.
661
- queries (List[str]): A list of queries to be processed.
662
- Returns:
663
- Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria.
664
- """
665
-
666
- logger.info("Starting evaluation with LLM.")
667
- self.config_manager.load_config("few_shot", "few_shot.json")
668
- query_engine = self.setup_query_engine()
669
-
670
- total_score = 0
671
- criteria_met = 0
672
- reasoning = []
673
-
674
- for j, query in enumerate(queries):
675
- # Handle special cases based on the value of j and other conditions
676
- if j == 1 and reg_result:
677
- extracted_data = {"score": 1, "reasoning": reg_result[0]}
678
- elif j == 2 and guidelines_result:
679
- extracted_data = {"score": 1, "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
680
- elif j == 8 and (guidelines_result or peer_result):
681
- extracted_data = {"score": 1, "reasoning": "The article is published in a peer-reviewed journal."}
682
- else:
683
-
684
- # Execute the query
685
- result = query_engine.query(query).response
686
- extracted_data = self.base_utils.extract_score_reasoning(result)
687
-
688
-
689
- # Validate and accumulate the scores
690
- extracted_data_score = 0 if extracted_data.get("score") is None else int(extracted_data.get("score"))
691
- if extracted_data_score > 0:
692
- criteria_met += 1
693
- reasoning.append(extracted_data["reasoning"])
694
- total_score += extracted_data_score
695
-
696
- score_percentage = (float(total_score) / len(queries)) * 100
697
- logger.info("Evaluation completed.")
698
- return total_score, criteria_met, score_percentage, reasoning
699
-
700
-
701
-
702
- class MixtralLLM(CustomLLM):
703
- """
704
- A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
705
-
706
- Attributes:
707
- context_window (int): Number of tokens used for context during inference.
708
- num_output (int): Number of tokens to generate as output.
709
- temperature (float): Sampling temperature for token generation.
710
- model_name (str): Name of the model on Hugging Face's model hub.
711
- api_key (str): API key for authenticating with the Hugging Face API.
712
-
713
- Methods:
714
- metadata: Retrieves metadata about the model.
715
- do_hf_call: Makes an API call to the Hugging Face model.
716
- complete: Generates a complete response for a given prompt.
717
- stream_complete: Streams a series of token completions for a given prompt.
718
- """
719
- context_window: int = Field(..., description="Number of tokens used for context during inference.")
720
- num_output: int = Field(..., description="Number of tokens to generate as output.")
721
- temperature: float = Field(..., description="Sampling temperature for token generation.")
722
- model_name: str = Field(..., description="Name of the model on Hugging Face's model hub.")
723
- api_key: str = Field(..., description="API key for authenticating with the Hugging Face API.")
724
-
725
-
726
- @property
727
- def metadata(self) -> LLMMetadata:
728
- """
729
- Retrieves metadata for the Mixtral LLM.
730
-
731
- Returns:
732
- LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
733
- """
734
- return LLMMetadata(
735
- context_window=self.context_window,
736
- num_output=self.num_output,
737
- model_name=self.model_name,
738
- )
739
-
740
- def do_hf_call(self, prompt: str) -> str:
741
- """
742
- Makes an API call to the Hugging Face model and retrieves the generated response.
743
-
744
- Args:
745
- prompt (str): The input prompt for the model.
746
-
747
- Returns:
748
- str: The text generated by the model in response to the prompt.
749
-
750
- Raises:
751
- Exception: If the API call fails or returns an error.
752
- """
753
- data = {
754
- "inputs": prompt,
755
- "parameters": {"Temperature": self.temperature}
756
- }
757
-
758
- # Makes a POST request to the Hugging Face API to get the model's response
759
- response = requests.post(
760
- f'https://api-inference.huggingface.co/models/{self.model_name}',
761
- headers={
762
- 'authorization': f'Bearer {self.api_key}',
763
- 'content-type': 'application/json',
764
- },
765
- json=data,
766
- stream=True
767
- )
768
-
769
- # Checks for a successful response and parses the generated text
770
- if response.status_code != 200 or not response.json() or 'error' in response.json():
771
- print(f"Error: {response}")
772
- return "Unable to answer for technical reasons."
773
- full_txt = response.json()[0]['generated_text']
774
- # Finds the section of the text following the context separator
775
- offset = full_txt.find("---------------------")
776
- ss = full_txt[offset:]
777
- # Extracts the actual answer from the response
778
- offset = ss.find("Answer:")
779
- return ss[offset+7:].strip()
780
-
781
-
782
- @llm_completion_callback()
783
- def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
784
- """
785
- Generates a complete response for a given prompt using the Hugging Face API.
786
-
787
- Args:
788
- prompt (str): The input prompt for the model.
789
- **kwargs: Additional keyword arguments for the completion.
790
-
791
- Returns:
792
- CompletionResponse: The complete response from the model.
793
- """
794
- response = self.do_hf_call(prompt)
795
- return CompletionResponse(text=response)
796
-
797
-
798
- @llm_completion_callback()
799
- def stream_complete(
800
- self, prompt: str, **kwargs: Any
801
- ) -> CompletionResponseGen:
802
- """
803
- Streams a series of token completions as a response for the given prompt.
804
-
805
- This method is useful for streaming responses where each token is generated sequentially.
806
-
807
- Args:
808
- prompt (str): The input prompt for the model.
809
- **kwargs: Additional keyword arguments for the streaming completion.
810
-
811
- Yields:
812
- CompletionResponseGen: A generator yielding each token in the completion response.
813
- """
814
- # Yields a stream of tokens as the completion response for the given prompt
815
- response = ""
816
- for token in self.do_hf_call(prompt):
817
- response += token
818
- yield CompletionResponse(text=response, delta=token)
819
-
820
-
821
-
822
- class KeywordSearch():
823
- def __init__(self, chunks):
824
- self.chunks = chunks
825
-
826
- def find_journal_name(self, response: str, journal_list: list) -> str:
827
- """
828
- Searches for a journal name in a given response string.
829
-
830
- This function iterates through a list of known journal names and checks if any of these
831
- names are present in the response string. It returns the first journal name found in the
832
- response. If no journal names from the list are found in the response, a default message
833
- indicating that the journal name was not found is returned.
834
-
835
- Args:
836
- response (str): The response string to search for a journal name.
837
- journal_list (list): A list of journal names to search within the response.
838
-
839
- Returns:
840
- str: The first journal name found in the response, or a default message if no journal name is found.
841
- """
842
- response_lower = response.lower()
843
- for journal in journal_list:
844
- journal_lower = journal.lower()
845
-
846
- if journal_lower in response_lower:
847
- return True
848
-
849
- return False
850
-
851
- def check_registration(self):
852
- """
853
- Check chunks of text for various registration numbers or URLs of registries.
854
- Returns the sentence containing a registration number, or if not found,
855
- returns chunks containing registry URLs.
856
-
857
- Args:
858
- chunks (list of str): List of text chunks to search.
859
-
860
- Returns:
861
- list of str: List of matching sentences or chunks, or an empty list if no matches are found.
862
- """
863
-
864
- # Patterns for different registration types
865
- patterns = {
866
- "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
867
- "ISRCTN": r"(ISRCTN\d{8})",
868
- "EudraCT": r"(\d{4}-\d{6}-\d{2})",
869
- "UMIN-CTR": r"(UMIN\d{9})",
870
- "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
871
- }
872
-
873
- # Registry URLs
874
- registry_urls = [
875
- "www.anzctr.org.au",
876
- "anzctr.org.au",
877
- "www.clinicaltrials.gov",
878
- "clinicaltrials.gov",
879
- "www.ISRCTN.org",
880
- "ISRCTN.org",
881
- "www.umin.ac.jp/ctr/index/htm",
882
- "umin.ac.jp/ctr/index/htm",
883
- "www.onderzoekmetmensen.nl/en",
884
- "onderzoekmetmensen.nl/en",
885
- "eudract.ema.europa.eu",
886
- "www.eudract.ema.europa.eu"
887
- ]
888
-
889
-
890
- # Check each chunk for registration numbers
891
- for chunk in self.chunks:
892
- # Split chunk into sentences
893
- sentences = re.split(r'(?<=[.!?]) +', chunk)
894
-
895
- # Check each sentence for any registration number
896
- for sentence in sentences:
897
- for pattern in patterns.values():
898
- if re.search(pattern, sentence):
899
- return [sentence] # Return immediately if a registration number is found
900
-
901
- # If no registration number found, check for URLs in chunks
902
- matching_chunks = []
903
- for chunk in self.chunks:
904
- if any(url in chunk for url in registry_urls):
905
- matching_chunks.append(chunk)
906
-
907
- return matching_chunks
908
-
909
-
910
-
911
- class StringExtraction():
912
-
913
- """
914
- A class to handle the the process of extraction of query string from complete LLM responses.
915
-
916
- This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
917
- LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
918
- satisfactory results. Best case scenario will be write your own string extraction method in such cases.
919
-
920
-
921
- Methods:
922
- extract_original_prompt():
923
- extraction_ground_truth():
924
- """
925
-
926
- def extract_original_prompt(self,result):
927
- r1 = result.response.strip().split("\n")
928
- binary_response = ""
929
- explanation_response = ""
930
- for r in r1:
931
- if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
932
- binary_response = r
933
- elif r.find("Reasoning:") >= 0:
934
- cut = r.find(":")
935
- explanation_response += r[cut+1:].strip()
936
-
937
- return binary_response,explanation_response
938
-
939
- def extraction_ground_truth(self,paper_name,labelled_data):
940
- id = int(paper_name[paper_name.find("_")+1:paper_name.find(".pdf")])
941
- id_row = labelled_data[labelled_data["id"] == id]
942
- ground_truth = id_row.iloc[:,2:11].values.tolist()[0]
943
- binary_ground_truth = []
944
- explanation_ground_truth = []
945
- for g in ground_truth:
946
- if len(g) > 0:
947
- binary_ground_truth.append("Yes")
948
- explanation_ground_truth.append(g)
949
- else:
950
- binary_ground_truth.append("No")
951
- explanation_ground_truth.append("The article does not provide any relevant information.")
952
- return binary_ground_truth,explanation_ground_truth
953
-
954
-
955
-
956
- class EvaluationMetrics():
957
- """
958
-
959
- This class encapsulates the evaluation methods that have been used in the project.
960
-
961
- Attributes:
962
- explanation_response = a list of detailed response from the LLM model corresponding to each query
963
- explanation_ground_truth = the list of ground truth corresponding to each query
964
-
965
- Methods:
966
- metric_cosine_similairty(): Sets up the query engine with all necessary components.
967
- metric_rouge(): Executes the predefined queries and prints the results.
968
- metric_binary_accuracy():
969
- """
970
-
971
-
972
- def __init__(self,explanation_response,explanation_ground_truth,embedding_model):
973
- self.explanation_response = explanation_response
974
- self.explanation_ground_truth = explanation_ground_truth
975
- self.embedding_model = embedding_model
976
-
977
- def metric_cosine_similarity(self):
978
- ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
979
- explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
980
- return np.diag(cosine_similarity(ground_truth_embedding,explanation_response_embedding))
981
-
982
- def metric_rouge(self):
983
- rouge = evaluate.load("rouge")
984
- results = rouge.compute(predictions = self.explanation_response,references = self.explanation_ground_truth)
985
- return results
986
-
987
- def binary_accuracy(self,binary_response,binary_ground_truth):
988
- count = 0
989
- if len(binary_response) != len(binary_ground_truth):
990
- return "Arrays which are to be compared has different lengths."
991
- else:
992
- for i in range(len(binary_response)):
993
- if binary_response[i] == binary_ground_truth[i]:
994
- count += 1
995
- return np.round(count/len(binary_response),2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
librarymed/huggingface/app_huggingface.py DELETED
@@ -1,304 +0,0 @@
1
- import logging
2
- import os
3
-
4
- import gradio as gr
5
- import openai
6
- from fpdf import FPDF
7
- from llama_index import Document
8
- from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
9
- from llama_index.llms import OpenAI
10
-
11
- from RAG_utils_huggingface import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
12
- ConfigManager
13
-
14
- # Configure basic logging
15
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16
-
17
- # Create a logger object
18
- logger = logging.getLogger(__name__)
19
-
20
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
21
-
22
- config_manager = ConfigManager()
23
- # config_manager.load_config("api", "Config/api_config.json")
24
- config_manager.load_config("model", "model_config.json")
25
-
26
- openai.api_key = os.environ['OPENAI_API_KEY'] # config_manager.get_config_value("api", "OPENAI_API_KEY")
27
- hf_token = os.environ['HF_TOKEN'] # config_manager.get_config_value("api", "HF_TOKEN")
28
-
29
- # PDF rendering and chunking parameters
30
- pdf_processing_config = config_manager.get_config_value("model", "pdf_processing")
31
-
32
- ALLOWED_EXTENSIONS = config_manager.get_config_value("model", "allowed_extensions")
33
- embed = config_manager.get_config_value("model", "embeddings")
34
- embed_model_name = config_manager.get_config_value("model", "embeddings_model")
35
-
36
- # llm_model = config_manager.get_config_value("model", "llm_model")
37
- model_temperature = config_manager.get_config_value("model", "model_temp")
38
- output_token_size = config_manager.get_config_value("model", "max_tokens")
39
- model_context_window = config_manager.get_config_value("model", "context_window")
40
-
41
- gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
42
- mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
43
- info_prompt_path = config_manager.get_config_value("model", "INFO_PROMPT_PATH")
44
-
45
- peer_review_journals_path = config_manager.get_config_value("model", "peer_review_journals_path")
46
- eq_network_journals_path = config_manager.get_config_value("model", "eq_network_journals_path")
47
-
48
- queries = config_manager.get_config_value("model", "queries")
49
- criteria = config_manager.get_config_value("model", "criteria")
50
- num_criteria = len(queries)
51
-
52
- author_query = config_manager.get_config_value("model", "author_query")
53
- journal_query = config_manager.get_config_value("model", "journal_query")
54
-
55
-
56
- # Helper function to check if the file extension is allowed
57
- def allowed_file(filename):
58
- return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
59
-
60
-
61
- def generate_score_bar(score, num_criteria):
62
- # Convert and round the score from a 9-point scale to a 100-point scale
63
- score_out_of_100 = round((score / num_criteria) * 100)
64
-
65
- # Determine the color and text based on the original score
66
- if score == 9:
67
- color = "#4CAF50" # green
68
- text = "Very good"
69
- elif score in [7, 8]:
70
- color = "#FFEB3B" # yellow
71
- text = "Good"
72
- elif score in [5, 6]:
73
- color = "#FF9800" # orange
74
- text = "Ok"
75
- elif score in [3, 4]:
76
- color = "#F44336" # red
77
- text = "Bad"
78
- else: # score < 3
79
- color = "#800000" # maroon
80
- text = "Very bad"
81
-
82
- # Create the HTML for the score bar
83
- score_bar_html = f"""
84
- <div style="background-color: #ddd; border-radius: 10px; position: relative; height: 20px; width: 100%;">
85
- <div style="background-color: {color}; height: 100%; border-radius: 10px; width: {score_out_of_100}%;"></div>
86
- </div>
87
- <p style="color: {color};">{text}</p> <!-- Display the text -->
88
- """
89
- return score_bar_html
90
-
91
-
92
- class PDF(FPDF):
93
- def __init__(self, *args, **kwargs):
94
- super().__init__(*args, **kwargs)
95
- # Load the DejaVu font files
96
- self.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
97
- self.add_font('DejaVu', 'B', 'DejaVuSansCondensed-Bold.ttf', uni=True)
98
- self.add_font('DejaVu', 'I', 'DejaVuSansCondensed-Oblique.ttf', uni=True)
99
-
100
- def header(self):
101
- self.set_font('DejaVu', 'B', 12)
102
- self.cell(0, 10, 'Paper Analysis Report', 0, 1, 'C')
103
-
104
- def footer(self):
105
- self.set_y(-15)
106
- self.set_font('DejaVu', 'I', 8)
107
- self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
108
-
109
-
110
- import os
111
-
112
-
113
- def create_pdf_report(title, author_info, score, criteria, reasoning_list, output_path):
114
- pdf = PDF()
115
- pdf.add_page()
116
-
117
- # Set margins
118
- pdf.set_left_margin(10)
119
- pdf.set_right_margin(10)
120
-
121
- # Title
122
- pdf.set_font("DejaVu", 'B', 14)
123
- pdf.cell(0, 10, "Title:", 0, 1)
124
- pdf.set_font("DejaVu", '', 12)
125
- pdf.multi_cell(0, 10, title, 0, 1)
126
-
127
- # Author Information
128
- pdf.set_font("DejaVu", 'B', 14)
129
- pdf.cell(0, 10, "Author Information:", 0, 1)
130
- pdf.set_font("DejaVu", '', 12)
131
- pdf.multi_cell(0, 10, author_info, 0, 1)
132
-
133
- # Score
134
- pdf.set_font("DejaVu", 'B', 14)
135
- pdf.cell(0, 10, "Score:", 0, 1)
136
- pdf.set_font("DejaVu", '', 12)
137
- pdf.multi_cell(0, 10, score, 0, 1)
138
-
139
- # Reasoning - each reasoning with a green heading in bold
140
- for heading, reasoning in zip(criteria, reasoning_list):
141
- print(reasoning)
142
- pdf.set_font("DejaVu", 'B', 14)
143
- pdf.set_text_color(0, 128, 0) # Green color
144
- pdf.multi_cell(0, 10, heading, 0, 1)
145
- pdf.set_text_color(0, 0, 0) # Reset to black color
146
- pdf.set_font("DejaVu", '', 12)
147
- pdf.multi_cell(0, 10, reasoning, 0, 1)
148
-
149
- # Save the PDF to the specified output path
150
- pdf.output(output_path)
151
-
152
- return output_path # Return the path to the generated report
153
-
154
-
155
- def check_title_for_review(uploaded_files):
156
- title_message = "All articles are valid for review."
157
- if not uploaded_files:
158
- title_message = "No files uploaded or upload canceled."
159
- else:
160
- for uploaded_file in uploaded_files:
161
- pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
162
- title = pdf_processor.extract_title_from_pdf(uploaded_file)
163
- if 'review' in title.lower():
164
- title_message = "One or more files are review papers. Hence the evaluation may not be accurate."
165
-
166
- return title_message
167
-
168
-
169
- def process_pdf(uploaded_files, llm_model, n_criteria=num_criteria):
170
- # Initialize aggregation variables
171
- final_score = 0
172
- final_reasoning = []
173
- final_score_bar_html = ""
174
- final_author_info_html = ""
175
- final_title_info_html = ""
176
- output_files = []
177
- for i, uploaded_file in enumerate(uploaded_files):
178
- # Process the PDF file
179
- file_name_without_extension = os.path.splitext(os.path.basename(uploaded_file))[0]
180
- file_name_without_extension
181
-
182
- pdf_processor = PDFProcessor_Unstructured(pdf_processing_config)
183
- merged_chunks, tables, title = pdf_processor.process_pdf_file(uploaded_file)
184
- documents = [Document(text=t) for t in merged_chunks]
185
-
186
- # Prompts and Queries
187
- utils = base_utils()
188
-
189
- info_prompt = utils.read_from_file(info_prompt_path)
190
-
191
- # LLM Model choice
192
- try:
193
- if llm_model == "Model 1":
194
- llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=output_token_size)
195
- general_prompt = utils.read_from_file(gpt_prompt_path)
196
-
197
- elif llm_model == "Model 2":
198
- if any(param is None for param in
199
- [model_context_window, output_token_size, model_temperature, hf_token]):
200
- raise ValueError("All parameters are required for Mistral LLM.")
201
-
202
- llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
203
- temperature=model_temperature, model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
204
- api_key=hf_token)
205
- general_prompt = utils.read_from_file(mistral_prompt_path)
206
- else:
207
- raise ValueError(f"Unsupported language model: {llm_model}")
208
-
209
- except Exception as e:
210
- logger.error(f"Error initializing language model '{llm_model}': {e}", exc_info=True)
211
- raise # Or handle the exception as needed
212
-
213
- # Embedding model choice for RAG
214
- try:
215
- if embed == "openai":
216
- embed_model = OpenAIEmbedding(model="text-embedding-3-large")
217
-
218
- elif embed == "huggingface":
219
- # Use the specified model name
220
- embed_model = HuggingFaceEmbedding(embed_model_name)
221
-
222
- else:
223
- raise ValueError(f"Unsupported embedding model: {embed_model}")
224
-
225
- except Exception as e:
226
- logger.error(f"Error initializing embedding model: {e}", exc_info=True)
227
- raise
228
-
229
- peer_review_journals = utils.read_from_file(peer_review_journals_path)
230
- eq_network_journals = utils.read_from_file(eq_network_journals_path)
231
-
232
- peer_review_journals_list = peer_review_journals.split('\n')
233
- eq_network_journals_list = eq_network_journals.split('\n')
234
-
235
- modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
236
- peer_review_journals_list) + "?"
237
-
238
- info_llm = OpenAI(model="gpt-4-1106-preview", temperature=model_temperature, max_tokens=100)
239
- pdf_info_query = PDFQueryEngine(documents, info_llm, embed_model, (info_prompt))
240
- info_query_engine = pdf_info_query.setup_query_engine()
241
- journal_result = info_query_engine.query(modified_journal_query).response
242
- author_result = info_query_engine.query(author_query).response
243
-
244
- pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (general_prompt))
245
-
246
- # Check for prior registration
247
- nlp_methods = KeywordSearch(merged_chunks)
248
- eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
249
- peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
250
- registration_result = nlp_methods.check_registration()
251
-
252
- # Evaluate with OpenAI model
253
- total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
254
- registration_result, peer_journal_result, eq_journal_result, queries)
255
-
256
- # Convert reasoning list to plain text
257
- # reasoning_text = "\n".join([f"{idx + 1}. {reason}" for idx, reason in enumerate(reasoning)])
258
-
259
- # Generate the score bar HTML
260
- score_bar_html = generate_score_bar(total_score, n_criteria)
261
- scaled_total_score = str(round((total_score / n_criteria) * 100)) + "/100"
262
-
263
- output_dir = "/tmp"
264
- base_name = os.path.splitext(uploaded_file)[0]
265
- output_path = os.path.join(output_dir, f"{base_name}_report.pdf")
266
-
267
- create_pdf_report(title, author_result, scaled_total_score, criteria, reasoning, output_path)
268
- output_files.append(output_path)
269
-
270
- # Construct the processing message
271
- processing_message = f"Processing complete. {len(uploaded_files)} reports generated. Please download your reports below."
272
-
273
- return processing_message, output_files
274
- # Return the score as a string and the reasoning as HTML
275
- # return str(round((total_score / n_criteria) * 100)) + "/100", score_bar_html, reasoning_html, author_info_html, title_info_html
276
-
277
-
278
- with gr.Blocks(theme=gr.themes.Glass(
279
- text_size="sm",
280
- font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"],
281
- primary_hue="neutral",
282
- secondary_hue="gray")) as demo:
283
- gr.Markdown("## Med Library")
284
- with gr.Row():
285
- file_upload = gr.File(label="Choose papers", file_types=['.pdf'], file_count="multiple")
286
-
287
- title_check_output = gr.Textbox(label="Warnings", interactive=False)
288
- file_upload.change(fn=check_title_for_review, inputs=file_upload, outputs=title_check_output)
289
-
290
- with gr.Row():
291
- model_choice = gr.Dropdown(["Model 1", "Model 2"], label="Choose a model", value="Model 1")
292
- submit_button = gr.Button("Evaluate")
293
-
294
- processing_message_output = gr.Textbox(label="Processing Status", interactive=False)
295
- report_download_links = gr.File(label="Download Reports", type="filepath", file_count="multiple")
296
-
297
- submit_button.click(
298
- fn=process_pdf,
299
- inputs=[file_upload, model_choice],
300
- outputs=[processing_message_output, report_download_links]
301
- )
302
-
303
-
304
- demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
librarymed/local/RAG_utils.py DELETED
@@ -1,979 +0,0 @@
1
- """Utility functions for working with the RAG model"""
2
-
3
- import json
4
- import logging
5
- import os
6
- import re
7
- import time
8
- from tempfile import NamedTemporaryFile
9
- from typing import Any, List, Tuple, Set, Dict, Optional, Union
10
-
11
- import evaluate
12
- import numpy as np
13
- import pandas as pd
14
- import requests
15
- from llama_index import PromptTemplate
16
- from llama_index import VectorStoreIndex, ServiceContext
17
- from llama_index import get_response_synthesizer
18
- from llama_index.llms import (
19
- CustomLLM,
20
- CompletionResponse,
21
- CompletionResponseGen,
22
- LLMMetadata,
23
- )
24
- from llama_index.llms.base import llm_completion_callback
25
- from llama_index.postprocessor import SentenceTransformerRerank
26
- from llama_index.query_engine import RetrieverQueryEngine
27
- from llama_index.retrievers import BaseRetriever, BM25Retriever
28
- from sklearn.metrics.pairwise import cosine_similarity
29
- from unstructured.partition.pdf import partition_pdf
30
- from pypdf import PdfReader
31
-
32
-
33
- # Configure basic logging
34
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
35
-
36
- # Create a logger object
37
- logger = logging.getLogger(__name__)
38
-
39
-
40
- class ConfigManager:
41
- """
42
- A class to manage loading and accessing configuration settings.
43
-
44
- Attributes:
45
- config (dict): Dictionary to hold configuration settings.
46
-
47
- Methods:
48
- load_config(config_path: str): Loads the configuration from a given JSON file.
49
- get_config_value(key: str): Retrieves a specific configuration value.
50
- """
51
-
52
- def __init__(self):
53
- self.configs = {}
54
-
55
- def load_config(self, config_name: str, config_path: str) -> None:
56
- """
57
- Loads configuration settings from a specified JSON file into a named configuration.
58
-
59
- Args:
60
- config_name (str): The name to assign to this set of configurations.
61
- config_path (str): The path to the configuration file.
62
-
63
- Raises:
64
- FileNotFoundError: If the config file is not found.
65
- json.JSONDecodeError: If there is an error parsing the config file.
66
- """
67
- try:
68
- with open(config_path, 'r') as f:
69
- self.configs[config_name] = json.load(f)
70
- except FileNotFoundError:
71
- logging.error(f"Config file not found at {config_path}")
72
- raise
73
- except json.JSONDecodeError as e:
74
- logging.error(f"Error decoding config file: {e}")
75
- raise
76
-
77
- def get_config_value(self, config_name: str, key: str) -> str:
78
- """
79
- Retrieves a specific configuration value.
80
-
81
- Args:
82
- key (str): The key for the configuration setting.
83
-
84
- Returns:
85
- str: The value of the configuration setting.
86
-
87
- Raises:
88
- ValueError: If the key is not found or is set to a placeholder value.
89
- """
90
- value = self.configs.get(config_name, {}).get(key)
91
- if value is None or value == "ENTER_YOUR_TOKEN_HERE":
92
- raise ValueError(f"Please set your '{key}' in the config.json file.")
93
- return value
94
-
95
-
96
- class base_utils:
97
- """
98
- A utility class providing miscellaneous static methods for processing and analyzing text data,
99
- particularly from PDF documents and filenames. This class also includes methods for file operations.
100
-
101
- This class encapsulates the functionality of extracting key information from text, such as scores,
102
- reasoning, and IDs, locating specific data within a DataFrame based on an ID extracted from a filename,
103
- and reading content from files.
104
-
105
- Attributes:
106
- None (This class contains only static methods and does not maintain any state)
107
-
108
- Methods:
109
- extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
110
- Extracts a score and reasoning from a given text using regular expressions.
111
-
112
- extract_id_from_filename(filename: str) -> Optional[int]:
113
- Extracts an ID from a given filename based on a specified pattern.
114
-
115
- find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
116
- Searches for a row in a DataFrame that matches an ID extracted from a PDF filename.
117
-
118
- read_from_file(file_path: str) -> str:
119
- Reads the content of a file and returns it as a string.
120
- """
121
-
122
- @staticmethod
123
- def read_from_file(file_path: str) -> str:
124
- """
125
- Reads the content of a file and returns it as a string.
126
-
127
- Args:
128
- file_path (str): The path to the file to be read.
129
-
130
- Returns:
131
- str: The content of the file.
132
- """
133
- with open(file_path, 'r') as prompt_file:
134
- prompt = prompt_file.read()
135
- return prompt
136
-
137
- @staticmethod
138
- def extract_id_from_filename(filename: str) -> Optional[int]:
139
- """
140
- Extracts an ID from a filename, assuming a specific format ('Id_{I}.pdf', where {I} is the ID).
141
-
142
- Args:
143
- filename (str): The filename from which to extract the ID.
144
-
145
- Returns:
146
- int: The extracted ID as an integer, or None if the pattern is not found.
147
- """
148
- # Assuming the file name is in the format 'Id_{I}.pdf', where {I} is the ID
149
- match = re.search(r'Id_(\d+).pdf', filename)
150
- if match:
151
- return int(match.group(1)) # Convert to integer if ID is numeric
152
- else:
153
- return None
154
-
155
- @staticmethod
156
- def extract_score_reasoning(text: str) -> Dict[str, Optional[str]]:
157
- """
158
- Extracts score and reasoning from a given text using regular expressions.
159
-
160
- Args:
161
- text (str): The text from which to extract the score and reasoning.
162
-
163
- Returns:
164
- dict: A dictionary containing 'score' and 'reasoning', extracted from the text.
165
- """
166
- # Define regular expression patterns for score and reasoning
167
- score_pattern = r"Score: (\d+)"
168
- reasoning_pattern = r"Reasoning: (.+)"
169
-
170
- # Extract data using regular expressions
171
- score_match = re.search(score_pattern, text)
172
- reasoning_match = re.search(reasoning_pattern, text, re.DOTALL) # re.DOTALL allows '.' to match newlines
173
-
174
- # Extract and return the results
175
- extracted_data = {
176
- "score": score_match.group(1) if score_match else None,
177
- "reasoning": reasoning_match.group(1).strip() if reasoning_match else None
178
- }
179
-
180
- return extracted_data
181
-
182
- @staticmethod
183
- def find_row_for_pdf(pdf_filename: str, dataframe: pd.DataFrame) -> Union[pd.Series, str]:
184
- """
185
- Finds the row in a dataframe corresponding to the ID extracted from a given PDF filename.
186
-
187
- Args:
188
- pdf_filename (str): The filename of the PDF.
189
- dataframe (pandas.DataFrame): The dataframe in which to find the corresponding row.
190
-
191
- Returns:
192
- pandas.Series or str: The matched row from the dataframe or a message indicating
193
- that no matching row or invalid filename was found.
194
- """
195
- pdf_id = Utility.extract_id_from_filename(pdf_filename)
196
- if pdf_id is not None:
197
- # Assuming the first column contains the ID
198
- matched_row = dataframe[dataframe.iloc[:, 0] == pdf_id]
199
- if not matched_row.empty:
200
- return matched_row
201
- else:
202
- return "No matching row found."
203
- else:
204
- return "Invalid file name."
205
-
206
-
207
- class PDFProcessor_Unstructured:
208
- """
209
- A class to process PDF files, providing functionalities for extracting, categorizing,
210
- and merging elements from a PDF file.
211
-
212
- This class is designed to handle unstructured PDF documents, particularly useful for
213
- tasks involving text extraction, categorization, and data processing within PDFs.
214
-
215
- Attributes:
216
- file_path (str): The full path to the PDF file.
217
- folder_path (str): The directory path where the PDF file is located.
218
- file_name (str): The name of the PDF file.
219
- texts (List[str]): A list to store extracted text chunks.
220
- tables (List[str]): A list to store extracted tables.
221
-
222
-
223
- Methods:
224
- extract_pdf_elements() -> List:
225
- Extracts images, tables, and text chunks from a PDF file.
226
-
227
- categorize_elements(raw_pdf_elements: List) -> None:
228
- Categorizes extracted elements from a PDF into tables and texts.
229
-
230
- merge_chunks() -> List[str]:
231
- Merges text chunks based on punctuation and character case criteria.
232
-
233
- should_skip_chunk(chunk: str) -> bool:
234
- Determines if a chunk should be skipped based on its content.
235
-
236
- should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
237
- Determines if the current chunk should be merged with the next one.
238
-
239
- process_pdf() -> Tuple[List[str], List[str]]:
240
- Processes the PDF by extracting, categorizing, and merging elements.
241
-
242
- process_pdf_file(uploaded_file) -> Tuple[List[str], List[str]]:
243
- Processes an uploaded PDF file to extract and categorize text and tables.
244
- """
245
-
246
- def __init__(self, config: Dict[str, any]):
247
- self.file_path = None
248
- self.folder_path = None
249
- self.file_name = None
250
- self.texts = []
251
- self.tables = []
252
- self.config = config if config is not None else self.default_config()
253
- logger.info(f"Initialized PdfProcessor_Unstructured for file: {self.file_name}")
254
-
255
- @staticmethod
256
- def default_config() -> Dict[str, any]:
257
- """
258
- Returns the default configuration for PDF processing.
259
-
260
- Returns:
261
- Dict[str, any]: Default configuration options.
262
- """
263
- return {
264
- "extract_images": False,
265
- "infer_table_structure": True,
266
- "chunking_strategy": "by_title",
267
- "max_characters": 10000,
268
- "combine_text_under_n_chars": 100,
269
- "strategy": "auto",
270
- "model_name": "yolox"
271
- }
272
-
273
- def extract_pdf_elements(self) -> List:
274
- """
275
- Extracts images, tables, and text chunks from a PDF file.
276
-
277
- Returns:
278
- List: A list of extracted elements from the PDF.
279
- """
280
- logger.info("Starting extraction of PDF elements.")
281
- try:
282
- extracted_elements = partition_pdf(
283
- filename=self.file_path,
284
- extract_images_in_pdf=False,
285
- infer_table_structure=True,
286
- chunking_strategy="by_title",
287
- max_characters=10000,
288
- combine_text_under_n_chars=100,
289
- image_output_dir_path=self.folder_path,
290
- # strategy="fast",
291
- )
292
- logger.info("Extraction of PDF elements completed successfully.")
293
- return extracted_elements
294
- except Exception as e:
295
- raise NotImplementedError(f"Error extracting PDF elements: {e}")
296
-
297
- def categorize_elements(self, raw_pdf_elements: List) -> None:
298
- """
299
- Categorizes extracted elements from a PDF into tables and texts.
300
-
301
- Args:
302
- raw_pdf_elements (List): A list of elements extracted from the PDF.
303
- """
304
- logger.debug("Starting categorization of PDF elements.")
305
- for element in raw_pdf_elements:
306
- element_type = str(type(element))
307
- if "unstructured.documents.elements.Table" in element_type:
308
- self.tables.append(str(element))
309
- elif "unstructured.documents.elements.CompositeElement" in element_type:
310
- self.texts.append(str(element))
311
-
312
- logger.debug("Categorization of PDF elements completed.")
313
-
314
- def merge_chunks(self) -> List[str]:
315
- """
316
- Merges text chunks based on punctuation and character case criteria.
317
-
318
- Returns:
319
- List[str]: A list of merged text chunks.
320
- """
321
- logger.debug("Starting merging of text chunks.")
322
-
323
- merged_chunks = []
324
- skip_next = False
325
-
326
- for i, current_chunk in enumerate(self.texts[:-1]):
327
- next_chunk = self.texts[i + 1]
328
-
329
- if self.should_skip_chunk(current_chunk):
330
- continue
331
-
332
- if self.should_merge_with_next(current_chunk, next_chunk):
333
- merged_chunks.append(current_chunk + " " + next_chunk)
334
- skip_next = True
335
- else:
336
- merged_chunks.append(current_chunk)
337
-
338
- if not skip_next:
339
- merged_chunks.append(self.texts[-1])
340
-
341
- logger.debug("Merging of text chunks completed.")
342
-
343
- return merged_chunks
344
-
345
- @staticmethod
346
- def should_skip_chunk(chunk: str) -> bool:
347
- """
348
- Determines if a chunk should be skipped based on its content.
349
-
350
- Args:
351
- chunk (str): The text chunk to be evaluated.
352
-
353
- Returns:
354
- bool: True if the chunk should be skipped, False otherwise.
355
- """
356
- return (chunk.lower().startswith(("figure", "fig", "table")) or
357
- not chunk[0].isalnum() or
358
- re.match(r'^\d+\.', chunk))
359
-
360
- @staticmethod
361
- def should_merge_with_next(current_chunk: str, next_chunk: str) -> bool:
362
- """
363
- Determines if the current chunk should be merged with the next one.
364
-
365
- Args:
366
- current_chunk (str): The current text chunk.
367
- next_chunk (str): The next text chunk.
368
-
369
- Returns:
370
- bool: True if the chunks should be merged, False otherwise.
371
- """
372
- return (current_chunk.endswith(",") or
373
- (current_chunk[-1].islower() and next_chunk[0].islower()))
374
-
375
- def process_pdf(self) -> Tuple[List[str], List[str]]:
376
- """
377
- Processes the PDF by extracting, categorizing, and merging elements.
378
-
379
- Returns:
380
- Tuple[List[str], List[str]]: A tuple of merged text chunks and tables.
381
- is_research_paper: A boolean indicating if the paper is a research paper or not.
382
- """
383
- is_review_paper = False
384
- logger.info("Starting processing of the PDF.")
385
- try:
386
- time_extract = time.time()
387
- raw_pdf_elements = self.extract_pdf_elements()
388
- logger.info(
389
- f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements extracted in {time.time() - time_extract:.2f} seconds.")
390
-
391
- time_review = time.time()
392
- for element in raw_pdf_elements:
393
- text = element.text.split()
394
- for word in text:
395
- if word.lower() == 'review':
396
- logger.warning("!!! this seems to be a review paper and not a research paper. this demo "
397
- "analyses only research papers.")
398
- is_review_paper = True
399
- logging.info(
400
- f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF review check completed in {time.time() - time_review:.2f} seconds.")
401
-
402
- time_categorize = time.time()
403
- self.categorize_elements(raw_pdf_elements)
404
- logger.info(
405
- f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF elements categorized in {time.time() - time_categorize:.2f} seconds.")
406
-
407
- time_merge = time.time()
408
- merged_chunks = self.merge_chunks()
409
- logger.info(
410
- f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PDF text chunks merged in {time.time() - time_merge:.2f} seconds.")
411
- return merged_chunks, self.tables
412
- except Exception as e:
413
- raise NotImplementedError(f"Error processing PDF: {e}")
414
-
415
- def process_pdf_file(self, uploaded_file):
416
- """
417
- Process an uploaded PDF file.
418
-
419
- If a new file is uploaded, the previously stored file is deleted.
420
- The method updates the file path, processes the PDF, and returns the results.
421
-
422
- Parameters:
423
- uploaded_file: The new PDF file uploaded for processing.
424
-
425
- Returns:
426
- The results of processing the PDF file.
427
- """
428
-
429
- logger.info(f"Starting to process the PDF file: {uploaded_file.filename}")
430
-
431
- with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
432
- uploaded_file.save(temp_file.name)
433
- self.file_path = temp_file.name
434
- self.folder_path = os.path.dirname(self.file_path)
435
-
436
- try:
437
- logger.debug(f"Processing PDF at {self.file_path}")
438
- results = self.process_pdf()
439
- title = self.extract_title_from_pdf(self.file_path)
440
- logger.info("PDF processing completed successfully.")
441
- return (*results, title)
442
-
443
- except Exception as e:
444
- logger.error(f"Error processing PDF file: {e}", exc_info=True)
445
- raise
446
- finally:
447
- try:
448
- os.remove(self.file_path)
449
- logger.debug(f"Temporary file {self.file_path} deleted.")
450
- except Exception as e:
451
- logger.warning(f"Error deleting temporary file: {e}", exc_info=True)
452
-
453
- def extract_title_from_pdf(self, uploaded_file):
454
- """
455
- Extracts the title from a PDF file's metadata.
456
-
457
- This function reads the metadata of a PDF file using PyPDF2 and attempts to
458
- extract the title. If the title is present in the metadata, it is returned.
459
- Otherwise, a default message indicating that the title was not found is returned.
460
-
461
- Parameters:
462
- uploaded_file (file): A file object or a path to the PDF file from which
463
- to extract the title. The file must be opened in binary mode.
464
-
465
- Returns:
466
- str: The title of the PDF file as a string. If no title is found, returns
467
- 'Title not found'.
468
- """
469
- # Initialize PDF reader
470
- pdf_reader = PdfReader(uploaded_file)
471
-
472
- # Extract document information
473
- meta = pdf_reader.metadata
474
-
475
- # Retrieve title from document information
476
- title = meta.title if meta and meta.title else 'Title not found'
477
- return title
478
-
479
-
480
-
481
-
482
- class HybridRetriever(BaseRetriever):
483
- """
484
- A hybrid retriever that combines results from vector-based and BM25 retrieval methods.
485
- Inherits from BaseRetriever.
486
-
487
- This class uses two different retrieval methods and merges their results to provide a
488
- comprehensive set of documents in response to a query. It ensures diversity in the
489
- retrieved documents by leveraging the strengths of both retrieval methods.
490
-
491
- Attributes:
492
- vector_retriever: An instance of a vector-based retriever.
493
- bm25_retriever: An instance of a BM25 retriever.
494
-
495
- Methods:
496
- __init__(vector_retriever, bm25_retriever): Initializes the HybridRetriever with vector and BM25 retrievers.
497
- _retrieve(query, **kwargs): Performs the retrieval operation by combining results from both retrievers.
498
- _combine_results(bm25_nodes, vector_nodes): Combines and de-duplicates the results from both retrievers.
499
- """
500
-
501
- def __init__(self, vector_retriever, bm25_retriever):
502
- super().__init__()
503
- self.vector_retriever = vector_retriever
504
- self.bm25_retriever = bm25_retriever
505
- logger.info("HybridRetriever initialized with vector and BM25 retrievers.")
506
-
507
- def _retrieve(self, query: str, **kwargs) -> List:
508
- """
509
- Retrieves and combines results from both vector and BM25 retrievers.
510
-
511
- Args:
512
- query: The query string for document retrieval.
513
- **kwargs: Additional keyword arguments for retrieval.
514
-
515
- Returns:
516
- List: Combined list of unique nodes retrieved from both methods.
517
- """
518
- logger.info(f"Retrieving documents for query: {query}")
519
- try:
520
- bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
521
- vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
522
- combined_nodes = self._combine_results(bm25_nodes, vector_nodes)
523
-
524
- logger.info(f"Retrieved {len(combined_nodes)} unique nodes combining vector and BM25 retrievers.")
525
- return combined_nodes
526
- except Exception as e:
527
- logger.error(f"Error in retrieval: {e}")
528
- raise
529
-
530
- @staticmethod
531
- def _combine_results(bm25_nodes: List, vector_nodes: List) -> List:
532
- """
533
- Combines and de-duplicates results from BM25 and vector retrievers.
534
-
535
- Args:
536
- bm25_nodes: Nodes retrieved from BM25 retriever.
537
- vector_nodes: Nodes retrieved from vector retriever.
538
-
539
- Returns:
540
- List: Combined list of unique nodes.
541
- """
542
- node_ids: Set = set()
543
- combined_nodes = []
544
-
545
- for node in bm25_nodes + vector_nodes:
546
- if node.node_id not in node_ids:
547
- combined_nodes.append(node)
548
- node_ids.add(node.node_id)
549
-
550
- return combined_nodes
551
-
552
-
553
- class PDFQueryEngine:
554
- """
555
- A class to handle the process of setting up a query engine and performing queries on PDF documents.
556
-
557
- This class encapsulates the functionality of creating prompt templates, embedding models, service contexts,
558
- indexes, hybrid retrievers, response synthesizers, and executing queries on the set up engine.
559
-
560
- Attributes:
561
- documents (List): A list of documents to be indexed.
562
- llm (Language Model): The language model to be used for embeddings and queries.
563
- qa_prompt_tmpl (str): Template for creating query prompts.
564
- queries (List[str]): List of queries to be executed.
565
-
566
- Methods:
567
- setup_query_engine(): Sets up the query engine with all necessary components.
568
- execute_queries(): Executes the predefined queries and prints the results.
569
- """
570
-
571
- def __init__(self, documents: List[Any], llm: Any, embed_model: Any, qa_prompt_tmpl: Any):
572
-
573
- self.documents = documents
574
- self.llm = llm
575
- self.embed_model = embed_model
576
- self.qa_prompt_tmpl = qa_prompt_tmpl
577
- self.base_utils = base_utils()
578
-
579
- logger.info("PDFQueryEngine initialized.")
580
-
581
- def setup_query_engine(self):
582
- """
583
- Sets up the query engine by initializing and configuring the embedding model, service context, index,
584
- hybrid retriever (combining vector and BM25 retrievers), and the response synthesizer.
585
-
586
- Args:
587
- embed_model: The embedding model to be used.
588
- service_context: The context for providing services to the query engine.
589
- index: The index used for storing and retrieving documents.
590
- hybrid_retriever: The retriever that combines vector and BM25 retrieval methods.
591
- response_synthesizer: The synthesizer for generating responses to queries.
592
-
593
- Returns:
594
- Any: The configured query engine.
595
- """
596
-
597
- try:
598
- logger.info("Initializing the service context for query engine setup.")
599
- service_context = ServiceContext.from_defaults(llm=self.llm, embed_model=self.embed_model)
600
-
601
- logger.info("Creating an index from documents.")
602
- index = VectorStoreIndex.from_documents(documents=self.documents, service_context=service_context)
603
- nodes = service_context.node_parser.get_nodes_from_documents(self.documents)
604
-
605
- logger.info("Setting up vector and BM25 retrievers.")
606
- vector_retriever = index.as_retriever(similarity_top_k=5)
607
- bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
608
- hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
609
-
610
- logger.info("Configuring the response synthesizer with the prompt template.")
611
- qa_prompt = PromptTemplate(self.qa_prompt_tmpl)
612
- response_synthesizer = get_response_synthesizer(
613
- service_context=service_context,
614
- text_qa_template=qa_prompt,
615
- response_mode="compact",
616
- )
617
-
618
- logger.info("Assembling the query engine with reranker and synthesizer.")
619
- reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")
620
- query_engine = RetrieverQueryEngine.from_args(
621
- retriever=hybrid_retriever,
622
- node_postprocessors=[reranker],
623
- response_synthesizer=response_synthesizer,
624
- )
625
-
626
- logger.info("Query engine setup complete.")
627
- return query_engine
628
- except Exception as e:
629
- logger.error(f"Error during query engine setup: {e}")
630
- raise
631
-
632
- def evaluate_with_llm(self, reg_result: Any, peer_result: Any, guidelines_result: Any, queries: List[str]) -> Tuple[
633
- int, List[int], int, float, List[str]]:
634
- """
635
- Evaluate documents using a language model based on various criteria.
636
-
637
- Args:
638
- reg_result (Any): Result related to registration.
639
- peer_result (Any): Result related to peer review.
640
- guidelines_result (Any): Result related to following guidelines.
641
- queries (List[str]): A list of queries to be processed.
642
-
643
- Returns:
644
- Tuple[int, List[int], int, float, List[str]]: A tuple containing the total score, a list of scores per criteria,
645
- """
646
-
647
- logger.info("Starting evaluation with LLM.")
648
- query_engine = self.setup_query_engine()
649
-
650
- total_score = 0
651
- criteria_met = 0
652
- reasoning = []
653
-
654
- for j, query in enumerate(queries):
655
- # Predefine extracted_data to handle the default case
656
- extracted_data = None
657
-
658
- # Handle special cases based on the value of j and other conditions
659
- if j == 1 and reg_result:
660
- extracted_data = {"score": 1, "reasoning": reg_result[0]}
661
- elif j == 2 and guidelines_result:
662
- extracted_data = {"score": 1,
663
- "reasoning": "The article is published in a journal following EQUATOR-NETWORK reporting guidelines"}
664
- elif j == 8 and (guidelines_result or peer_result):
665
- extracted_data = {"score": 1, "reasoning": "The article is published in a peer reviewed journal."}
666
-
667
- # Handle the default case if none of the special conditions were met
668
- if extracted_data is None:
669
- result = query_engine.query(query).response
670
- extracted_data = self.base_utils.extract_score_reasoning(result)
671
-
672
- if extracted_data['score'] and int(extracted_data["score"]) > 0:
673
- criteria_met += 1
674
- total_score += int(extracted_data["score"])
675
- reasoning.append(extracted_data["reasoning"])
676
-
677
- score_percentage = (float(total_score) / len(queries)) * 100
678
- logger.info("Evaluation completed.")
679
- return total_score, criteria_met, score_percentage, reasoning
680
-
681
-
682
- class MixtralLLM(CustomLLM):
683
- """
684
- A custom language model class for interfacing with the Hugging Face API, specifically using the Mixtral model.
685
-
686
- Attributes:
687
- context_window (int): Number of tokens used for context during inference.
688
- num_output (int): Number of tokens to generate as output.
689
- temperature (float): Sampling temperature for token generation.
690
- model_name (str): Name of the model on Hugging Face's model hub.
691
- api_key (str): API key for authenticating with the Hugging Face API.
692
-
693
- Methods:
694
- metadata: Retrieves metadata about the model.
695
- do_hf_call: Makes an API call to the Hugging Face model.
696
- complete: Generates a complete response for a given prompt.
697
- stream_complete: Streams a series of token completions for a given prompt.
698
- """
699
-
700
- def __init__(self, context_window: int, num_output: int, temperature: float, model_name: str, api_key: str):
701
- """
702
- Initialize the MixtralLLM class with specific configuration values.
703
-
704
- Args:
705
- context_window (int): The number of tokens to consider for context during LLM inference.
706
- num_output (int): The number of tokens to generate in the output.
707
- temperature (float): The sampling temperature to use for generating tokens.
708
- model_name (str): The name of the model to be used from Hugging Face's model hub.
709
- api_key (str): The API key for authentication with Hugging Face's inference API.
710
- """
711
- super().__init__()
712
- self.context_window = context_window
713
- self.num_output = num_output
714
- self.temperature = temperature
715
- self.model_name = model_name
716
- self.api_key = api_key
717
-
718
- @property
719
- def metadata(self) -> LLMMetadata:
720
- """
721
- Retrieves metadata for the Mixtral LLM.
722
-
723
- Returns:
724
- LLMMetadata: An object containing metadata such as context window, number of outputs, and model name.
725
- """
726
- return LLMMetadata(
727
- context_window=self.context_window,
728
- num_output=self.num_output,
729
- model_name=self.model_name,
730
- )
731
-
732
- def do_hf_call(self, prompt: str) -> str:
733
- """
734
- Makes an API call to the Hugging Face model and retrieves the generated response.
735
-
736
- Args:
737
- prompt (str): The input prompt for the model.
738
-
739
- Returns:
740
- str: The text generated by the model in response to the prompt.
741
-
742
- Raises:
743
- Exception: If the API call fails or returns an error.
744
- """
745
- data = {
746
- "inputs": prompt,
747
- "parameters": {"Temperature": self.temperature}
748
- }
749
-
750
- # Makes a POST request to the Hugging Face API to get the model's response
751
- response = requests.post(
752
- f'https://api-inference.huggingface.co/models/{self.model_name}',
753
- headers={
754
- 'authorization': f'Bearer {self.api_key}',
755
- 'content-type': 'application/json',
756
- },
757
- json=data,
758
- stream=True
759
- )
760
-
761
- # Checks for a successful response and parses the generated text
762
- if response.status_code != 200 or not response.json() or 'error' in response.json():
763
- print(f"Error: {response}")
764
- return "Unable to answer for technical reasons."
765
- full_txt = response.json()[0]['generated_text']
766
- # Finds the section of the text following the context separator
767
- offset = full_txt.find("---------------------")
768
- ss = full_txt[offset:]
769
- # Extracts the actual answer from the response
770
- offset = ss.find("Answer:")
771
- return ss[offset + 7:].strip()
772
-
773
- @llm_completion_callback()
774
- def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
775
- """
776
- Generates a complete response for a given prompt using the Hugging Face API.
777
-
778
- Args:
779
- prompt (str): The input prompt for the model.
780
- **kwargs: Additional keyword arguments for the completion.
781
-
782
- Returns:
783
- CompletionResponse: The complete response from the model.
784
- """
785
- response = self.do_hf_call(prompt)
786
- return CompletionResponse(text=response)
787
-
788
- @llm_completion_callback()
789
- def stream_complete(
790
- self, prompt: str, **kwargs: Any
791
- ) -> CompletionResponseGen:
792
- """
793
- Streams a series of token completions as a response for the given prompt.
794
-
795
- This method is useful for streaming responses where each token is generated sequentially.
796
-
797
- Args:
798
- prompt (str): The input prompt for the model.
799
- **kwargs: Additional keyword arguments for the streaming completion.
800
-
801
- Yields:
802
- CompletionResponseGen: A generator yielding each token in the completion response.
803
- """
804
- # Yields a stream of tokens as the completion response for the given prompt
805
- response = ""
806
- for token in self.do_hf_call(prompt):
807
- response += token
808
- yield CompletionResponse(text=response, delta=token)
809
-
810
-
811
- class KeywordSearch():
812
- def __init__(self, chunks):
813
- self.chunks = chunks
814
-
815
- def find_journal_name(self, response: str, journal_list: list) -> str:
816
- """
817
- Searches for a journal name in a given response string.
818
-
819
- This function iterates through a list of known journal names and checks if any of these
820
- names are present in the response string. It returns the first journal name found in the
821
- response. If no journal names from the list are found in the response, a default message
822
- indicating that the journal name was not found is returned.
823
-
824
- Args:
825
- response (str): The response string to search for a journal name.
826
- journal_list (list): A list of journal names to search within the response.
827
-
828
- Returns:
829
- str: The first journal name found in the response, or a default message if no journal name is found.
830
- """
831
- response_lower = response.lower()
832
- for journal in journal_list:
833
- journal_lower = journal.lower()
834
-
835
- if journal_lower in response_lower:
836
- return True
837
-
838
- return False
839
-
840
- def check_registration(self):
841
- """
842
- Check chunks of text for various registration numbers or URLs of registries.
843
- Returns the sentence containing a registration number, or if not found,
844
- returns chunks containing registry URLs.
845
-
846
- Args:
847
- chunks (list of str): List of text chunks to search.
848
-
849
- Returns:
850
- list of str: List of matching sentences or chunks, or an empty list if no matches are found.
851
- """
852
-
853
- # Patterns for different registration types
854
- patterns = {
855
- "NCT": r"\(?(NCT#?\s*(No\s*)?)(\d{8})\)?",
856
- "ISRCTN": r"(ISRCTN\d{8})",
857
- "EudraCT": r"(\d{4}-\d{6}-\d{2})",
858
- "UMIN-CTR": r"(UMIN\d{9})",
859
- "CTRI": r"(CTRI/\d{4}/\d{2}/\d{6})"
860
- }
861
-
862
- # Registry URLs
863
- registry_urls = [
864
- "www.anzctr.org.au",
865
- "anzctr.org.au",
866
- "www.clinicaltrials.gov",
867
- "clinicaltrials.gov",
868
- "www.ISRCTN.org",
869
- "ISRCTN.org",
870
- "www.umin.ac.jp/ctr/index/htm",
871
- "umin.ac.jp/ctr/index/htm",
872
- "www.onderzoekmetmensen.nl/en",
873
- "onderzoekmetmensen.nl/en",
874
- "eudract.ema.europa.eu",
875
- "www.eudract.ema.europa.eu"
876
- ]
877
-
878
- # Check each chunk for registration numbers
879
- for chunk in self.chunks:
880
- # Split chunk into sentences
881
- sentences = re.split(r'(?<=[.!?]) +', chunk)
882
-
883
- # Check each sentence for any registration number
884
- for sentence in sentences:
885
- for pattern in patterns.values():
886
- if re.search(pattern, sentence):
887
- return [sentence] # Return immediately if a registration number is found
888
-
889
- # If no registration number found, check for URLs in chunks
890
- matching_chunks = []
891
- for chunk in self.chunks:
892
- if any(url in chunk for url in registry_urls):
893
- matching_chunks.append(chunk)
894
-
895
- return matching_chunks
896
-
897
-
898
- class StringExtraction():
899
- """
900
- A class to handle the the process of extraction of query string from complete LLM responses.
901
-
902
- This class encapsulates the functionality of extracting original ground truth from a labelled data csv and query strings from responses. Please note that
903
- LLMs may generate different formatted answers based on different models or different prompting technique. In such cases, extract_original_prompt may not give
904
- satisfactory results. Best case scenario will be write your own string extraction method in such cases.
905
-
906
-
907
- Methods:
908
- extract_original_prompt():
909
- extraction_ground_truth():
910
- """
911
-
912
- def extract_original_prompt(self, result):
913
- r1 = result.response.strip().split("\n")
914
- binary_response = ""
915
- explanation_response = ""
916
- for r in r1:
917
- if binary_response == "" and (r.find("Yes") >= 0 or r.find("No") >= 0):
918
- binary_response = r
919
- elif r.find("Reasoning:") >= 0:
920
- cut = r.find(":")
921
- explanation_response += r[cut + 1:].strip()
922
-
923
- return binary_response, explanation_response
924
-
925
- def extraction_ground_truth(self, paper_name, labelled_data):
926
- id = int(paper_name[paper_name.find("_") + 1:paper_name.find(".pdf")])
927
- id_row = labelled_data[labelled_data["id"] == id]
928
- ground_truth = id_row.iloc[:, 2:11].values.tolist()[0]
929
- binary_ground_truth = []
930
- explanation_ground_truth = []
931
- for g in ground_truth:
932
- if len(g) > 0:
933
- binary_ground_truth.append("Yes")
934
- explanation_ground_truth.append(g)
935
- else:
936
- binary_ground_truth.append("No")
937
- explanation_ground_truth.append("The article does not provide any relevant information.")
938
- return binary_ground_truth, explanation_ground_truth
939
-
940
-
941
- class EvaluationMetrics():
942
- """
943
-
944
- This class encapsulates the evaluation methods that have been used in the project.
945
-
946
- Attributes:
947
- explanation_response = a list of detailed response from the LLM model corresponding to each query
948
- explanation_ground_truth = the list of ground truth corresponding to each query
949
-
950
- Methods:
951
- metric_cosine_similairty(): Sets up the query engine with all necessary components.
952
- metric_rouge(): Executes the predefined queries and prints the results.
953
- metric_binary_accuracy():
954
- """
955
-
956
- def __init__(self, explanation_response, explanation_ground_truth, embedding_model):
957
- self.explanation_response = explanation_response
958
- self.explanation_ground_truth = explanation_ground_truth
959
- self.embedding_model = embedding_model
960
-
961
- def metric_cosine_similarity(self):
962
- ground_truth_embedding = self.embedding_model.encode(self.explanation_ground_truth)
963
- explanation_response_embedding = self.embedding_model.encode(self.explanation_response)
964
- return np.diag(cosine_similarity(ground_truth_embedding, explanation_response_embedding))
965
-
966
- def metric_rouge(self):
967
- rouge = evaluate.load("rouge")
968
- results = rouge.compute(predictions=self.explanation_response, references=self.explanation_ground_truth)
969
- return results
970
-
971
- def binary_accuracy(self, binary_response, binary_ground_truth):
972
- count = 0
973
- if len(binary_response) != len(binary_ground_truth):
974
- return "Arrays which are to be compared has different lengths."
975
- else:
976
- for i in range(len(binary_response)):
977
- if binary_response[i] == binary_ground_truth[i]:
978
- count += 1
979
- return np.round(count / len(binary_response), 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
librarymed/local/__init__.py DELETED
File without changes
librarymed/local/app_local.py DELETED
@@ -1,160 +0,0 @@
1
- import time
2
- import argparse
3
- import logging
4
- import os
5
-
6
- import openai
7
- from flask import Flask, flash, request, render_template, redirect
8
- from llama_index import Document
9
- from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
10
- from llama_index.llms import OpenAI
11
-
12
- from librarymed.local.RAG_utils import PDFProcessor_Unstructured, PDFQueryEngine, MixtralLLM, KeywordSearch, base_utils, \
13
- ConfigManager
14
-
15
- app = Flask(__name__)
16
- app.config['SECRET_KEY'] = 'librarymed super secret key'
17
-
18
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
19
- config_manager = ConfigManager()
20
- config_manager.load_config("api", "Config/api_config.json")
21
- config_manager.load_config("model", "Config/model_config.json")
22
- app.config['user_config'] = config_manager
23
-
24
-
25
- def allowed_file(filename, allowed_extensions):
26
- """ Helper function to check if the file extension is allowed """
27
- return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
28
-
29
-
30
- @app.route('/', methods=['GET'])
31
- def __get__():
32
- score = 0
33
- criteria_met = 0
34
- title = ""
35
- author_info = ""
36
- reasoning = ""
37
-
38
- return render_template('index.html',
39
- title=title,
40
- author=author_info,
41
- score=score,
42
- criteria_met=criteria_met,
43
- reasoning=reasoning,
44
- )
45
-
46
-
47
- @app.route('/upload', methods=['POST'])
48
- def upload():
49
- config = app.config['user_config']
50
- openai.api_key = config.get_config_value("api", "OPENAI_API_KEY")
51
- hf_token = config.get_config_value("api", "HF_TOKEN")
52
- embed = config.get_config_value("model", "embeddings")
53
- embed_model_name = config.get_config_value("model", "embeddings_model")
54
- llm_model = config.get_config_value("model", "llm_model")
55
- model_temperature = config.get_config_value("model", "model_temp")
56
- output_token_size = config.get_config_value("model", "max_tokens")
57
- model_context_window = config.get_config_value("model", "context_window")
58
- gpt_prompt_path = config_manager.get_config_value("model", "GPT_PROMPT_PATH")
59
- mistral_prompt_path = config_manager.get_config_value("model", "MISTRAL_PROMPT_PATH")
60
- info_prompt_path = config.get_config_value("model", "INFO_PROMPT_PATH")
61
- peer_review_journals_path = config.get_config_value("model", "peer_review_journals_path")
62
- eq_network_journals_path = config.get_config_value("model", "eq_network_journals_path")
63
- queries = config.get_config_value("model", "queries")
64
- num_criteria = len(config.get_config_value("model", "criteria"))
65
- author_query = config.get_config_value("model", "author_query")
66
- journal_query = config.get_config_value("model", "journal_query")
67
-
68
- # Check if the post request has the file part
69
- if 'file' not in request.files:
70
- flash('No file part')
71
- return redirect(request.url)
72
- file = request.files['file']
73
- # If user does not select file, browser also submits an empty part without filename
74
- if file.filename == '':
75
- flash('No selected file')
76
- return redirect(request.url)
77
- if file and allowed_file(file.filename, config.get_config_value("model", "allowed_extensions")):
78
- try:
79
- # Process the PDF file
80
- pdf_processor = PDFProcessor_Unstructured(config.get_config_value("model", "pdf_processing"))
81
- merged_chunks, tables, title = pdf_processor.process_pdf_file(file)
82
- documents = [Document(text=t) for t in merged_chunks]
83
-
84
- utils = base_utils()
85
-
86
- # LLM Model choice
87
- if 'gpt' in llm_model.lower(): # TODO tested "gpt-4" and "gpt-3.5-turbo":
88
- llm = OpenAI(model=llm_model, temperature=model_temperature, max_tokens=output_token_size)
89
- prompt_template = utils.read_from_file(gpt_prompt_path)
90
-
91
- elif llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
92
- if any(param is None for param in
93
- [model_context_window, output_token_size, model_temperature, hf_token]):
94
- raise ValueError("All parameters are required for Mistral LLM.")
95
-
96
- llm = MixtralLLM(context_window=model_context_window, num_output=output_token_size,
97
- temperature=model_temperature, model_name=llm_model, api_key=hf_token)
98
- prompt_template = utils.read_from_file(mistral_prompt_path)
99
-
100
- else:
101
- raise NotImplementedError(f"Error initializing language model '{llm_model}'")
102
-
103
- # Embedding model choice for RAG
104
- if embed == "openai":
105
- embed_model = OpenAIEmbedding()
106
-
107
- elif embed == "huggingface":
108
- if embed_model_name is None:
109
- # Set to default model if name not provided
110
- embed_model_name = "BAAI/bge-small-en-v1.5"
111
- embed_model = HuggingFaceEmbedding(embed_model_name)
112
- else:
113
- # Use the specified model name
114
- embed_model = HuggingFaceEmbedding(embed_model_name)
115
-
116
- else:
117
- raise NotImplementedError(f"Error initializing embedding model: {embed}")
118
-
119
- # Prompts and Queries
120
- info_prompt = utils.read_from_file(info_prompt_path)
121
-
122
- peer_review_journals = utils.read_from_file(peer_review_journals_path)
123
- eq_network_journals = utils.read_from_file(eq_network_journals_path)
124
-
125
- peer_review_journals_list = peer_review_journals.split('\n')
126
- eq_network_journals_list = eq_network_journals.split('\n')
127
-
128
- modified_journal_query = "Is the given research paper published in any of the following journals: " + ", ".join(
129
- peer_review_journals_list) + "?"
130
-
131
- pdf_info_query = PDFQueryEngine(documents, llm, embed_model, (info_prompt))
132
- info_query_engine = pdf_info_query.setup_query_engine()
133
- journal_result = info_query_engine.query(modified_journal_query).response
134
- author_info = info_query_engine.query(author_query).response
135
-
136
- pdf_criteria_query = PDFQueryEngine(documents, llm, embed_model, (prompt_template))
137
-
138
- # Check for prior registration
139
- nlp_methods = KeywordSearch(merged_chunks)
140
- eq_journal_result = nlp_methods.find_journal_name(journal_result, eq_network_journals_list)
141
- peer_journal_result = nlp_methods.find_journal_name(journal_result, peer_review_journals_list)
142
- registration_result = nlp_methods.check_registration()
143
-
144
- # Evaluate with OpenAI model
145
- total_score, criteria_met, score_percentage, reasoning = pdf_criteria_query.evaluate_with_llm(
146
- registration_result, peer_journal_result, eq_journal_result, queries)
147
- score = f"{round((total_score / num_criteria) * 100)}/100"
148
-
149
- except Exception as e:
150
- flash('An error occurred while processing the file. Error: ' + str(e))
151
- return redirect(request.url)
152
-
153
- # e.g. score: 56 / 100 - criteria_met: 5 - author_info: Direct
154
- return render_template('index.html',
155
- title=title,
156
- author=author_info,
157
- score=score,
158
- criteria_met=criteria_met,
159
- reasoning=reasoning,
160
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
librarymed/local/templates/index.html DELETED
@@ -1,187 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <title>Upload and Results</title>
5
- <!-- Include Google Fonts -->
6
- <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
7
- <style>
8
- body {
9
- font-family: 'Roboto', sans-serif;
10
- background-color: #f4f4f4;
11
- overflow: auto;
12
- width: 100%;
13
- margin: 0;
14
- padding: 0;
15
- display: flex;
16
- flex-direction: column; /* Stack flex items vertically */
17
- align-items: center; /* Center items horizontally */
18
- justify-content: flex-start; /* Align items to the start of the container vertically */
19
- min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
20
- }
21
-
22
- table {
23
- width: 100%; /* Adjust the width as needed */
24
- border-collapse: collapse; /* Collapse borders for a tighter look */
25
- }
26
-
27
- th, td {
28
- border: 1px solid #ddd; /* Adjust the border size as needed */
29
- text-align: left;
30
- padding: 5px; /* Reduce padding to decrease cell spacing */
31
- height: 30px; /* Optionally reduce the height of the cells */
32
- }
33
- .parent-element {
34
- overflow: visible; /* Ensures content is not cut off */
35
- }
36
- .container {
37
- background-color: white;
38
- overflow: auto;
39
- border-radius: 8px;
40
- box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
41
- padding: 40px;
42
- width: 100%; /* Set width to 100% of the viewport */
43
- max-width: 700px;
44
- }
45
- .score-bar-container {
46
- position: relative;
47
- margin-top: 20px; /* Space above the score bar */
48
- max-width: 100%; /* Ensures the container does not exceed the parent width */
49
- }
50
- .score-very-good-fill {
51
- background-color: #4CAF50; /* Green */
52
- }
53
-
54
- .score-good-fill {
55
- background-color: #FFEB3B; /* Yellow */
56
- }
57
-
58
- .score-ok-fill {
59
- background-color: #FF9800; /* Orange */
60
- }
61
-
62
- .score-bad-fill {
63
- background-color: #f44336; /* Red */
64
- }
65
-
66
- .score-very-bad-fill {
67
- background-color: #9E9E9E; /* Grey */
68
- }
69
- .score-very-good-text {
70
- color: #4CAF50; /* Green */
71
- }
72
-
73
- .score-good-text {
74
- color: #FFEB3B; /* Yellow */
75
- }
76
-
77
- .score-ok-text {
78
- color: #FF9800; /* Orange */
79
- }
80
-
81
- .score-bad-text {
82
- color: #f44336; /* Red */
83
- }
84
-
85
- .score-very-bad-text {
86
- color: #9E9E9E; /* Grey */
87
- }
88
-
89
- .score-bar {
90
- background-color: #ddd;
91
- border-radius: 10px;
92
- height: 20px;
93
- width: 100%; /* Adjusted to take the full width */
94
- display: inline-block; /* Allows the score text to sit next to the score bar */
95
- vertical-align: middle; /* Aligns score bar and text vertically */
96
- }
97
-
98
- .score-fill {
99
- height: 100%;
100
- border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
101
- display: inline-block;
102
- vertical-align: middle;
103
- }
104
-
105
- .score-text {
106
- display: inline-block;
107
- vertical-align: middle; /* Align with the score bar */
108
- font-weight: bold; /* Make the score text bold */
109
- margin-left: 10px; /* Space between the score bar and score text */
110
- }
111
-
112
- .score-title {
113
- font-size: 20px;
114
- font-weight: bold;
115
- margin: 20px 0;
116
- color: #333;
117
- }
118
- .major-issues {
119
- text-align: left; /* Aligns the major issues to the left */
120
- padding-left: 20px; /* Padding for the bullet list */
121
- list-style: inside disc; /* Bullet style */
122
- }
123
- form {
124
- margin-bottom: 20px;
125
- }
126
- input[type="file"] {
127
- margin-bottom: 10px;
128
- }
129
- input[type="submit"] {
130
- cursor: pointer;
131
- margin-top: 10px;
132
- padding: 10px 20px;
133
- border: none;
134
- background-color: #4CAF50;
135
- color: white;
136
- border-radius: 5px;
137
- font-size: 16px;
138
- font-weight: bold;
139
- }
140
- input[type="submit"]:hover {
141
- background-color: #45a049;
142
- }
143
- </style>
144
- </head>
145
- <body>
146
- <div class="container">
147
- <h2>Upload PDF and View Results</h2>
148
-
149
- <!-- Upload Form -->
150
- <form action="/upload" method="post" enctype="multipart/form-data">
151
- <input type="file" name="file" required>
152
- <input type="submit" value="Upload">
153
- </form>
154
-
155
- <!-- Results Section -->
156
- {% if total_score is not none %}
157
- <!-- GPT-4 Score Bar -->
158
- <div class="score-title">Score:</div>
159
- <div class="score-bar-container">
160
- <div class="score-bar">
161
- <div class="score-fill {{
162
- 'score-very-good-fill' if criteria_met == 9 else
163
- 'score-good-fill' if criteria_met >= 7 else
164
- 'score-ok-fill' if criteria_met >= 5 else
165
- 'score-bad-fill' if criteria_met >= 3 else
166
- 'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
167
- </div>
168
- <div class="score-text">{{ score }}</div>
169
- </div>
170
-
171
- <h3>Title:</h3>
172
- <p> {{title}}</p>
173
-
174
- <h3>Author Information:</h3>
175
- <p> {{author}}</p>
176
-
177
- <h3>Reasoning:</h3>
178
- <ul class="major-issues">
179
- {% for issue in reasoning %}
180
- <li>{{ issue }}</li>
181
- {% endfor %}
182
- </ul>
183
-
184
- {% endif %}
185
- </div>
186
- </body>
187
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
librarymed/local/templates/upload_and_results.html DELETED
@@ -1,227 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <title>Upload and Results</title>
5
- <!-- Include Google Fonts -->
6
- <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap" rel="stylesheet">
7
- <style>
8
- body {
9
- font-family: 'Roboto', sans-serif;
10
- background-color: #f4f4f4;
11
- overflow: auto;
12
- width: 100%;
13
- margin: 0;
14
- padding: 0;
15
- display: flex;
16
- flex-direction: column; /* Stack flex items vertically */
17
- align-items: center; /* Center items horizontally */
18
- justify-content: flex-start; /* Align items to the start of the container vertically */
19
- min-height: 100vh; /* Use min-height instead of height to accommodate content taller than the viewport */
20
- }
21
-
22
- table {
23
- width: 100%; /* Adjust the width as needed */
24
- border-collapse: collapse; /* Collapse borders for a tighter look */
25
- }
26
-
27
- th, td {
28
- border: 1px solid #ddd; /* Adjust the border size as needed */
29
- text-align: left;
30
- padding: 5px; /* Reduce padding to decrease cell spacing */
31
- height: 30px; /* Optionally reduce the height of the cells */
32
- }
33
- .parent-element {
34
- overflow: visible; /* Ensures content is not cut off */
35
- }
36
- .container {
37
- background-color: white;
38
- overflow: auto;
39
- border-radius: 8px;
40
- box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
41
- padding: 40px;
42
- width: 100%; /* Set width to 100% of the viewport */
43
- max-width: 700px;
44
- }
45
- .score-bar-container {
46
- position: relative;
47
- margin-top: 20px; /* Space above the score bar */
48
- max-width: 100%; /* Ensures the container does not exceed the parent width */
49
- }
50
- .score-very-good-fill {
51
- background-color: #4CAF50; /* Green */
52
- }
53
-
54
- .score-good-fill {
55
- background-color: #FFEB3B; /* Yellow */
56
- }
57
-
58
- .score-ok-fill {
59
- background-color: #FF9800; /* Orange */
60
- }
61
-
62
- .score-bad-fill {
63
- background-color: #f44336; /* Red */
64
- }
65
-
66
- .score-very-bad-fill {
67
- background-color: #9E9E9E; /* Grey */
68
- }
69
- .score-very-good-text {
70
- color: #4CAF50; /* Green */
71
- }
72
-
73
- .score-good-text {
74
- color: #FFEB3B; /* Yellow */
75
- }
76
-
77
- .score-ok-text {
78
- color: #FF9800; /* Orange */
79
- }
80
-
81
- .score-bad-text {
82
- color: #f44336; /* Red */
83
- }
84
-
85
- .score-very-bad-text {
86
- color: #9E9E9E; /* Grey */
87
- }
88
-
89
- .score-bar {
90
- background-color: #ddd;
91
- border-radius: 10px;
92
- height: 20px;
93
- width: 100%; /* Adjusted to take the full width */
94
- display: inline-block; /* Allows the score text to sit next to the score bar */
95
- vertical-align: middle; /* Aligns score bar and text vertically */
96
- }
97
-
98
- .score-fill {
99
- height: 100%;
100
- border-radius: 10px 0 0 10px; /* Rounded corners on the left side */
101
- display: inline-block;
102
- vertical-align: middle;
103
- }
104
-
105
- .score-text {
106
- display: inline-block;
107
- vertical-align: middle; /* Align with the score bar */
108
- font-weight: bold; /* Make the score text bold */
109
- margin-left: 10px; /* Space between the score bar and score text */
110
- }
111
-
112
- .score-title {
113
- font-size: 20px;
114
- font-weight: bold;
115
- margin: 20px 0;
116
- color: #333;
117
- }
118
- .major-issues {
119
- text-align: left; /* Aligns the major issues to the left */
120
- padding-left: 20px; /* Padding for the bullet list */
121
- list-style: inside disc; /* Bullet style */
122
- }
123
- form {
124
- margin-bottom: 20px;
125
- }
126
- input[type="file"] {
127
- margin-bottom: 10px;
128
- }
129
- input[type="submit"] {
130
- cursor: pointer;
131
- margin-top: 10px;
132
- padding: 10px 20px;
133
- border: none;
134
- background-color: #4CAF50;
135
- color: white;
136
- border-radius: 5px;
137
- font-size: 16px;
138
- font-weight: bold;
139
- }
140
- input[type="submit"]:hover {
141
- background-color: #45a049;
142
- }
143
- </style>
144
- </head>
145
- <body>
146
- <div class="container">
147
- <h2>Upload PDF and View Results</h2>
148
-
149
- <!-- Upload Form -->
150
- <form action="/upload" method="post" enctype="multipart/form-data">
151
- <input type="file" name="file" required>
152
- <input type="submit" value="Upload">
153
- </form>
154
-
155
- <!-- Results Section -->
156
- {% if gpt4_score is not none or mistral_score is not none %}
157
- <!-- GPT-4 Score Bar -->
158
- <div class="score-title">Score for GPT-4:</div>
159
- <div class="score-bar-container">
160
- <div class="score-bar">
161
- <div class="score-fill {{
162
- 'score-very-good-fill' if criteria_met_gpt4 == 9 else
163
- 'score-good-fill' if criteria_met_gpt4 >= 7 else
164
- 'score-ok-fill' if criteria_met_gpt4 >= 5 else
165
- 'score-bad-fill' if criteria_met_gpt4 >= 3 else
166
- 'score-very-bad-fill' }}" style="width: {{ score_percentage_gpt4 }}%;"></div>
167
- </div>
168
- <div class="score-text">{{ total_score_gpt4 }}/9</div>
169
- </div>
170
-
171
- <!-- Mistral Score Bar -->
172
- <div class="score-title">Score for Mistral:</div>
173
- <div class="score-bar-container">
174
- <div class="score-bar">
175
- <div class="score-fill {{
176
- 'score-very-good-fill' if criteria_met_mistral == 9 else
177
- 'score-good-fill' if criteria_met_mistral >= 7 else
178
- 'score-ok-fill' if criteria_met_mistral >= 5 else
179
- 'score-bad-fill' if criteria_met_mistral >= 3 else
180
- 'score-very-bad-fill' }}" style="width: {{ score_percentage_mistral }}%;"></div>
181
- </div>
182
- <div class="score-text">{{ total_score_mistral }}/9</div>
183
- </div>
184
-
185
- <!-- Reasoning for GPT-4 -->
186
- <h3>Reasoning from GPT-4:</h3>
187
- <ul class="major-issues">
188
- {% for issue in reasoning_gpt4 %}
189
- <li>{{ issue }}</li>
190
- {% endfor %}
191
- </ul>
192
-
193
- <!-- Reasoning for Mistral -->
194
- <h3>Reasoning from Mistral:</h3>
195
- <ul class="major-issues">
196
- {% for issue in reasoning_mistral %}
197
- <li>{{ issue }}</li>
198
- {% endfor %}
199
- </ul>
200
- <!-- Insert the Criteria Table Section Here -->
201
- {% if combined_criteria_table %}
202
- <h3>Criteria Evaluation</h3>
203
- <table>
204
- <thead>
205
- <tr>
206
- <th>Criteria Number</th>
207
- <th>GPT-4 output</th>
208
- <th>Mistral output</th>
209
- <th>Ground truth</th>
210
- </tr>
211
- </thead>
212
- <tbody>
213
- {% for row in combined_criteria_table %}
214
- <tr>
215
- <td>{{ row['Criteria Number'] }}</td>
216
- <td>{{ 'Yes' if row['Score GPT-4'] == 1 else 'No' }}</td>
217
- <td>{{ 'Yes' if row['Score Mistral'] == 1 else 'No' }}</td>
218
- <td>{{ 'Yes' if row['ground truth'] else 'No' }}</td>
219
- </tr>
220
- {% endfor %}
221
- </tbody>
222
- </table>
223
- {% endif %}
224
- {% endif %}
225
- </div>
226
- </body>
227
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
librarymed/main.py CHANGED
@@ -1,22 +1,12 @@
1
- import argparse
2
- import logging
3
  import os
 
4
  from dotenv import load_dotenv
5
-
6
  load_dotenv()
 
 
 
7
 
8
  if __name__ == '__main__':
9
- args_parse = argparse.ArgumentParser(description="LibraryMed")
10
- args_parse.add_argument("--local", help="Run inferface v0.1.0 by the fellows", action="store_true")
11
- args = args_parse.parse_args()
12
  port = os.getenv("PORT") or 80
13
-
14
- if args.local:
15
- from .local.app_local import app
16
- logging.info("Run LibraryMed interface v0.1.0 developed by the fellows")
17
- app.run(debug=True, host="0.0.0.0", port=port)
18
-
19
- else:
20
- from kromin.app_librarymed import app
21
- logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
22
- app.run(debug=True, host="0.0.0.0", port=port)
 
 
 
1
  import os
2
+ import logging
3
  from dotenv import load_dotenv
 
4
  load_dotenv()
5
+ from .app_librarymed import app
6
+
7
+ app = app
8
 
9
  if __name__ == '__main__':
 
 
 
10
  port = os.getenv("PORT") or 80
11
+ logging.info("Run LibraryMed interface v0.2.0 developed by Kromin")
12
+ app.run(debug=True, host="0.0.0.0", port=port)