agoyal496 commited on
Commit
24412da
·
1 Parent(s): d16857c

Documentation

Browse files
app.py CHANGED
@@ -16,15 +16,39 @@ llm_model_name = "gpt-4o-mini"
16
  llm_generator = None
17
 
18
 
19
- def set_api_key(api_key: str):
 
 
 
 
 
 
 
 
 
 
 
 
20
  if api_key.strip():
21
  os.environ["OPENAI_API_KEY"] = api_key
22
  else:
23
  raise gr.Error("Please provide a valid API key")
24
 
25
 
26
- def process_inputs(api_key: str, pdf_file, questions: str):
 
 
 
 
 
 
 
 
 
27
 
 
 
 
28
  # Setup Api KEY
29
  set_api_key(api_key)
30
 
 
16
  llm_generator = None
17
 
18
 
19
+ def set_api_key(api_key: str) -> None:
20
+ """
21
+ Sets the OpenAI API key as an environment variable.
22
+
23
+ Parameters:
24
+ api_key (str): The OpenAI API key to be set.
25
+
26
+ Returns:
27
+ None: This function does not return any value.
28
+
29
+ Raises:
30
+ gr.Error: If the provided API key is empty or consists only of whitespace characters.
31
+ """
32
  if api_key.strip():
33
  os.environ["OPENAI_API_KEY"] = api_key
34
  else:
35
  raise gr.Error("Please provide a valid API key")
36
 
37
 
38
+ def process_inputs(api_key: str, pdf_file, questions: str) -> str:
39
+ """
40
+ This function processes the inputs, sets up the API key, validates the PDF file, parses the PDF,
41
+ creates a vector store, generates an LLM generator, validates the questions, retrieves top similar chunks,
42
+ generates answers, and returns the output in JSON format.
43
+
44
+ Parameters:
45
+ api_key (str): The OpenAI API key for accessing the LLM model.
46
+ pdf_file (File): The uploaded PDF file.
47
+ questions (str): The list of questions, one per line.
48
 
49
+ Returns:
50
+ str: The output in JSON format containing the answers to the questions.
51
+ """
52
  # Setup Api KEY
53
  set_api_key(api_key)
54
 
utils/document_parsing.py CHANGED
@@ -10,6 +10,17 @@ class DocParsing:
10
  chunk_overlap = 50
11
 
12
  def __init__(self, file_path, model_name, max_model_tokens=384):
 
 
 
 
 
 
 
 
 
 
 
13
  self.file_path = file_path
14
 
15
  # Initialize the tokenizer for all-MiniLM
@@ -18,16 +29,59 @@ class DocParsing:
18
  self.max_model_tokens = max_model_tokens
19
 
20
  def process_pdf(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  self.load_pdf()
22
  self.create_chunks()
23
  return self.chunks
24
 
25
  def load_pdf(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  loader = PyPDFLoader(self.file_path)
27
  self.documents = loader.load()
28
 
29
  def create_chunks(self):
30
- # Split documents into chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  self.chunks = []
32
  for doc in self.documents:
33
  self.chunks.extend(
@@ -37,10 +91,37 @@ class DocParsing:
37
  )
38
 
39
  def tokenize(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
40
  return self.tokenizer.encode(text, add_special_tokens=False)
41
 
42
  def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
43
- """Split a single Document into multiple Documents based on token length."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  tokens = self.tokenize(doc.page_content)
45
  chunks = []
46
  start = 0
 
10
  chunk_overlap = 50
11
 
12
  def __init__(self, file_path, model_name, max_model_tokens=384):
13
+ """
14
+ Initialize the DocParsing class with the provided file path, model name, and maximum model tokens.
15
+
16
+ Parameters:
17
+ file_path (str): The path to the PDF file to be processed.
18
+ model_name (str): The name of the transformer model to be used for tokenization.
19
+ max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384.
20
+
21
+ Returns:
22
+ None
23
+ """
24
  self.file_path = file_path
25
 
26
  # Initialize the tokenizer for all-MiniLM
 
29
  self.max_model_tokens = max_model_tokens
30
 
31
  def process_pdf(self):
32
+ """
33
+ Process the PDF file by loading it, splitting it into chunks, and returning the chunks.
34
+
35
+ This function first calls the `load_pdf` method to load the PDF file into a list of Document objects.
36
+ Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified
37
+ chunk size and overlap. Finally, it returns the list of chunks.
38
+
39
+ Parameters:
40
+ None
41
+
42
+ Returns:
43
+ list: A list of Document objects, where each Document represents a chunk of the PDF file.
44
+ """
45
  self.load_pdf()
46
  self.create_chunks()
47
  return self.chunks
48
 
49
  def load_pdf(self):
50
+ """
51
+ Load the PDF file specified by the file_path attribute into a list of Document objects.
52
+
53
+ This function uses the PyPDFLoader class from the langchain library to load the PDF file.
54
+ The loaded Document objects are stored in the self.documents attribute.
55
+
56
+ Parameters:
57
+ None
58
+
59
+ Returns:
60
+ None
61
+
62
+ Raises:
63
+ FileNotFoundError: If the specified file_path does not exist or cannot be accessed.
64
+ """
65
  loader = PyPDFLoader(self.file_path)
66
  self.documents = loader.load()
67
 
68
  def create_chunks(self):
69
+ """
70
+ Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap.
71
+
72
+ This function iterates through each Document object in the self.documents list and calls the
73
+ token_split_document method to split the Document into smaller chunks. The resulting chunks are
74
+ then appended to the self.chunks list.
75
+
76
+ Parameters:
77
+ None
78
+
79
+ Returns:
80
+ None
81
+
82
+ Attributes:
83
+ self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file.
84
+ """
85
  self.chunks = []
86
  for doc in self.documents:
87
  self.chunks.extend(
 
91
  )
92
 
93
  def tokenize(self, text):
94
+ """
95
+ Tokenize the input text using the transformer model's tokenizer.
96
+
97
+ This method uses the tokenizer provided by the transformer model to encode the input text.
98
+ The special tokens are not added to the encoded tokens.
99
+
100
+ Parameters:
101
+ text (str): The input text to be tokenized.
102
+
103
+ Returns:
104
+ list: A list of integers representing the tokenized input text.
105
+ """
106
  return self.tokenizer.encode(text, add_special_tokens=False)
107
 
108
  def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
109
+ """
110
+ Split a single Document into multiple chunks based on token length.
111
+
112
+ This function tokenizes the input Document's page content, then splits the tokens into smaller chunks
113
+ of specified size. Overlapping chunks are created by moving the start index forward by the difference
114
+ between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created
115
+ with the same metadata but truncated text.
116
+
117
+ Parameters:
118
+ doc (Document): The input Document to be split into chunks.
119
+ chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350.
120
+ chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50.
121
+
122
+ Returns:
123
+ list: A list of Document objects, where each Document represents a chunk of the input Document.
124
+ """
125
  tokens = self.tokenize(doc.page_content)
126
  chunks = []
127
  start = 0
utils/llm_generation.py CHANGED
@@ -26,6 +26,15 @@ json_schema = {
26
 
27
  class LLMGeneration:
28
  def __init__(self, llm_model_name="gpt-4o-mini"):
 
 
 
 
 
 
 
 
 
29
  self.llm_model_name = llm_model_name
30
  self.llm = ChatOpenAI(
31
  model_name=self.llm_model_name,
@@ -41,6 +50,18 @@ class LLMGeneration:
41
  self.create_initial_prompt()
42
 
43
  def create_initial_prompt(self):
 
 
 
 
 
 
 
 
 
 
 
 
44
  # System message for the chain
45
  system_message = SystemMessage(
46
  content=(
@@ -61,8 +82,21 @@ class LLMGeneration:
61
 
62
  self.initial_prompt_messages = [system_message] + few_shots
63
 
64
- def create_human_message_prompt(self, query: str, docs: List[Document]):
 
 
 
 
 
 
65
 
 
 
 
 
 
 
 
66
  # Prepare the context from the retrieved chunks
67
  context = "\n\n".join(
68
  [f"<context>{doc.page_content}</context>" for doc in docs]
@@ -76,15 +110,24 @@ class LLMGeneration:
76
 
77
  return HumanMessagePromptTemplate.from_template(human_message)
78
 
79
- def generate_answer(self, query: str, docs: List[Document]):
 
 
 
 
 
 
80
 
 
 
 
81
  # Create the prompt template
82
  prompt = ChatPromptTemplate.from_messages(
83
  self.initial_prompt_messages
84
  + [self.create_human_message_prompt(query, docs)]
85
  )
86
 
87
- # Create and run the chain with the hypothetical gpt-40-mini model
88
  chain = LLMChain(
89
  llm=self.llm,
90
  prompt=prompt,
 
26
 
27
  class LLMGeneration:
28
  def __init__(self, llm_model_name="gpt-4o-mini"):
29
+ """
30
+ Initialize the LLMGeneration class with a specified LLM model.
31
+
32
+ Parameters:
33
+ llm_model_name (str): The name of the LLM model to be used. Default is "gpt-4o-mini".
34
+
35
+ Returns:
36
+ None
37
+ """
38
  self.llm_model_name = llm_model_name
39
  self.llm = ChatOpenAI(
40
  model_name=self.llm_model_name,
 
50
  self.create_initial_prompt()
51
 
52
  def create_initial_prompt(self):
53
+ """
54
+ Prepares the initial prompt for the LLMChain.
55
+
56
+ This function creates a system message and few-shot examples for the LLMChain.
57
+ The system message instructs the assistant to use the provided context to answer the user's question,
58
+ and to follow a structured JSON format for the answer. It also specifies the conditions for providing an answer.
59
+
60
+ The few-shot examples include a context and a question, along with the expected answer in JSON format.
61
+
62
+ Returns:
63
+ None. The initial prompt messages are stored in the `initial_prompt_messages` attribute of the class instance.
64
+ """
65
  # System message for the chain
66
  system_message = SystemMessage(
67
  content=(
 
82
 
83
  self.initial_prompt_messages = [system_message] + few_shots
84
 
85
+ def create_human_message_prompt(self, query: str, docs: List[Document]) -> HumanMessagePromptTemplate:
86
+ """
87
+ Prepares a human message prompt for the LLMChain.
88
+
89
+ This function constructs a human message that includes the provided context and a question.
90
+ The context is extracted from the list of documents and formatted as per the required structure.
91
+ The question is included in the human message.
92
 
93
+ Parameters:
94
+ query (str): The user's question for which an answer needs to be generated.
95
+ docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
96
+
97
+ Returns:
98
+ HumanMessagePromptTemplate: A human message prompt template that can be used with the LLMChain.
99
+ """
100
  # Prepare the context from the retrieved chunks
101
  context = "\n\n".join(
102
  [f"<context>{doc.page_content}</context>" for doc in docs]
 
110
 
111
  return HumanMessagePromptTemplate.from_template(human_message)
112
 
113
+ def generate_answer(self, query: str, docs: List[Document]) -> str:
114
+ """
115
+ Generate an answer to the user's query using the provided documents and the LLM model.
116
+
117
+ Parameters:
118
+ query (str): The user's question for which an answer needs to be generated.
119
+ docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
120
 
121
+ Returns:
122
+ str: The answer to the user's query. If no answer is found, returns an empty string.
123
+ """
124
  # Create the prompt template
125
  prompt = ChatPromptTemplate.from_messages(
126
  self.initial_prompt_messages
127
  + [self.create_human_message_prompt(query, docs)]
128
  )
129
 
130
+ # Create and run the chain with the gpt-40-mini model
131
  chain = LLMChain(
132
  llm=self.llm,
133
  prompt=prompt,
utils/retrieval.py CHANGED
@@ -6,6 +6,16 @@ from typing import List
6
 
7
  class Retrieval:
8
  def __init__(self, model_name, max_model_tokens=384):
 
 
 
 
 
 
 
 
 
 
9
  self.model_name = model_name
10
  self.embeddings = HuggingFaceEmbeddings(
11
  model_name=model_name,
@@ -13,12 +23,13 @@ class Retrieval:
13
  )
14
 
15
  def create_vector_store(self, chunks: List[Document]):
16
-
17
  self.chunks = chunks
18
  # Create FAISS vector store
19
  self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
20
 
21
  def search(self, query, k=10) -> List[Document]:
 
22
  # Retrieve top 10 similar chunks
23
  similar_docs = self.vectorstore.similarity_search(query, k)
24
 
 
6
 
7
  class Retrieval:
8
  def __init__(self, model_name, max_model_tokens=384):
9
+ """
10
+ Initialize Retrieval class with HuggingFace embeddings and FAISS vector store.
11
+
12
+ Parameters:
13
+ model_name (str): The name of the HuggingFace model to use for embeddings.
14
+ max_model_tokens (int, optional): The maximum number of tokens to use for encoding. Defaults to 384.
15
+
16
+ Returns:
17
+ None
18
+ """
19
  self.model_name = model_name
20
  self.embeddings = HuggingFaceEmbeddings(
21
  model_name=model_name,
 
23
  )
24
 
25
  def create_vector_store(self, chunks: List[Document]):
26
+ """Creates a new vector store for similarity search"""
27
  self.chunks = chunks
28
  # Create FAISS vector store
29
  self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
30
 
31
  def search(self, query, k=10) -> List[Document]:
32
+ """Search top matching documents"""
33
  # Retrieve top 10 similar chunks
34
  similar_docs = self.vectorstore.similarity_search(query, k)
35