naveenvenkatesh commited on
Commit
4ec3e55
1 Parent(s): 417a9e0

Upload 5 files

Browse files
ContractGenerator.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+
3
+ class ContractGenerator:
4
+ """
5
+ A class for generating contract forms based on user instructions using the OpenAI GPT-3.5 model.
6
+ """
7
+
8
+ def __init__(self, api_key: str):
9
+ """
10
+ Initialize the ContractGenerator.
11
+
12
+ Args:
13
+ api_key (str): Your OpenAI API key.
14
+ """
15
+ openai.api_key = api_key
16
+
17
+ def generate_contract(self, instructions: str) -> None:
18
+ """
19
+ Generate a contract form based on user instructions.
20
+
21
+ Args:
22
+ instructions (str): User-provided instructions for the contract form.
23
+
24
+ Raises:
25
+ openai.error.OpenAIError: If there is an error with the OpenAI API request.
26
+ """
27
+ # Define a prompt
28
+ prompt = f"Your task is to generate a contract form based on user instructions. ***Instructions:{instructions}***"
29
+
30
+ try:
31
+ # Generate text using the GPT-3.5 model
32
+ response = openai.Completion.create(
33
+ engine="text-davinci-003",
34
+ prompt=prompt,
35
+ max_tokens=500 # You can adjust the length of the generated text
36
+ )
37
+
38
+ # Print the generated text
39
+ return response.choices[0].text
40
+
41
+ except openai.error.OpenAIError as e:
42
+ print(f"Error generating the contract: {str(e)}")
43
+
contract_missing_clausses.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import pdfplumber
3
+ import logging
4
+
5
+ # Configure logging
6
+ logging.basicConfig(
7
+ filename='contract_missing_clausses.log', # You can adjust the log file name here
8
+ filemode='a',
9
+ format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
10
+ datefmt='%Y-%b-%d %H:%M:%S'
11
+ )
12
+ LOGGER = logging.getLogger(__name__)
13
+
14
+ log_level_env = 'INFO' # You can adjust the log level here
15
+ log_level_dict = {
16
+ 'DEBUG': logging.DEBUG,
17
+ 'INFO': logging.INFO,
18
+ 'WARNING': logging.WARNING,
19
+ 'ERROR': logging.ERROR,
20
+ 'CRITICAL': logging.CRITICAL
21
+ }
22
+ if log_level_env in log_level_dict:
23
+ log_level = log_level_dict[log_level_env]
24
+ else:
25
+ log_level = log_level_dict['INFO']
26
+ LOGGER.setLevel(log_level)
27
+
28
+ class ContractMissingClauses:
29
+
30
+ """
31
+ Class for identifying missing clauses, sub-clauses, and terms in a contract.
32
+ """
33
+
34
+ def __init__(self,open_api_key):
35
+
36
+ """
37
+ Initialize the ContractMissingClauses class and set up the OpenAI API client.
38
+ """
39
+
40
+ # Initialize the OpenAI API client
41
+ openai.api_key = open_api_key
42
+
43
+ def get_missing_clauses(self, contract: str):
44
+
45
+ """
46
+ Generate and print missing clauses, sub-clauses, and terms in the given contract.
47
+
48
+ Args:
49
+ contract (str): The text of the contract.
50
+ """
51
+ try:
52
+ LOGGER.info("Analyzing contract and extracting missing clauses...")
53
+ # Generate text using the OpenAI GPT-3 model
54
+ response = openai.Completion.create(
55
+ engine="text-davinci-003", # You can specify different engines
56
+ prompt="identify missing clauses,sub-clauses and terms from given contrct ***{contract}*** return only missing (clauses,sub-clauses and terms) seperately.",
57
+ temperature=0,
58
+ max_tokens=500, # The maximum number of tokens (words) in the generated text
59
+ )
60
+
61
+ # Print the generated text
62
+ return response.choices[0].text
63
+
64
+ except Exception as e:
65
+ # If an error occurs during the key-value extraction process, log the error
66
+ LOGGER.error(f"Error occurred while extracting missing clauses: {str(e)}")
67
+
68
+
69
+ def iterate_each_page(self,pdf_file):
70
+
71
+ """
72
+ Iterate through each page of a PDF contract, extract text, and call get_missing_clauses for each page.
73
+ """
74
+
75
+ try:
76
+
77
+ LOGGER.info("Analyzing contract and extracting pdf page...")
78
+
79
+ # Initialize pdfplumber
80
+ pdf = pdfplumber.open(pdf_file.name)
81
+
82
+ # Iterate through each page and extract text
83
+ for page in pdf.pages:
84
+ contract = page.extract_text()
85
+ self.get_missing_clauses(contract)
86
+
87
+ except Exception as e:
88
+ # If an error occurs during the key-value extraction process, log the error
89
+ LOGGER.error(f"Error occurred while extracting pdf page: {str(e)}")
extract_date.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ import openai
3
+ import fitz # PyMuPDF
4
+ import logging
5
+
6
+ # Configure logging
7
+ logging.basicConfig(
8
+ filename='extract_date.log', # You can adjust the log file name here
9
+ filemode='a',
10
+ format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
11
+ datefmt='%Y-%b-%d %H:%M:%S'
12
+ )
13
+ LOGGER = logging.getLogger(__name__)
14
+
15
+ log_level_env = 'INFO' # You can adjust the log level here
16
+ log_level_dict = {
17
+ 'DEBUG': logging.DEBUG,
18
+ 'INFO': logging.INFO,
19
+ 'WARNING': logging.WARNING,
20
+ 'ERROR': logging.ERROR,
21
+ 'CRITICAL': logging.CRITICAL
22
+ }
23
+ if log_level_env in log_level_dict:
24
+ log_level = log_level_dict[log_level_env]
25
+ else:
26
+ log_level = log_level_dict['INFO']
27
+ LOGGER.setLevel(log_level)
28
+
29
+ class ExtractDateAndDuration:
30
+
31
+
32
+ def __init__(self,api_key):
33
+ """
34
+ Initialize the ExtractDateAndDuration class.
35
+ """
36
+ openai.api_key = api_key
37
+
38
+ def get_date_and_duration(self, contract_text: str) -> str:
39
+ """
40
+ Extract dates and durations from the provided contract text.
41
+
42
+ Args:
43
+ contract_text (str): The text of the contract to analyze.
44
+
45
+ Returns:
46
+ str: Extracted dates and durations.
47
+ """
48
+ try:
49
+ response = openai.Completion.create(
50
+ engine="text-davinci-003",
51
+ prompt=f"""Your task is Identify Dates and Durations Mentioned in the contract and extract that date and duration in key-value pair.
52
+ ```contract: {contract_text}```
53
+ """,
54
+ max_tokens=300,
55
+ temperature=0
56
+ )
57
+ extracted_date_duration = response.choices[0].text.strip()
58
+ return extracted_date_duration
59
+
60
+ except Exception as e:
61
+ LOGGER.error(f"An error occurred during text analysis: {str(e)}")
62
+
63
+ def itrate_each_page(self, pdf_file_path: str):
64
+ """
65
+ Extract text from each page of a PDF document and process it.
66
+
67
+ Args:
68
+ pdf_file_path (str): The path to the PDF document.
69
+
70
+ Returns:
71
+ str: Extracted text from the PDF pages.
72
+ """
73
+ try:
74
+ # Open the multi-page PDF using PdfReaderer
75
+ pdf = PdfReader(pdf_file_path.name)
76
+
77
+ extracted_date_duration = ""
78
+
79
+ # Extract text from each page and pass it to the process_text function
80
+ for page_number in range(len(pdf.pages)):
81
+ # Extract text from the page
82
+ page = pdf.pages[page_number]
83
+ text = page.extract_text()
84
+
85
+ # Pass the text to the process_text function for further processing
86
+ extracted_date_duration += self.get_date_and_duration(text)
87
+ return extracted_date_duration
88
+
89
+ except Exception as e:
90
+ LOGGER.error(f"An error occurred while processing the PDF document: {str(e)}")
invoice_extractor.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from PIL import Image, ImageDraw
4
+ import traceback
5
+ import torch
6
+ from docquery import pipeline
7
+ from docquery.document import load_bytes, load_document, ImageDocument
8
+ from docquery.ocr_reader import get_ocr_reader
9
+ from pdf2image import convert_from_path
10
+
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+ # Initialize the logger
14
+ logging.basicConfig(filename="invoice_extraction.log", level=logging.DEBUG) # Create a log file
15
+
16
+ # Checkpoint for different models
17
+ CHECKPOINTS = {
18
+ "LayoutLMv1 for Invoices 🧾": "impira/layoutlm-invoices",
19
+ }
20
+ PIPELINES = {}
21
+
22
+
23
+ class InvoiceKeyValuePair():
24
+
25
+ """
26
+ This class provides a utility to extract key-value pairs from invoices using LayoutLM.
27
+ """
28
+
29
+ def __init__(self):
30
+
31
+ self.fields = {
32
+ "Vendor Name": ["Vendor Name - Logo?", "Vendor Name - Address?"],
33
+ "Vendor Address": ["Vendor Address?"],
34
+ "Customer Name": ["Customer Name?"],
35
+ "Customer Address": ["Customer Address?"],
36
+ "Invoice Number": ["Invoice Number?"],
37
+ "Invoice Date": ["Invoice Date?"],
38
+ "Due Date": ["Due Date?"],
39
+ "Subtotal": ["Subtotal?"],
40
+ "Total Tax": ["Total Tax?"],
41
+ "Invoice Total": ["Invoice Total?"],
42
+ "Amount Due": ["Amount Due?"],
43
+ "Payment Terms": ["Payment Terms?"],
44
+ "Remit To Name": ["Remit To Name?"],
45
+ "Remit To Address": ["Remit To Address?"],
46
+ }
47
+ self.model = list(CHECKPOINTS.keys())[0]
48
+
49
+ def ensure_list(self, x):
50
+ try:
51
+ # Log the function entry
52
+ logging.info(f'Entering ensure_list with x={x}')
53
+
54
+ # Check if 'x' is already a list
55
+ if isinstance(x, list):
56
+ return x
57
+ else:
58
+ # If 'x' is not a list, wrap it in a list and return
59
+ return [x]
60
+ except Exception as e:
61
+ # Log exceptions
62
+ logging.error("An error occurred:", exc_info=True)
63
+ return []
64
+
65
+ def construct_pipeline(self, task, model):
66
+ try:
67
+ # Log the function entry
68
+ logging.info(f'Entering construct_pipeline with task={task} and model={model}')
69
+
70
+ # Global dictionary to cache pipelines based on model checkpoint names
71
+ global PIPELINES
72
+
73
+ # Check if a pipeline for the specified model already exists in the cache
74
+ if model in PIPELINES:
75
+ # If it exists, return the cached pipeline
76
+ return PIPELINES[model]
77
+ try:
78
+ # Determine the device to use for inference (GPU if available, else CPU)
79
+ device = "cuda" if torch.cuda.is_available() else "cpu"
80
+
81
+ # Create the pipeline using the specified task and model checkpoint
82
+ ret = pipeline(task=task, model=CHECKPOINTS[model], device=device)
83
+
84
+ # Cache the created pipeline for future use
85
+ PIPELINES[model] = ret
86
+
87
+ # Return the constructed pipeline
88
+ return ret
89
+ except Exception as e:
90
+ # Handle exceptions and log the error message
91
+ logging.error("An error occurred:", exc_info=True)
92
+ return None
93
+ except Exception as e:
94
+ # Log exceptions
95
+ logging.error("An error occurred:", exc_info=True)
96
+ return None
97
+
98
+ def run_pipeline(self, model, question, document, top_k):
99
+ try:
100
+ # Log the function entry
101
+ logging.info(f'Entering run_pipeline with model={model}, question={question}, and document={document}')
102
+
103
+ # Use the construct_pipeline method to get or create a pipeline for the specified model
104
+ pipeline = self.construct_pipeline("document-question-answering", model)
105
+
106
+ # Use the constructed pipeline to perform question-answering on the document
107
+ # Pass the question, document context, and top_k as arguments to the pipeline
108
+ return pipeline(question=question, **document.context, top_k=top_k)
109
+ except Exception as e:
110
+ # Log exceptions
111
+ logging.error("An error occurred:", exc_info=True)
112
+ return None
113
+
114
+ def lift_word_boxes(self, document, page):
115
+ try:
116
+ # Log the function entry
117
+ logging.info(f'Entering lift_word_boxes with document={document} and page={page}')
118
+
119
+ # Extract the word boxes for the specified page from the document's context
120
+ return document.context["image"][page][1]
121
+ except Exception as e:
122
+ # Log exceptions
123
+ logging.error("An error occurred:", exc_info=True)
124
+ return []
125
+
126
+ def expand_bbox(self, word_boxes):
127
+ try:
128
+ # Log the function entry
129
+ logging.info(f'Entering expand_bbox with word_boxes={word_boxes}')
130
+
131
+ # Check if the input list of word boxes is empty
132
+ if len(word_boxes) == 0:
133
+ return None
134
+
135
+ # Extract the minimum and maximum coordinates of the word boxes
136
+ min_x, min_y, max_x, max_y = zip(*[x[1] for x in word_boxes])
137
+
138
+ # Calculate the overall minimum and maximum coordinates
139
+ min_x, min_y, max_x, max_y = [min(min_x), min(min_y), max(max_x), max(max_y)]
140
+
141
+ # Return the expanded bounding box as [min_x, min_y, max_x, max_y]
142
+ return [min_x, min_y, max_x, max_y]
143
+ except Exception as e:
144
+ # Log exceptions
145
+ logging.error("An error occurred:", exc_info=True)
146
+ return None
147
+
148
+ def normalize_bbox(self, box, width, height, padding=0.005):
149
+ try:
150
+ # Log the function entry
151
+ logging.info(f'Entering normalize_bbox with box={box}, width={width}, height={height}, and padding={padding}')
152
+
153
+ # Extract the bounding box coordinates and convert them from millimeters to fractions
154
+ min_x, min_y, max_x, max_y = [c / 1000 for c in box]
155
+
156
+ # Apply padding if specified (as a fraction of image dimensions)
157
+ if padding != 0:
158
+ min_x = max(0, min_x - padding)
159
+ min_y = max(0, min_y - padding)
160
+ max_x = min(max_x + padding, 1)
161
+ max_y = min(max_y + padding, 1)
162
+
163
+ # Scale the normalized coordinates to match the image dimensions
164
+ return [min_x * width, min_y * height, max_x * width, max_y * height]
165
+ except Exception as e:
166
+ # Log exceptions
167
+ logging.error("An error occurred:", exc_info=True)
168
+ return None
169
+
170
+ def annotate_page(self, prediction, pages, document):
171
+ try:
172
+ # Log the function entry
173
+ logging.info(f'Entering annotate_page with prediction={prediction}, pages={pages}, and document={document}')
174
+
175
+ # Check if a prediction exists and contains word_ids
176
+ if prediction is not None and "word_ids" in prediction:
177
+
178
+ # Get the image of the page where the prediction was made
179
+ image = pages[prediction["page"]]
180
+
181
+ # Create a drawing object for the image
182
+ draw = ImageDraw.Draw(image, "RGBA")
183
+
184
+ # Extract word boxes for the page
185
+ word_boxes = self.lift_word_boxes(document, prediction["page"])
186
+
187
+ # Expand and normalize the bounding box of the predicted words
188
+ x1, y1, x2, y2 = self.normalize_bbox(
189
+ self.expand_bbox([word_boxes[i] for i in prediction["word_ids"]]),
190
+ image.width,
191
+ image.height,
192
+ )
193
+
194
+ # Draw a semi-transparent green rectangle around the predicted words
195
+ draw.rectangle(((x1, y1), (x2, y2)), fill=(0, 255, 0, int(0.4 * 255)))
196
+ except Exception as e:
197
+ # Log exceptions
198
+ logging.error("An error occurred:", exc_info=True)
199
+
200
+ def process_fields(self, document, fields, model=list(CHECKPOINTS.keys())[0]):
201
+ try:
202
+ # Log the function entry
203
+ logging.info(f'Entering process_fields with document={document}, fields={fields}, and model={model}')
204
+
205
+ # Convert preview pages of the document to RGB format
206
+ pages = [x.copy().convert("RGB") for x in document.preview]
207
+
208
+ # Initialize dictionaries to store results
209
+ ret = {}
210
+ table = []
211
+
212
+ # Iterate through the fields and associated questions
213
+ for (field_name, questions) in fields.items():
214
+
215
+ # Extract answers for each question and filter based on score
216
+ answers = [
217
+ a
218
+ for q in questions
219
+ for a in self.ensure_list(self.run_pipeline(model, q, document, top_k=1))
220
+ if a.get("score", 1) > 0.5
221
+ ]
222
+
223
+ # Sort answers by score (higher score first)
224
+ answers.sort(key=lambda x: -x.get("score", 0) if x else 0)
225
+
226
+ # Get the top answer (if any)
227
+ top = answers[0] if len(answers) > 0 else None
228
+
229
+ # Annotate the page with the top answer's bounding box
230
+ self.annotate_page(top, pages, document)
231
+
232
+ # Store the top answer for the field and add it to the table
233
+ ret[field_name] = top
234
+ table.append([field_name, top.get("answer") if top is not None else None])
235
+
236
+ # Return the table of key-value pairs
237
+ return table
238
+ except Exception as e:
239
+ # Log exceptions
240
+ logging.error("An error occurred:", exc_info=True)
241
+ return []
242
+
243
+ def process_document(self, document, fields, model, error=None):
244
+ try:
245
+ # Log the function entry
246
+ logging.info(f'Entering process_document with document={document}, fields={fields}, model={model}, and error={error}')
247
+
248
+ # Check if the document is not None and no error occurred during processing
249
+ if document is not None and error is None:
250
+
251
+ # Process the fields in the document using the specified model
252
+ table = self.process_fields(document, fields, model)
253
+ return table
254
+ except Exception as e:
255
+ # Log exceptions
256
+ logging.error("An error occurred:", exc_info=True)
257
+ return []
258
+
259
+ def process_path(self, path, fields, model):
260
+ try:
261
+ # Log the function entry
262
+ logging.info(f'Entering process_path with path={path}, fields={fields}, and model={model}')
263
+
264
+ # Initialize error and document variables
265
+ error = None
266
+ document = None
267
+
268
+ # Check if a file path is provided
269
+ if path:
270
+ try:
271
+ # Load the document from the specified file path
272
+ document = load_document(path)
273
+ except Exception as e:
274
+ # Handle exceptions and store the error message
275
+ logging.error("An error occurred:", exc_info=True)
276
+ error = str(e)
277
+
278
+ # Process the loaded document and extract key-value pairs
279
+ return self.process_document(document, fields, model, error)
280
+ except Exception as e:
281
+ # Log exceptions
282
+ logging.error("An error occurred:", exc_info=True)
283
+ return []
284
+
285
+ def pdf_to_image(self, file_path):
286
+ try:
287
+ # Log the function entry
288
+ logging.info(f'Entering pdf_to_image with file_path={file_path}')
289
+
290
+ # Convert PDF to a list of image objects (one for each page)
291
+ images = convert_from_path(file_path)
292
+
293
+ # Loop through each image and save it
294
+ for i, image in enumerate(images):
295
+ image_path = f'page_{i + 1}.png'
296
+
297
+ return image_path
298
+ except Exception as e:
299
+ # Log exceptions
300
+ logging.error("An error occurred:", exc_info=True)
301
+ return []
302
+
303
+ def process_upload(self, file):
304
+ try:
305
+ # Log the function entry
306
+ logging.info(f'Entering process_upload with file={file}')
307
+
308
+ # Get the model and fields from the instance
309
+ model = self.model
310
+ fields = self.fields
311
+
312
+ # Convert the uploaded PDF file to a list of image files
313
+ image = self.pdf_to_image(file)
314
+
315
+ # Use the first generated image file as the file path for processing
316
+ file = image
317
+
318
+ # Process the document (image) and extract key-value pairs
319
+ return self.process_path(file if file else None, fields, model)
320
+ except Exception as e:
321
+ # Log exceptions
322
+ logging.error("An error occurred:", exc_info=True)
323
+ return []
324
+
325
+ def extract_key_value_pair(self, invoice_file):
326
+ try:
327
+ # Log the function entry
328
+ logging.info(f'Entering extract_key_value_pair with invoice_file={invoice_file}')
329
+
330
+ # Process the uploaded invoice PDF file and extract key-value pairs
331
+ data = self.process_upload(invoice_file.name)
332
+
333
+ # Iterate through the extracted key-value pairs and print them
334
+ for item in data:
335
+ key, value = item
336
+ return f'{key}: {value}'
337
+
338
+ except Exception as e:
339
+ # Log exceptions
340
+ logging.error("An error occurred:", exc_info=True)
341
+
pdftojson.py CHANGED
@@ -1,16 +1,40 @@
1
  import os
2
  import PyPDF2
 
3
  from langchain import PromptTemplate, LLMChain
4
  from langchain.llms import OpenAI
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  class PdftoJson:
7
 
8
- def __init__(self):
9
  """
10
  Initialize the PdftoJson class with OpenAI API key.
11
  """
12
- # OPENAI_API_KEY = ""
13
- # os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
14
 
15
  def _get_json(self, input_text: str) -> str:
16
  """
@@ -23,6 +47,7 @@ class PdftoJson:
23
  str: JSON result containing topics and content.
24
  """
25
  try:
 
26
 
27
  # Initialize the OpenAI language model with specified settings
28
  llm = OpenAI(temperature=0, max_tokens=1000)
@@ -42,10 +67,11 @@ class PdftoJson:
42
  text = input_text
43
  json_result = llm_chain.run(text)
44
 
 
45
  return json_result
46
 
47
  except Exception as e:
48
- print(f"Error occurred while generating JSON result: {str(e)}")
49
 
50
 
51
  def extract_text_from_pdf(self, pdf_path: str):
@@ -56,6 +82,7 @@ class PdftoJson:
56
  pdf_path (str): Path to the PDF file.
57
  """
58
  try:
 
59
 
60
  # Open the PDF file in binary read mode
61
  with open(pdf_path.name, "rb") as pdf_file:
@@ -71,13 +98,9 @@ class PdftoJson:
71
  # Generate JSON result for the extracted text
72
  json_result = self._get_json(text)
73
 
74
- # # Clear Extra Spaces
75
- # clear_json_result = self._remove_empty_lines(json_result)
76
-
77
- # # Save the JSON result to a file
78
- # self._save_json(clear_json_result)
79
  return json_result
80
 
 
81
 
82
  except Exception as e:
83
- print(f"Error occurred during extraction and processing: {str(e)}")
 
1
  import os
2
  import PyPDF2
3
+ import logging
4
  from langchain import PromptTemplate, LLMChain
5
  from langchain.llms import OpenAI
6
 
7
+ # Configure logging
8
+ logging.basicConfig(
9
+ filename='pdftojson.log', # You can adjust the log file name here
10
+ filemode='a',
11
+ format='[%(asctime)s] [%(levelname)s] [%(filename)s] [%(lineno)s:%(funcName)s()] %(message)s',
12
+ datefmt='%Y-%b-%d %H:%M:%S'
13
+ )
14
+ LOGGER = logging.getLogger(__name__)
15
+
16
+ log_level_env = 'INFO' # You can adjust the log level here
17
+ log_level_dict = {
18
+ 'DEBUG': logging.DEBUG,
19
+ 'INFO': logging.INFO,
20
+ 'WARNING': logging.WARNING,
21
+ 'ERROR': logging.ERROR,
22
+ 'CRITICAL': logging.CRITICAL
23
+ }
24
+ if log_level_env in log_level_dict:
25
+ log_level = log_level_dict[log_level_env]
26
+ else:
27
+ log_level = log_level_dict['INFO']
28
+ LOGGER.setLevel(log_level)
29
+
30
  class PdftoJson:
31
 
32
+ def __init__(self,openai_api_key: str):
33
  """
34
  Initialize the PdftoJson class with OpenAI API key.
35
  """
36
+ OPENAI_API_KEY = openai_api_key
37
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
38
 
39
  def _get_json(self, input_text: str) -> str:
40
  """
 
47
  str: JSON result containing topics and content.
48
  """
49
  try:
50
+ LOGGER.info("Generating JSON result by analyzing input text...")
51
 
52
  # Initialize the OpenAI language model with specified settings
53
  llm = OpenAI(temperature=0, max_tokens=1000)
 
67
  text = input_text
68
  json_result = llm_chain.run(text)
69
 
70
+ LOGGER.info("Generated JSON result successfully.")
71
  return json_result
72
 
73
  except Exception as e:
74
+ LOGGER.error(f"Error occurred while generating JSON result: {str(e)}")
75
 
76
 
77
  def extract_text_from_pdf(self, pdf_path: str):
 
82
  pdf_path (str): Path to the PDF file.
83
  """
84
  try:
85
+ LOGGER.info("Extracting text from PDF, generating JSON result, and saving to a file...")
86
 
87
  # Open the PDF file in binary read mode
88
  with open(pdf_path.name, "rb") as pdf_file:
 
98
  # Generate JSON result for the extracted text
99
  json_result = self._get_json(text)
100
 
 
 
 
 
 
101
  return json_result
102
 
103
+ LOGGER.info("Extraction, JSON generation, and saving completed.")
104
 
105
  except Exception as e:
106
+ LOGGER.error(f"Error occurred during extraction and processing: {str(e)}")