Spaces:

ADOPLE
/

Contract_Management

Running

App Files Files Community

Contract_Management / pdftojson.py

robertselvam

Update pdftojson.py

afcf7a1 about 1 year ago

raw

history blame

2.93 kB

	import os
	import PyPDF2
	from langchain import PromptTemplate, LLMChain
	from langchain.llms import OpenAI

	class PdftoJson:

	def __init__(self):
	"""
	Initialize the PdftoJson class with OpenAI API key.
	"""
	# OPENAI_API_KEY = ""
	# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

	def _get_json(self, input_text: str) -> str:
	"""
	Generate JSON result by analyzing and splitting input text into topics and content.

	Args:
	input_text (str): Text to be analyzed.

	Returns:
	str: JSON result containing topics and content.
	"""
	try:

	# Initialize the OpenAI language model with specified settings
	llm = OpenAI(temperature=0, max_tokens=1000)

	# Define a template that instructs the model to split input text into topics and content
	template = """
	Your task is Get the text and analyse and split it into Topics and Content in json format.Give Proper Name to Topic dont give any Numbers and Dont Give any empty Contents.The Output Format Should Be very good.

	{text}
	"""
	prompt = PromptTemplate(template=template, input_variables=["text"])

	# Create an LLMChain instance to chain the prompt and language model together
	llm_chain = LLMChain(prompt=prompt, llm=llm)

	# Use the provided input text to generate JSON result using the model
	text = input_text
	json_result = llm_chain.run(text)

	return json_result

	except Exception as e:
	print(f"Error occurred while generating JSON result: {str(e)}")


	def extract_text_from_pdf(self, pdf_path: str):
	"""
	Extract text from a PDF file, generate JSON result, and save it to a file.

	Args:
	pdf_path (str): Path to the PDF file.
	"""
	try:

	# Open the PDF file in binary read mode
	with open(pdf_path.name, "rb") as pdf_file:
	# Create a PDF reader object
	pdf_reader = PyPDF2.PdfReader(pdf_file)

	# Iterate through each page in the PDF
	for page_number in range(len(pdf_reader.pages)):
	# Extract text from the current page
	page = pdf_reader.pages[page_number]
	text = page.extract_text()

	# Generate JSON result for the extracted text
	json_result = self._get_json(text)

	# # Clear Extra Spaces
	# clear_json_result = self._remove_empty_lines(json_result)

	# # Save the JSON result to a file
	# self._save_json(clear_json_result)
	return json_result


	except Exception as e:
	print(f"Error occurred during extraction and processing: {str(e)}")