Spaces:

kausthubkannan17
/

Drake

Sleeping

App Files Files Community

Drake / utilis.py

kausthubkannan17

feat: OCR support

8163d1a 10 months ago

raw

history blame

7.14 kB

	import os
	import json
	from typing import List
	import assemblyai as aai
	from youtube_transcript_api import YouTubeTranscriptApi
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents.base import Document
	from langchain_community.vectorstores import DeepLake
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.prompts.few_shot import FewShotPromptTemplate
	from langchain.prompts.prompt import PromptTemplate
	from typing import Dict
	import uuid
	from pdf2image import convert_from_bytes
	import pytesseract
	from pytesseract import Output


	class Processing:
	def __init__(self, dataset_path: str, embedding_model_name: str, chunk_size=500, chunk_overlap=5):
	"""
	Parameters:
	dataset_path (str): Path to the dataset in the Vector-DB
	embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
	chunk_size (int): Size of each chunk to be processed
	chunk_overlap (int): Overlap between each chunk

	Initialize embedding model, text splitter, transcriber and Vector-DB
	"""
	self.dataset_path = dataset_path
	self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	self.transcriber = aai.Transcriber()

	self.embedding_model = HuggingFaceEmbeddings(
	model_name=embedding_model_name,
	encode_kwargs={'normalize_embeddings': False}
	)

	self.db = DeepLake(dataset_path=f"hub://{self.dataset_path}",
	embedding=self.embedding_model,
	exec_option="compute_engine"
	)

	def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str,
	course_tag="") -> (List[Document], Dict[str, str]):
	"""
	Parameters:
	documents (List[Document]): List of documents to add metadata to
	id (str): ID of the documents
	source (str): Source of the documents
	file_type (str): Type of the documents
	course_tag (str): Tag to identify the course the documents belongs to

	Returns:
	documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata

	Add metadata to the documents
	"""
	metadata = {
	"id": id,
	"source": source,
	"url": url,
	"file_type": file_type,
	"course_tag": course_tag
	}
	for doc in documents:
	doc.metadata = metadata
	return documents, metadata

	def load_pdf(self, text) -> (List[Document], Dict[str, str]):
	"""
	Returns:
	documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata

	Load PDF file, split into chunks and add metadata
	"""
	pdf_chunk = self.text_splitter.create_documents([text])
	print("Created document chunks")
	return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")

	def load_scanned_pdf(self, file) -> str:
	"""
	Parameters:
	file (File): Scanned PDF file to be processed

	Returns:
	str: Text extracted from the scanned PDF file

	Extract text from scanned PDF file
	"""
	images = convert_from_bytes(file)

	all_text = ""
	for image in images:
	# Perform OCR on the image
	text = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)

	# Extract text from the dictionary
	page_text = " ".join(text['text'])
	all_text += page_text + "\n"

	return all_text

	def load_transcript(self, url) -> (List[Document], Dict[str, str]):
	"""
	Returns:
	documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata

	Load transcript, split into chunks and add metadata
	"""
	transcript = self.transcriber.transcribe(url)
	print("Transcribed")
	transcript_chunk = self.text_splitter.create_documents([transcript.text])
	print("Created transcript chunks")
	return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video",
	file_type="transcript")

	def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
	"""
	Returns:
	documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata

	Load YouTube transcript, split into chunks and add metadata
	"""
	if url.startswith("https://www.youtube.com/watch?v="):
	video_id = url.replace("https://www.youtube.com/watch?v=", "")
	else:
	video_id = url.replace("https://youtu.be/", "")

	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	print("Downloaded transcript")
	transcript = [line['text'] for line in transcript]
	transcript_text = ' '.join(transcript)
	yt_transcript_chunk = self.text_splitter.create_documents([transcript_text])
	print("Created YouTube transcript chunks")
	return self._add_metadata(yt_transcript_chunk, url=url, id=video_id, source="youtube", file_type="transcript")

	def upload_to_db(self, documents: List[Document]):
	"""
	Parameters:
	documents (List[Document]): List of documents to upload to Vector-DB

	Upload documents to Vector-DB
	"""
	print("Embedding and uploading to Vector-DB...")
	self.db.add_documents(documents)
	print("Uploaded to Vector-DB")


	class PromptCreate:
	def __init__(self, example_path, save_path):
	self.examples = []
	self.example_prompt = None
	self.few_shot_prompt = None
	self.example_path = example_path
	self.file_name = "example_{i}.json"
	self.save_path = save_path

	def load_examples(self):
	for i in range(1, len(os.listdir(self.example_path)) + 1):
	filename = os.path.join(self.example_path, self.file_name.format(i=i))
	try:
	with open(filename, "r") as json_file:
	self.examples.append(json.load(json_file))
	except FileNotFoundError:
	print(f"File {filename} not found.")
	except json.JSONDecodeError:
	print(f"Error decoding JSON from file {filename}.")

	def create_prompt_template(self, input_variables, template_string):
	self.example_prompt = PromptTemplate(input_variables=input_variables, template=template_string)

	def create_few_shot_prompt(self, prefix, suffix):
	self.few_shot_prompt = FewShotPromptTemplate(
	examples=self.examples, example_prompt=self.example_prompt, prefix=prefix, suffix=suffix
	)

	def save_prompt(self):
	self.few_shot_prompt.save(self.save_path)