Spaces:

Schmitz005
/

huggingwhale.ai

Sleeping

huggingwhale.ai / whalecore /parser.py

Create whalecore/parser.py

f2a821b verified 2 months ago

1.37 kB

	import os
	import PyPDF2
	import whisper
	from pydub import AudioSegment
	from sentence_transformers import SentenceTransformer

	import warnings
	warnings.filterwarnings(
	"ignore",
	category=FutureWarning,
	message="`clean_up_tokenization_spaces` was not set.*"
	)
	model = SentenceTransformer('all-MiniLM-L6-v2')

	def parse_pdf(filepath):
	text = ""
	with open(filepath, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	def parse_audio(filepath):
	model = whisper.load_model("base")
	result = model.transcribe(filepath)
	return result['text']

	def parse_text(filepath):
	with open(filepath, 'r') as f:
	return f.read()

	def parse_file(filepath):
	if filepath.endswith('.pdf'):
	return parse_pdf(filepath)
	elif filepath.endswith(('.mp3', '.wav', '.m4a')):
	return parse_audio(filepath)
	elif filepath.endswith('.txt'):
	return parse_text(filepath)
	else:
	raise ValueError(f"Unsupported file type: {filepath}")

	def chunk_text(text, chunk_size=300):
	words = text.split()
	return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

	def chunk_and_embed(text):
	chunks = chunk_text(text)
	embeddings = model.encode(chunks).tolist()
	return list(zip(chunks, embeddings))