Spaces:

BenjaminB
/

gistillery

Runtime error

gistillery / src /gistillery /preprocessing.py

Benjamin Bossan

Add pdf processor using pypdf

4c2b75c about 1 year ago

7.65 kB

	import abc
	import io
	import logging
	import re
	from typing import Optional

	import torch
	import trafilatura
	import urllib3
	from httpx import Client
	from PIL import Image
	from transformers import AutoProcessor, WhisperForConditionalGeneration

	from gistillery.base import JobInput
	from gistillery.config import get_config
	from gistillery.errors import ProcessingError
	from gistillery.media import download_yt_audio, load_audio
	from gistillery.tools import get_agent

	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)


	RE_URL = re.compile(r"(https?://[^\s]+)")


	def get_url(text: str) -> str \| None:
	urls: list[str] = list(RE_URL.findall(text))
	if len(urls) == 1:
	url = urls[0]
	return url
	return None


	class Processor(abc.ABC):
	def __init__(self) -> None:
	self.max_length = get_config().processing_max_length
	self._super_init_called = True

	def get_name(self) -> str:
	return self.__class__.__name__

	def __call__(self, job: JobInput) -> str:
	if not self._super_init_called:
	raise RuntimeError(
	"super().__init__() was not called with class "
	f"{self.__class__.__name__}"
	)

	_id = job.id
	logger.info(f"Processing {input} with {self.__class__.__name__} (id={_id[:8]})")
	result = self.process(job)
	if len(result) > self.max_length:
	logger.warning(
	f"Length of result ({len(result)}) exceeds max_length "
	f"({self.max_length}), truncating"
	)
	result = result[: self.max_length]
	logger.info(f"Finished processing input (id={_id[:8]})")
	return result

	@abc.abstractmethod
	def process(self, input: JobInput) -> str:
	raise NotImplementedError

	@abc.abstractmethod
	def match(self, input: JobInput) -> bool:
	raise NotImplementedError


	class RawTextProcessor(Processor):
	def match(self, input: JobInput) -> bool:
	return True

	def process(self, input: JobInput) -> str:
	return input.content.strip()


	class DefaultUrlProcessor(Processor):
	def __init__(self) -> None:
	super().__init__()
	self.client = Client()
	self.url = Optional[str]
	self.template = "{url}\n\n{content}"

	def match(self, input: JobInput) -> bool:
	url = get_url(input.content.strip())
	if url is None:
	return False

	self.url = url
	return True

	def process(self, input: JobInput) -> str:
	"""Get content of website and return it as string"""
	if not isinstance(self.url, str):
	raise TypeError("self.url must be a string")

	text = self.client.get(self.url).text
	assert isinstance(text, str)
	extracted = trafilatura.extract(text)
	text = self.template.format(url=self.url, content=extracted)
	return str(text)


	class PdfUrlProcessor(Processor):
	def __init__(self) -> None:
	super().__init__()
	self.client = Client()
	self.url = Optional[str]
	self.template = "{url}\n\n{content}"
	self.stop_words = get_config().pdf_stop_words

	def match(self, input: JobInput) -> bool:
	url = get_url(input.content.strip())
	if url is None:
	return False

	suffix = url.rsplit(".", 1)[-1].lower()
	if suffix != "pdf":
	return False

	self.url = url
	return True

	def process(self, input: JobInput) -> str:
	if not isinstance(self.url, str):
	raise TypeError("self.url must be a string")

	response = self.client.get(self.url)
	import pypdf

	pdf = pypdf.PdfReader(io.BytesIO(response.content))
	results = []
	for page in pdf.pages:
	results.append(page.extract_text())
	if any(word in results[-1] for word in self.stop_words):
	break
	text = "\n".join(results).strip()
	if not text:
	raise ProcessingError("No text could be extracted from PDF")
	return self.template.format(url=self.url, content=text)


	class ImageUrlProcessor(Processor):
	def __init__(self) -> None:
	super().__init__()
	self.client = Client()
	self.url = Optional[str]
	self.template = "{url}\n\n{content}"
	self.image_suffixes = {'jpg', 'jpeg', 'png', 'gif'}

	def match(self, input: JobInput) -> bool:
	url = get_url(input.content.strip())
	if url is None:
	return False

	suffix = url.rsplit(".", 1)[-1].lower()
	if suffix not in self.image_suffixes:
	return False

	self.url = url
	return True

	def process(self, input: JobInput) -> str:
	if not isinstance(self.url, str):
	raise TypeError("self.url must be a string")

	response = self.client.get(self.url)
	image = Image.open(io.BytesIO(response.content)).convert('RGB')
	caption = get_agent().run("Caption the following image", image=image)
	text = str(caption)
	return self.template.format(url=self.url, content=text)


	class YoutubeUrlProcessor(Processor):
	"""Download yt audio, transcribe with whisper"""

	def __init__(self) -> None:
	super().__init__()
	self.client = Client()
	self.url = Optional[str]
	self.template = "{url}\n\n{content}"

	self.processor = AutoProcessor.from_pretrained("openai/whisper-small.en")
	self.model = WhisperForConditionalGeneration.from_pretrained(
	"openai/whisper-small.en"
	)

	self.hosts = {"www.youtube.com", "youtube.com", "youtu.be"}

	def match(self, input: JobInput) -> bool:
	url = get_url(input.content.strip())
	if url is None:
	return False

	parsed = urllib3.util.parse_url(url)
	if parsed.host not in self.hosts:
	return False

	self.url = url
	return True

	@staticmethod
	def make_batch(input_ids: torch.Tensor, max_len: int) -> torch.Tensor:
	"""Create batches from last dimension, pad last batch if necessary

	Examples
	>>> import torch
	>>> x = torch.zeros((1, 10, 213))
	>>> YoutubeUrlProcessor.make_batch(x, max_len=100).shape
	torch.Size([3, 10, 100])

	"""
	# ugly workaround, transformers whisper implementation requires a
	# specific shape of input length, probably there is a better way...
	batches = input_ids.split(max_len, dim=-1) # type: ignore
	last = batches[-1]
	n = last.shape[-1]
	last = torch.nn.functional.pad(last, (1, max_len - n - 1), value=0.0)
	batches = batches[:-1] + (last,)
	return torch.concat(batches)

	def process(self, input: JobInput) -> str:
	if not isinstance(self.url, str):
	raise TypeError("self.url must be a string")

	config = get_config()
	fname = download_yt_audio(self.url, max_length=config.max_yt_length)
	audio = load_audio(fname, sampling_rate=config.sampling_rate)
	inputs = self.processor(
	audio,
	return_tensors='pt',
	sampling_rate=config.sampling_rate,
	max_length=-1,
	)
	batch = self.make_batch(
	inputs['input_features'], max_len=2 * self.model.config.max_source_positions
	)
	generated_ids = self.model.generate(batch)
	transcription = self.processor.batch_decode(
	generated_ids, skip_special_tokens=True
	)
	return self.template.format(url=self.url, content=" ".join(transcription))