Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import categories | |
| import processing | |
| import extract | |
| from PIL import Image | |
| from pydantic import BaseModel | |
| from io import BytesIO | |
| def categorize_and_parse_text(text: str) -> BaseModel: | |
| """Categorizes the text and parses the information from it. | |
| Args: | |
| text(str): The text to categorize and parse information from. | |
| Returns: The category of the text. | |
| """ | |
| category = categories.categorize_text(text) | |
| # if stop_on_category: | |
| # return category, text | |
| result = categories.run_category_chain(category, text) | |
| return result | |
| def process_pdf(filename: Path, extract_only=False) -> BaseModel: | |
| """Processes the given PDF file and extracts information from it. | |
| Args: | |
| filename(Path): The PDF file to process. | |
| Returns: The extracted information. | |
| """ | |
| with open(filename, "rb") as f: | |
| pdf_bytes = bytes(f.read()) | |
| text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes)) | |
| # If the encoded text is too short, a pdf scanner probably added a watermark | |
| if len(text) < 20: | |
| # Try to extract text from images | |
| images = processing.preprocess_pdf_pdf2image(pdf_bytes) | |
| text = extract.extract_text_from_images_pyocr_tesseract(images) | |
| if extract_only: | |
| return text | |
| result = categorize_and_parse_text(text) | |
| return result | |
| def process_image(filename: Path, extract_only=False) -> BaseModel: | |
| """Processes the given image file and extracts information from it. | |
| Args: | |
| filename(Path): The image file to process. | |
| Returns: The extracted information. | |
| """ | |
| image = Image.open(filename) | |
| image = processing.preprocess_image(image) | |
| text = extract.extract_text_from_image_pyocr_tesseract(image) | |
| image.close() | |
| if extract_only: | |
| return text | |
| result = categorize_and_parse_text(text) | |
| return result | |
| if __name__ == "__main__": | |
| filename = Path("examples/example1.pdf") | |
| result = process_pdf(filename) | |
| print(result.json(indent=4)) |