{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install -U \"transformers[torch]\" pypdf python-docx langdetect sentencepiece sacremoses" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gradio as gr\n", "from transformers import pipeline\n", "from pypdf import PdfReader\n", "import docx\n", "import os\n", "from langdetect import detect" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "summarizer = pipeline(task=\"summarization\", model=\"facebook/bart-large-cnn\")\n", "translator_to_french = pipeline(\n", " task=\"translation_en_to_fr\", model=\"Helsinki-NLP/opus-mt-en-fr\"\n", ")\n", "translator_to_english = pipeline(\n", " task=\"translation_fr_to_en\", model=\"Helsinki-NLP/opus-mt-fr-en\"\n", ")\n", "\n", "MAX_FILE_SIZE = 10000000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class TextExtractor:\n", " def __init__(self, doc_location: str):\n", " if doc_location is None:\n", " raise Exception(f\"Please select a PDF to summarize\")\n", " self.doc_location = doc_location\n", "\n", " def extract_text_from_pdf(self):\n", " reader = PdfReader(self.doc_location)\n", " text = \"\"\n", "\n", " for page in reader.pages:\n", " text += page.extract_text()\n", "\n", " return text\n", "\n", " def extract_text_from_doc(self):\n", " doc = docx.Document(self.doc_location)\n", " text = \"\"\n", "\n", " for paragraph in doc.paragraphs:\n", " text += paragraph.text + \"\\n\"\n", " return text\n", "\n", " def extract_text_from_txt(self):\n", " with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n", " text = file.read()\n", " return text\n", "\n", " def extract_text_from_txt(self):\n", " with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n", " text = file.read()\n", " return text\n", " \n", " def text_length(self):\n", " words = self.text.split()\n", " num_words = len(words)\n", " return num_words\n", "\n", " def get_text(self) -> str:\n", " file_extension = os.path.splitext(self.doc_location)[1]\n", " if file_extension == \".pdf\":\n", " self.text = self.extract_text_from_pdf()\n", " elif file_extension == \".txt\":\n", " self.text = self.extract_text_from_txt()\n", " elif file_extension == \".docx\" or file_extension == \".doc\":\n", " self.text = self.extract_text_from_doc()\n", " else:\n", " raise gr.Error(f\"We only support .pdf, .txt, .doc and .docx files\")\n", "\n", " if len(self.text) > MAX_FILE_SIZE:\n", " raise gr.Error(\n", " f\"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters.\"\n", " )\n", "\n", " return self.text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_extractor = TextExtractor(\"data/doc-file-example.docx\")\n", "text = text_extractor.get_text()\n", "\n", "text_length = text_extractor.text_length()\n", "summary_length = int(text_length / 2)\n", "\n", "summary = summarizer(text, max_length=summary_length, do_sample=False)[0][\"summary_text\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "summary" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "detected_lang = detect(summary)\n", "detected_lang" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }