In [None]:
! pip install -U "transformers[torch]" pypdf python-docx langdetect sentencepiece sacremoses

In [None]:
import gradio as gr
from transformers import pipeline
from pypdf import PdfReader
import docx
import os
from langdetect import detect

In [None]:
summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
translator_to_french = pipeline(
    task="translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr"
)
translator_to_english = pipeline(
    task="translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en"
)

MAX_FILE_SIZE = 10000000

In [None]:
class TextExtractor:
    def __init__(self, doc_location: str):
        if doc_location is None:
            raise Exception(f"Please select a PDF to summarize")
        self.doc_location = doc_location

    def extract_text_from_pdf(self):
        reader = PdfReader(self.doc_location)
        text = ""

        for page in reader.pages:
            text += page.extract_text()

        return text

    def extract_text_from_doc(self):
        doc = docx.Document(self.doc_location)
        text = ""

        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

    def extract_text_from_txt(self):
        with open(self.doc_location, "r", encoding="utf-8") as file:
            text = file.read()
        return text

    def extract_text_from_txt(self):
        with open(self.doc_location, "r", encoding="utf-8") as file:
            text = file.read()
        return text
    
    def text_length(self):
        words = self.text.split()
        num_words = len(words)
        return num_words

    def get_text(self) -> str:
        file_extension = os.path.splitext(self.doc_location)[1]
        if file_extension == ".pdf":
            self.text = self.extract_text_from_pdf()
        elif file_extension == ".txt":
            self.text = self.extract_text_from_txt()
        elif file_extension == ".docx" or file_extension == ".doc":
            self.text = self.extract_text_from_doc()
        else:
            raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")

        if len(self.text) > MAX_FILE_SIZE:
            raise gr.Error(
                f"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters."
            )

        return self.text

In [None]:
text_extractor = TextExtractor("data/doc-file-example.docx")
text = text_extractor.get_text()

text_length = text_extractor.text_length()
summary_length = int(text_length / 2)

summary = summarizer(text, max_length=summary_length, do_sample=False)[0]["summary_text"]

In [None]:
summary

In [None]:
detected_lang = detect(summary)
detected_lang