from typing import Dict, List, Tuple, Optional from tqdm import tqdm from transformers import PegasusForConditionalGeneration, PegasusTokenizer from src.text_extractor import TextExtractor from mdutils.mdutils import MdUtils import torch import fitz import copy class Summarizer(): def __init__(self, model_name: str): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.tokenizer = PegasusTokenizer.from_pretrained(model_name) self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device) self.preprocess = TextExtractor() def extract_text(self, document: object) -> Dict[str, List[Tuple[str, str]]]: doc = fitz.open(document) self.filename = doc.name.split('/')[-1].split('.')[0] font_counts, styles = self.preprocess.get_font_info(doc, granularity=False) size_tag = self.preprocess.get_font_tags(font_counts, styles) texts = self.preprocess.assign_tags(doc, size_tag) slide_content = self.preprocess.get_slides(texts) return slide_content def __call__(self, slides: Dict[str, List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, str]]]: summarized_slides = copy.deepcopy(slides) for page, contents in tqdm(summarized_slides.items()): for idx, (tag, content) in enumerate(contents): if tag.startswith('p'): try: input = self.tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(self.device) tensor = self.model.generate(**input) summary = self.tokenizer.batch_decode(tensor, skip_special_tokens=True)[0] contents[idx] = (tag, summary) except Exception as e: print(f"Summarization Fails, Error: {e}") return summarized_slides def convert2markdown(self, summarized_slides: Dict[str, List[Tuple[str, str]]], target_path: Optional[str]=None) -> str: filename = self.filename if target_path: filename = target_path mdFile = MdUtils(file_name=filename) for k, v in summarized_slides.items(): mdFile.new_line('---\n') for section in v: tag = section[0] content = section[1] if tag.startswith('h'): try: mdFile.new_header(level=int(tag[1]), title=content) except: continue if tag == 'p': contents = content.split('') for content in contents: mdFile.new_line(f"{content}\n") markdown = mdFile.create_md_file() return markdown def remove_leading_empty_lines(self, file_path) -> None: with open(file_path, 'r') as file: lines = file.readlines() non_empty_lines = [] found_first_word = False for line in lines: stripped_line = line.strip() if stripped_line and not found_first_word: found_first_word = True if found_first_word or stripped_line: non_empty_lines.append(line) with open(file_path, 'w') as file: file.writelines(non_empty_lines) return