Spaces:
Runtime error
Runtime error
File size: 3,485 Bytes
28ad2d2 9f2dd14 53eb8d1 9f2dd14 e2f7af9 9f2dd14 e0427a6 9f2dd14 e0427a6 9f2dd14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
from typing import Dict, List, Tuple, Optional
from tqdm import tqdm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from src.text_extractor import TextExtractor
from mdutils.mdutils import MdUtils
import torch
import fitz
import copy
class Summarizer():
def __init__(self, model_name: str):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device)
self.preprocess = TextExtractor()
def extract_text(self, document: object) -> Dict[str, List[Tuple[str, str]]]:
doc = fitz.open(document)
self.filename = doc.name.split('/')[-1].split('.')[0]
font_counts, styles = self.preprocess.get_font_info(doc, granularity=False)
size_tag = self.preprocess.get_font_tags(font_counts, styles)
texts = self.preprocess.assign_tags(doc, size_tag)
slide_content = self.preprocess.get_slides(texts)
return slide_content
def __call__(self, slides: Dict[str, List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, str]]]:
summarized_slides = copy.deepcopy(slides)
for page, contents in tqdm(summarized_slides.items()):
for idx, (tag, content) in enumerate(contents):
if tag.startswith('p'):
try:
input = self.tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(self.device)
tensor = self.model.generate(**input)
summary = self.tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
contents[idx] = (tag, summary)
except Exception as e:
print(f"Summarization Fails, Error: {e}")
return summarized_slides
def convert2markdown(self, summarized_slides: Dict[str, List[Tuple[str, str]]], target_path: Optional[str]=None) -> str:
filename = self.filename
if target_path:
filename = target_path
mdFile = MdUtils(file_name=filename)
for k, v in summarized_slides.items():
mdFile.new_line('---\n')
for section in v:
tag = section[0]
content = section[1]
if tag.startswith('h'):
try:
mdFile.new_header(level=int(tag[1]), title=content)
except:
continue
if tag == 'p':
contents = content.split('<n>')
for content in contents:
mdFile.new_line(f"{content}\n")
markdown = mdFile.create_md_file()
return markdown
def remove_leading_empty_lines(self, file_path) -> None:
with open(file_path, 'r') as file:
lines = file.readlines()
non_empty_lines = []
found_first_word = False
for line in lines:
stripped_line = line.strip()
if stripped_line and not found_first_word:
found_first_word = True
if found_first_word or stripped_line:
non_empty_lines.append(line)
with open(file_path, 'w') as file:
file.writelines(non_empty_lines)
return
|