File size: 3,485 Bytes
28ad2d2
9f2dd14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53eb8d1
9f2dd14
 
 
 
 
 
e2f7af9
 
 
 
9f2dd14
 
 
 
 
 
e0427a6
 
 
 
 
 
 
 
 
 
 
 
 
 
9f2dd14
e0427a6
 
 
9f2dd14
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from typing import Dict, List, Tuple, Optional
from tqdm import tqdm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from src.text_extractor import TextExtractor
from mdutils.mdutils import MdUtils

import torch
import fitz
import copy

class Summarizer():
    def __init__(self, model_name: str):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.preprocess = TextExtractor()
    
    def extract_text(self, document: object) -> Dict[str, List[Tuple[str, str]]]:
        doc = fitz.open(document)
        self.filename = doc.name.split('/')[-1].split('.')[0]
        font_counts, styles = self.preprocess.get_font_info(doc, granularity=False)
        size_tag = self.preprocess.get_font_tags(font_counts, styles)
        texts = self.preprocess.assign_tags(doc, size_tag)
        slide_content = self.preprocess.get_slides(texts)
        return slide_content

    def __call__(self, slides: Dict[str, List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, str]]]:
            summarized_slides = copy.deepcopy(slides)
            for page, contents in tqdm(summarized_slides.items()):
                for idx, (tag, content) in enumerate(contents):
                    if tag.startswith('p'): 
                        try:
                            input = self.tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(self.device)
                            tensor = self.model.generate(**input)
                            summary = self.tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
                            contents[idx] = (tag, summary)
                        except Exception as e:
                            print(f"Summarization Fails, Error: {e}")
                            
            return summarized_slides
    
    def convert2markdown(self, summarized_slides: Dict[str, List[Tuple[str, str]]], target_path: Optional[str]=None) -> str:
        filename = self.filename
        if target_path:
            filename = target_path
        mdFile = MdUtils(file_name=filename)
        for k, v in summarized_slides.items():
            mdFile.new_line('---\n')
            for section in v:
                tag = section[0]
                content = section[1]
                if tag.startswith('h'):
                    try:
                        mdFile.new_header(level=int(tag[1]), title=content)
                    except:
                        continue
                if tag == 'p':
                    contents = content.split('<n>')
                    for content in contents:
                        mdFile.new_line(f"{content}\n")
        markdown = mdFile.create_md_file()
        return markdown
    
    def remove_leading_empty_lines(self, file_path) -> None:
        with open(file_path, 'r') as file:
            lines = file.readlines()

        non_empty_lines = []
        found_first_word = False

        for line in lines:
            stripped_line = line.strip()
            if stripped_line and not found_first_word:
                found_first_word = True
            if found_first_word or stripped_line:
                non_empty_lines.append(line)

        with open(file_path, 'w') as file:
            file.writelines(non_empty_lines)
        return