Spaces:

Davidsamuel101
/

PPTGenerator

Runtime error

App Files Files Community

Davidsamuel101 commited on Jul 23, 2023

Commit

57da257

•

1 Parent(s): a88643a

Added Description

Browse files

Files changed (2) hide show

src/summarizer.py +1 -1
src/text_extractor.py +34 -10

src/summarizer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple, Optional
 from tqdm import tqdm
 from transformers import PegasusForConditionalGeneration, PegasusTokenizer
 from src.text_extractor import TextExtractor

+0from typing import Dict, List, Tuple, Optional
 from tqdm import tqdm
 from transformers import PegasusForConditionalGeneration, PegasusTokenizer
 from src.text_extractor import TextExtractor

src/text_extractor.py CHANGED Viewed

@@ -9,9 +9,9 @@ class TextExtractor:
         pass
     @staticmethod
-    def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
         """
-        This function return the fonts information inside the pdf such as size and type.
         Args:
             doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
@@ -21,7 +21,8 @@ class TextExtractor:
             ValueError: Raises Value Error if there are no font detected
         Returns:
-            Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
         """
         styles = {}
         font_counts = {}
@@ -38,16 +39,17 @@ class TextExtractor:
         return font_counts, styles
     @staticmethod
-    def get_font_tags(font_counts, styles):
         """
-        _summary_
         Args:
-            font_counts (_type_): _description_
-            styles (_type_): _description_
         Returns:
-            _type_: _description_
         """
         p_size = styles[font_counts[0][0]]['size']
         # sorting the font sizes high to low, so that we can append the right integer to each tag
@@ -61,7 +63,7 @@ class TextExtractor:
         return size_tag
     @staticmethod
-    def assign_tags(doc, size_tag):
         """
         Scrapes headers & paragraphs from PDF and return texts with element tags.
@@ -70,6 +72,9 @@ class TextExtractor:
             size_tag (dict): Textual element tags for each size.
         Returns:
             list: Texts with pre-prended element tags
         """
         texts = []
         previous_s = {}
@@ -100,6 +105,24 @@ class TextExtractor:
     @staticmethod
     def get_slides(texts):
         slides = {}
         section = []
         page = 1
@@ -130,4 +153,5 @@ class TextExtractor:
                         page += 1
                 except:
                     continue
-        return slides

         pass
     @staticmethod
+    def get_font_info(doc: Iterator, granularity=False) -> List[Tuple[str, int]]:
         """
+        Return a list containing the font sizes and their count number.
         Args:
             doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
             ValueError: Raises Value Error if there are no font detected
         Returns:
+            List[Tuple[str, int]]:
+                Font Counts: [('12.0', 266), ('16.020000457763672', 18), ('13.979999542236328', 7), ('7.019999980926514', 2)]
         """
         styles = {}
         font_counts = {}
         return font_counts, styles
     @staticmethod
+    def get_font_tags(font_counts, styles) -> Dict[int, str]:
         """
+        Return a dictionary of font sizes and their corresponding tags.
         Args:
+            font_counts (List[Tuple[str, int]]): The font sizes as keys and their count as values
+            styles (Dict[int, Dict[str, str]]): A style descriptioin of every font sizes.
         Returns:
+            Dict[int, str]: Dictionary of the font sizes as keys and their tags as values.
+            Example: {12.0: '<p>', 16.020000457763672: '<h1>', 13.979999542236328: '<h2>', 7.019999980926514: '<s4>'}
         """
         p_size = styles[font_counts[0][0]]['size']
         # sorting the font sizes high to low, so that we can append the right integer to each tag
         return size_tag
     @staticmethod
+    def assign_tags(doc, size_tag) -> List[str]:
         """
         Scrapes headers & paragraphs from PDF and return texts with element tags.
             size_tag (dict): Textual element tags for each size.
         Returns:
             list: Texts with pre-prended element tags
+            Examples: ['<h1>Group Members: |', '<p>1. Stella Shania Mintara - 2301860596
+            | 2. David Samuel - 2301850304 | 3.   Egivenia - 2301850134 | 4. Aurelius Va
+            nnes Leander - 2301862102 | 5. Juanrico Alvaro - 2301847316 ||']
         """
         texts = []
         previous_s = {}
     @staticmethod
     def get_slides(texts):
+        """
+        Returns the tagged texts into a slide format dictionary where the page is the
+        key and the value is a list contaning the component of that page.
+        Args:
+            texts (List[str]): PDF text with element tags.
+        Returns:
+            Dict: The text of the PDF seperated by the header 1 tags.
+            Examples: {'Page 1': [('h1', 'Group Members:'),
+                    ['p', '1. Stella Shania Mintara - 2301860596 2. David Samuel -
+                    2301850304 3. Egivenia - 2301850134 4. Aurelius Vannes Leander -
+                    2301862102 5.
+                    Juanrico Alvaro - 2301847316']],
+                    'Page 2': [('h1', 'Case Problem'),
+                    ['p', FreshMart is an established large-scale supermarket with branc
+                    hes in popular areas across Jakarta and big cities]]}
+        """
         slides = {}
         section = []
         page = 1
                         page += 1
                 except:
                     continue
+        return slides