Davidsamuel101 commited on
Commit
57da257
1 Parent(s): a88643a

Added Description

Browse files
Files changed (2) hide show
  1. src/summarizer.py +1 -1
  2. src/text_extractor.py +34 -10
src/summarizer.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Tuple, Optional
2
  from tqdm import tqdm
3
  from transformers import PegasusForConditionalGeneration, PegasusTokenizer
4
  from src.text_extractor import TextExtractor
 
1
+ 0from typing import Dict, List, Tuple, Optional
2
  from tqdm import tqdm
3
  from transformers import PegasusForConditionalGeneration, PegasusTokenizer
4
  from src.text_extractor import TextExtractor
src/text_extractor.py CHANGED
@@ -9,9 +9,9 @@ class TextExtractor:
9
  pass
10
 
11
  @staticmethod
12
- def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
13
  """
14
- This function return the fonts information inside the pdf such as size and type.
15
 
16
  Args:
17
  doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
@@ -21,7 +21,8 @@ class TextExtractor:
21
  ValueError: Raises Value Error if there are no font detected
22
 
23
  Returns:
24
- Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
 
25
  """
26
  styles = {}
27
  font_counts = {}
@@ -38,16 +39,17 @@ class TextExtractor:
38
  return font_counts, styles
39
 
40
  @staticmethod
41
- def get_font_tags(font_counts, styles):
42
  """
43
- _summary_
44
 
45
  Args:
46
- font_counts (_type_): _description_
47
- styles (_type_): _description_
48
 
49
  Returns:
50
- _type_: _description_
 
51
  """
52
  p_size = styles[font_counts[0][0]]['size']
53
  # sorting the font sizes high to low, so that we can append the right integer to each tag
@@ -61,7 +63,7 @@ class TextExtractor:
61
  return size_tag
62
 
63
  @staticmethod
64
- def assign_tags(doc, size_tag):
65
  """
66
  Scrapes headers & paragraphs from PDF and return texts with element tags.
67
 
@@ -70,6 +72,9 @@ class TextExtractor:
70
  size_tag (dict): Textual element tags for each size.
71
  Returns:
72
  list: Texts with pre-prended element tags
 
 
 
73
  """
74
  texts = []
75
  previous_s = {}
@@ -100,6 +105,24 @@ class TextExtractor:
100
 
101
  @staticmethod
102
  def get_slides(texts):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  slides = {}
104
  section = []
105
  page = 1
@@ -130,4 +153,5 @@ class TextExtractor:
130
  page += 1
131
  except:
132
  continue
133
- return slides
 
 
9
  pass
10
 
11
  @staticmethod
12
+ def get_font_info(doc: Iterator, granularity=False) -> List[Tuple[str, int]]:
13
  """
14
+ Return a list containing the font sizes and their count number.
15
 
16
  Args:
17
  doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
 
21
  ValueError: Raises Value Error if there are no font detected
22
 
23
  Returns:
24
+ List[Tuple[str, int]]:
25
+ Font Counts: [('12.0', 266), ('16.020000457763672', 18), ('13.979999542236328', 7), ('7.019999980926514', 2)]
26
  """
27
  styles = {}
28
  font_counts = {}
 
39
  return font_counts, styles
40
 
41
  @staticmethod
42
+ def get_font_tags(font_counts, styles) -> Dict[int, str]:
43
  """
44
+ Return a dictionary of font sizes and their corresponding tags.
45
 
46
  Args:
47
+ font_counts (List[Tuple[str, int]]): The font sizes as keys and their count as values
48
+ styles (Dict[int, Dict[str, str]]): A style descriptioin of every font sizes.
49
 
50
  Returns:
51
+ Dict[int, str]: Dictionary of the font sizes as keys and their tags as values.
52
+ Example: {12.0: '<p>', 16.020000457763672: '<h1>', 13.979999542236328: '<h2>', 7.019999980926514: '<s4>'}
53
  """
54
  p_size = styles[font_counts[0][0]]['size']
55
  # sorting the font sizes high to low, so that we can append the right integer to each tag
 
63
  return size_tag
64
 
65
  @staticmethod
66
+ def assign_tags(doc, size_tag) -> List[str]:
67
  """
68
  Scrapes headers & paragraphs from PDF and return texts with element tags.
69
 
 
72
  size_tag (dict): Textual element tags for each size.
73
  Returns:
74
  list: Texts with pre-prended element tags
75
+ Examples: ['<h1>Group Members: |', '<p>1. Stella Shania Mintara - 2301860596
76
+ | 2. David Samuel - 2301850304 | 3. Egivenia - 2301850134 | 4. Aurelius Va
77
+ nnes Leander - 2301862102 | 5. Juanrico Alvaro - 2301847316 ||']
78
  """
79
  texts = []
80
  previous_s = {}
 
105
 
106
  @staticmethod
107
  def get_slides(texts):
108
+ """
109
+ Returns the tagged texts into a slide format dictionary where the page is the
110
+ key and the value is a list contaning the component of that page.
111
+
112
+ Args:
113
+ texts (List[str]): PDF text with element tags.
114
+
115
+ Returns:
116
+ Dict: The text of the PDF seperated by the header 1 tags.
117
+ Examples: {'Page 1': [('h1', 'Group Members:'),
118
+ ['p', '1. Stella Shania Mintara - 2301860596 2. David Samuel -
119
+ 2301850304 3. Egivenia - 2301850134 4. Aurelius Vannes Leander -
120
+ 2301862102 5.
121
+ Juanrico Alvaro - 2301847316']],
122
+ 'Page 2': [('h1', 'Case Problem'),
123
+ ['p', FreshMart is an established large-scale supermarket with branc
124
+ hes in popular areas across Jakarta and big cities]]}
125
+ """
126
  slides = {}
127
  section = []
128
  page = 1
 
153
  page += 1
154
  except:
155
  continue
156
+ return slides
157
+