omkar334 commited on
Commit
df32b2c
1 Parent(s): 0e8d6fe

clean_text and activities

Browse files
Files changed (1) hide show
  1. preprocessing.py +43 -4
preprocessing.py CHANGED
@@ -1,4 +1,5 @@
1
- from collections import defaultdict
 
2
 
3
  import pymupdf
4
 
@@ -34,6 +35,15 @@ def majority_element(spans, param):
34
  return max(char_count, key=char_count.get, default=None)
35
 
36
 
 
 
 
 
 
 
 
 
 
37
  def get_chunks(doc):
38
  allchunks = []
39
 
@@ -63,7 +73,7 @@ def get_chunks(doc):
63
  if text.strip():
64
  chunks.append(
65
  {
66
- "text": text.strip(),
67
  "page": page_num,
68
  "x": block["bbox"][0],
69
  "y": block["bbox"][1],
@@ -77,7 +87,36 @@ def get_chunks(doc):
77
  return allchunks
78
 
79
 
80
- def embed_pdf(path):
81
- doc = pymupdf.open(path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  chunks = get_chunks(doc)
 
83
  return chunks
 
1
+ import re
2
+ from collections import OrderedDict, defaultdict
3
 
4
  import pymupdf
5
 
 
35
  return max(char_count, key=char_count.get, default=None)
36
 
37
 
38
+ def clean_text(text):
39
+ print("Cleaning = ", text)
40
+ words = text.split()
41
+ unique_words = OrderedDict.fromkeys(words)
42
+ cleaned_text = " ".join(unique_words)
43
+ cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
44
+ return cleaned_text
45
+
46
+
47
  def get_chunks(doc):
48
  allchunks = []
49
 
 
73
  if text.strip():
74
  chunks.append(
75
  {
76
+ "text": clean_text(text.strip()),
77
  "page": page_num,
78
  "x": block["bbox"][0],
79
  "y": block["bbox"][1],
 
87
  return allchunks
88
 
89
 
90
+ def process_activities(chunks):
91
+ # activities = []
92
+ i = 0
93
+ while i < len(chunks):
94
+ chunk = chunks[i]
95
+ if "Activity" in chunk["text"]:
96
+ activity = chunk.copy()
97
+ activity_size = chunks[i + 1]["size"] if i + 1 < len(chunks) else None
98
+
99
+ j = i + 1
100
+ while j < len(chunks) and chunks[j]["size"] == activity_size:
101
+ activity["text"] += "\n" + chunks[j]["text"]
102
+ j += 1
103
+
104
+ # Replace the range of chunks with the single activity chunk
105
+ chunks[i:j] = [activity]
106
+
107
+ # activities.append(activity)
108
+ i += 1
109
+ else:
110
+ i += 1
111
+
112
+ return chunks
113
+
114
+
115
+ def embed_pdf(path, buffer=False):
116
+ if buffer:
117
+ doc = pymupdf.open(stream=path, filetype="pdf")
118
+ else:
119
+ doc = pymupdf.open(path)
120
  chunks = get_chunks(doc)
121
+ chunks = process_activities(chunks)
122
  return chunks