Omkar008 commited on
Commit
ba20ae6
·
verified ·
1 Parent(s): cff02ad

Update extract_and_store_supabase.py

Browse files
Files changed (1) hide show
  1. extract_and_store_supabase.py +238 -116
extract_and_store_supabase.py CHANGED
@@ -11,135 +11,257 @@ def extract_structure_store_message(user_id:str,message_id:str , attachment_id:s
11
  project_id = os.getenv('PROJECT_ID')
12
  processor_id = os.getenv('PROCESSOR_ID')
13
  document_entities = {}
14
-
15
  file_name = f"{message_id}_{attachment_id}.{attachment_extension}"
16
  print(f"file_name: {file_name}")
17
  supabase = Supabase_Client().instance
18
- try:
19
- response = supabase.storage.from_("receipt_radar").download(
20
- file_name
21
- )
22
- base64_data = urlsafe_b64encode(response).decode('utf-8')
23
-
24
- payload = {
25
- "skipHumanReview": True,
26
- "rawDocument": {
27
- "mimeType": f"application/{attachment_extension}",
28
- "content": base64_data
29
- }
30
- }
31
-
32
- access_token = get_access_token_v1()
33
- print(access_token)
34
-
35
- headers = {
36
- 'Authorization': f'Bearer {access_token}',
37
- 'Content-Type': 'application/json; charset=utf-8'
38
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- response = requests.post(
41
- f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
42
- headers=headers,
43
- json=payload
44
- )
45
- response_json = response.json()
46
- allowed_entities = [
47
- "due_date",
48
- "invoice_date",
49
- "total_amount",
50
- "total_tax_amount",
51
- "receiver_name",
52
- "invoice_id",
53
- "currency",
54
- "receiver_address",
55
- "invoice_type",
56
- "supplier_name",
57
- "payment_terms",
58
- "line_item",
59
- "line_item/description",
60
- "line_item/quantity",
61
- "line_item/amount",
62
- "line_item/unit_price"
63
- ]
64
- raw_text = response_json.get('document').get('text' , None)
65
- entities = response_json.get('document').get('entities' , None)
66
- document_entities['user_id'] = user_id
67
- insert_ocr_data_response = (
68
- supabase.table("receipt_ocr_data")
69
- .insert({'user_id':user_id , 'message_id':message_id,'receipt_text':raw_text ,'email':email,'file_type':attachment_extension})
70
- .execute()
71
- )
72
 
73
- print('Printing entities')
74
- print(entities)
75
- # if entities is not None:
76
- # for ent in entities:
77
- # if ent.get('type') is not None:
78
- # if ent.get('type') in allowed_entities:
79
- # mention_text = ent.get('mentionText')
80
- # normalised_values = ent.get('normalizedValue') if 'normalizedValue' in ent else None
81
- # document_entities[ent.get('type')] = {"mention_text":mention_text,"normalizedValue":normalised_values}
82
- if entities is not None:
83
- for ent in entities:
84
- if ent.get('type') is not None:
85
- entity_type = ent.get('type') or ""
86
 
87
- # Check if the entity type is in the allowed list
88
- if entity_type in allowed_entities:
89
- mention_text = ent.get('mentionText') or ""
90
- normalized_values = ent.get('normalizedValue') or ""
91
 
92
- # Initialize a list for the entity type if not already present
93
- if entity_type not in document_entities:
94
- document_entities[entity_type] = []
95
 
96
- # Append the entity data to the list
97
- document_entities[entity_type].append({
98
- "mention_text": mention_text,
99
- "normalizedValue": normalized_values
100
- })
101
 
102
- # Handling 'line_item' and its properties (line_item/description, line_item/quantity, etc.)
103
- if entity_type == 'line_item' and 'properties' in ent:
104
- for prop in ent['properties']:
105
- prop_type = prop.get('type') or ""
106
- if prop_type in allowed_entities:
107
- mention_text = prop.get('mentionText') or ""
108
- normalized_values = prop.get('normalizedValue') or ""
109
 
110
- # Initialize a list for the property type if not already present
111
- if prop_type not in document_entities:
112
- document_entities[prop_type] = []
113
 
114
- # Append the property data to the list
115
- document_entities[prop_type].append({
116
- "mention_text": mention_text,
117
- "normalizedValue": normalized_values
118
- })
119
- if 'line_item/description' in document_entities:
120
- document_entities['line_item_description'] = document_entities['line_item/description']
121
- document_entities.pop('line_item/description', None)
122
 
123
- if 'line_item/quantity' in document_entities:
124
- document_entities['line_item_quantity'] = document_entities['line_item/quantity']
125
- document_entities.pop('line_item/quantity', None)
126
 
127
- if 'line_item/amount' in document_entities:
128
- document_entities['line_item_amount'] = document_entities['line_item/amount']
129
- document_entities.pop('line_item/amount', None)
130
 
131
- if 'line_item/unit_price' in document_entities:
132
- document_entities['line_item_unit_price'] = document_entities['line_item/unit_price']
133
- document_entities.pop('line_item/unit_price', None)
134
- document_entities['email'] = email
135
- document_entities['message_id'] = message_id
136
- print(document_entities)
137
- insert_data_response = (
138
- supabase.table("document_ai_entities")
139
- .insert(document_entities)
140
- .execute()
141
- )
142
- print(insert_data_response)
143
 
144
- except Exception as e:
145
- print(f"Error downloading or encoding file: {e}")
 
11
  project_id = os.getenv('PROJECT_ID')
12
  processor_id = os.getenv('PROCESSOR_ID')
13
  document_entities = {}
14
+
15
  file_name = f"{message_id}_{attachment_id}.{attachment_extension}"
16
  print(f"file_name: {file_name}")
17
  supabase = Supabase_Client().instance
18
+
19
+ response = supabase.storage.from_("receipt_radar").download(file_name)
20
+ base64_data = urlsafe_b64encode(response).decode('utf-8')
21
+
22
+ payload = {
23
+ "skipHumanReview": True,
24
+ "rawDocument": {
25
+ "mimeType": f"application/{attachment_extension}",
26
+ "content": base64_data
 
 
 
 
 
 
 
 
 
 
 
27
  }
28
+ }
29
+
30
+ access_token = get_access_token_v1()
31
+ print(access_token)
32
+
33
+ headers = {
34
+ 'Authorization': f'Bearer {access_token}',
35
+ 'Content-Type': 'application/json; charset=utf-8'
36
+ }
37
+
38
+ response = requests.post(
39
+ f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
40
+ headers=headers,
41
+ json=payload
42
+ )
43
+ response_json = response.json()
44
+ allowed_entities = [
45
+ "due_date",
46
+ "invoice_date",
47
+ "total_amount",
48
+ "total_tax_amount",
49
+ "receiver_name",
50
+ "invoice_id",
51
+ "currency",
52
+ "receiver_address",
53
+ "invoice_type",
54
+ "supplier_name",
55
+ "payment_terms",
56
+ "line_item",
57
+ "line_item/description",
58
+ "line_item/quantity",
59
+ "line_item/amount",
60
+ "line_item/unit_price"
61
+ ]
62
+ raw_text = response_json.get('document').get('text', None)
63
+ entities = response_json.get('document').get('entities', None)
64
+ document_entities['user_id'] = user_id
65
+ insert_ocr_data_response = (
66
+ supabase.table("receipt_ocr_data")
67
+ .insert({'user_id': user_id, 'message_id': message_id, 'receipt_text': raw_text, 'email': email, 'file_type': attachment_extension})
68
+ .execute()
69
+ )
70
+
71
+ print('Printing entities')
72
+ print(entities)
73
+
74
+ if entities is not None:
75
+ for ent in entities:
76
+ if ent.get('type') is not None:
77
+ entity_type = ent.get('type') or ""
78
+
79
+ if entity_type in allowed_entities:
80
+ mention_text = ent.get('mentionText') or ""
81
+ normalized_values = ent.get('normalizedValue') or ""
82
+
83
+ if entity_type not in document_entities:
84
+ document_entities[entity_type] = []
85
+
86
+ document_entities[entity_type].append({
87
+ "mention_text": mention_text,
88
+ "normalizedValue": normalized_values
89
+ })
90
+
91
+ if entity_type == 'line_item' and 'properties' in ent:
92
+ for prop in ent['properties']:
93
+ prop_type = prop.get('type') or ""
94
+ if prop_type in allowed_entities:
95
+ mention_text = prop.get('mentionText') or ""
96
+ normalized_values = prop.get('normalizedValue') or ""
97
+
98
+ if prop_type not in document_entities:
99
+ document_entities[prop_type] = []
100
+
101
+ document_entities[prop_type].append({
102
+ "mention_text": mention_text,
103
+ "normalizedValue": normalized_values
104
+ })
105
+
106
+ if 'line_item/description' in document_entities:
107
+ document_entities['line_item_description'] = document_entities['line_item/description']
108
+ document_entities.pop('line_item/description', None)
109
+
110
+ if 'line_item/quantity' in document_entities:
111
+ document_entities['line_item_quantity'] = document_entities['line_item/quantity']
112
+ document_entities.pop('line_item/quantity', None)
113
+
114
+ if 'line_item/amount' in document_entities:
115
+ document_entities['line_item_amount'] = document_entities['line_item/amount']
116
+ document_entities.pop('line_item/amount', None)
117
+
118
+ if 'line_item/unit_price' in document_entities:
119
+ document_entities['line_item_unit_price'] = document_entities['line_item/unit_price']
120
+ document_entities.pop('line_item/unit_price', None)
121
+
122
+ document_entities['email'] = email
123
+ document_entities['message_id'] = message_id
124
+ print(document_entities)
125
+ insert_data_response = (
126
+ supabase.table("document_ai_entities")
127
+ .insert(document_entities)
128
+ .execute()
129
+ )
130
+ print(insert_data_response)
131
+
132
+ # if attachment_id and message_id:
133
+ # project_id = os.getenv('PROJECT_ID')
134
+ # processor_id = os.getenv('PROCESSOR_ID')
135
+ # document_entities = {}
136
+
137
+ # file_name = f"{message_id}_{attachment_id}.{attachment_extension}"
138
+ # print(f"file_name: {file_name}")
139
+ # supabase = Supabase_Client().instance
140
+ # try:
141
+ # response = supabase.storage.from_("receipt_radar").download(
142
+ # file_name
143
+ # )
144
+ # base64_data = urlsafe_b64encode(response).decode('utf-8')
145
+
146
+ # payload = {
147
+ # "skipHumanReview": True,
148
+ # "rawDocument": {
149
+ # "mimeType": f"application/{attachment_extension}",
150
+ # "content": base64_data
151
+ # }
152
+ # }
153
+
154
+ # access_token = get_access_token_v1()
155
+ # print(access_token)
156
+
157
+ # headers = {
158
+ # 'Authorization': f'Bearer {access_token}',
159
+ # 'Content-Type': 'application/json; charset=utf-8'
160
+ # }
161
 
162
+ # response = requests.post(
163
+ # f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
164
+ # headers=headers,
165
+ # json=payload
166
+ # )
167
+ # response_json = response.json()
168
+ # allowed_entities = [
169
+ # "due_date",
170
+ # "invoice_date",
171
+ # "total_amount",
172
+ # "total_tax_amount",
173
+ # "receiver_name",
174
+ # "invoice_id",
175
+ # "currency",
176
+ # "receiver_address",
177
+ # "invoice_type",
178
+ # "supplier_name",
179
+ # "payment_terms",
180
+ # "line_item",
181
+ # "line_item/description",
182
+ # "line_item/quantity",
183
+ # "line_item/amount",
184
+ # "line_item/unit_price"
185
+ # ]
186
+ # raw_text = response_json.get('document').get('text' , None)
187
+ # entities = response_json.get('document').get('entities' , None)
188
+ # document_entities['user_id'] = user_id
189
+ # insert_ocr_data_response = (
190
+ # supabase.table("receipt_ocr_data")
191
+ # .insert({'user_id':user_id , 'message_id':message_id,'receipt_text':raw_text ,'email':email,'file_type':attachment_extension})
192
+ # .execute()
193
+ # )
194
 
195
+ # print('Printing entities')
196
+ # print(entities)
197
+ # # if entities is not None:
198
+ # # for ent in entities:
199
+ # # if ent.get('type') is not None:
200
+ # # if ent.get('type') in allowed_entities:
201
+ # # mention_text = ent.get('mentionText')
202
+ # # normalised_values = ent.get('normalizedValue') if 'normalizedValue' in ent else None
203
+ # # document_entities[ent.get('type')] = {"mention_text":mention_text,"normalizedValue":normalised_values}
204
+ # if entities is not None:
205
+ # for ent in entities:
206
+ # if ent.get('type') is not None:
207
+ # entity_type = ent.get('type') or ""
208
 
209
+ # # Check if the entity type is in the allowed list
210
+ # if entity_type in allowed_entities:
211
+ # mention_text = ent.get('mentionText') or ""
212
+ # normalized_values = ent.get('normalizedValue') or ""
213
 
214
+ # # Initialize a list for the entity type if not already present
215
+ # if entity_type not in document_entities:
216
+ # document_entities[entity_type] = []
217
 
218
+ # # Append the entity data to the list
219
+ # document_entities[entity_type].append({
220
+ # "mention_text": mention_text,
221
+ # "normalizedValue": normalized_values
222
+ # })
223
 
224
+ # # Handling 'line_item' and its properties (line_item/description, line_item/quantity, etc.)
225
+ # if entity_type == 'line_item' and 'properties' in ent:
226
+ # for prop in ent['properties']:
227
+ # prop_type = prop.get('type') or ""
228
+ # if prop_type in allowed_entities:
229
+ # mention_text = prop.get('mentionText') or ""
230
+ # normalized_values = prop.get('normalizedValue') or ""
231
 
232
+ # # Initialize a list for the property type if not already present
233
+ # if prop_type not in document_entities:
234
+ # document_entities[prop_type] = []
235
 
236
+ # # Append the property data to the list
237
+ # document_entities[prop_type].append({
238
+ # "mention_text": mention_text,
239
+ # "normalizedValue": normalized_values
240
+ # })
241
+ # if 'line_item/description' in document_entities:
242
+ # document_entities['line_item_description'] = document_entities['line_item/description']
243
+ # document_entities.pop('line_item/description', None)
244
 
245
+ # if 'line_item/quantity' in document_entities:
246
+ # document_entities['line_item_quantity'] = document_entities['line_item/quantity']
247
+ # document_entities.pop('line_item/quantity', None)
248
 
249
+ # if 'line_item/amount' in document_entities:
250
+ # document_entities['line_item_amount'] = document_entities['line_item/amount']
251
+ # document_entities.pop('line_item/amount', None)
252
 
253
+ # if 'line_item/unit_price' in document_entities:
254
+ # document_entities['line_item_unit_price'] = document_entities['line_item/unit_price']
255
+ # document_entities.pop('line_item/unit_price', None)
256
+ # document_entities['email'] = email
257
+ # document_entities['message_id'] = message_id
258
+ # print(document_entities)
259
+ # insert_data_response = (
260
+ # supabase.table("document_ai_entities")
261
+ # .insert(document_entities)
262
+ # .execute()
263
+ # )
264
+ # print(insert_data_response)
265
 
266
+ # except Exception as e:
267
+ # print(f"Error downloading or encoding file: {e}")