Omkar008 commited on
Commit
b2e9bf4
1 Parent(s): b19687e

Upload 22 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.5-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . /app
6
+
7
+ RUN pip install -r requirements.txt
8
+
9
+ RUN useradd -m -u 1000 user
10
+
11
+ USER user
12
+
13
+ # Copy the rest of the application code into the container at /app
14
+ COPY --chown=user . /app/
15
+
16
+
17
+ # Command to run your application
18
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860" , "--workers" , "5"]
__init__.py ADDED
File without changes
controllers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
controllers/gmail_controller.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import base64
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ import jwt
6
+ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
7
+ from cryptography.hazmat.backends import default_backend
8
+ import os
9
+ import hashlib
10
+
11
+ class GmailDataExtractor:
12
+
13
+ def __init__(self,jwt:str , user_input: str = None) -> None:
14
+ if jwt is None :
15
+ self.error = "Error"
16
+ else:
17
+ self.__jwt = jwt
18
+ self.__user_input = user_input
19
+ self.error = None
20
+ self.__secret_key = 'nkldjlncbamjlklwjeklwu24898h*&#Ujnfjf34893U5HSJFBSKFSHFNSK*$*W_ 3OWU'
21
+
22
+ def __validate_jwt_token(self):
23
+ try:
24
+ payload = jwt.decode(self.jwt, self.secret_key, algorithms=["HS256"])
25
+ access_token = payload.get("access_token")
26
+ if access_token:
27
+ return access_token
28
+ else:
29
+ raise ValueError("Invalid JWT token: Missing access token")
30
+ except jwt.ExpiredSignatureError:
31
+ raise ValueError("Invalid JWT token: Expired token")
32
+ except jwt.InvalidTokenError:
33
+ raise ValueError("Invalid JWT token: Token verification failed")
34
+
35
+ def __fetch_messages(self) -> list:
36
+ """
37
+ Fetches messages from the Gmail API.
38
+ Args:
39
+ gmail_url (str): The URL for the Gmail API request.
40
+ access_token (str): The access token for authenticating with Gmail API.
41
+ Returns:
42
+ list: A list of message objects retrieved from the Gmail API.
43
+ Raises:
44
+ RuntimeError: If there is an issue while fetching messages from the Gmail API.
45
+
46
+ """
47
+
48
+ """currently not implementing jwt for testing purposes
49
+ replace every access_token with jwt function directly which returns the access token"""
50
+ access_token = self.__jwt
51
+ print("access token")
52
+ print(access_token)
53
+ receipt_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject: aankoopbon OR subject:reçu OR subject:invoice OR subject:invoice OR category:purchases)'
54
+ # if self.__user_input is not None:
55
+ # receipt_query = f'((subject:"your order" OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice OR category:purchases) AND subject:{self.__user_input})&maxResults=15'
56
+ gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={receipt_query}&maxResults=10"
57
+ gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
58
+ gmail_data = gmail_response.json()
59
+ messages=[]
60
+ messages.extend(gmail_data.get("messages",[]))
61
+ # def __fetch_page(url):
62
+ # response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
63
+ # response.raise_for_status() # Raise error if the request fails
64
+ # data = response.json()
65
+ # return data.get("messages", []), data.get("nextPageToken")
66
+
67
+ # messages = []
68
+ # page_token = None
69
+ # try:
70
+ # while True:
71
+ # url = f"{gmail_url}&pageToken={page_token}" if page_token else gmail_url
72
+ # page_messages, page_token = __fetch_page(url)
73
+ # messages.extend(page_messages)
74
+ # if not page_token:
75
+ # break
76
+ # except requests.RequestException as e:
77
+ # raise RuntimeError(f"Error fetching messages from Gmail API: {str(e)}")
78
+
79
+ print(len(messages))
80
+ return messages
81
+
82
+ def __fetch_message_data(self, message_id: str) -> dict:
83
+ """
84
+ Fetches message data from the Gmail API.
85
+ Args:
86
+ message_id (str): The ID of the message to fetch.
87
+ Returns:
88
+ dict: Message data retrieved from the Gmail API.
89
+ Raises:
90
+ RuntimeError: If there is an issue while fetching message data from the Gmail API.
91
+ """
92
+ print("fetch_message_data")
93
+ message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
94
+ try:
95
+ response = requests.get(message_url, headers={"Authorization": f"Bearer {self.__jwt}"})
96
+ response.raise_for_status() # Raise error if the request fails
97
+ return response.json()
98
+ except requests.RequestException as e:
99
+ raise RuntimeError(f"Error fetching message data from Gmail API: {str(e)}")
100
+
101
+ def __fetch_attachment_data(self, message_id: str, attachment_id: str) -> dict:
102
+ """
103
+ Fetches attachment data from the Gmail API.
104
+ Args:
105
+ message_id (str): The ID of the message containing the attachment.
106
+ attachment_id (str): The ID of the attachment to fetch.
107
+ Returns:
108
+ dict: Attachment data retrieved from the Gmail API.
109
+ Raises:
110
+ RuntimeError: If there is an issue while fetching attachment data from the Gmail API.
111
+ """
112
+ print("fetch_attachment_data")
113
+ attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
114
+ try:
115
+ response = requests.get(attachment_url, headers={"Authorization": f"Bearer {self.__jwt}"})
116
+ response.raise_for_status() # Raise error if the request fails
117
+ return response.json()
118
+ except requests.RequestException as e:
119
+ raise RuntimeError(f"Error fetching attachment data from Gmail API: {str(e)}")
120
+
121
+ def __process_message(self, message: dict) -> tuple:
122
+ """
123
+ Processes a single message.
124
+ Args:
125
+ message (dict): The message to process.
126
+ Returns:
127
+ tuple: A tuple containing the subject (str), body (str), links (list of str),
128
+ and base64 data if it contains an document attachment in the form of pdf, docx, ppt or any file format indicating whether the message contains an attachment.
129
+ Raises:
130
+ RuntimeError: If there is an issue while fetching message data from the Gmail API.
131
+ """
132
+ print("process_messages")
133
+ message_id = message.get("id")
134
+ # encrypted_message_id = self.encrypt_message_id(message_id)
135
+ if not message_id:
136
+ return None, None, [], False
137
+ subject=''
138
+ message_data = self.__fetch_message_data(message_id)
139
+ if 'payload' in message_data and 'headers' in message_data['payload']:
140
+ headers = message_data['payload']['headers']
141
+ for header in headers:
142
+ if header['name'] == 'Subject':
143
+ subject = header['value']
144
+
145
+
146
+ body = ''
147
+ text=''
148
+ links = []
149
+ has_attachment = False
150
+ company_from_gmail = 'others'
151
+
152
+ if 'payload' in message_data and 'parts' in message_data['payload']:
153
+ parts = message_data['payload']['parts']
154
+ payload = message_data['payload']['headers']
155
+ print("printing headers response")
156
+ print(payload)
157
+
158
+ #Extracting the domain name from the senders email
159
+ for fromdata in payload:
160
+ if fromdata['name'] == 'From':
161
+ company_from_gmail = self.extract_domain_from_email(fromdata['value'])
162
+ break
163
+ if 'chanel' in subject.lower():
164
+ company_from_gmail = 'chanel'
165
+ if 'louis vuitton' in subject.lower():
166
+ company_from_gmail = 'Louis Vuitton'
167
+
168
+ for part in parts:
169
+ if 'mimeType' not in part:
170
+ continue
171
+
172
+ mime_type = part['mimeType']
173
+
174
+ if mime_type == 'text/plain' or mime_type == 'text/html':
175
+ body_data = part['body'].get('data', '')
176
+ body = base64.urlsafe_b64decode(body_data)
177
+ text= self.extract_text(body)
178
+
179
+ if 'body' in part and 'attachmentId' in part['body']:
180
+ attachment_id = part['body']['attachmentId']
181
+ attachment_data = self.__fetch_attachment_data(message_id, attachment_id)
182
+ data = attachment_data.get("data", "")
183
+ filename = part.get("filename", "untitled.txt")
184
+
185
+
186
+ if data:
187
+ # Save only the first 10 characters of the attachment data
188
+ return subject,text ,{"filename":filename , "data":data} , company_from_gmail , message_id
189
+
190
+ return subject, text,None , company_from_gmail , message_id
191
+
192
+ def encrypt_message_id(self,message_id:str):
193
+ key = os.getenv('AES_KEY').encode('utf-8')[:32]
194
+ message_id_bytes = message_id.encode('utf-8')
195
+ iv = os.urandom(16)
196
+ # Initialize AES cipher with the key and CBC mode
197
+ cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
198
+
199
+ # Create a encryptor object
200
+ encryptor = cipher.encryptor()
201
+
202
+ # Pad the message_id to be a multiple of 16 bytes (AES block size)
203
+ # This is necessary for AES encryption
204
+ message_id_padded = message_id_bytes.ljust(32, b'\0')
205
+
206
+ # Encrypt the padded message_id
207
+ ciphertext = encryptor.update(message_id_padded) + encryptor.finalize()
208
+ return ciphertext
209
+
210
+
211
+ def extract_domain_from_email(self,email_string):
212
+ # Extracting the email address using regex
213
+ email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
214
+
215
+ # Extracting the domain name from the email address
216
+ domain = email_address.split('@')[-1].split('.')[0]
217
+ if email_address and domain :
218
+ return domain
219
+ else:
220
+ return None
221
+
222
+
223
+ def extract_text(self,html_content:str):
224
+ """
225
+ Extracts text and links from HTML content.
226
+ Args:
227
+ html_content (str): The HTML content to process.
228
+ Returns:
229
+ tuple: A tuple containing the extracted text (str) and links (list of tuples).
230
+ Raises:
231
+ ValueError: If the input HTML content is empty or None.
232
+ """
233
+ if not html_content:
234
+ raise ValueError("HTML content is empty or None")
235
+
236
+ soup = BeautifulSoup(html_content, 'html.parser')
237
+
238
+ # Extract text
239
+ text = soup.get_text(separator=' ')
240
+ text = re.sub(r'\s+', ' ', text).strip()
241
+ print("Printing the extracted text from the html")
242
+ print(text)
243
+ print()
244
+ print()
245
+ # Extract links
246
+ links = [(link.text, link['href']) for link in soup.find_all('a', href=True)]
247
+
248
+ return text
249
+
250
+ def extract_messages(self) -> dict:
251
+ """
252
+ Extracts messages based on the provided brand name.
253
+ Args:
254
+ brand_name (str): The brand name to search for in email subjects.
255
+ jwt_token (str): The JWT token for authentication.
256
+ Returns:
257
+ dict: A dictionary containing the extracted messages with their subjects, bodies, links, and attachment statuses.
258
+ format:{"results":[{"subjec":"test subject" , "body":"it would be text" , "attachment_data":{"filename":base64URL format}},{second message with same content of subject , body , attachment_data}]}
259
+ """
260
+ print("entered the extract messages")
261
+ messages = self.__fetch_messages()
262
+ results = []
263
+ for message in messages:
264
+ subject, body, attachment_data , company_name , encrypt_mssg_id = self.__process_message(message)
265
+
266
+ """ Handling None values """
267
+ body = body if body is not None else ''
268
+ attachment_data = attachment_data if attachment_data is not None else {}
269
+ company_associated = company_name if company_name is not None else ''
270
+ en_msg_id = encrypt_mssg_id if encrypt_mssg_id is not None else None
271
+
272
+ results.append({"body": body, "attachment_data": [attachment_data] ,'company_associated':company_associated , "message_id":en_msg_id})
273
+
274
+ return {"results": results}
controllers/ner_ai_controller.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from services.chat_client_NER import ChatClient
2
+
3
+ def get_brand_from_query(query:str):
4
+ chat = ChatClient().create(conversation=[])
5
+ response = chat.send_message(content=f"{query}", stream=False)
6
+ return response.text
controllers/websocket_controller.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import base64
3
+ import requests
4
+ import asyncio
5
+ from fastapi import WebSocket
6
+ from services import utils as util
7
+ import re
8
+ from bs4 import BeautifulSoup
9
+
10
+ async def send_chunked_data(websocket: WebSocket, filename: str, data: str ,company_associated:str , message_id:str):
11
+ chunk_size = 2000 # Set an appropriate chunk size
12
+ for i in range(0, len(data), chunk_size):
13
+ await websocket.send_json({"filename": filename, "data_chunk": data[i:i + chunk_size]})
14
+ await asyncio.sleep(0.4)
15
+ await websocket.send_json({"company_associated":company_associated , "message_id":message_id})
16
+ await websocket.send_text("FinishedThisAttachment")
17
+
18
+ async def send_chunked_data_without_attch(websocket: WebSocket,body_text:str,message_id:str , company_associated:str):
19
+ chunk_size = 2000 # Set an appropriate chunk size
20
+ await websocket.send_text("This message does'nt contain an Attachment")
21
+ for i in range(0, len(body_text), chunk_size):
22
+ await websocket.send_json({"data_chunk": body_text[i:i + chunk_size]})
23
+ await asyncio.sleep(0.4)
24
+ await websocket.send_json({"company_associated":company_associated , "message_id":message_id})
25
+ await websocket.send_text("FinishedThisAttachmentnotContainingAttachment")
26
+
27
+ async def process_messages(access_token: str, websocket: WebSocket):
28
+ logging.info("Entered process_messages")
29
+ messages = get_messages(access_token)
30
+ await websocket.send_json({"total_messages": len(messages)})
31
+ await websocket.send_text("CompletedSendingTotalMessagesLength")
32
+
33
+ for message in messages:
34
+ message_id = message.get("id")
35
+ if message_id:
36
+ message_data = fetch_message_data(access_token, message_id)
37
+ await process_message_data(access_token,message_data, websocket,message_id)
38
+
39
+ await websocket.send_text("CompletedFetchingMessages")
40
+
41
+ async def websocket_main(code: str, websocket: WebSocket):
42
+ logging.info("Entered mwebsocket_main")
43
+ access_token = code
44
+ await process_messages(access_token, websocket)
45
+ logging.info("Completed Fetching all the messages")
46
+ websocket.close()
47
+
48
+ def get_messages(code: str):
49
+ logging.info("Entered get_messages")
50
+ access_token = code
51
+ page_token = None
52
+ messages = []
53
+ jobs_query = f'subject:"your order" OR subject:receipts OR subject:receipt OR subject: aankoopbon OR subject:reçu OR subject:invoice OR subject:invoice OR category:purchases'
54
+ max_results = 10
55
+ while True:
56
+ gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={jobs_query}&maxResults={max_results}"
57
+ if page_token:
58
+ gmail_url += f"&pageToken={page_token}"
59
+
60
+ gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
61
+ logging.info(f"{gmail_response}")
62
+ gmail_data = gmail_response.json()
63
+
64
+ if "messages" in gmail_data:
65
+ messages.extend(gmail_data["messages"])
66
+ # if len(messages) 10:
67
+ # break
68
+ if "nextPageToken" in gmail_data:
69
+ page_token = gmail_data["nextPageToken"]
70
+ else:
71
+ break
72
+ logging.info("Total Length:")
73
+
74
+ logging.info(len(messages))
75
+ return messages
76
+
77
+ def fetch_message_data(access_token: str, message_id: str):
78
+ logging.info(f"Entered fetch_message_data for message_id: {message_id}")
79
+ message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
80
+ message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"})
81
+ return message_response.json()
82
+
83
+ async def process_message_data(access_token:str,message_data: dict, websocket: WebSocket,message_id:str):
84
+ logging.info("Entered process_message_data")
85
+ subject=''
86
+ body_base64 = ''
87
+ body_html=''
88
+ body_text = ''
89
+ compnay_from_mail = 'others'
90
+ #Extracting subject
91
+
92
+ subject = extract_subject_from_mail(message_data)
93
+
94
+ company_from_mail = extract_domain_name(message_data['payload']['headers'],subject)
95
+
96
+ if "payload" in message_data and "parts" in message_data["payload"]:
97
+ #Extracting the domain name from the senders email
98
+
99
+
100
+ for part in message_data["payload"]["parts"]:
101
+ if 'mimeType' not in part:
102
+ continue
103
+
104
+ mime_type = part['mimeType']
105
+
106
+ if mime_type == 'text/plain' or mime_type == 'text/html':
107
+ body_data = part['body'].get('data', '')
108
+ body_base64 = base64.urlsafe_b64decode(body_data)
109
+ body_text = extract_text(body_base64)
110
+
111
+ if "body" in part and "attachmentId" not in part["body"]:
112
+ await process_mail_body_data(websocket , body_text , message_id , company_from_mail)
113
+
114
+
115
+ if "body" in part and "attachmentId" in part["body"]:
116
+ attachment_id = part["body"]["attachmentId"]
117
+ attachment_data = fetch_attachment_data(access_token, message_data["id"], attachment_id)
118
+ body_text=''
119
+ await process_attachment_data(part, attachment_data, websocket,company_from_mail ,message_id)
120
+
121
+
122
+ async def process_attachment_data(part: dict, attachment_data: dict, websocket: WebSocket,company_associated:str,message_id:str):
123
+ logging.info("Entered process_attachment_data")
124
+ filename = part.get("filename", "untitled.txt")
125
+ data = attachment_data.get("data", {})
126
+ if data:
127
+ attachment_content = base64.urlsafe_b64decode(data)
128
+ extracted_text = await util.extract_text_from_attachment(filename, attachment_content)
129
+ logging.info(f"Extracted text from attachment {filename}: {extracted_text}")
130
+ await send_chunked_data(websocket, filename, data , company_associated ,message_id)
131
+
132
+ async def process_mail_body_data(websocket:WebSocket ,body_text : str, message_id:str,company_associated:str):
133
+ await send_chunked_data_without_attch(websocket,body_text,message_id,company_associated)
134
+
135
+ def fetch_attachment_data(access_token: str, message_id: str, attachment_id: str):
136
+ logging.info(f"Entered fetch_attachment_data for attachment_id: {attachment_id}")
137
+ attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
138
+ attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token}"})
139
+ return attachment_response.json()
140
+
141
+
142
+ def extract_subject_from_mail(message_data: dict):
143
+ if 'payload' in message_data and 'headers' in message_data['payload']:
144
+ headers = message_data['payload']['headers']
145
+ for header in headers:
146
+ if header['name'] == 'Subject':
147
+ return header['value']
148
+ # If 'Subject' header is not found, return a default value or handle it gracefully
149
+ return ""
150
+ else:
151
+ # If 'payload' or 'headers' are not present, return a default value or handle it gracefully
152
+ return ""
153
+
154
+
155
+ def extract_domain_name(payload:dict,subject:str):
156
+ domain_name = 'others'
157
+ for fromdata in payload:
158
+ if fromdata['name'] == 'From':
159
+ domain_name = extract_domain_from_email(fromdata['value'])
160
+ break
161
+ if 'chanel' in subject.lower():
162
+ return 'chanel'
163
+ if 'louis vuitton' in subject.lower():
164
+ return'Louis Vuitton'
165
+ return domain_name
166
+
167
+
168
+ def extract_domain_from_email(email_string:str):
169
+ # Extracting the email address using regex
170
+ email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
171
+
172
+ # Extracting the domain name from the email address
173
+ domain = email_address.split('@')[-1].split('.')[0]
174
+ if email_address and domain :
175
+ return domain
176
+ else:
177
+ return None
178
+
179
+ def extract_text(html_content:str):
180
+ """
181
+ Extracts text and links from HTML content.
182
+ Args:
183
+ html_content (str): The HTML content to process.
184
+ Returns:
185
+ tuple: A tuple containing the extracted text (str) and links (list of tuples).
186
+ Raises:
187
+ ValueError: If the input HTML content is empty or None.
188
+ """
189
+ if not html_content:
190
+ raise ValueError("HTML content is empty or None")
191
+
192
+ soup = BeautifulSoup(html_content, 'html.parser')
193
+
194
+ # Extract text
195
+ text = soup.get_text(separator=' ')
196
+ text = re.sub(r'\s+', ' ', text).strip()
197
+ print("Printing the extracted text from the html")
198
+ print(text)
199
+ print()
200
+ print()
201
+ # Extract links
202
+ links = [(link.text, link['href']) for link in soup.find_all('a', href=True)]
203
+
204
+ return text
205
+
controllers/ws_controller.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import logging
4
+ import re
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from typing import Optional, List, Dict
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from models.models import Message, Attachment
10
+ from fastapi import WebSocket
11
+ from services import utils as ut
12
+ import asyncio
13
+
14
+ def get_company_type(company_name:str)->str:
15
+ company_types_dict ={'ao yun': 'wines and spirit', 'ardbeg': 'wines and spirit', 'belvedere': 'wines and spirit', 'bodega numanthia': 'wines and spirit', 'chandon': 'wines and spirit', 'château cheval blanc': 'wines and spirit', "château d'yquem": 'wines and spirit', 'château galoupet': 'wines and spirit', 'cheval des andes': 'wines and spirit', 'clos19': 'wines and spirit', 'cloudy bay': 'wines and spirit', 'colgin cellars': 'wines and spirit', 'dom pérignon': 'wines and spirit', 'domaine des lambrays': 'wines and spirit', 'eminente': 'wines and spirit', 'glenmorangie': 'wines and spirit', 'hennessy': 'wines and spirit', 'joseph phelps': 'wines and spirit', 'krug': 'wines and spirit', 'mercier': 'wines and spirit', 'moët & chandon': 'wines and spirit', 'newton vineyard': 'wines and spirit', 'ruinart': 'wines and spirit', 'terrazas de los andes': 'wines and spirit', 'veuve clicquot': 'wines and spirit', 'volcan de mi tierra': 'wines and spirit', 'woodinville': 'wines and spirit' , 'berluti': 'Fashion & Leather Goods', 'celine': 'Fashion & Leather Goods', 'christian dior': 'Fashion & Leather Goods', 'emilio pucci': 'Fashion & Leather Goods', 'fendi': 'Fashion & Leather Goods', 'givenchy': 'Fashion & Leather Goods', 'kenzo': 'Fashion & Leather Goods', 'loewe': 'Fashion & Leather Goods', 'loro piana': 'Fashion & Leather Goods', 'louis vuitton': 'Fashion & Leather Goods', 'marc jacobs': 'Fashion & Leather Goods', 'moynat': 'Fashion & Leather Goods', 'patou': 'Fashion & Leather Goods', 'rimowa': 'Fashion & Leather Goods','acqua di parma': 'Perfumes & Cosmetics', 'benefit cosmetics': 'Perfumes & Cosmetics', 'cha ling': 'Perfumes & Cosmetics', 'fenty beauty by rihanna': 'Perfumes & Cosmetics', 'fresh': 'Perfumes & Cosmetics', 'givenchy parfums': 'Perfumes & Cosmetics', 'guerlain': 'Perfumes & Cosmetics', 'kenzo parfums': 'Perfumes & Cosmetics', 'kvd beauty': 'Perfumes & Cosmetics', 'loewe perfumes': 'Perfumes & Cosmetics', 'maison francis kurkdjian': 'Perfumes & Cosmetics', 'make up for ever': 'Perfumes & Cosmetics', 'officine universelle buly': 'Perfumes & Cosmetics', 'olehenriksen': 'Perfumes & Cosmetics', 'parfums christian dior': 'Perfumes & Cosmetics', 'stella by stella mccartney': 'Perfumes & Cosmetics','bulgari': 'Watches & Jewelry', 'chaumet': 'Watches & Jewelry', 'fred': 'Watches & Jewelry', 'hublot': 'Watches & Jewelry', 'repossi': 'Watches & Jewelry', 'tag heuer': 'Watches & Jewelry', 'tiffany & co.': 'Watches & Jewelry', 'zenith': 'Watches & Jewelry','24s': 'Selective retailing', 'dfs': 'Selective retailing', 'la grande epicerie de paris': 'Selective retailing', 'le bon marché rive gauche': 'Selective retailing', 'sephora': 'Selective retailing','belmond': 'Other activities', 'cheval blanc': 'Other activities', 'connaissance des arts': 'Other activities', 'cova': 'Other activities', 'investir': 'Other activities', "jardin d'acclimatation": 'Other activities', 'le parisien': 'Other activities', 'les echos': 'Other activities', 'radio classique': 'Other activities', 'royal van lent': 'Other activities'}
16
+ print(company_types_dict["louis vuitton"])
17
+ return company_types_dict.get(company_name.lower(), 'Others')
18
+
19
+ async def get_messages(code: str,websocket:WebSocket,brand_name: Optional[str] = None):
20
+ access_token = code
21
+ g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject:aankoopbon OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases) has:attachment'
22
+ if brand_name is not None:
23
+ g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject: aankoopbon OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases OR from:{brand_name}) AND subject:{brand_name} has:attachment'
24
+ page_token = None
25
+ messages = []
26
+ # max_results = 10
27
+ # gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={jobs_query}&maxResults={max_results}"
28
+ # gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
29
+ # gmail_data = gmail_response.json()
30
+ # messages.append(gmail_data['messages'])
31
+ def fetch_message_wrapper(message_data):
32
+ message_id = message_data.get("id")
33
+ if message_id:
34
+ return fetch_message_data(access_token, message_id)
35
+
36
+ return None
37
+
38
+ while True:
39
+ gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={g_query}"
40
+ if page_token:
41
+ gmail_url += f"&pageToken={page_token}"
42
+
43
+ gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
44
+ gmail_data = gmail_response.json()
45
+ print(len(gmail_data))
46
+ print(gmail_data)
47
+
48
+ if "messages" in gmail_data:
49
+ with ThreadPoolExecutor(max_workers=15) as executor:
50
+
51
+
52
+
53
+ futures=[executor.submit(fetch_message_wrapper, message_data) for message_data in
54
+ gmail_data["messages"]]
55
+ for future in futures:
56
+ message = future.result()
57
+ if message:
58
+ messages.append(message)
59
+ for message_data in messages:
60
+ await process_message(message_data,websocket,10000)
61
+
62
+ if "nextPageToken" in gmail_data:
63
+ page_token = gmail_data["nextPageToken"]
64
+ else:
65
+ break
66
+ print("printing messages")
67
+ print(messages)
68
+ return messages
69
+
70
+ async def process_message(message:Message, websocket:WebSocket, chunk_size:int):
71
+ logging.info("process_message")
72
+ if message:
73
+ message_json = message.to_json()
74
+ logging.info(f"{message_json}")
75
+ await send_message_in_chunks(websocket, message_json, chunk_size)
76
+ await websocket.send_text("NEXT_MESSAGE")
77
+
78
+
79
+ def fetch_message_data(access_token: str, message_id: str) -> Message:
80
+ message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
81
+ message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"})
82
+ message_data = message_response.json()
83
+ # print(message_data)
84
+ subject = extract_subject_from_mail(message_data)
85
+ company_from_mail = extract_domain_name(message_data['payload']['headers'], subject)
86
+
87
+ body = extract_body_from_mail(message_data)
88
+
89
+ attachments,structed_attachment_data = extract_attachments_from_mail(access_token, message_data)
90
+ high_level_company_type = get_company_type(company_from_mail)
91
+ # structed_attachment_data = extract_json_from_attachments(access_token , message_data)
92
+
93
+
94
+ body_len = 0
95
+ if body is not None :
96
+ body_len = len(body)
97
+
98
+ # print("subject: ")
99
+ # print(subject)
100
+ # print("company name: ")
101
+ # print(company_from_mail)
102
+ # print("Printing the body of the mail: ")
103
+ # print(body)
104
+ # print("Printing attachment Data: ")
105
+ # print(attachments)
106
+ # print("Completed this mail.")
107
+ return Message(message_id=message_id, body_len=body_len,body=body, attachments=attachments, company=company_from_mail,high_level_company_type=high_level_company_type,structured_data = structed_attachment_data)
108
+
109
+
110
+
111
+ def extract_subject_from_mail(message_data: dict) -> str:
112
+ if 'payload' in message_data and 'headers' in message_data['payload']:
113
+ headers = message_data['payload']['headers']
114
+ for header in headers:
115
+ if header['name'] == 'Subject':
116
+ return header['value']
117
+ return ""
118
+ else:
119
+ return ""
120
+
121
+
122
+ def extract_domain_name(payload: dict, subject: str) -> str:
123
+ domain_name = 'others'
124
+ for fromdata in payload:
125
+ if fromdata['name'] == 'From':
126
+ domain_name = extract_domain_from_email(fromdata['value'])
127
+ break
128
+ if 'chanel' in subject.lower():
129
+ return 'chanel'
130
+ if 'louis vuitton' in subject.lower():
131
+ return 'Louis Vuitton'
132
+ return domain_name
133
+
134
+
135
+ def extract_domain_from_email(email_string: str) -> Optional[str]:
136
+ email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
137
+ domain = email_address.split('@')[-1].split('.')[0]
138
+ if email_address and domain:
139
+ return domain
140
+ else:
141
+ return None
142
+
143
+
144
+ # def extract_body_from_mail(message_data: dict) -> str:
145
+ # body = None
146
+ # if "payload" in message_data and "parts" in message_data["payload"]:
147
+ # for part in message_data["payload"]["parts"]:
148
+ # if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
149
+ # body_data = part['body'].get('data', '')
150
+ # body_base64 = base64.urlsafe_b64decode(body_data)
151
+ # body = extract_text(body_base64)
152
+ # return body
153
+
154
+
155
+ def extract_body_from_mail(message_data: dict) -> str:
156
+ body = None
157
+ if "payload" in message_data:
158
+ payload = message_data["payload"]
159
+ if "parts" in payload:
160
+ for part in payload["parts"]:
161
+ if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
162
+ body_data = part['body'].get('data', '')
163
+ if body_data:
164
+ body_base64 = base64.urlsafe_b64decode(body_data)
165
+ body = extract_text(body_base64)
166
+
167
+ elif 'body' in payload:
168
+ body_data = payload['body'].get('data', '')
169
+ if body_data:
170
+ body_base64 = base64.urlsafe_b64decode(body_data)
171
+ body = extract_text(body_base64)
172
+ elif 'parts' in payload['body']:
173
+ for part in payload['body']['parts']:
174
+ if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
175
+ body_data = part['body'].get('data', '')
176
+ if body_data:
177
+ body_base64 = base64.urlsafe_b64decode(body_data)
178
+ body = extract_text(body_base64)
179
+
180
+ if not body:
181
+ body = message_data.get('snippet', '')
182
+ return body
183
+
184
+
185
+ def fetch_attachment_data(access_token: str, message_id: str, attachment_id: str) -> Dict:
186
+ attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
187
+ attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token}"})
188
+ return attachment_response.json()
189
+
190
+
191
+ def extract_attachments_from_mail(access_token: str, message_data: dict) -> List[Attachment]:
192
+ attachments = []
193
+ structured_data = []
194
+ if "payload" in message_data and "parts" in message_data["payload"]:
195
+ for part in message_data["payload"]["parts"]:
196
+ if "body" in part and "attachmentId" in part["body"]:
197
+ attachment_id = part["body"]["attachmentId"]
198
+ attachment_data = fetch_attachment_data(access_token, message_data["id"], attachment_id)
199
+ filename = part.get("filename", "untitled.txt")
200
+ data = attachment_data.get("data", "")
201
+ raw_text=ut.extract_text_from_attachment(filename , data)
202
+ struct_data = ut.strcuture_document_data(raw_text)
203
+ if struct_data:
204
+ structured_data.append(struct_data)
205
+
206
+ attachments.append(Attachment(attachment_len = len(attachment_data.get("data", "")),filename=filename, data=attachment_data.get("data", "")))
207
+ return attachments,structured_data
208
+
209
+
210
+ def extract_text(html_content: str) -> str:
211
+ if not html_content:
212
+ raise ValueError("HTML content is empty or None")
213
+
214
+ soup = BeautifulSoup(html_content, 'html.parser')
215
+ text = soup.get_text(separator=' ')
216
+ text = re.sub(r'\s+', ' ', text).strip()
217
+ return text
218
+
219
+
220
+ async def websocket_main(code: str, websocket: WebSocket,brand_name: Optional[str] = None):
221
+ access_token = code
222
+ # messages = get_messages(access_token,websocket,brand_name)
223
+ await get_messages(access_token,websocket,brand_name)
224
+ # print("websocket_main")
225
+ # print(messages)
226
+ # # logging.info(f"brand_name:{brand_name}")
227
+ # await websocket.send_json({"total_messages": len(messages)})
228
+ # print("Total Length of messages")
229
+ # print(len(messages))
230
+ # chunk_size = 100000
231
+ # i=0
232
+ # for message in messages:
233
+ # message_json = message.to_json()
234
+
235
+ # logging.info(f"{i} th message")
236
+ # i=i+1
237
+ # await send_message_in_chunks(websocket, message_json, chunk_size)
238
+ # await websocket.send_text("NEXT_MESSAGE")
239
+
240
+
241
+ await websocket.close()
242
+
243
+
244
+ async def send_message_in_chunks(websocket: WebSocket, message_json: dict, chunk_size: int):
245
+
246
+ # if message_json['attachments'] is not None :
247
+ # for attch in message_json['attachments']:
248
+ # attachment_len = attch['attachment_len']
249
+
250
+
251
+ # print(body_len)
252
+ # print(attachment_len)
253
+ # if attachment_len == 0:
254
+ # attachment_len = None
255
+ # await websocket.send_json({"body_len":body_len ,"attachment_len":attachment_len})
256
+
257
+ message_str = json.dumps(message_json)
258
+ # print("Printing message_str")
259
+ # print(message_str)
260
+ # logging.info(message_str)
261
+ # await websocket.send_json({"file_len":len(file)})
262
+ for i in range(0, len(message_str), chunk_size):
263
+ await websocket.send_text(message_str[i:i + chunk_size])
data_extraction_classes/get_gmail_data.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import base64
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ import jwt
6
+ class GmailDataExtractor:
7
+
8
+ def __init__(self,jwt:str , user_input: str = None) -> None:
9
+ if jwt is None :
10
+ self.error = "Error"
11
+ else:
12
+ self.__jwt = jwt
13
+ self.__user_input = user_input
14
+ self.error = None
15
+ self.__secret_key = 'nkldjlncbamjlklwjeklwu24898h*&#Ujnfjf34893U5HSJFBSKFSHFNSK*$*W_ 3OWU'
16
+
17
+ def __validate_jwt_token(self):
18
+ try:
19
+ payload = jwt.decode(self.jwt, self.secret_key, algorithms=["HS256"])
20
+ access_token = payload.get("access_token")
21
+ if access_token:
22
+ return access_token
23
+ else:
24
+ raise ValueError("Invalid JWT token: Missing access token")
25
+ except jwt.ExpiredSignatureError:
26
+ raise ValueError("Invalid JWT token: Expired token")
27
+ except jwt.InvalidTokenError:
28
+ raise ValueError("Invalid JWT token: Token verification failed")
29
+
30
+ def __fetch_messages(self) -> list:
31
+ """
32
+ Fetches messages from the Gmail API.
33
+
34
+ Args:
35
+ gmail_url (str): The URL for the Gmail API request.
36
+ access_token (str): The access token for authenticating with Gmail API.
37
+
38
+ Returns:
39
+ list: A list of message objects retrieved from the Gmail API.
40
+
41
+ Raises:
42
+ RuntimeError: If there is an issue while fetching messages from the Gmail API.
43
+
44
+ """
45
+
46
+ """currently not implementing jwt for testing purposes
47
+ replace every access_token with jwt function directly which returns the access token"""
48
+ access_token = self.__jwt
49
+ print("access token")
50
+ print(access_token)
51
+ receipt_query = f"(label:^smartlabel_receipt OR (subject:your AND subject:order) OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice))"
52
+ if self.__user_input is not None:
53
+ receipt_query = f"(label:^smartlabel_receipt OR (subject:your AND subject:order) OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice)) AND subject:{self.__user_input}"
54
+ gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={receipt_query}"
55
+ def __fetch_page(url):
56
+ response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
57
+ response.raise_for_status() # Raise error if the request fails
58
+ data = response.json()
59
+ return data.get("messages", []), data.get("nextPageToken")
60
+
61
+ messages = []
62
+ page_token = None
63
+ try:
64
+ while True:
65
+ url = f"{gmail_url}&pageToken={page_token}" if page_token else gmail_url
66
+ page_messages, page_token = __fetch_page(url)
67
+ messages.extend(page_messages)
68
+ if not page_token:
69
+ break
70
+ except requests.RequestException as e:
71
+ raise RuntimeError(f"Error fetching messages from Gmail API: {str(e)}")
72
+
73
+ return messages
74
+
75
+ def __fetch_message_data(self, message_id: str) -> dict:
76
+ """
77
+ Fetches message data from the Gmail API.
78
+
79
+ Args:
80
+ message_id (str): The ID of the message to fetch.
81
+
82
+ Returns:
83
+ dict: Message data retrieved from the Gmail API.
84
+
85
+ Raises:
86
+ RuntimeError: If there is an issue while fetching message data from the Gmail API.
87
+ """
88
+ message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
89
+ try:
90
+ response = requests.get(message_url, headers={"Authorization": f"Bearer {self.__jwt}"})
91
+ response.raise_for_status() # Raise error if the request fails
92
+ return response.json()
93
+ except requests.RequestException as e:
94
+ raise RuntimeError(f"Error fetching message data from Gmail API: {str(e)}")
95
+
96
+ def __fetch_attachment_data(self, message_id: str, attachment_id: str) -> dict:
97
+ """
98
+ Fetches attachment data from the Gmail API.
99
+
100
+ Args:
101
+ message_id (str): The ID of the message containing the attachment.
102
+ attachment_id (str): The ID of the attachment to fetch.
103
+
104
+ Returns:
105
+ dict: Attachment data retrieved from the Gmail API.
106
+
107
+ Raises:
108
+ RuntimeError: If there is an issue while fetching attachment data from the Gmail API.
109
+ """
110
+ attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
111
+ try:
112
+ response = requests.get(attachment_url, headers={"Authorization": f"Bearer {self.__jwt}"})
113
+ response.raise_for_status() # Raise error if the request fails
114
+ return response.json()
115
+ except requests.RequestException as e:
116
+ raise RuntimeError(f"Error fetching attachment data from Gmail API: {str(e)}")
117
+
118
+ def __process_message(self, message: dict) -> tuple:
119
+ """
120
+ Processes a single message.
121
+
122
+ Args:
123
+ message (dict): The message to process.
124
+
125
+ Returns:
126
+ tuple: A tuple containing the subject (str), body (str), links (list of str),
127
+ and base64 data if it contains an document attachment in the form of pdf, docx, ppt or any file format indicating whether the message contains an attachment.
128
+
129
+ Raises:
130
+ RuntimeError: If there is an issue while fetching message data from the Gmail API.
131
+ """
132
+ message_id = message.get("id")
133
+ if not message_id:
134
+ return None, None, [], False
135
+
136
+ message_data = self.__fetch_message_data(message_id, self.__jwt)
137
+ subject = message_data.get('payload', {}).get('headers', {}).get('value', '')
138
+
139
+ body = ''
140
+ links = []
141
+ has_attachment = False
142
+
143
+ if 'payload' in message_data and 'parts' in message_data['payload']:
144
+ parts = message_data['payload']['parts']
145
+ for part in parts:
146
+ if 'mimeType' not in part:
147
+ continue
148
+
149
+ mime_type = part['mimeType']
150
+ if mime_type == 'text/plain' or mime_type == 'text/html':
151
+ body_data = part['body'].get('data', '')
152
+ body = base64.urlsafe_b64decode(body_data).decode('utf-8')
153
+ text= self._extract_text_and_links(body)
154
+
155
+ if 'body' in part and 'attachmentId' in part['body']:
156
+ attachment_id = part['body']['attachmentId']
157
+ attachment_data = self.__fetch_attachment_data(message_id, attachment_id)
158
+ data = attachment_data.get("data", "")
159
+ filename = part.get("filename", "untitled.txt")
160
+
161
+ if data:
162
+ # Save only the first 10 characters of the attachment data
163
+ return subject,body , links , {filename:data}
164
+
165
+ return subject, body, links , None
166
+
167
+ def extract_text_and_links(html_content: str) -> tuple:
168
+ """
169
+ Extracts text and links from HTML content.
170
+
171
+ Args:
172
+ html_content (str): The HTML content to process.
173
+
174
+ Returns:
175
+ tuple: A tuple containing the extracted text (str) and links (list of tuples).
176
+
177
+ Raises:
178
+ ValueError: If the input HTML content is empty or None.
179
+ """
180
+ if not html_content:
181
+ raise ValueError("HTML content is empty or None")
182
+
183
+ soup = BeautifulSoup(html_content, 'html.parser')
184
+
185
+ # Extract text
186
+ text = soup.get_text(separator=' ')
187
+ text = re.sub(r'\s+', ' ', text).strip()
188
+
189
+ # Extract links
190
+ links = [(link.text, link['href']) for link in soup.find_all('a', href=True)]
191
+
192
+ return text, links
193
+
194
+ def extract_messages(self) -> dict:
195
+ """
196
+ Extracts messages based on the provided brand name.
197
+
198
+ Args:
199
+ brand_name (str): The brand name to search for in email subjects.
200
+ jwt_token (str): The JWT token for authentication.
201
+
202
+ Returns:
203
+ dict: A dictionary containing the extracted messages with their subjects, bodies, links, and attachment statuses.
204
+ format:{"results":[{"subjec":"test subject" , "body":"it would be text" , "attachment_data":{"filename":base64URL format}},{second message with same content of subject , body , attachment_data}]}
205
+
206
+ """
207
+ print("entered the extract messages")
208
+ messages = self.__fetch_messages()
209
+ results = []
210
+ for message in messages:
211
+ subject, body, attachment_data = self.__process_message(message)
212
+
213
+ """ Handling None values """
214
+ subject = subject if subject is not None else ""
215
+ body = body if body is not None else ""
216
+ attachment_data = attachment_data if attachment_data is not None else {}
217
+
218
+ results.append({"subject": subject, "body": body, "attachment_data": attachment_data})
219
+
220
+ return {"results": results}
221
+
222
+ # obj = GmailDataExtractor("abcd","user_input")
223
+ # print(obj.error)
main.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI , Request, APIRouter, Depends, HTTPException
2
+ from starlette.middleware.cors import CORSMiddleware
3
+ from routers import auth , gmail , websockets_new , queryfilter_router
4
+
5
+
6
+ app = FastAPI()
7
+
8
+ app.add_middleware(
9
+ CORSMiddleware,
10
+ allow_origins=["*"],
11
+ allow_credentials=True,
12
+ allow_methods=["DELETE", "GET", "POST", "PUT"],
13
+ allow_headers=["*"],
14
+ )
15
+
16
+ app.include_router(auth.router)
17
+ app.include_router(gmail.router)
18
+ app.include_router(websockets_new.router)
19
+ app.include_router(queryfilter_router.router)
20
+
21
+ @app.get("/")
22
+ async def test():
23
+ return {"Message":"Application is Working!"}
24
+
25
+
26
+
models/__init__.py ADDED
File without changes
models/models.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Optional, List, Dict
3
+
4
+ class Attachment:
5
+ def __init__(self, attachment_len:int,filename: str, data: str):
6
+ self.attachment_len = attachment_len
7
+ self.filename = filename
8
+ self.data = data
9
+
10
+
11
+
12
+
13
+ class Message:
14
+ #structured_data:Optional[List] add this in the below __init__
15
+
16
+ def __init__(self, message_id: str, body_len:int, body: Optional[str], attachments: Optional[List[Attachment]], company: str , high_level_company_type:str,structured_data:Optional[List]):
17
+ self.id = message_id
18
+ self.body_len = body_len
19
+ self.body = body
20
+ self.attachments = attachments
21
+ self.company = company
22
+ self.high_level_company_type = high_level_company_type
23
+ self.structured_data = structured_data
24
+
25
+
26
+ def to_json(self):
27
+ return {
28
+ "id": self.id,
29
+ "body_len" : self.body_len,
30
+ "body": self.body,
31
+ "attachments": [attachment.__dict__ for attachment in self.attachments] if self.attachments else None,
32
+ "company": self.company,
33
+ "high_level_company_type":self.high_level_company_type,
34
+ "structured_data": self.structured_data if self.structured_data else None
35
+ }
36
+
requirements.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.3
2
+ aiosignal==1.3.1
3
+ annotated-types==0.6.0
4
+ anyio==4.2.0
5
+ attrs==23.2.0
6
+ beautifulsoup4==4.12.3
7
+ certifi==2024.2.2
8
+ cffi==1.16.0
9
+ charset-normalizer==3.3.2
10
+ cryptography==42.0.2
11
+ Deprecated==1.2.14
12
+ fastapi==0.109.2
13
+ frozenlist==1.4.1
14
+ idna==3.6
15
+ jwcrypto==1.5.1
16
+ lxml==5.1.0
17
+ multidict==6.0.5
18
+ pycparser==2.21
19
+ pydantic==2.6.1
20
+ pydantic_core==2.16.2
21
+ PyJWT==2.8.0
22
+ PyPDF2==3.0.1
23
+ python-docx==1.1.0
24
+ python-jwt==4.1.0
25
+ requests==2.31.0
26
+ sniffio==1.3.0
27
+ soupsieve==2.5
28
+ starlette==0.36.3
29
+ typing_extensions==4.9.0
30
+ urllib3==2.2.0
31
+ wrapt==1.16.0
32
+ yarl==1.9.4
33
+ uvicorn==0.27.1
34
+ uvloop==0.19.0
35
+ websockets==12.0
36
+ google-generativeai==0.3.2
37
+ python-dotenv==1.0.1
38
+ langchain-community==0.0.27
39
+ langchain-core==0.1.30
40
+ langsmith==0.1.23
41
+ langchain==0.1.11
42
+ langchain-text-splitters==0.0.1
43
+ openai==1.13.3
44
+ langchain-openai==0.0.8
routers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+
routers/auth.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ router = APIRouter(prefix="/auth")
4
+
5
+ @router.get("/authenticate")
6
+ async def auth():
7
+ return {"Message":"Entered Auth"}
routers/gmail.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter , Request ,HTTPException
2
+ from controllers import gmail_controller
3
+
4
+ router = APIRouter(prefix="/process")
5
+
6
+ @router.post("/receipt_data/direct")
7
+ async def get_data(request:Request):
8
+ try:
9
+ body_data = await request.json()
10
+ token = body_data.get('data')
11
+ user_str = body_data.get('brand_name')
12
+ if token is None:
13
+ return HTTPException(status_code=400,detail="Token Invalid!")
14
+ if user_str is None:
15
+ user_str = None
16
+
17
+ fetch_data = gmail_controller.GmailDataExtractor(token , user_str).extract_messages()
18
+ return fetch_data
19
+ except Exception as e:
20
+ raise HTTPException(status_code=500, detail=str(e))
21
+
22
+
23
+
routers/queryfilter_router.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter , Request ,HTTPException
2
+ from controllers import ner_ai_controller as ai
3
+ import logging
4
+
5
+
6
+ router = APIRouter(prefix="/queryfilter")
7
+
8
+ @router.post("/gemini")
9
+ async def get_data(request:Request):
10
+ body = await request.json()
11
+ # user_query = body.get('query')
12
+
13
+ user_query = body.get('query', '') if body else ''
14
+
15
+ return {"brand_name": None} if not user_query.strip() else {"brand_name": ai.get_brand_from_query(user_query)}
16
+ # return {"brand_name":ai.get_brand_from_query(user_query)}
17
+
18
+
routers/websockets_new.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter , Request ,HTTPException , WebSocket
2
+ from controllers import websocket_controller as wc
3
+ from controllers import ws_controller as w
4
+ from controllers import ner_ai_controller as ai
5
+ from services.chat_client_NER import ChatClient
6
+
7
+ import logging
8
+ import aiohttp
9
+
10
+ router = APIRouter(prefix="/websockets")
11
+
12
+ @router.websocket("/ws")
13
+ async def get_data(websocket:WebSocket):
14
+ await websocket.accept()
15
+
16
+ json = await websocket.receive_json()
17
+ access_token = json['access_token']
18
+ logging.info(f"access_token:{access_token}")
19
+ user_query = json['brand_name'] if json.get('brand_name') is not None else None
20
+ logging.info(f"brand_name: {user_query}")
21
+ if access_token is None:
22
+ await websocket.send_text("Access Token Invalid OR NULL !!!")
23
+ websocket.close()
24
+ # access_token = await websocket.receive_text()
25
+ brand_name = ""
26
+ logging.info(f"brand_name: f{user_query}")
27
+ logging.info(f"access_token : {access_token}")
28
+ if user_query is not None:
29
+ chat = ChatClient().create(conversation=[])
30
+ response = chat.send_message(content=f"{user_query}", stream=False)
31
+ if response.text == 'others':
32
+ brand_name = None
33
+ else:
34
+ brand_name = response.text
35
+
36
+
37
+
38
+ await w.websocket_main(access_token ,websocket,brand_name)
39
+
services/__init__.py ADDED
File without changes
services/base_ai_client.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import google.generativeai as genai
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+
9
+ class BaseAIClient:
10
+ def __init__(self,system,model_response):
11
+ genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
12
+ self.system = system
13
+ self.model = model_response
14
+
15
+ def create(self, conversation):
16
+ model = genai.GenerativeModel('gemini-pro')
17
+ new_conversation = [
18
+ {"role": 'user', "parts": [self.system]},
19
+ {"role": 'model', "parts": [self.model]},
20
+ ]
21
+ new_conversation.extend(conversation)
22
+ return model.start_chat(history=new_conversation)
services/chat_client_NER.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_ai_client import BaseAIClient
2
+
3
+ class ChatClient(BaseAIClient):
4
+ def __init__(self):
5
+ super().__init__(
6
+ '''Your name is Hushh Bot. You will be acting as an NER, recognizing and identifying the Company name or brand name in the input text provided to you.
7
+ For example: If you are given an input text as -
8
+ input text: "get my chanel receipts"
9
+ output: chanel
10
+ You will provide the output with only the company name strictly.
11
+ Just reply with the Company name.
12
+ Above is just an example; you will not receive all the text in a similar format.
13
+ If you are unable to find the company name then strictly reply with only one word that is "others".
14
+ ''',
15
+ ''' '''
16
+ )
services/utils.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from docx import Document
3
+ import io
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from typing_extensions import Concatenate
7
+ from typing import List
8
+ # from langchain_community.llms import OpenAI
9
+ from langchain_community.callbacks import get_openai_callback
10
+ from langchain.output_parsers import PydanticOutputParser
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain_core.pydantic_v1 import BaseModel, Field, validator
13
+ import os
14
+ import logging
15
+ import base64
16
+ from langchain_openai import OpenAI
17
+ import re
18
+ import json
19
+
20
+ #Setting the openai api key
21
+ api_key=os.getenv('OPENAI_API_KEY')
22
+
23
+ class Candidate(BaseModel):
24
+ brand: str = Field(description="Please identify and provide the primary brand name listed on the receipt. If multiple brand names are present, determine and specify the most prominent or relevant brand associated with the primary transaction on the receipt. If the brand name is not explicitly mentioned, include any contextual details or indirect indicators that might help in accurately identifying the brand. Defalut value will be 'null'.Try to return a brand name.Look for the brand name which would be mostly at the top of the document or text provided.")
25
+ total_cost: str = Field(description="Identify and provide the 'Total Order Value' listed on the receipt. Please specify the exact section where this value is noted, typically labeled as 'Total', 'Total Amount','total' , 'total amount' ,'total cost','Total Cost','Grand total','grand total'. Include any other labeling variations that might represent the total order value. If the total order value is not present or cannot be determined, explicitly state 'null' as the response.Rember total cost is always the highest value and it mostly cannot be a single digit value like 2.9 , 5.8 , 5 ,etc.But remember total cost is always the highest value which will be found somewhere in middle of provide text.Not at very end or at the very start.")
26
+ location: str = Field(description="Please provide the city and state where the purchase was made, as indicated on the receipt. For travel-related receipts, extract the location from which the booking was initiated, focusing on the booking origin or departure city/state, rather than the destination. Look for specific details such as the departure airport code, departure city, or the booking location mentioned in the itinerary or booking confirmation section. These details typically indicate the purchase's origin. If the purchase location is not explicitly stated or if the information is ambiguous, provide any relevant clues or context from the receipt that might assist in accurately identifying the location. If no such information is available, or if it remains unclear, clearly mark the response as 'null'")
27
+ no_of_items: str = Field(description="Specify the total number of items listed in the order as reflected in the receipt or document. If the total count of items is not explicitly mentioned or if it cannot be determined from the provided document, please assign and return the value 'null'.")
28
+ purchase_category: str = Field(description="Identify and specify the purchase category. Choose from the following predefined categories: fashion, home, travel, food, groceries, hotels, spa, insurance, or others. If the purchase category is not explicitly stated on the receipt or document, or if it cannot be accurately determined based on the available information, assign and return the value 'null'.")
29
+ brand_category: str = Field(description="""Based on the receipt information, use one of the following brand categories strictly:
30
+ 1. "Fashion, Dress, Personal"
31
+ 2. "Coffee - Personal"
32
+ 3. "Food - Personal"
33
+ 4. "Travel, Roam, Explore"
34
+ 5. "Shopping, Hunt, Obtain"
35
+
36
+ If you don't find any brand category then return 'null'.
37
+ """)
38
+ Date: str = Field(description="Specify the date of purchase in the format dd-MM-yyyy. If the date of purchase is not explicitly provided on the receipt or document, or if it cannot be accurately determined, assign the value 'null'. Ensure the date is formatted correctly as day, month, and year in two digits each.")
39
+
40
+
41
+ # async def initialize_openai():
42
+ # model_name = "gpt-3.5-turbo-instruct"
43
+ # # model_name = "text-davinci-003"
44
+ # temperature = 0.0
45
+ # model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
46
+
47
+ def strcuture_document_data(raw_text:str)->dict:
48
+ try:
49
+ model_name = "gpt-3.5-turbo-instruct"
50
+ # model_name = "text-davinci-003"
51
+ temperature = 0.0
52
+ model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
53
+ doc_query = (
54
+ "Extract and return strictly a JSON object containing only the following keys strictly : brand , total_cost , location , no_of_items , purchase_category,brand_category , Date ."+
55
+ "\nReceipt Data:\n" + raw_text + "\nRemember the response should only be in JSON format very Strictly and it should have these keys brand , total_cost , location , no_of_items , purchase_category,brand_category , Date , very Strictly.\n"+"Remeber that if one of the key is null then don't assume that other keys maybe null.Always get the values of all the keys mentioned."
56
+ )
57
+ print(raw_text)
58
+ parser = PydanticOutputParser(pydantic_object=Candidate)
59
+
60
+ prompt = PromptTemplate(
61
+ template="Answer the user query.\n{query}\n{format_instructions}\n",
62
+ input_variables=["query"],
63
+ partial_variables={"format_instructions": parser.get_format_instructions()},
64
+ )
65
+ input = prompt.format_prompt(query=doc_query)
66
+ with get_openai_callback() as cb:
67
+ result = model(input.to_string())
68
+
69
+
70
+ print(f"GPT Response {result}")
71
+ # result = extract_json_from_string(result)
72
+ # print(f"Formatted Response : {result}")
73
+
74
+ class_object= parser.parse(result)
75
+ dict_object=class_object.__dict__
76
+ print("printing structured json")
77
+ print(dict_object)
78
+ return dict_object
79
+ except Exception as e:
80
+ print(f"Error occurred: {e}")
81
+ return {}
82
+
83
+
84
+ def extract_json_from_string(input_string):
85
+ # Define a regular expression pattern to match JSON
86
+ pattern = r'\{.*?\}'
87
+
88
+ # Use re.findall() to find all matches of JSON in the input string
89
+ matches = re.findall(pattern, input_string)
90
+
91
+ # If there are matches, extract the JSON and parse it
92
+ if matches:
93
+ json_data_list = []
94
+ for match in matches:
95
+ json_data = json.loads(match)
96
+ json_data_list.append(json_data)
97
+ return json_data_list
98
+ else:
99
+ return None
100
+
101
+ def extract_text_from_pdf(pdf_data):
102
+ with io.BytesIO(pdf_data) as pdf_file:
103
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
104
+ text = ""
105
+ for page_num in range(len(pdf_reader.pages)):
106
+ page = pdf_reader.pages[page_num]
107
+ text += page.extract_text()
108
+ return text
109
+
110
+ def extract_text_from_docx(docx_data):
111
+ doc = Document(io.BytesIO(docx_data))
112
+ text = ""
113
+ for para in doc.paragraphs:
114
+ text += para.text + "\n"
115
+ return text
116
+
117
+ def extract_text_from_attachment(filename, data):
118
+ if filename.endswith('.pdf'):
119
+ return extract_text_from_pdf(base64.urlsafe_b64decode(data))
120
+ elif filename.endswith('.docx'):
121
+ return extract_text_from_docx(base64.urlsafe_b64decode(data))
122
+ else:
123
+ # Add handling for other document types if needed
124
+ return "Unsupported document type"
125
+
126
+