Spaces:

Omkar008
/

demo

Sleeping

App Files Files Community

Omkar008 commited on May 2

Commit

b2e9bf4

•

1 Parent(s): b19687e

Upload 22 files

Browse files

Files changed (21) hide show

Dockerfile +18 -0
__init__.py +0 -0
controllers/__init__.py +1 -0
controllers/gmail_controller.py +274 -0
controllers/ner_ai_controller.py +6 -0
controllers/websocket_controller.py +205 -0
controllers/ws_controller.py +263 -0
data_extraction_classes/get_gmail_data.py +223 -0
main.py +26 -0
models/__init__.py +0 -0
models/models.py +36 -0
requirements.txt +44 -0
routers/__init__.py +2 -0
routers/auth.py +7 -0
routers/gmail.py +23 -0
routers/queryfilter_router.py +18 -0
routers/websockets_new.py +39 -0
services/__init__.py +0 -0
services/base_ai_client.py +22 -0
services/chat_client_NER.py +16 -0
services/utils.py +126 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.11.5-slim
+WORKDIR /app
+COPY . /app
+RUN pip install -r requirements.txt
+RUN useradd -m -u 1000 user
+USER user
+# Copy the rest of the application code into the container at /app
+COPY --chown=user . /app/
+# Command to run your application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860" , "--workers" , "5"]

__init__.py ADDED Viewed

File without changes

controllers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

controllers/gmail_controller.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import requests
+import base64
+from bs4 import BeautifulSoup
+import re
+import jwt
+from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
+from cryptography.hazmat.backends import default_backend
+import os
+import hashlib
+class GmailDataExtractor:
+    def __init__(self,jwt:str  , user_input: str = None) -> None:
+        if jwt is None :
+            self.error = "Error"
+        else:
+            self.__jwt = jwt
+            self.__user_input = user_input
+            self.error = None
+            self.__secret_key = 'nkldjlncbamjlklwjeklwu24898h*&#Ujnfjf34893U5HSJFBSKFSHFNSK*$*W_ 3OWU'
+    def __validate_jwt_token(self):
+        try:
+            payload = jwt.decode(self.jwt, self.secret_key, algorithms=["HS256"])
+            access_token = payload.get("access_token")
+            if access_token:
+                return access_token
+            else:
+                raise ValueError("Invalid JWT token: Missing access token")
+        except jwt.ExpiredSignatureError:
+            raise ValueError("Invalid JWT token: Expired token")
+        except jwt.InvalidTokenError:
+            raise ValueError("Invalid JWT token: Token verification failed")
+    def __fetch_messages(self) -> list:
+        """
+        Fetches messages from the Gmail API.
+        Args:
+            gmail_url (str): The URL for the Gmail API request.
+            access_token (str): The access token for authenticating with Gmail API.
+        Returns:
+            list: A list of message objects retrieved from the Gmail API.
+        Raises:
+            RuntimeError: If there is an issue while fetching messages from the Gmail API.
+        """
+        """currently not implementing jwt for testing purposes
+        replace every access_token with jwt function directly which returns the access token"""
+        access_token = self.__jwt
+        print("access token")
+        print(access_token)
+        receipt_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR  subject: aankoopbon  OR subject:reçu OR subject:invoice OR subject:invoice OR category:purchases)'
+        # if self.__user_input is not None:
+        #     receipt_query = f'((subject:"your order" OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice OR category:purchases)  AND subject:{self.__user_input})&maxResults=15'
+        gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={receipt_query}&maxResults=10"
+        gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
+        gmail_data = gmail_response.json()
+        messages=[]
+        messages.extend(gmail_data.get("messages",[]))
+        # def __fetch_page(url):
+        #     response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
+        #     response.raise_for_status()  # Raise error if the request fails
+        #     data = response.json()
+        #     return data.get("messages", []), data.get("nextPageToken")
+        # messages = []
+        # page_token = None
+        # try:
+        #     while True:
+        #         url = f"{gmail_url}&pageToken={page_token}" if page_token else gmail_url
+        #         page_messages, page_token = __fetch_page(url)
+        #         messages.extend(page_messages)
+        #         if not page_token:
+        #             break
+        # except requests.RequestException as e:
+        #     raise RuntimeError(f"Error fetching messages from Gmail API: {str(e)}")
+        print(len(messages))
+        return messages
+    def __fetch_message_data(self, message_id: str) -> dict:
+        """
+        Fetches message data from the Gmail API.
+        Args:
+            message_id (str): The ID of the message to fetch.
+        Returns:
+            dict: Message data retrieved from the Gmail API.
+        Raises:
+            RuntimeError: If there is an issue while fetching message data from the Gmail API.
+        """
+        print("fetch_message_data")
+        message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
+        try:
+            response = requests.get(message_url, headers={"Authorization": f"Bearer {self.__jwt}"})
+            response.raise_for_status()  # Raise error if the request fails
+            return response.json()
+        except requests.RequestException as e:
+            raise RuntimeError(f"Error fetching message data from Gmail API: {str(e)}")
+    def __fetch_attachment_data(self, message_id: str, attachment_id: str) -> dict:
+        """
+        Fetches attachment data from the Gmail API.
+        Args:
+            message_id (str): The ID of the message containing the attachment.
+            attachment_id (str): The ID of the attachment to fetch.
+        Returns:
+            dict: Attachment data retrieved from the Gmail API.
+        Raises:
+            RuntimeError: If there is an issue while fetching attachment data from the Gmail API.
+        """
+        print("fetch_attachment_data")
+        attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
+        try:
+            response = requests.get(attachment_url, headers={"Authorization": f"Bearer {self.__jwt}"})
+            response.raise_for_status()  # Raise error if the request fails
+            return response.json()
+        except requests.RequestException as e:
+            raise RuntimeError(f"Error fetching attachment data from Gmail API: {str(e)}")
+    def __process_message(self, message: dict) -> tuple:
+        """
+        Processes a single message.
+        Args:
+            message (dict): The message to process.
+        Returns:
+            tuple: A tuple containing the subject (str), body (str), links (list of str),
+                and base64 data if it contains an document attachment in the form of pdf, docx, ppt or any file format indicating whether the message contains an attachment.
+        Raises:
+            RuntimeError: If there is an issue while fetching message data from the Gmail API.
+        """
+        print("process_messages")
+        message_id = message.get("id")
+        # encrypted_message_id = self.encrypt_message_id(message_id)
+        if not message_id:
+            return None, None, [], False
+        subject=''
+        message_data = self.__fetch_message_data(message_id)
+        if 'payload' in message_data and 'headers' in message_data['payload']:
+            headers = message_data['payload']['headers']
+            for header in headers:
+                if header['name'] == 'Subject':
+                    subject = header['value']
+        body = ''
+        text=''
+        links = []
+        has_attachment = False
+        company_from_gmail = 'others'
+        if 'payload' in message_data and 'parts' in message_data['payload']:
+            parts = message_data['payload']['parts']
+            payload = message_data['payload']['headers']
+            print("printing headers response")
+            print(payload)
+            #Extracting the domain name from the senders email
+            for fromdata in payload:
+                if fromdata['name'] == 'From':
+                    company_from_gmail = self.extract_domain_from_email(fromdata['value'])
+                    break
+            if 'chanel' in subject.lower():
+                company_from_gmail = 'chanel'
+            if 'louis vuitton' in subject.lower():
+                company_from_gmail = 'Louis Vuitton'
+            for part in parts:
+                if 'mimeType' not in part:
+                    continue
+                mime_type = part['mimeType']
+                if mime_type == 'text/plain' or mime_type == 'text/html':
+                    body_data = part['body'].get('data', '')
+                    body = base64.urlsafe_b64decode(body_data)
+                    text= self.extract_text(body)
+                if 'body' in part and 'attachmentId' in part['body']:
+                    attachment_id = part['body']['attachmentId']
+                    attachment_data = self.__fetch_attachment_data(message_id, attachment_id)
+                    data = attachment_data.get("data", "")
+                    filename = part.get("filename", "untitled.txt")
+                    if data:
+                        # Save only the first 10 characters of the attachment data
+                        return subject,text ,{"filename":filename , "data":data} , company_from_gmail , message_id
+        return subject, text,None , company_from_gmail , message_id
+    def encrypt_message_id(self,message_id:str):
+        key = os.getenv('AES_KEY').encode('utf-8')[:32]
+        message_id_bytes = message_id.encode('utf-8')
+        iv = os.urandom(16)
+        # Initialize AES cipher with the key and CBC mode
+        cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
+        # Create a encryptor object
+        encryptor = cipher.encryptor()
+        # Pad the message_id to be a multiple of 16 bytes (AES block size)
+        # This is necessary for AES encryption
+        message_id_padded = message_id_bytes.ljust(32, b'\0')
+        # Encrypt the padded message_id
+        ciphertext = encryptor.update(message_id_padded) + encryptor.finalize()
+        return ciphertext
+    def extract_domain_from_email(self,email_string):
+        # Extracting the email address using regex
+        email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
+        # Extracting the domain name from the email address
+        domain = email_address.split('@')[-1].split('.')[0]
+        if email_address and domain :
+            return domain
+        else:
+            return None
+    def extract_text(self,html_content:str):
+        """
+        Extracts text and links from HTML content.
+        Args:
+            html_content (str): The HTML content to process.
+        Returns:
+            tuple: A tuple containing the extracted text (str) and links (list of tuples).
+        Raises:
+            ValueError: If the input HTML content is empty or None.
+        """
+        if not html_content:
+            raise ValueError("HTML content is empty or None")
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Extract text
+        text = soup.get_text(separator=' ')
+        text = re.sub(r'\s+', ' ', text).strip()
+        print("Printing the extracted text from the html")
+        print(text)
+        print()
+        print()
+        # Extract links
+        links = [(link.text, link['href']) for link in soup.find_all('a', href=True)]
+        return text
+    def extract_messages(self) -> dict:
+        """
+        Extracts messages based on the provided brand name.
+        Args:
+            brand_name (str): The brand name to search for in email subjects.
+            jwt_token (str): The JWT token for authentication.
+        Returns:
+            dict: A dictionary containing the extracted messages with their subjects, bodies, links, and attachment statuses.
+            format:{"results":[{"subjec":"test subject" , "body":"it would be text" , "attachment_data":{"filename":base64URL format}},{second message with same content of subject , body , attachment_data}]}
+        """
+        print("entered the extract messages")
+        messages = self.__fetch_messages()
+        results = []
+        for message in messages:
+            subject, body, attachment_data , company_name , encrypt_mssg_id = self.__process_message(message)
+            """ Handling None values """
+            body = body if body is not None else ''
+            attachment_data = attachment_data if attachment_data is not None else {}
+            company_associated = company_name if company_name is not None else ''
+            en_msg_id = encrypt_mssg_id if encrypt_mssg_id is not None else None
+            results.append({"body": body, "attachment_data": [attachment_data] ,'company_associated':company_associated , "message_id":en_msg_id})
+        return {"results": results}

controllers/ner_ai_controller.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from services.chat_client_NER import ChatClient
+def get_brand_from_query(query:str):
+    chat = ChatClient().create(conversation=[])
+    response = chat.send_message(content=f"{query}", stream=False)
+    return response.text

controllers/websocket_controller.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import logging
+import base64
+import requests
+import asyncio
+from fastapi import WebSocket
+from services import utils as util
+import re
+from bs4 import BeautifulSoup
+async def send_chunked_data(websocket: WebSocket, filename: str, data: str ,company_associated:str , message_id:str):
+    chunk_size = 2000  # Set an appropriate chunk size
+    for i in range(0, len(data), chunk_size):
+        await websocket.send_json({"filename": filename, "data_chunk": data[i:i + chunk_size]})
+        await asyncio.sleep(0.4)
+    await websocket.send_json({"company_associated":company_associated , "message_id":message_id})
+    await websocket.send_text("FinishedThisAttachment")
+async def send_chunked_data_without_attch(websocket: WebSocket,body_text:str,message_id:str , company_associated:str):
+    chunk_size = 2000  # Set an appropriate chunk size
+    await websocket.send_text("This message does'nt contain an Attachment")
+    for i in range(0, len(body_text), chunk_size):
+        await websocket.send_json({"data_chunk": body_text[i:i + chunk_size]})
+        await asyncio.sleep(0.4)
+    await websocket.send_json({"company_associated":company_associated , "message_id":message_id})
+    await websocket.send_text("FinishedThisAttachmentnotContainingAttachment")
+async def process_messages(access_token: str, websocket: WebSocket):
+    logging.info("Entered process_messages")
+    messages = get_messages(access_token)
+    await websocket.send_json({"total_messages": len(messages)})
+    await websocket.send_text("CompletedSendingTotalMessagesLength")
+    for message in messages:
+        message_id = message.get("id")
+        if message_id:
+            message_data = fetch_message_data(access_token, message_id)
+            await process_message_data(access_token,message_data, websocket,message_id)
+    await websocket.send_text("CompletedFetchingMessages")
+async def websocket_main(code: str, websocket: WebSocket):
+    logging.info("Entered mwebsocket_main")
+    access_token = code
+    await process_messages(access_token, websocket)
+    logging.info("Completed Fetching all the messages")
+    websocket.close()
+def get_messages(code: str):
+    logging.info("Entered get_messages")
+    access_token = code
+    page_token = None
+    messages = []
+    jobs_query = f'subject:"your order" OR subject:receipts OR subject:receipt OR  subject: aankoopbon  OR subject:reçu OR subject:invoice OR subject:invoice OR category:purchases'
+    max_results = 10
+    while True:
+        gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={jobs_query}&maxResults={max_results}"
+        if page_token:
+            gmail_url += f"&pageToken={page_token}"
+        gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
+        logging.info(f"{gmail_response}")
+        gmail_data = gmail_response.json()
+        if "messages" in gmail_data:
+            messages.extend(gmail_data["messages"])
+        # if len(messages) 10:
+        #     break
+        if "nextPageToken" in gmail_data:
+            page_token = gmail_data["nextPageToken"]
+        else:
+            break
+    logging.info("Total Length:")
+    logging.info(len(messages))
+    return messages
+def fetch_message_data(access_token: str, message_id: str):
+    logging.info(f"Entered fetch_message_data for message_id: {message_id}")
+    message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
+    message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"})
+    return message_response.json()
+async def process_message_data(access_token:str,message_data: dict, websocket: WebSocket,message_id:str):
+    logging.info("Entered process_message_data")
+    subject=''
+    body_base64 = ''
+    body_html=''
+    body_text = ''
+    compnay_from_mail = 'others'
+    #Extracting subject
+    subject = extract_subject_from_mail(message_data)
+    company_from_mail = extract_domain_name(message_data['payload']['headers'],subject)
+    if "payload" in message_data and "parts" in message_data["payload"]:
+        #Extracting the domain name from the senders email
+        for part in message_data["payload"]["parts"]:
+            if 'mimeType' not in part:
+                    continue
+            mime_type = part['mimeType']
+            if mime_type == 'text/plain' or mime_type == 'text/html':
+                body_data = part['body'].get('data', '')
+                body_base64 = base64.urlsafe_b64decode(body_data)
+                body_text = extract_text(body_base64)
+            if "body" in part and "attachmentId" not in part["body"]:
+                await process_mail_body_data(websocket , body_text , message_id , company_from_mail)
+            if "body" in part and "attachmentId" in part["body"]:
+                attachment_id = part["body"]["attachmentId"]
+                attachment_data = fetch_attachment_data(access_token, message_data["id"], attachment_id)
+                body_text=''
+                await process_attachment_data(part, attachment_data, websocket,company_from_mail ,message_id)
+async def process_attachment_data(part: dict, attachment_data: dict, websocket: WebSocket,company_associated:str,message_id:str):
+    logging.info("Entered process_attachment_data")
+    filename = part.get("filename", "untitled.txt")
+    data = attachment_data.get("data", {})
+    if data:
+        attachment_content = base64.urlsafe_b64decode(data)
+        extracted_text = await util.extract_text_from_attachment(filename, attachment_content)
+        logging.info(f"Extracted text from attachment {filename}: {extracted_text}")
+        await send_chunked_data(websocket, filename, data , company_associated ,message_id)
+async def process_mail_body_data(websocket:WebSocket  ,body_text : str, message_id:str,company_associated:str):
+    await send_chunked_data_without_attch(websocket,body_text,message_id,company_associated)
+def fetch_attachment_data(access_token: str, message_id: str, attachment_id: str):
+    logging.info(f"Entered fetch_attachment_data for attachment_id: {attachment_id}")
+    attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
+    attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token}"})
+    return attachment_response.json()
+def extract_subject_from_mail(message_data: dict):
+    if 'payload' in message_data and 'headers' in message_data['payload']:
+        headers = message_data['payload']['headers']
+        for header in headers:
+            if header['name'] == 'Subject':
+                return header['value']
+        # If 'Subject' header is not found, return a default value or handle it gracefully
+        return ""
+    else:
+        # If 'payload' or 'headers' are not present, return a default value or handle it gracefully
+        return ""
+def extract_domain_name(payload:dict,subject:str):
+    domain_name = 'others'
+    for fromdata in payload:
+        if fromdata['name'] == 'From':
+            domain_name = extract_domain_from_email(fromdata['value'])
+            break
+    if 'chanel' in subject.lower():
+        return 'chanel'
+    if 'louis vuitton' in subject.lower():
+        return'Louis Vuitton'
+    return domain_name
+def extract_domain_from_email(email_string:str):
+    # Extracting the email address using regex
+    email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
+    # Extracting the domain name from the email address
+    domain = email_address.split('@')[-1].split('.')[0]
+    if email_address and domain :
+        return domain
+    else:
+        return None
+def extract_text(html_content:str):
+    """
+    Extracts text and links from HTML content.
+    Args:
+        html_content (str): The HTML content to process.
+    Returns:
+        tuple: A tuple containing the extracted text (str) and links (list of tuples).
+    Raises:
+        ValueError: If the input HTML content is empty or None.
+    """
+    if not html_content:
+        raise ValueError("HTML content is empty or None")
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # Extract text
+    text = soup.get_text(separator=' ')
+    text = re.sub(r'\s+', ' ', text).strip()
+    print("Printing the extracted text from the html")
+    print(text)
+    print()
+    print()
+        # Extract links
+    links = [(link.text, link['href']) for link in soup.find_all('a', href=True)]
+    return text

controllers/ws_controller.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import base64
+import json
+import logging
+import re
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional, List, Dict
+import requests
+from bs4 import BeautifulSoup
+from models.models import Message, Attachment
+from fastapi import WebSocket
+from services import utils as ut
+import asyncio
+def get_company_type(company_name:str)->str:
+    company_types_dict ={'ao yun': 'wines and spirit', 'ardbeg': 'wines and spirit', 'belvedere': 'wines and spirit', 'bodega numanthia': 'wines and spirit', 'chandon': 'wines and spirit', 'château cheval blanc': 'wines and spirit', "château d'yquem": 'wines and spirit', 'château galoupet': 'wines and spirit', 'cheval des andes': 'wines and spirit', 'clos19': 'wines and spirit', 'cloudy bay': 'wines and spirit', 'colgin cellars': 'wines and spirit', 'dom pérignon': 'wines and spirit', 'domaine des lambrays': 'wines and spirit', 'eminente': 'wines and spirit', 'glenmorangie': 'wines and spirit', 'hennessy': 'wines and spirit', 'joseph phelps': 'wines and spirit', 'krug': 'wines and spirit', 'mercier': 'wines and spirit', 'moët & chandon': 'wines and spirit', 'newton vineyard': 'wines and spirit', 'ruinart': 'wines and spirit', 'terrazas de los andes': 'wines and spirit', 'veuve clicquot': 'wines and spirit', 'volcan de mi tierra': 'wines and spirit', 'woodinville': 'wines and spirit' , 'berluti': 'Fashion & Leather Goods', 'celine': 'Fashion & Leather Goods', 'christian dior': 'Fashion & Leather Goods', 'emilio pucci': 'Fashion & Leather Goods', 'fendi': 'Fashion & Leather Goods', 'givenchy': 'Fashion & Leather Goods', 'kenzo': 'Fashion & Leather Goods', 'loewe': 'Fashion & Leather Goods', 'loro piana': 'Fashion & Leather Goods', 'louis vuitton': 'Fashion & Leather Goods', 'marc jacobs': 'Fashion & Leather Goods', 'moynat': 'Fashion & Leather Goods', 'patou': 'Fashion & Leather Goods', 'rimowa': 'Fashion & Leather Goods','acqua di parma': 'Perfumes & Cosmetics', 'benefit cosmetics': 'Perfumes & Cosmetics', 'cha ling': 'Perfumes & Cosmetics', 'fenty beauty by rihanna': 'Perfumes & Cosmetics', 'fresh': 'Perfumes & Cosmetics', 'givenchy parfums': 'Perfumes & Cosmetics', 'guerlain': 'Perfumes & Cosmetics', 'kenzo parfums': 'Perfumes & Cosmetics', 'kvd beauty': 'Perfumes & Cosmetics', 'loewe perfumes': 'Perfumes & Cosmetics', 'maison francis kurkdjian': 'Perfumes & Cosmetics', 'make up for ever': 'Perfumes & Cosmetics', 'officine universelle buly': 'Perfumes & Cosmetics', 'olehenriksen': 'Perfumes & Cosmetics', 'parfums christian dior': 'Perfumes & Cosmetics', 'stella by stella mccartney': 'Perfumes & Cosmetics','bulgari': 'Watches & Jewelry', 'chaumet': 'Watches & Jewelry', 'fred': 'Watches & Jewelry', 'hublot': 'Watches & Jewelry', 'repossi': 'Watches & Jewelry', 'tag heuer': 'Watches & Jewelry', 'tiffany & co.': 'Watches & Jewelry', 'zenith': 'Watches & Jewelry','24s': 'Selective retailing', 'dfs': 'Selective retailing', 'la grande epicerie de paris': 'Selective retailing', 'le bon marché rive gauche': 'Selective retailing', 'sephora': 'Selective retailing','belmond': 'Other activities', 'cheval blanc': 'Other activities', 'connaissance des arts': 'Other activities', 'cova': 'Other activities', 'investir': 'Other activities', "jardin d'acclimatation": 'Other activities', 'le parisien': 'Other activities', 'les echos': 'Other activities', 'radio classique': 'Other activities', 'royal van lent': 'Other activities'}
+    print(company_types_dict["louis vuitton"])
+    return company_types_dict.get(company_name.lower(), 'Others')
+async def get_messages(code: str,websocket:WebSocket,brand_name: Optional[str] = None):
+    access_token = code
+    g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject:aankoopbon OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases) has:attachment'
+    if brand_name is not None:
+        g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR  subject: aankoopbon  OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases OR from:{brand_name}) AND subject:{brand_name} has:attachment'
+    page_token = None
+    messages = []
+    # max_results = 10
+    # gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={jobs_query}&maxResults={max_results}"
+    # gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
+    # gmail_data = gmail_response.json()
+    # messages.append(gmail_data['messages'])
+    def fetch_message_wrapper(message_data):
+        message_id = message_data.get("id")
+        if message_id:
+            return fetch_message_data(access_token, message_id)
+        return None
+    while True:
+        gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={g_query}"
+        if page_token:
+            gmail_url += f"&pageToken={page_token}"
+        gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
+        gmail_data = gmail_response.json()
+        print(len(gmail_data))
+        print(gmail_data)
+        if "messages" in gmail_data:
+            with ThreadPoolExecutor(max_workers=15) as executor:
+                futures=[executor.submit(fetch_message_wrapper, message_data) for message_data in
+                           gmail_data["messages"]]
+                for future in futures:
+                    message = future.result()
+                    if message:
+                        messages.append(message)
+            for message_data in messages:
+                await process_message(message_data,websocket,10000)
+        if "nextPageToken" in gmail_data:
+            page_token = gmail_data["nextPageToken"]
+        else:
+            break
+    print("printing messages")
+    print(messages)
+    return messages
+async def process_message(message:Message, websocket:WebSocket, chunk_size:int):
+    logging.info("process_message")
+    if message:
+        message_json = message.to_json()
+        logging.info(f"{message_json}")
+        await send_message_in_chunks(websocket, message_json, chunk_size)
+        await websocket.send_text("NEXT_MESSAGE")
+def fetch_message_data(access_token: str, message_id: str) -> Message:
+    message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
+    message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"})
+    message_data = message_response.json()
+    # print(message_data)
+    subject = extract_subject_from_mail(message_data)
+    company_from_mail = extract_domain_name(message_data['payload']['headers'], subject)
+    body = extract_body_from_mail(message_data)
+    attachments,structed_attachment_data = extract_attachments_from_mail(access_token, message_data)
+    high_level_company_type = get_company_type(company_from_mail)
+    # structed_attachment_data = extract_json_from_attachments(access_token , message_data)
+    body_len = 0
+    if body is not None :
+        body_len = len(body)
+    # print("subject: ")
+    # print(subject)
+    # print("company name: ")
+    # print(company_from_mail)
+    # print("Printing the body of the mail: ")
+    # print(body)
+    # print("Printing attachment Data: ")
+    # print(attachments)
+    # print("Completed this mail.")
+    return Message(message_id=message_id, body_len=body_len,body=body, attachments=attachments, company=company_from_mail,high_level_company_type=high_level_company_type,structured_data = structed_attachment_data)
+def extract_subject_from_mail(message_data: dict) -> str:
+    if 'payload' in message_data and 'headers' in message_data['payload']:
+        headers = message_data['payload']['headers']
+        for header in headers:
+            if header['name'] == 'Subject':
+                return header['value']
+        return ""
+    else:
+        return ""
+def extract_domain_name(payload: dict, subject: str) -> str:
+    domain_name = 'others'
+    for fromdata in payload:
+        if fromdata['name'] == 'From':
+            domain_name = extract_domain_from_email(fromdata['value'])
+            break
+    if 'chanel' in subject.lower():
+        return 'chanel'
+    if 'louis vuitton' in subject.lower():
+        return 'Louis Vuitton'
+    return domain_name
+def extract_domain_from_email(email_string: str) -> Optional[str]:
+    email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
+    domain = email_address.split('@')[-1].split('.')[0]
+    if email_address and domain:
+        return domain
+    else:
+        return None
+# def extract_body_from_mail(message_data: dict) -> str:
+#     body = None
+#     if "payload" in message_data and "parts" in message_data["payload"]:
+#         for part in message_data["payload"]["parts"]:
+#             if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
+#                 body_data = part['body'].get('data', '')
+#                 body_base64 = base64.urlsafe_b64decode(body_data)
+#                 body = extract_text(body_base64)
+#     return body
+def extract_body_from_mail(message_data: dict) -> str:
+    body = None
+    if "payload" in message_data:
+        payload = message_data["payload"]
+        if "parts" in payload:
+            for part in payload["parts"]:
+                if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
+                    body_data = part['body'].get('data', '')
+                    if body_data:
+                        body_base64 = base64.urlsafe_b64decode(body_data)
+                        body = extract_text(body_base64)
+        elif 'body' in payload:
+            body_data = payload['body'].get('data', '')
+            if body_data:
+                body_base64 = base64.urlsafe_b64decode(body_data)
+                body = extract_text(body_base64)
+        elif 'parts' in payload['body']:
+            for part in payload['body']['parts']:
+                if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
+                    body_data = part['body'].get('data', '')
+                    if body_data:
+                        body_base64 = base64.urlsafe_b64decode(body_data)
+                        body = extract_text(body_base64)
+    if not body:
+        body = message_data.get('snippet', '')
+    return body
+def fetch_attachment_data(access_token: str, message_id: str, attachment_id: str) -> Dict:
+    attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
+    attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token}"})
+    return attachment_response.json()
+def extract_attachments_from_mail(access_token: str, message_data: dict) -> List[Attachment]:
+    attachments = []
+    structured_data = []
+    if "payload" in message_data and "parts" in message_data["payload"]:
+        for part in message_data["payload"]["parts"]:
+            if "body" in part and "attachmentId" in part["body"]:
+                attachment_id = part["body"]["attachmentId"]
+                attachment_data = fetch_attachment_data(access_token, message_data["id"], attachment_id)
+                filename = part.get("filename", "untitled.txt")
+                data = attachment_data.get("data", "")
+                raw_text=ut.extract_text_from_attachment(filename , data)
+                struct_data = ut.strcuture_document_data(raw_text)
+                if struct_data:
+                    structured_data.append(struct_data)
+                attachments.append(Attachment(attachment_len = len(attachment_data.get("data", "")),filename=filename, data=attachment_data.get("data", "")))
+    return attachments,structured_data
+def extract_text(html_content: str) -> str:
+    if not html_content:
+        raise ValueError("HTML content is empty or None")
+    soup = BeautifulSoup(html_content, 'html.parser')
+    text = soup.get_text(separator=' ')
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+async def websocket_main(code: str,  websocket: WebSocket,brand_name: Optional[str] = None):
+    access_token = code
+    # messages = get_messages(access_token,websocket,brand_name)
+    await get_messages(access_token,websocket,brand_name)
+    # print("websocket_main")
+    # print(messages)
+    # # logging.info(f"brand_name:{brand_name}")
+    # await websocket.send_json({"total_messages": len(messages)})
+    # print("Total Length of messages")
+    # print(len(messages))
+    # chunk_size = 100000
+    # i=0
+    # for message in messages:
+    #     message_json = message.to_json()
+    #     logging.info(f"{i} th message")
+    #     i=i+1
+    #     await send_message_in_chunks(websocket, message_json, chunk_size)
+    #     await websocket.send_text("NEXT_MESSAGE")
+    await websocket.close()
+async def send_message_in_chunks(websocket: WebSocket, message_json: dict, chunk_size: int):
+    # if message_json['attachments'] is not None :
+    #     for attch in message_json['attachments']:
+    #         attachment_len = attch['attachment_len']
+    # print(body_len)
+    # print(attachment_len)
+    # if attachment_len == 0:
+    #     attachment_len = None
+    # await websocket.send_json({"body_len":body_len ,"attachment_len":attachment_len})
+    message_str = json.dumps(message_json)
+    # print("Printing message_str")
+    # print(message_str)
+    # logging.info(message_str)
+    # await websocket.send_json({"file_len":len(file)})
+    for i in range(0, len(message_str), chunk_size):
+        await websocket.send_text(message_str[i:i + chunk_size])

data_extraction_classes/get_gmail_data.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import requests
+import base64
+from bs4 import BeautifulSoup
+import re
+import jwt
+class GmailDataExtractor:
+    def __init__(self,jwt:str  , user_input: str = None) -> None:
+        if jwt is None :
+            self.error = "Error"
+        else:
+            self.__jwt = jwt
+            self.__user_input = user_input
+            self.error = None
+            self.__secret_key = 'nkldjlncbamjlklwjeklwu24898h*&#Ujnfjf34893U5HSJFBSKFSHFNSK*$*W_ 3OWU'
+    def __validate_jwt_token(self):
+        try:
+            payload = jwt.decode(self.jwt, self.secret_key, algorithms=["HS256"])
+            access_token = payload.get("access_token")
+            if access_token:
+                return access_token
+            else:
+                raise ValueError("Invalid JWT token: Missing access token")
+        except jwt.ExpiredSignatureError:
+            raise ValueError("Invalid JWT token: Expired token")
+        except jwt.InvalidTokenError:
+            raise ValueError("Invalid JWT token: Token verification failed")
+    def __fetch_messages(self) -> list:
+        """
+        Fetches messages from the Gmail API.
+        Args:
+            gmail_url (str): The URL for the Gmail API request.
+            access_token (str): The access token for authenticating with Gmail API.
+        Returns:
+            list: A list of message objects retrieved from the Gmail API.
+        Raises:
+            RuntimeError: If there is an issue while fetching messages from the Gmail API.
+        """
+        """currently not implementing jwt for testing purposes
+        replace every access_token with jwt function directly which returns the access token"""
+        access_token = self.__jwt
+        print("access token")
+        print(access_token)
+        receipt_query = f"(label:^smartlabel_receipt OR (subject:your AND subject:order) OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice))"
+        if self.__user_input is not None:
+            receipt_query = f"(label:^smartlabel_receipt OR (subject:your AND subject:order) OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice)) AND subject:{self.__user_input}"
+        gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={receipt_query}"
+        def __fetch_page(url):
+            response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
+            response.raise_for_status()  # Raise error if the request fails
+            data = response.json()
+            return data.get("messages", []), data.get("nextPageToken")
+        messages = []
+        page_token = None
+        try:
+            while True:
+                url = f"{gmail_url}&pageToken={page_token}" if page_token else gmail_url
+                page_messages, page_token = __fetch_page(url)
+                messages.extend(page_messages)
+                if not page_token:
+                    break
+        except requests.RequestException as e:
+            raise RuntimeError(f"Error fetching messages from Gmail API: {str(e)}")
+        return messages
+    def __fetch_message_data(self, message_id: str) -> dict:
+        """
+        Fetches message data from the Gmail API.
+        Args:
+            message_id (str): The ID of the message to fetch.
+        Returns:
+            dict: Message data retrieved from the Gmail API.
+        Raises:
+            RuntimeError: If there is an issue while fetching message data from the Gmail API.
+        """
+        message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
+        try:
+            response = requests.get(message_url, headers={"Authorization": f"Bearer {self.__jwt}"})
+            response.raise_for_status()  # Raise error if the request fails
+            return response.json()
+        except requests.RequestException as e:
+            raise RuntimeError(f"Error fetching message data from Gmail API: {str(e)}")
+    def __fetch_attachment_data(self, message_id: str, attachment_id: str) -> dict:
+        """
+        Fetches attachment data from the Gmail API.
+        Args:
+            message_id (str): The ID of the message containing the attachment.
+            attachment_id (str): The ID of the attachment to fetch.
+        Returns:
+            dict: Attachment data retrieved from the Gmail API.
+        Raises:
+            RuntimeError: If there is an issue while fetching attachment data from the Gmail API.
+        """
+        attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
+        try:
+            response = requests.get(attachment_url, headers={"Authorization": f"Bearer {self.__jwt}"})
+            response.raise_for_status()  # Raise error if the request fails
+            return response.json()
+        except requests.RequestException as e:
+            raise RuntimeError(f"Error fetching attachment data from Gmail API: {str(e)}")
+    def __process_message(self, message: dict) -> tuple:
+        """
+        Processes a single message.
+        Args:
+            message (dict): The message to process.
+        Returns:
+            tuple: A tuple containing the subject (str), body (str), links (list of str),
+                and base64 data if it contains an document attachment in the form of pdf, docx, ppt or any file format indicating whether the message contains an attachment.
+        Raises:
+            RuntimeError: If there is an issue while fetching message data from the Gmail API.
+        """
+        message_id = message.get("id")
+        if not message_id:
+            return None, None, [], False
+        message_data = self.__fetch_message_data(message_id, self.__jwt)
+        subject = message_data.get('payload', {}).get('headers', {}).get('value', '')
+        body = ''
+        links = []
+        has_attachment = False
+        if 'payload' in message_data and 'parts' in message_data['payload']:
+            parts = message_data['payload']['parts']
+            for part in parts:
+                if 'mimeType' not in part:
+                    continue
+                mime_type = part['mimeType']
+                if mime_type == 'text/plain' or mime_type == 'text/html':
+                    body_data = part['body'].get('data', '')
+                    body = base64.urlsafe_b64decode(body_data).decode('utf-8')
+                    text= self._extract_text_and_links(body)
+                if 'body' in part and 'attachmentId' in part['body']:
+                    attachment_id = part['body']['attachmentId']
+                    attachment_data = self.__fetch_attachment_data(message_id, attachment_id)
+                    data = attachment_data.get("data", "")
+                    filename = part.get("filename", "untitled.txt")
+                    if data:
+                        # Save only the first 10 characters of the attachment data
+                        return subject,body , links , {filename:data}
+        return subject, body, links , None
+    def extract_text_and_links(html_content: str) -> tuple:
+        """
+        Extracts text and links from HTML content.
+        Args:
+            html_content (str): The HTML content to process.
+        Returns:
+            tuple: A tuple containing the extracted text (str) and links (list of tuples).
+        Raises:
+            ValueError: If the input HTML content is empty or None.
+        """
+        if not html_content:
+            raise ValueError("HTML content is empty or None")
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Extract text
+        text = soup.get_text(separator=' ')
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Extract links
+        links = [(link.text, link['href']) for link in soup.find_all('a', href=True)]
+        return text, links
+    def extract_messages(self) -> dict:
+        """
+        Extracts messages based on the provided brand name.
+        Args:
+            brand_name (str): The brand name to search for in email subjects.
+            jwt_token (str): The JWT token for authentication.
+        Returns:
+            dict: A dictionary containing the extracted messages with their subjects, bodies, links, and attachment statuses.
+            format:{"results":[{"subjec":"test subject" , "body":"it would be text" , "attachment_data":{"filename":base64URL format}},{second message with same content of subject , body , attachment_data}]}
+        """
+        print("entered the extract messages")
+        messages = self.__fetch_messages()
+        results = []
+        for message in messages:
+            subject, body, attachment_data = self.__process_message(message)
+            """ Handling None values """
+            subject = subject if subject is not None else ""
+            body = body if body is not None else ""
+            attachment_data = attachment_data if attachment_data is not None else {}
+            results.append({"subject": subject, "body": body, "attachment_data": attachment_data})
+        return {"results": results}
+# obj = GmailDataExtractor("abcd","user_input")
+# print(obj.error)

main.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from fastapi import FastAPI , Request, APIRouter, Depends, HTTPException
+from starlette.middleware.cors import CORSMiddleware
+from routers import auth , gmail , websockets_new , queryfilter_router
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["DELETE", "GET", "POST", "PUT"],
+    allow_headers=["*"],
+)
+app.include_router(auth.router)
+app.include_router(gmail.router)
+app.include_router(websockets_new.router)
+app.include_router(queryfilter_router.router)
+@app.get("/")
+async def test():
+    return {"Message":"Application is Working!"}

models/__init__.py ADDED Viewed

File without changes

models/models.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import json
+from typing import Optional, List, Dict
+class Attachment:
+    def __init__(self, attachment_len:int,filename: str, data: str):
+        self.attachment_len = attachment_len
+        self.filename = filename
+        self.data = data
+class Message:
+    #structured_data:Optional[List] add this in the below __init__
+    def __init__(self, message_id: str, body_len:int, body: Optional[str], attachments: Optional[List[Attachment]], company: str , high_level_company_type:str,structured_data:Optional[List]):
+        self.id = message_id
+        self.body_len = body_len
+        self.body = body
+        self.attachments = attachments
+        self.company = company
+        self.high_level_company_type = high_level_company_type
+        self.structured_data = structured_data
+    def to_json(self):
+        return {
+            "id": self.id,
+            "body_len" : self.body_len,
+            "body": self.body,
+            "attachments": [attachment.__dict__ for attachment in self.attachments] if self.attachments else None,
+            "company": self.company,
+            "high_level_company_type":self.high_level_company_type,
+            "structured_data": self.structured_data if self.structured_data else None
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+aiohttp==3.9.3
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.2.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+cryptography==42.0.2
+Deprecated==1.2.14
+fastapi==0.109.2
+frozenlist==1.4.1
+idna==3.6
+jwcrypto==1.5.1
+lxml==5.1.0
+multidict==6.0.5
+pycparser==2.21
+pydantic==2.6.1
+pydantic_core==2.16.2
+PyJWT==2.8.0
+PyPDF2==3.0.1
+python-docx==1.1.0
+python-jwt==4.1.0
+requests==2.31.0
+sniffio==1.3.0
+soupsieve==2.5
+starlette==0.36.3
+typing_extensions==4.9.0
+urllib3==2.2.0
+wrapt==1.16.0
+yarl==1.9.4
+uvicorn==0.27.1
+uvloop==0.19.0
+websockets==12.0
+google-generativeai==0.3.2
+python-dotenv==1.0.1
+langchain-community==0.0.27
+langchain-core==0.1.30
+langsmith==0.1.23
+langchain==0.1.11
+langchain-text-splitters==0.0.1
+openai==1.13.3
+langchain-openai==0.0.8

routers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

routers/auth.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from fastapi import APIRouter
+router = APIRouter(prefix="/auth")
+@router.get("/authenticate")
+async def auth():
+    return {"Message":"Entered Auth"}

routers/gmail.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from fastapi import APIRouter , Request ,HTTPException
+from controllers import gmail_controller
+router = APIRouter(prefix="/process")
+@router.post("/receipt_data/direct")
+async def get_data(request:Request):
+    try:
+        body_data = await request.json()
+        token = body_data.get('data')
+        user_str = body_data.get('brand_name')
+        if token is None:
+            return HTTPException(status_code=400,detail="Token Invalid!")
+        if user_str is None:
+            user_str = None
+        fetch_data = gmail_controller.GmailDataExtractor(token , user_str).extract_messages()
+        return fetch_data
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

routers/queryfilter_router.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from fastapi import APIRouter , Request ,HTTPException
+from controllers import ner_ai_controller as ai
+import logging
+router = APIRouter(prefix="/queryfilter")
+@router.post("/gemini")
+async def get_data(request:Request):
+    body = await request.json()
+    # user_query = body.get('query')
+    user_query = body.get('query', '') if body else ''
+    return {"brand_name": None} if not user_query.strip() else {"brand_name": ai.get_brand_from_query(user_query)}
+    # return {"brand_name":ai.get_brand_from_query(user_query)}

routers/websockets_new.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from fastapi import APIRouter , Request ,HTTPException , WebSocket
+from controllers import websocket_controller as wc
+from controllers import ws_controller as w
+from controllers import ner_ai_controller as ai
+from services.chat_client_NER import ChatClient
+import logging
+import aiohttp
+router = APIRouter(prefix="/websockets")
+@router.websocket("/ws")
+async def get_data(websocket:WebSocket):
+    await websocket.accept()
+    json = await websocket.receive_json()
+    access_token = json['access_token']
+    logging.info(f"access_token:{access_token}")
+    user_query = json['brand_name'] if json.get('brand_name') is not None else None
+    logging.info(f"brand_name: {user_query}")
+    if access_token is None:
+        await websocket.send_text("Access Token Invalid OR NULL !!!")
+        websocket.close()
+    # access_token = await websocket.receive_text()
+    brand_name = ""
+    logging.info(f"brand_name: f{user_query}")
+    logging.info(f"access_token : {access_token}")
+    if user_query is not None:
+        chat = ChatClient().create(conversation=[])
+        response = chat.send_message(content=f"{user_query}", stream=False)
+        if response.text == 'others':
+            brand_name = None
+        else:
+            brand_name = response.text
+    await w.websocket_main(access_token ,websocket,brand_name)

services/__init__.py ADDED Viewed

File without changes

services/base_ai_client.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import google.generativeai as genai
+from dotenv import load_dotenv
+load_dotenv()
+class BaseAIClient:
+    def __init__(self,system,model_response):
+        genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
+        self.system = system
+        self.model = model_response
+    def create(self, conversation):
+        model = genai.GenerativeModel('gemini-pro')
+        new_conversation = [
+            {"role": 'user', "parts": [self.system]},
+            {"role": 'model', "parts": [self.model]},
+        ]
+        new_conversation.extend(conversation)
+        return model.start_chat(history=new_conversation)

services/chat_client_NER.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .base_ai_client import BaseAIClient
+class ChatClient(BaseAIClient):
+    def __init__(self):
+        super().__init__(
+            '''Your name is Hushh Bot. You will be acting as an NER, recognizing and identifying the Company name or brand name in the input text provided to you.
+            For example: If you are given an input text as -
+            input text: "get my chanel receipts"
+            output: chanel
+            You will provide the output with only the company name strictly.
+            Just reply with the Company name.
+            Above is just an example; you will not receive all the text in a similar format.
+            If you are unable to find the company name then strictly reply with only one word that is "others".
+            ''',
+            ''' '''
+        )

services/utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import PyPDF2
+from docx import Document
+import io
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from typing_extensions import Concatenate
+from typing import List
+# from langchain_community.llms import OpenAI
+from langchain_community.callbacks import get_openai_callback
+from langchain.output_parsers import PydanticOutputParser
+from langchain.prompts import PromptTemplate
+from langchain_core.pydantic_v1 import BaseModel, Field, validator
+import os
+import logging
+import base64
+from langchain_openai import OpenAI
+import re
+import json
+#Setting the openai api key
+api_key=os.getenv('OPENAI_API_KEY')
+class Candidate(BaseModel):
+    brand: str = Field(description="Please identify and provide the primary brand name listed on the receipt. If multiple brand names are present, determine and specify the most prominent or relevant brand associated with the primary transaction on the receipt. If the brand name is not explicitly mentioned, include any contextual details or indirect indicators that might help in accurately identifying the brand. Defalut value will be 'null'.Try to return a brand name.Look for the brand name which would be mostly at the top of the document or text provided.")
+    total_cost: str = Field(description="Identify and provide the 'Total Order Value' listed on the receipt. Please specify the exact section where this value is noted, typically labeled as 'Total', 'Total Amount','total' , 'total amount' ,'total cost','Total Cost','Grand total','grand total'. Include any other labeling variations that might represent the total order value. If the total order value is not present or cannot be determined, explicitly state 'null' as the response.Rember total cost is always the highest value and it mostly cannot be a single digit value like 2.9 , 5.8 , 5 ,etc.But remember total cost is always the highest value which will be found somewhere in middle of provide text.Not at very end or at the very start.")
+    location: str = Field(description="Please provide the city and state where the purchase was made, as indicated on the receipt. For travel-related receipts, extract the location from which the booking was initiated, focusing on the booking origin or departure city/state, rather than the destination. Look for specific details such as the departure airport code, departure city, or the booking location mentioned in the itinerary or booking confirmation section. These details typically indicate the purchase's origin. If the purchase location is not explicitly stated or if the information is ambiguous, provide any relevant clues or context from the receipt that might assist in accurately identifying the location. If no such information is available, or if it remains unclear, clearly mark the response as 'null'")
+    no_of_items: str = Field(description="Specify the total number of items listed in the order as reflected in the receipt or document. If the total count of items is not explicitly mentioned or if it cannot be determined from the provided document, please assign and return the value 'null'.")
+    purchase_category: str = Field(description="Identify and specify the purchase category. Choose from the following predefined categories: fashion, home, travel, food, groceries, hotels, spa, insurance, or others. If the purchase category is not explicitly stated on the receipt or document, or if it cannot be accurately determined based on the available information, assign and return the value 'null'.")
+    brand_category: str = Field(description="""Based on the receipt information, use one of the following brand categories strictly:
+1. "Fashion, Dress, Personal"
+2. "Coffee - Personal"
+3. "Food - Personal"
+4. "Travel, Roam, Explore"
+5. "Shopping, Hunt, Obtain"
+If you don't find any brand category then return 'null'.
+""")
+    Date: str = Field(description="Specify the date of purchase in the format dd-MM-yyyy. If the date of purchase is not explicitly provided on the receipt or document, or if it cannot be accurately determined, assign the value 'null'. Ensure the date is formatted correctly as day, month, and year in two digits each.")
+# async def initialize_openai():
+#     model_name = "gpt-3.5-turbo-instruct"
+#     # model_name = "text-davinci-003"
+#     temperature = 0.0
+#     model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
+def strcuture_document_data(raw_text:str)->dict:
+    try:
+        model_name = "gpt-3.5-turbo-instruct"
+        # model_name = "text-davinci-003"
+        temperature = 0.0
+        model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800)
+        doc_query = (
+            "Extract and return strictly a JSON object containing only the  following keys strictly : brand , total_cost , location , no_of_items , purchase_category,brand_category , Date ."+
+            "\nReceipt Data:\n" + raw_text + "\nRemember the response should only be in JSON format very Strictly and it should have these keys brand , total_cost , location , no_of_items , purchase_category,brand_category , Date , very Strictly.\n"+"Remeber that if one of the key is null then don't assume that other keys maybe null.Always get the values of all the keys mentioned."
+        )
+        print(raw_text)
+        parser = PydanticOutputParser(pydantic_object=Candidate)
+        prompt = PromptTemplate(
+            template="Answer the user query.\n{query}\n{format_instructions}\n",
+            input_variables=["query"],
+            partial_variables={"format_instructions": parser.get_format_instructions()},
+        )
+        input = prompt.format_prompt(query=doc_query)
+        with get_openai_callback() as cb:
+            result = model(input.to_string())
+        print(f"GPT Response {result}")
+        # result = extract_json_from_string(result)
+        # print(f"Formatted Response : {result}")
+        class_object= parser.parse(result)
+        dict_object=class_object.__dict__
+        print("printing structured json")
+        print(dict_object)
+        return dict_object
+    except Exception as e:
+        print(f"Error occurred: {e}")
+        return {}
+def extract_json_from_string(input_string):
+    # Define a regular expression pattern to match JSON
+    pattern = r'\{.*?\}'
+    # Use re.findall() to find all matches of JSON in the input string
+    matches = re.findall(pattern, input_string)
+    # If there are matches, extract the JSON and parse it
+    if matches:
+        json_data_list = []
+        for match in matches:
+            json_data = json.loads(match)
+            json_data_list.append(json_data)
+        return json_data_list
+    else:
+        return None
+def extract_text_from_pdf(pdf_data):
+    with io.BytesIO(pdf_data) as pdf_file:
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+            text += page.extract_text()
+        return text
+def extract_text_from_docx(docx_data):
+    doc = Document(io.BytesIO(docx_data))
+    text = ""
+    for para in doc.paragraphs:
+        text += para.text + "\n"
+    return text
+def extract_text_from_attachment(filename, data):
+    if filename.endswith('.pdf'):
+        return extract_text_from_pdf(base64.urlsafe_b64decode(data))
+    elif filename.endswith('.docx'):
+        return extract_text_from_docx(base64.urlsafe_b64decode(data))
+    else:
+        # Add handling for other document types if needed
+        return "Unsupported document type"