File size: 8,051 Bytes
593f03b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ce4d52
593f03b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import gradio as gr
import json
import os
import io
import pdfplumber
import requests
import together
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import unicodedata
from dotenv import load_dotenv
from flask import jsonify

load_dotenv()
API_URL = "ttps://1611-223-233-35-112.ngrok-free.app  "
API_URL_FILES = f"{API_URL}/file"
API_URL_EMBEDDINGS = f"{API_URL}/embeddings"
API_URL_METADATA = f"{API_URL}/metadata"

# FAISS index setup
DIM = 768  # Adjust based on the embedding model

# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"

# Use a sentence transformer for embeddings
#'BAAI/bge-base-en-v1.5'
# embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")  

# 'togethercomputer/m2-bert-80M-8k-retrieval'
embedding_model = SentenceTransformer(
    "togethercomputer/m2-bert-80M-8k-retrieval", 
    trust_remote_code=True  # Allow remote code execution
)

embedding_dim = 768  # Adjust according to model


def store_document_data(PDF_FILE):
    print(" Storing document...")

    if PDF_FILE:
        # Extract text from the PDF
        text = extract_text_from_pdf(PDF_FILE)
        if not text:
            return "Could not extract any text from the PDF."

        # Generate and return embedding
        embedding = embedding_model.encode([text]).astype(np.float32)
        
        print("Embeddings generated")
        print("Embedding shape:", embedding.shape)
        print(f"sending to {API_URL_EMBEDDINGS}")

        try:
            index = faiss.IndexFlatL2(embedding.shape[1])
            index.add(embedding)  # Add embedding
            print(index, index.ntotal)

            if index.ntotal == 0:
                raise ValueError("FAISS index is empty. No embeddings added.")
                
            index_file = "index.bin"
            faiss.write_index(index, index_file)
            
            faiss_index = faiss.read_index(index_file)
            print("FAISS index loaded successfully. Number of vectors:", faiss_index.ntotal)
            
           
            doc_index = index.ntotal - 1
            
            with open(index_file, "rb") as f:
                response = requests.post(API_URL_EMBEDDINGS, 
                             files={"file": ("index.bin", f, "application/octet-stream")})

            print("sent", response.json())
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

        return doc_index
    else:
        return "No PDF file provided."

def retrieve_document(query):
    print(f"Retrieving document based on:\n{query}")

    embeddings_ = requests.get(API_URL_EMBEDDINGS)
    metadata_ = requests.get(API_URL_METADATA)

        # Check for errors before parsing JSON
    if embeddings_.status_code != 200:
        print(f"Error fetching embeddings: {embeddings_.status_code} - {embeddings_.text}")
        return None
    
    if metadata_.status_code != 200:
        print(f"Error fetching metadata: {metadata_.status_code} - {metadata_.text}")
        return None
    
    try:
        metadata_file = metadata_.json()['metadata_file']
        print(metadata_file)
    except requests.exceptions.JSONDecodeError as e:
        print(f"Error decoding metadata JSON: {e}")
        return None

    try:
        print("Response content length:", len(embeddings_.content))  # Debugging
    
        if len(embeddings_.content) == 0:
            raise ValueError("Received empty FAISS index file")
    
        # Convert response content to a byte stream
        byte_stream = io.BytesIO(embeddings_.content)
    
        # Write the received binary content to a temporary file
        with open("downloaded_index.bin", "wb") as f:
            f.write(byte_stream.read())
    
        # Load FAISS index from file
        index = faiss.read_index("downloaded_index.bin")
    
        print(f"✅ Successfully loaded FAISS index with {index.ntotal} vectors.")

    except Exception as e:
        print(f"Error loading FAISS index: {e}")
        return None

    print(index, metadata_file)

    # Generate query embedding
    query_embedding = embedding_model.encode([query]).astype(np.float32)

    # Search for the closest document in FAISS index
    _, closest_idx = index.search(query_embedding, 1)

    metadata = metadata_file

    # Check if a relevant document was found
    if closest_idx[0][0] == -1 or str(closest_idx[0][0]) not in metadata:
        print("No relevant document found")
        return None

    # Retrieve the document file path
    filename = metadata[str(closest_idx[0][0])]

    print(filename)

    response = requests.get(API_URL_FILES, params={"file":filename})

    print(response.content)
    
    recieved_file = "document.pdf"
    
    if response.status_code == 200:
        with open(recieved_file, "wb") as f:
            f.write(response.content)

        prompt_doc = extract_text_from_pdf(recieved_file)
        
        print(f"PDF received successfully: received_{filename}")
    else:
        print(f"Error: {response.status_code}, {response.json()}")

    return prompt_doc
    
def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    print("cleaning")
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    print("extracting")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        return text
    except Exception as e:
        print(f"Error extracting text: {e}{pdf_file}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for better processing."""
    print("splitting")
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def chatbot(user_question):
    """Processes the PDF and answers the user's question."""
    print("chatbot start")
   
    # retrieve the document relevant to the query
    doc = retrieve_document(user_question)           
    
    if doc:
        print(f"found doc:\n{doc}\n")
        # Split into smaller chunks
        chunks = split_text(doc)
      
        # Use only the first chunk (to optimize token usage)
        prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
        print(f"prompt:\n{prompt}")
    else:
              prompt=user_question

    try:
            print("asking")
            response = together.Completion.create(
                model="mistralai/Mistral-7B-Instruct-v0.1",
                prompt=prompt,
                max_tokens=200,
                temperature=0.7,
            )
        
            # Return chatbot's response
            return response.choices[0].text
    except  Exception as e:
        return f"Error generating response: {e}"
        
    # Send to Together.AI (Mistral-7B)

def helloWorld(text):
    return f"{text} : hello world"

# Gradio Interface
iface = gr.TabbedInterface(
    [
        gr.Interface(
            fn=chatbot,
            inputs=gr.Textbox(label="Ask a Question"),
            outputs=gr.Textbox(label="Answer"),
            title="PDF Q&A Chatbot (Powered by Together.AI)",
        ),
        gr.Interface(
            fn=helloWorld,
            inputs="text",
            outputs="text",
        ),
        gr.Interface(
            fn=store_document_data,
            inputs=[gr.File(label="PDF_FILE")],
            outputs=gr.Textbox(label="Answer"),
            title="pdf file, metadata, index parsing and storing",
        ),
    ]
)
        
# Launch Gradio app
iface.launch(show_error=True)