File size: 2,311 Bytes
90cf652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import fitz  # PyMuPDF for PDFs
import pytesseract
from PIL import Image
import io
from flask import Flask, request, jsonify
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

# Initialize Flask app
app = Flask(__name__)

# Set Mistral API Key
os.environ["MISTRAL_API_KEY"] = "your_api_key_here"
client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY"))

# Set Tesseract Path for Windows (if needed)
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_pdf(pdf_path):
    """Extract text from PDFs, using OCR for scanned pages."""
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        extracted_text = page.get_text("text")
        
        # If no text, apply OCR (for scanned PDFs)
        if not extracted_text.strip():
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            extracted_text = pytesseract.image_to_string(img)

        text += extracted_text + "\n"
    
    return text

def query_mistral(pdf_text, user_query):
    """Send extracted text and user query to Mistral AI."""
    messages = [
        ChatMessage(role="system", content="You are an AI that answers questions based on PDFs."),
        ChatMessage(role="user", content=f"Document content: {pdf_text[:3000]}... (truncated)"),
        ChatMessage(role="user", content=f"User question: {user_query}")
    ]
    
    response = client.chat(model="mistral-7b", messages=messages)
    return response.choices[0].message.content

@app.route("/upload", methods=["POST"])
def upload_pdf():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400

    file = request.files["file"]
    pdf_path = "uploaded.pdf"
    file.save(pdf_path)

    # Extract text
    pdf_text = extract_text_from_pdf(pdf_path)
    return jsonify({"message": "PDF uploaded and processed", "text": pdf_text[:500]})  # Preview

@app.route("/chat", methods=["POST"])
def chat():
    data = request.json
    user_query = data.get("query", "")
    pdf_text = extract_text_from_pdf("uploaded.pdf")
    response = query_mistral(pdf_text, user_query)
    return jsonify({"response": response})

if __name__ == "__main__":
    app.run(debug=True)