File size: 8,852 Bytes
7c331c3
a80e992
7c331c3
 
 
157ff13
 
7c331c3
 
 
 
 
9f7b11b
e904acc
a80e992
157ff13
a80e992
 
7c331c3
 
a80e992
7c331c3
 
 
 
157ff13
7c331c3
 
a80e992
7c331c3
 
 
 
 
 
 
 
 
 
 
 
 
 
a80e992
7c331c3
 
 
 
 
a80e992
157ff13
7c331c3
 
 
 
 
62d72e3
7c331c3
 
 
a80e992
157ff13
3be2608
 
7c331c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b5f8af
7c331c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157ff13
7c331c3
7b3d807
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

import streamlit as st
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import textwrap
import os
import json
import tempfile
import os
import requests





# ------------------- Secure Credential Loading for Hugging Face ------------------- #
# This section loads the Service Account from Hugging Face Secrets for ADC

# 1. Load the Service Account JSON string from the environment variable (secret)
gcp_credentials_json_str = os.getenv("GCP_CREDENTIALS_JSON")
project_id =  "wise-env-461717-t5" # Initialize project_id
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# 2. Check if the secret is present
if gcp_credentials_json_str:
    try:
        # --- FIX: Write to the /tmp/ directory, which is writable on Hugging Face Spaces ---
        credentials_file_path = "/tmp/gcp_service_account.json"
        
        # 3. Write the JSON string to the file in the temporary directory
        with open(credentials_file_path, "w") as f:
            f.write(gcp_credentials_json_str)
            
        # 4. Set the environment variable to point to this file
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file_path
        
        # Extract project_id from the credentials for convenience
        creds_dict = json.loads(gcp_credentials_json_str)
        project_id = creds_dict.get("project_id")
        
    except Exception as e:
        st.error(f"🚨 Failed to process GCP credentials: {e}")
        st.stop()
else:
    st.error("🚨 GCP_CREDENTIALS_JSON secret not found! Please add it to your Hugging Face Space settings.")
    st.stop()


# ------------------- Configuration ------------------- #
# Project ID is now dynamically loaded from the service account
if not project_id:
    st.error("🚨 Project ID could not be found in the GCP credentials.")
    st.stop()

# You still need to provide your Processor ID and location
processor_id = "86a7eec52bbb9616" # <-- REPLACE WITH YOUR PROCESSOR ID
location = "us" # e.g., "us" or "eu"




# ------------------- Google Document AI Client (Uses ADC) ------------------- #
# The client now automatically finds and uses the credentials set above
try:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    docai_client = documentai_v1.DocumentProcessorServiceClient(client_options=opts)
    full_processor_name = docai_client.processor_path(project_id, location, processor_id)
except Exception as e:
    st.error(f"Error initializing Document AI client: {e}")
    st.stop()


@st.cache_resource
def load_embedding_model():
    # Use a writable cache directory
    cache_dir = "/tmp/hf_cache"
    os.makedirs(cache_dir, exist_ok=True)

    # Set Hugging Face environment variables
    os.environ["TRANSFORMERS_CACHE"] = cache_dir
    os.environ["HF_HOME"] = cache_dir

    # Load embedding model
    return SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
embed_model = load_embedding_model()


# ------------------- Utility Functions ------------------- #
def chunk_text(text, max_chars=500):
    return textwrap.wrap(text, max_chars)
def extract_text_with_documentai(file_path):
    with open(file_path, "rb") as f:
        content = f.read()
    raw_document = documentai_v1.RawDocument(content=content, mime_type="application/pdf")
    request = documentai_v1.ProcessRequest(name=full_processor_name, raw_document=raw_document)
    result = docai_client.process_document(request=request)
    document = result.document
    return document.text

def build_index(text):
    text_chunks = chunk_text(text)
    embeddings = embed_model.encode(text_chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index, text_chunks

def retrieve_context(query, index, text_chunks, top_k=5):
    query_embed = embed_model.encode([query])
    distances, indices = index.search(np.array(query_embed), top_k)
    return [text_chunks[i] for i in indices[0]]

# ------------------- Gemini API Functions ------------------- #
def ask_groq_agent(query, context):
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
        json={
            "model": "llama3-70b-8192",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.3
        }
    )
    return response.json()["choices"][0]["message"]["content"]
def get_summary(text):
    prompt = f"Please provide a concise summary of the following document:\n\n{text[:4000]}"
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
        json={
            "model": "llama3-70b-8192",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.3
        }
    )
    return response.json()["choices"][0]["message"]["content"]


def generate_flashcards(text_chunks):
    joined_text = "\n".join(text_chunks)
    prompt = (
        "Generate 5 helpful flashcards from the following content. "
        "Use the format exactly like this:\n\n"
        "Q: What is ...?\nA: ...\n\nQ: How does ...?\nA: ...\n\n"
        "Text:\n" + joined_text
    )
    
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
        json={
            "model": "llama3-70b-8192",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.5
        }
    )
    content = response.json()["choices"][0]["message"]["content"]

    flashcards = []
    question = None
    for line in content.strip().splitlines():
        line = line.strip()
        if line.lower().startswith("q:"):
            question = line[2:].strip()
        elif line.lower().startswith("a:") and question:
            answer = line[2:].strip()
            flashcards.append({"question": question, "answer": answer})
            question = None
    return flashcards

st.title("πŸ“„ PDF AI Assistant (Groq + DocAI)")

if "index" not in st.session_state:
    st.session_state.index = None
    st.session_state.text_chunks = []
    st.session_state.raw_text = ""

with st.sidebar:
    st.header("πŸ“€ Upload PDF")
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

    if uploaded_file is not None:
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
                tmp_file.write(uploaded_file.read())
                tmp_file.flush()
                tmp_path = tmp_file.name

            # DEBUG: File info
            st.write("Saved file at:", tmp_path)
            st.write("File size:", os.path.getsize(tmp_path), "bytes")
            st.write("File exists:", os.path.exists(tmp_path))

            with st.spinner("Extracting text using Document AI..."):
                raw_text = extract_text_with_documentai(tmp_path)
                index, text_chunks = build_index(raw_text)
                st.session_state.index = index
                st.session_state.text_chunks = text_chunks
                st.session_state.raw_text = raw_text
                st.success("βœ… Document processed successfully.")
        except Exception as e:
            st.error(f"Error: {e}")
        finally:
            os.unlink(tmp_path)


# ------------------- Q&A Interface ------------------- #
st.subheader("❓ Ask Questions")
if st.session_state.index:
    question = st.text_input("Enter your question")
    if st.button("Ask"):
        context = "\n\n".join(retrieve_context(question, st.session_state.index, st.session_state.text_chunks))
        answer = ask_groq_agent(question, context)
        st.markdown(f"**Answer:** {answer}")
else:
    st.info("Upload a PDF to start asking questions.")

# ------------------- Summary Interface ------------------- #
st.subheader("πŸ“ Document Summary")
if st.session_state.text_chunks:
    if st.button("Generate Summary"):
        with st.spinner("Generating summary..."):
            summary = get_summary(" ".join(st.session_state.text_chunks))
            st.markdown(summary)
else:
    st.info("Upload a PDF to get a summary.")

# ------------------- Flashcards ------------------- #
st.subheader("🧠 Flashcards")
if st.session_state.text_chunks:
    if st.button("Generate Flashcards"):
        with st.spinner("Generating flashcards..."):
            flashcards = generate_flashcards(st.session_state.text_chunks)
            for fc in flashcards:
                st.markdown(f"**Q: {fc['question']}**\n\nA: {fc['answer']}")
else:
    st.info("Upload a PDF to generate flashcards.")