decodingdatascience commited on
Commit
dd7a19d
·
verified ·
1 Parent(s): e07dbb2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import logging
4
+ import sys
5
+ import gradio as gr
6
+
7
+ from pinecone import Pinecone, ServerlessSpec
8
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
9
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
10
+ from llama_index.readers.file import PDFReader
11
+
12
+ # Optional: only if you are using OpenAI as the default LLM / embeddings
13
+ # from llama_index.llms.openai import OpenAI
14
+ # from llama_index.embeddings.openai import OpenAIEmbedding
15
+ # from llama_index.core import Settings
16
+
17
+ # --- Logging ---
18
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # --- Load Secrets from Hugging Face Spaces ---
22
+ # Add these in: Space Settings -> Variables and secrets
23
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
24
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") # optional, if needed
25
+
26
+ if not PINECONE_API_KEY:
27
+ raise ValueError("Missing PINECONE_API_KEY in Hugging Face Space secrets.")
28
+
29
+ # If you use OpenAI in LlamaIndex, uncomment this check
30
+ # if not OPENAI_API_KEY:
31
+ # raise ValueError("Missing OPENAI_API_KEY in Hugging Face Space secrets.")
32
+
33
+ # If you use OpenAI explicitly in LlamaIndex, uncomment this section
34
+ # Settings.llm = OpenAI(model="gpt-4.1-mini", api_key=OPENAI_API_KEY)
35
+ # Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_key=OPENAI_API_KEY)
36
+
37
+ # --- Pinecone Config ---
38
+ INDEX_NAME = "quickstart"
39
+ DIMENSION = 1536
40
+ METRIC = "euclidean"
41
+ CLOUD = "aws"
42
+ REGION = "us-east-1"
43
+
44
+ # --- Initialize Pinecone ---
45
+ pc = Pinecone(api_key=PINECONE_API_KEY)
46
+
47
+ def get_existing_index_names(pc_client):
48
+ """Safely extract index names across SDK response shapes."""
49
+ raw = pc_client.list_indexes()
50
+
51
+ # Common case: iterable of dicts
52
+ try:
53
+ return [idx["name"] for idx in raw]
54
+ except Exception:
55
+ pass
56
+
57
+ # Alternate case: object with .indexes
58
+ try:
59
+ return [idx["name"] for idx in raw.indexes]
60
+ except Exception:
61
+ pass
62
+
63
+ # Fallback
64
+ return []
65
+
66
+ def ensure_index(pc_client, index_name: str, dimension: int):
67
+ existing_indexes = get_existing_index_names(pc_client)
68
+
69
+ if index_name not in existing_indexes:
70
+ logger.info(f"Creating Pinecone index: {index_name}")
71
+ pc_client.create_index(
72
+ name=index_name,
73
+ dimension=dimension,
74
+ metric=METRIC,
75
+ spec=ServerlessSpec(cloud=CLOUD, region=REGION),
76
+ )
77
+ # Small wait to avoid race condition on first startup
78
+ time.sleep(5)
79
+ else:
80
+ logger.info(f"Using existing Pinecone index: {index_name}")
81
+
82
+ return pc_client.Index(index_name)
83
+
84
+ # --- Load Documents ---
85
+ def load_documents():
86
+ documents = SimpleDirectoryReader(
87
+ input_dir="data",
88
+ required_exts=[".pdf"],
89
+ file_extractor={".pdf": PDFReader()}
90
+ ).load_data()
91
+
92
+ if not documents:
93
+ raise ValueError("No PDF documents were loaded from the 'data' folder.")
94
+
95
+ logger.info(f"Loaded {len(documents)} document chunks/items.")
96
+ return documents
97
+
98
+ # --- Build Query Engine Once at Startup ---
99
+ def build_query_engine():
100
+ pinecone_index = ensure_index(pc, INDEX_NAME, DIMENSION)
101
+ documents = load_documents()
102
+
103
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
104
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
105
+
106
+ index = VectorStoreIndex.from_documents(
107
+ documents,
108
+ storage_context=storage_context
109
+ )
110
+
111
+ return index.as_query_engine()
112
+
113
+ query_engine = build_query_engine()
114
+
115
+ # --- Gradio Function ---
116
+ def query_doc(prompt):
117
+ if not prompt or not prompt.strip():
118
+ return "Please enter a question."
119
+
120
+ try:
121
+ response = query_engine.query(prompt)
122
+ return str(response)
123
+ except Exception as e:
124
+ logger.exception("Query failed")
125
+ return f"Error: {str(e)}"
126
+
127
+ # --- Gradio UI ---
128
+ demo = gr.Interface(
129
+ fn=query_doc,
130
+ inputs=gr.Textbox(
131
+ label="Ask a question about the document",
132
+ placeholder="What does the policy say about social media conduct?"
133
+ ),
134
+ outputs=gr.Textbox(label="Answer"),
135
+ title="DDS Enterprise Chatbot",
136
+ description="Ask questions based on the indexed Social Media Regulation PDF. Powered by LlamaIndex & Pinecone."
137
+ )
138
+
139
+ if __name__ == "__main__":
140
+ demo.launch()