RAGResume / app.py
nharshavardhana's picture
commit
5c7c2b0
import os
# βœ… Load secrets from Hugging Face Spaces environment
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
# βœ… Verify that keys are loaded (prints only in development mode)
if MISTRAL_API_KEY is None or LLAMA_CLOUD_API_KEY is None:
print("🚨 ERROR: Missing API keys. Please set them in Hugging Face Secrets.")
import nest_asyncio
nest_asyncio.apply()
# imports
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.core import Settings
Settings.embed_model = MistralAIEmbedding(model_name="mistral-embed")
from llama_index.core import VectorStoreIndex
from llama_parse import LlamaParse
from llama_index.llms.mistralai import MistralAI
llm = MistralAI(model="mistral-large-latest", api_key=MISTRAL_API_KEY)
from llama_index.core.workflow import (
StartEvent,
StopEvent,
Workflow,
step,
Event,
Context
)
class QueryEvent(Event):
query: str
from llama_index.core import StorageContext, load_index_from_storage
import os
import hashlib
from pathlib import Path
class RAGWorkflow(Workflow):
storage_dir = "./storage"
hash_file = "./last_resume_hash.txt"
llm: MistralAI
query_engine: VectorStoreIndex
def compute_file_hash(self, file_path):
"""Compute SHA256 hash of a file from its path."""
hasher = hashlib.sha256()
with open(file_path, "rb") as f: # Read file in binary mode
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
def get_last_stored_hash(self):
"""Retrieve the last stored resume hash, if available."""
if os.path.exists(self.hash_file):
with open(self.hash_file, "r") as f:
return f.read().strip()
return None
def update_stored_hash(self, new_hash):
"""Update the stored resume hash after processing a new file."""
with open(self.hash_file, "w") as f:
f.write(new_hash)
@step
async def set_up(self, ctx: Context, ev: StartEvent) -> QueryEvent:
if not ev.resume_file:
raise ValueError("❌ No resume file provided")
# βœ… Extract the correct file path
if isinstance(ev.resume_file, gr.utils.NamedString):
file_path = ev.resume_file.name
elif isinstance(ev.resume_file, str) and os.path.exists(ev.resume_file):
file_path = ev.resume_file
else:
raise ValueError("⚠️ Invalid file format received!")
print(f"βœ… Resume File Path: {file_path}")
self.llm = MistralAI(model="mistral-large-latest")
# βœ… Compute hash of the uploaded resume file
new_resume_hash = self.compute_file_hash(file_path)
last_stored_hash = self.get_last_stored_hash()
if os.path.exists(self.storage_dir) and last_stored_hash == new_resume_hash:
# Resume hasn't changed; load the existing index
storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir)
index = load_index_from_storage(storage_context)
else:
# Resume is new; process and update storage
documents = LlamaParse(
result_type="markdown",
content_guideline_instruction="Extract structured bullet points from the resume."
).load_data(file_path, extra_info={"file_name": os.path.basename(file_path)})
index = VectorStoreIndex.from_documents(
documents,
embed_model=Settings.embed_model # Use Hugging Face embeddings
)
index.storage_context.persist(persist_dir=self.storage_dir)
# βœ… Update stored hash
self.update_stored_hash(new_resume_hash)
self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5)
return QueryEvent(query=ev.query)
@step
async def ask_question(self, ctx: Context, ev: QueryEvent) -> StopEvent:
response = self.query_engine.query(f"This is a question about the resume: {ev.query}")
return StopEvent(result=response.response)
import gradio as gr
import asyncio
import os
# βœ… Ensure you have your RAGWorkflow properly initialized
w = RAGWorkflow(timeout=120, verbose=False)
async def process_resume(file, query):
"""Handles Gradio file upload and query processing (Async)."""
if file is None:
return "❌ Please upload a resume."
if not query:
return "❌ Please enter a question."
try:
# βœ… Use the actual file path from Gradio
file_path = file.name
# βœ… Debugging information
print(f"βœ… File uploaded: {file_path}")
print(f"βœ… File size: {os.path.getsize(file_path)} bytes")
# βœ… Run the RAG workflow with the actual file path
result = await w.run(
resume_file=file_path, # βœ… Pass file path, not BytesIO
query=query
)
print("βœ… Result:", result) # Debug output
return result if result else "⚠️ No relevant information found."
except Exception as e:
print("🚨 Error:", str(e))
return f"🚨 Error occurred: {str(e)}"
# βœ… Function to clear inputs
def clear_inputs():
return None, "", ""
# βœ… Create Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# πŸ“„ RAGResume")
gr.Markdown("""
**Upload a resume and ask questions about it!**
""")
gr.Markdown("""
1. Upload a resume in PDF format.
2. Enter a question about the resume (example: where does the applicant currently work?).
3. Click on the "Submit" button to get the response.
4. Click on the "Clear" button to reset the inputs.
""")
with gr.Row():
file_input = gr.File(label="πŸ“„ Upload Resume (PDF)")
query_input = gr.Textbox(label="πŸ’¬ Enter your question")
output = gr.Textbox(label="πŸ“ Response")
with gr.Row():
submit_btn = gr.Button("πŸš€ Submit")
clear_btn = gr.Button("🧹 Clear")
submit_btn.click(process_resume, inputs=[file_input, query_input], outputs=output)
clear_btn.click(clear_inputs, outputs=[file_input, query_input, output])
# βœ… Fix for Colab & Hugging Face Spaces
try:
import nest_asyncio
nest_asyncio.apply() # βœ… Fix for Jupyter/Colab Notebooks
except ImportError:
pass
# βœ… Launch Gradio with proper Async Handling
def run_demo():
demo.queue() # Enables async functions
demo.launch(share=True) # βœ… Public link enabled
if __name__ == "__main__":
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(run_demo()) # βœ… Runs correctly in scripts
except RuntimeError:
asyncio.run(run_demo()) # βœ… Fallback for runtime errors