Kurian Jose
commited on
Commit
·
15c8adb
1
Parent(s):
6b63a5c
Finalize backend for HF deployment
Browse files- Dockerfile +6 -0
- README.md +2 -9
- app.py +131 -0
- data/knowledge_base.json +92 -0
- requirements.txt +9 -0
Dockerfile
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
COPY . .
|
| 4 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 5 |
+
EXPOSE 7860
|
| 6 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "app:app"]
|
README.md
CHANGED
|
@@ -1,11 +1,4 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 🏃
|
| 4 |
-
colorFrom: green
|
| 5 |
-
colorTo: red
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
-
short_description: The flask api backend needed for the chatbot on the portfoli
|
| 9 |
-
---
|
| 10 |
-
|
| 11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: KurianGPT Backend
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: docker
|
| 4 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from flask import Flask, request, jsonify
|
| 4 |
+
from flask_cors import CORS
|
| 5 |
+
from langchain_community.vectorstores import FAISS
|
| 6 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_groq import ChatGroq
|
| 8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 9 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 10 |
+
from langchain.chains import create_retrieval_chain
|
| 11 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 12 |
+
from langchain_core.documents import Document
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
import re
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
app = Flask(__name__)
|
| 20 |
+
CORS(app)
|
| 21 |
+
|
| 22 |
+
rag_chain = None
|
| 23 |
+
|
| 24 |
+
def json_to_text(json_data):
|
| 25 |
+
"""
|
| 26 |
+
A simple function to convert the structured JSON into a single string.
|
| 27 |
+
This text will be used for creating embeddings.
|
| 28 |
+
"""
|
| 29 |
+
text = ""
|
| 30 |
+
for key, value in json_data.items():
|
| 31 |
+
if isinstance(value, dict):
|
| 32 |
+
text += f"{key.replace('_', ' ').title()}:\n"
|
| 33 |
+
for sub_key, sub_value in value.items():
|
| 34 |
+
text += f" {sub_key.replace('_', ' ').title()}: {sub_value}\n"
|
| 35 |
+
elif isinstance(value, list):
|
| 36 |
+
text += f"{key.replace('_', ' ').title()}:\n"
|
| 37 |
+
for item in value:
|
| 38 |
+
if isinstance(item, dict):
|
| 39 |
+
for item_key, item_value in item.items():
|
| 40 |
+
text += f" - {item_key.replace('_', ' ').title()}: {item_value}\n"
|
| 41 |
+
else:
|
| 42 |
+
text += f" - {item}\n"
|
| 43 |
+
else:
|
| 44 |
+
text += f"{key.replace('_', ' ').title()}: {value}\n"
|
| 45 |
+
text += "\n"
|
| 46 |
+
return text
|
| 47 |
+
|
| 48 |
+
def initialize_rag_chain():
|
| 49 |
+
global rag_chain
|
| 50 |
+
try:
|
| 51 |
+
# 1. Load data from JSON file
|
| 52 |
+
print("Loading data from knowledge_base.json...")
|
| 53 |
+
data_dir = "data"
|
| 54 |
+
json_path = os.path.join(data_dir, 'knowledge_base.json')
|
| 55 |
+
|
| 56 |
+
with open(json_path, 'r') as f:
|
| 57 |
+
knowledge_base = json.load(f)
|
| 58 |
+
|
| 59 |
+
# Convert the entire JSON to a single text string
|
| 60 |
+
text_content = json_to_text(knowledge_base)
|
| 61 |
+
# Wrap it in a LangChain Document object
|
| 62 |
+
documents = [Document(page_content=text_content)]
|
| 63 |
+
|
| 64 |
+
# 2. Chunk the documents
|
| 65 |
+
print("Splitting document into chunks...")
|
| 66 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 67 |
+
chunks = text_splitter.split_documents(documents)
|
| 68 |
+
print(f"Created {len(chunks)} text chunks.")
|
| 69 |
+
|
| 70 |
+
# 3. Create Embeddings
|
| 71 |
+
print("Initializing embedding model...")
|
| 72 |
+
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
|
| 73 |
+
|
| 74 |
+
# 4. Create FAISS Vector Store
|
| 75 |
+
print("Creating FAISS vector store...")
|
| 76 |
+
vector_store = FAISS.from_documents(chunks, embedding_model)
|
| 77 |
+
retriever = vector_store.as_retriever()
|
| 78 |
+
|
| 79 |
+
# 5. Initialize LLM
|
| 80 |
+
print("Initializing Groq LLM...")
|
| 81 |
+
llm = ChatGroq(temperature=0, model_name="qwen/qwen3-32b")
|
| 82 |
+
|
| 83 |
+
# 6. Create RAG Chain
|
| 84 |
+
print("Creating RAG chain...")
|
| 85 |
+
prompt = ChatPromptTemplate.from_template("""
|
| 86 |
+
You are "KurianGPT", an expert and very friendly AI assistant providing information about Kurian Jose based on his resume and project documents.
|
| 87 |
+
Answer the user's question based only on the following context.
|
| 88 |
+
Refer too yourself as "KurianGPT" and Kurian as "Kurian".
|
| 89 |
+
Avoid showing internal reasoning. Do not output <think> tags. Respond directly and professionally.
|
| 90 |
+
Keep responses **very brief** (1–2 sentences max) unless the user asks for more details or examples.
|
| 91 |
+
If the answer is not in the context, politely say that you can only answer questions regarding kurian's professional background and projects.
|
| 92 |
+
refer to the user as "you" and Kurian as "Kurian".
|
| 93 |
+
<context>
|
| 94 |
+
{context}
|
| 95 |
+
</context>
|
| 96 |
+
|
| 97 |
+
Question: {input}
|
| 98 |
+
""")
|
| 99 |
+
document_chain = create_stuff_documents_chain(llm, prompt)
|
| 100 |
+
rag_chain = create_retrieval_chain(retriever, document_chain)
|
| 101 |
+
print("--- RAG Chain Initialized Successfully! ---")
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"Error during RAG initialization: {e}")
|
| 104 |
+
|
| 105 |
+
def strip_think_tags(text):
|
| 106 |
+
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
| 107 |
+
|
| 108 |
+
#App routes
|
| 109 |
+
@app.route('/api/chat', methods=['POST'])
|
| 110 |
+
def chat():
|
| 111 |
+
if not rag_chain:
|
| 112 |
+
return jsonify({'error': 'RAG chain is not initialized. Check server logs.'}), 500
|
| 113 |
+
data = request.get_json()
|
| 114 |
+
user_message = data.get('message')
|
| 115 |
+
if not user_message:
|
| 116 |
+
return jsonify({'error': 'No message provided'}), 400
|
| 117 |
+
try:
|
| 118 |
+
result = rag_chain.invoke({"input": user_message})
|
| 119 |
+
raw_response = result.get('answer', "I couldn't generate a response.")
|
| 120 |
+
clean_response = strip_think_tags(raw_response)
|
| 121 |
+
return jsonify({'reply': clean_response})
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f"Error during chat processing: {e}")
|
| 124 |
+
return jsonify({'error': 'An error occurred while processing your request.'}), 500
|
| 125 |
+
|
| 126 |
+
# Initialize the RAG chain when the application starts
|
| 127 |
+
initialize_rag_chain()
|
| 128 |
+
|
| 129 |
+
# This part is for local development only and will not be used by Vercel
|
| 130 |
+
if __name__ == '__main__':
|
| 131 |
+
app.run(debug=True, port=5001)
|
data/knowledge_base.json
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"personal_info": {
|
| 3 |
+
"name": "Kurian Jose",
|
| 4 |
+
"contact": {
|
| 5 |
+
"phone": "+91 9354502635",
|
| 6 |
+
"email": "kurianjoseoff@gmail.com",
|
| 7 |
+
"linkedin": "linkedin.com/in/Kurian",
|
| 8 |
+
"github": "github.com/Kurian"
|
| 9 |
+
},
|
| 10 |
+
"objective": "AI/ML developer with practical experience in LLMs, Retrieval-Augmented Generation (RAG), NLP, and prompt engineering. Specialized in building legal-tech tools and automation workflows using LangChain, FAISS, and OpenAI/Groq APIs. Eager to drive applied AI solutions in high-impact domains like law, finance, and enterprise automation."
|
| 11 |
+
},
|
| 12 |
+
"education": [
|
| 13 |
+
{
|
| 14 |
+
"institution": "Galgotias University",
|
| 15 |
+
"degree": "B.Tech in Computer Science",
|
| 16 |
+
"specialization": "AI and Machine Learning",
|
| 17 |
+
"dates": "Oct 2023 - Present",
|
| 18 |
+
"location": "Greater Noida, UP",
|
| 19 |
+
"courses": ["Machine Learning", "NLP", "Data Structures", "Algorithms"]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"institution": "Father Agnel School",
|
| 23 |
+
"degree": "Senior Secondary Education",
|
| 24 |
+
"dates": "March 2021 - May 2023",
|
| 25 |
+
"location": "New Delhi"
|
| 26 |
+
}
|
| 27 |
+
],
|
| 28 |
+
"experience": [
|
| 29 |
+
{
|
| 30 |
+
"company": "Astron Financial Solutions",
|
| 31 |
+
"role": "AI Intern",
|
| 32 |
+
"dates": "June 2025 - Present",
|
| 33 |
+
"responsibilities": [
|
| 34 |
+
"Built a legal case brief generator using Python, Streamlit, and LangChain with Groq API, reducing manual summarization time by 40%.",
|
| 35 |
+
"Developed a legal question generator (MCQ + Subjective) from Constitutional Law case briefs using LLM prompt engineering, improving content generation efficiency by 30%.",
|
| 36 |
+
"Automated legal case retrieval and summarization from Indian Kanoon using Selenium and OpenRouter API with plain-English search.",
|
| 37 |
+
"Created an AI Model Consolidator to compare and merge LLM outputs (OpenAI, Gemini, DeepSeek) using Streamlit and Meta LLAMA-3.",
|
| 38 |
+
"Implemented a GSTR vs Purchase Register reconciliation tool in Python using pandas, enhancing GST audit accuracy and compliance."
|
| 39 |
+
],
|
| 40 |
+
"tech_stack": ["Python", "Streamlit", "LangChain", "Groq API", "OpenRouter", "Gemini API", "Selenium", "FAISS", "pandas", "Git"]
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"projects": [
|
| 44 |
+
{
|
| 45 |
+
"title": "Legal Case Brief Generator using LLMs",
|
| 46 |
+
"description": "Developed a Streamlit-based web application that automates the generation of legal case briefs from PDF documents using a large language model (LLM). The tool allows users to upload legal case files, automatically extracts and chunks the content, and generates structured brief sections including Facts, Issue, Rule, Application, and Conclusion using the ChatGroq model (deepseek-r1-distill-llama-70b).",
|
| 47 |
+
"tech": ["Python", "Streamlit", "LangChain", "Groq API", "pdfplumber", "dotenv"],
|
| 48 |
+
"status": "Completed"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"title": "Legal Scenario-Based Question Generator (MCQ & Subjective)",
|
| 52 |
+
"description": "Built a web-based application using Streamlit that generates scenario-based legal questions (both MCQs and open-ended) from Indian constitutional law case briefs, specifically focused on Article 21 topics. The app allows users to upload a legal case PDF and generates pedagogically sound questions using a Groq-hosted LLM.",
|
| 53 |
+
"tech": ["Python", "Streamlit", "Groq API", "pdfplumber", "PyPDF2", "regex", "dotenv"],
|
| 54 |
+
"status": "Completed"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"title": "Automated Case Law Finder & PDF Downloader",
|
| 58 |
+
"description": "Designed a CLI-based automation pipeline that accepts a natural-language legal issue, uses an LLM (via OpenRouter API) to fetch relevant Supreme Court case titles, normalizes them, searches Indian Kanoon, and downloads the judgments as PDFs.",
|
| 59 |
+
"tech": ["Python", "OpenRouter (LLM API)", "Selenium", "Indian Kanoon", "ChromeDriver", "Regex", "dotenv"],
|
| 60 |
+
"status": "Completed"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"title": "AI Lawyer - Legal QA Chatbot (RAG)",
|
| 64 |
+
"description": "Developed a PDF-querying legal chatbot using Retrieval-Augmented Generation (RAG) architecture. Used FAISS and Ollama for document indexing and vector search; integrated DeepSeek via Groq API. Handled chunking, embedding, and LLM orchestration using LangChain.",
|
| 65 |
+
"tech": ["LangChain", "FAISS", "Ollama", "DeepSeek", "Groq API"]
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"title": "Surakshita - Real-Time Audio Safety Alert App",
|
| 69 |
+
"description": "Built a real-time audio-based distress detection app using BiLSTM that triggered automated alerts. Developed a gender classification model to identify female voices in potential threat scenarios and integrated Google Maps API for geolocation-based emergency response.",
|
| 70 |
+
"tech": ["TensorFlow", "BiLSTM", "Google Maps API"],
|
| 71 |
+
"date": "August 2024"
|
| 72 |
+
}
|
| 73 |
+
],
|
| 74 |
+
"skills": {
|
| 75 |
+
"languages": ["Python", "Java", "C", "SQL"],
|
| 76 |
+
"ai_ml": ["Transformers", "LLMs", "LangChain", "Retrieval-Augmented Generation (RAG)", "Prompt Engineering", "BiLSTM", "TensorFlow", "NLP (NLTK, Regex)"],
|
| 77 |
+
"frameworks_tools": ["Streamlit", "Flask", "Selenium", "Git", "VS Code", "PyCharm", "Google Colab", "GCP", "Figma"],
|
| 78 |
+
"apis_libraries": ["OpenAI", "Groq Cloud", "OpenRouter", "FAISS", "dotenv"],
|
| 79 |
+
"in_progress": ["Hugging Face Transformers", "Async LLM Orchestration", "Scalable RAG Pipelines"]
|
| 80 |
+
},
|
| 81 |
+
"certifications": [
|
| 82 |
+
{
|
| 83 |
+
"name": "Python for Data Science, AI & Development",
|
| 84 |
+
"issuer": "IBM"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "100 Days of Python Expert",
|
| 88 |
+
"issuer": "Guvi Education"
|
| 89 |
+
}
|
| 90 |
+
],
|
| 91 |
+
"soft_skills": "Strong leadership and teamwork from hackathon experiences, adept problem-solver, and highly adaptable with rapid learning of tools like TensorFlow and NLP libraries."
|
| 92 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask
|
| 2 |
+
flask_cors
|
| 3 |
+
langchain
|
| 4 |
+
langchain_community
|
| 5 |
+
langchain_groq
|
| 6 |
+
langchain-text-splitters
|
| 7 |
+
faiss-cpu
|
| 8 |
+
sentence-transformers
|
| 9 |
+
python-dotenv
|