Spaces:

srikol
/

SriGPT

Sleeping

App Files Files Community

srikol commited on Jun 30

Commit

ee10b9c

verified ·

1 Parent(s): 9cfd3ac

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -49

app.py CHANGED Viewed

@@ -1,78 +1,108 @@
-"""
-CPU-fast RAG chatbot for my_resume.pdf
-– avoids the NLTK 3.8.2 punkt_tab issue by pinning nltk==3.8.1
-"""
 import os, re, json, faiss, gradio as gr
-from typing import List
-from PyPDF2 import PdfReader
 from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-import nltk
-# ── sentence splitter that works on nltk 3.8.1 (punkt) ─────────────
-nltk.download("punkt", quiet=True)
-def sent_split(text: str) -> List[str]:
-    return nltk.sent_tokenize(text)
-# ── load and chunk résumé ──────────────────────────────────────────
-def pdf_text(path="my_resume.pdf") -> str:
-    return " ".join(page.extract_text() or "" for page in PdfReader(path).pages)
-raw  = pdf_text()
-text = re.sub(r"\s+", " ", raw).strip()
-def chunker(txt: str, max_tokens=200) -> List[str]:
-    out, buf, tok = [], [], 0
-    for s in sent_split(txt):
-        n = len(s.split())
-        if tok + n > max_tokens:
-            out.append(" ".join(buf)); buf, tok = [], 0
-        buf.append(s); tok += n
-    if buf: out.append(" ".join(buf))
-    return out
-CHUNKS = chunker(text)
-# ── build FAISS index ──────────────────────────────────────────────
 embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-vecs = embedder.encode(CHUNKS, convert_to_numpy=True, show_progress_bar=False)
-faiss.normalize_L2(vecs)
-index = faiss.IndexFlatIP(vecs.shape[1])
-index.add(vecs)
-def retrieve(q: str, k=3) -> str:
-    qv = embedder.encode([q], convert_to_numpy=True)
     faiss.normalize_L2(qv)
     _, idx = index.search(qv, k)
-    return " ".join(CHUNKS[i] for i in idx[0])
-# ── small generation model ─────────────────────────────────────────
-gen_tok = AutoTokenizer.from_pretrained("google/flan-t5-small")
-gen_mod = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
-gen = pipeline("text2text-generation", model=gen_mod, tokenizer=gen_tok)
-SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
-          "If the answer is not in the context, reply exactly: "
-          "\"I don't know based on the resume.\"")
 def guard(ctx: str, ans: str) -> bool:
     ctx_words = set(re.findall(r"\b\w+\b", ctx.lower()))
     ans_words = set(re.findall(r"\b\w+\b", ans.lower()))
     return not ctx_words.isdisjoint(ans_words)
-def respond(msg: str, hist=None):
-    ctx  = retrieve(msg)
-    prompt = f"{SYSTEM}\n\nContext:\n{ctx}\n\nQuestion: {msg}\nAnswer:"
     raw = gen(prompt, max_length=256, do_sample=False)[0]["generated_text"]
     ans = raw.split("Answer:")[-1].strip()
     return ans if guard(ctx, ans) else "I don't know based on the resume."
-# ── Gradio chat UI ─────────────────────────────────────────────────
 demo = gr.ChatInterface(
-    fn=respond,
-    title="Resume Q&A (fast CPU)",
-    description="Ask any question – answers come strictly from my_resume.pdf.",
 )
 if __name__ == "__main__":

+# RAG chatbot for Sri Kolagani résumé
+# (c) 2025 – drop into a Hugging Face Space and run.
 import os, re, json, faiss, gradio as gr
+from typing import List, Dict
+from docx import Document                 # python-docx
 from sentence_transformers import SentenceTransformer
+from transformers import (AutoTokenizer,
+                          AutoModelForSeq2SeqLM,
+                          pipeline)
+# ──────────────────────────────
+# 1) LOAD + STRUCTURE THE RESUME
+# ──────────────────────────────
+DOCX = "Sri-Kolagani-Resume-3.docx"            # uploaded file
+def read_docx(path: str) -> List[str]:
+    doc = Document(path)
+    return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
+paras = read_docx(DOCX)
+# simple heading-based splitter
+sections: Dict[str, List[str]] = {}
+current = "misc"
+for p in paras:
+    if re.match(r"(?i)professional summary", p):
+        current = "summary"
+    elif re.match(r"(?i)professional experience", p):
+        current = "experience"
+    elif re.match(r"(?i)recent project", p):
+        current = "projects"
+    elif re.match(r"(?i)core technical skills", p):
+        current = "skills"
+    elif re.match(r"(?i)certifications", p):
+        current = "certifications"
+    elif re.match(r"(?i)education", p):
+        current = "education"
+    elif re.match(r"(?i)speaking engagements", p):
+        current = "speaking"
+    elif re.match(r"(?i)publications", p):
+        current = "publications"
+    sections.setdefault(current, []).append(p)
+# flatten into “facts” ≤ 200 tokens each
+def chunkify(text: str, max_tokens: int = 200) -> List[str]:
+    words = text.split()
+    return [" ".join(words[i:i+max_tokens])
+            for i in range(0, len(words), max_tokens)]
+facts, labels = [], []          # parallel lists
+for label, lines in sections.items():
+    for blk in lines:
+        for chunk in chunkify(blk):
+            facts.append(chunk)
+            labels.append(label)
+# ──────────────────────────────
+# 2) EMBEDDINGS + FAISS INDEX
+# ──────────────────────────────
 embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+emb = embedder.encode(facts, convert_to_numpy=True, show_progress_bar=False)
+faiss.normalize_L2(emb)
+index = faiss.IndexFlatIP(emb.shape[1]);  index.add(emb)
+def retrieve(question: str, k: int = 3) -> List[str]:
+    qv = embedder.encode([question], convert_to_numpy=True)
     faiss.normalize_L2(qv)
     _, idx = index.search(qv, k)
+    return [facts[i] for i in idx[0]]
+# ──────────────────────────────
+# 3) GENERATION MODEL
+# ──────────────────────────────
+GEN_ID = "google/flan-t5-small"
+tok = AutoTokenizer.from_pretrained(GEN_ID)
+mod = AutoModelForSeq2SeqLM.from_pretrained(GEN_ID)
+gen = pipeline("text2text-generation", model=mod, tokenizer=tok)
+SYS = ("You are a helpful assistant.  Answer ONLY with facts present in "
+       "the context. If the answer is not in the context, reply exactly: "
+       "\"I don't know based on the resume.\"")
 def guard(ctx: str, ans: str) -> bool:
     ctx_words = set(re.findall(r"\b\w+\b", ctx.lower()))
     ans_words = set(re.findall(r"\b\w+\b", ans.lower()))
     return not ctx_words.isdisjoint(ans_words)
+# ──────────────────────────────
+# 4) CHAT FUNCTION
+# ──────────────────────────────
+def chat(user_msg, _history=None):
+    ctx = " ".join(retrieve(user_msg))
+    prompt = f"{SYS}\n\nContext:\n{ctx}\n\nQuestion: {user_msg}\nAnswer:"
     raw = gen(prompt, max_length=256, do_sample=False)[0]["generated_text"]
     ans = raw.split("Answer:")[-1].strip()
     return ans if guard(ctx, ans) else "I don't know based on the resume."
+# ──────────────────────────────
+# 5) GRADIO UI
+# ──────────────────────────────
 demo = gr.ChatInterface(
+    fn=chat,
+    title="Sri Kolagani • Résumé Q&A",
+    description="Ask any question—answers are grounded in the résumé only.",
 )
 if __name__ == "__main__":