srikol commited on
Commit
ee10b9c
Β·
verified Β·
1 Parent(s): 9cfd3ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -49
app.py CHANGED
@@ -1,78 +1,108 @@
1
- """
2
- CPU-fast RAG chatbot for my_resume.pdf
3
- – avoids the NLTK 3.8.2 punkt_tab issue by pinning nltk==3.8.1
4
- """
5
  import os, re, json, faiss, gradio as gr
6
- from typing import List
7
- from PyPDF2 import PdfReader
8
  from sentence_transformers import SentenceTransformer
9
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
10
- import nltk
 
11
 
12
- # ── sentence splitter that works on nltk 3.8.1 (punkt) ─────────────
13
- nltk.download("punkt", quiet=True)
 
 
14
 
15
- def sent_split(text: str) -> List[str]:
16
- return nltk.sent_tokenize(text)
 
17
 
18
- # ── load and chunk rΓ©sumΓ© ──────────────────────────────────────────
19
- def pdf_text(path="my_resume.pdf") -> str:
20
- return " ".join(page.extract_text() or "" for page in PdfReader(path).pages)
21
 
22
- raw = pdf_text()
23
- text = re.sub(r"\s+", " ", raw).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- def chunker(txt: str, max_tokens=200) -> List[str]:
26
- out, buf, tok = [], [], 0
27
- for s in sent_split(txt):
28
- n = len(s.split())
29
- if tok + n > max_tokens:
30
- out.append(" ".join(buf)); buf, tok = [], 0
31
- buf.append(s); tok += n
32
- if buf: out.append(" ".join(buf))
33
- return out
34
 
35
- CHUNKS = chunker(text)
 
 
 
 
 
36
 
37
- # ── build FAISS index ──────────────────────────────────────────────
 
 
38
  embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
39
- vecs = embedder.encode(CHUNKS, convert_to_numpy=True, show_progress_bar=False)
40
- faiss.normalize_L2(vecs)
41
- index = faiss.IndexFlatIP(vecs.shape[1])
42
- index.add(vecs)
43
 
44
- def retrieve(q: str, k=3) -> str:
45
- qv = embedder.encode([q], convert_to_numpy=True)
46
  faiss.normalize_L2(qv)
47
  _, idx = index.search(qv, k)
48
- return " ".join(CHUNKS[i] for i in idx[0])
49
 
50
- # ── small generation model ─────────────────────────────────────────
51
- gen_tok = AutoTokenizer.from_pretrained("google/flan-t5-small")
52
- gen_mod = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
53
- gen = pipeline("text2text-generation", model=gen_mod, tokenizer=gen_tok)
 
 
 
54
 
55
- SYSTEM = ("You are a helpful assistant. Answer ONLY with facts in the context. "
56
- "If the answer is not in the context, reply exactly: "
57
- "\"I don't know based on the resume.\"")
58
 
59
  def guard(ctx: str, ans: str) -> bool:
60
  ctx_words = set(re.findall(r"\b\w+\b", ctx.lower()))
61
  ans_words = set(re.findall(r"\b\w+\b", ans.lower()))
62
  return not ctx_words.isdisjoint(ans_words)
63
 
64
- def respond(msg: str, hist=None):
65
- ctx = retrieve(msg)
66
- prompt = f"{SYSTEM}\n\nContext:\n{ctx}\n\nQuestion: {msg}\nAnswer:"
 
 
 
67
  raw = gen(prompt, max_length=256, do_sample=False)[0]["generated_text"]
68
  ans = raw.split("Answer:")[-1].strip()
69
  return ans if guard(ctx, ans) else "I don't know based on the resume."
70
 
71
- # ── Gradio chat UI ─────────────────────────────────────────────────
 
 
72
  demo = gr.ChatInterface(
73
- fn=respond,
74
- title="Resume Q&A (fast CPU)",
75
- description="Ask any question – answers come strictly from my_resume.pdf.",
76
  )
77
 
78
  if __name__ == "__main__":
 
1
+ # RAG chatbot for Sri Kolagani rΓ©sumΓ©
2
+ # (c) 2025 – drop into a Hugging Face Space and run.
3
+
 
4
  import os, re, json, faiss, gradio as gr
5
+ from typing import List, Dict
6
+ from docx import Document # python-docx
7
  from sentence_transformers import SentenceTransformer
8
+ from transformers import (AutoTokenizer,
9
+ AutoModelForSeq2SeqLM,
10
+ pipeline)
11
 
12
+ # ──────────────────────────────
13
+ # 1) LOAD + STRUCTURE THE RESUME
14
+ # ──────────────────────────────
15
+ DOCX = "Sri-Kolagani-Resume-3.docx" # uploaded file
16
 
17
+ def read_docx(path: str) -> List[str]:
18
+ doc = Document(path)
19
+ return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
20
 
21
+ paras = read_docx(DOCX)
 
 
22
 
23
+ # simple heading-based splitter
24
+ sections: Dict[str, List[str]] = {}
25
+ current = "misc"
26
+ for p in paras:
27
+ if re.match(r"(?i)professional summary", p):
28
+ current = "summary"
29
+ elif re.match(r"(?i)professional experience", p):
30
+ current = "experience"
31
+ elif re.match(r"(?i)recent project", p):
32
+ current = "projects"
33
+ elif re.match(r"(?i)core technical skills", p):
34
+ current = "skills"
35
+ elif re.match(r"(?i)certifications", p):
36
+ current = "certifications"
37
+ elif re.match(r"(?i)education", p):
38
+ current = "education"
39
+ elif re.match(r"(?i)speaking engagements", p):
40
+ current = "speaking"
41
+ elif re.match(r"(?i)publications", p):
42
+ current = "publications"
43
+ sections.setdefault(current, []).append(p)
44
 
45
+ # flatten into β€œfacts” ≀ 200 tokens each
46
+ def chunkify(text: str, max_tokens: int = 200) -> List[str]:
47
+ words = text.split()
48
+ return [" ".join(words[i:i+max_tokens])
49
+ for i in range(0, len(words), max_tokens)]
 
 
 
 
50
 
51
+ facts, labels = [], [] # parallel lists
52
+ for label, lines in sections.items():
53
+ for blk in lines:
54
+ for chunk in chunkify(blk):
55
+ facts.append(chunk)
56
+ labels.append(label)
57
 
58
+ # ──────────────────────────────
59
+ # 2) EMBEDDINGS + FAISS INDEX
60
+ # ──────────────────────────────
61
  embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
62
+ emb = embedder.encode(facts, convert_to_numpy=True, show_progress_bar=False)
63
+ faiss.normalize_L2(emb)
64
+ index = faiss.IndexFlatIP(emb.shape[1]); index.add(emb)
 
65
 
66
+ def retrieve(question: str, k: int = 3) -> List[str]:
67
+ qv = embedder.encode([question], convert_to_numpy=True)
68
  faiss.normalize_L2(qv)
69
  _, idx = index.search(qv, k)
70
+ return [facts[i] for i in idx[0]]
71
 
72
+ # ──────────────────────────────
73
+ # 3) GENERATION MODEL
74
+ # ──────────────────────────────
75
+ GEN_ID = "google/flan-t5-small"
76
+ tok = AutoTokenizer.from_pretrained(GEN_ID)
77
+ mod = AutoModelForSeq2SeqLM.from_pretrained(GEN_ID)
78
+ gen = pipeline("text2text-generation", model=mod, tokenizer=tok)
79
 
80
+ SYS = ("You are a helpful assistant. Answer ONLY with facts present in "
81
+ "the context. If the answer is not in the context, reply exactly: "
82
+ "\"I don't know based on the resume.\"")
83
 
84
  def guard(ctx: str, ans: str) -> bool:
85
  ctx_words = set(re.findall(r"\b\w+\b", ctx.lower()))
86
  ans_words = set(re.findall(r"\b\w+\b", ans.lower()))
87
  return not ctx_words.isdisjoint(ans_words)
88
 
89
+ # ──────────────────────────────
90
+ # 4) CHAT FUNCTION
91
+ # ──────────────────────────────
92
+ def chat(user_msg, _history=None):
93
+ ctx = " ".join(retrieve(user_msg))
94
+ prompt = f"{SYS}\n\nContext:\n{ctx}\n\nQuestion: {user_msg}\nAnswer:"
95
  raw = gen(prompt, max_length=256, do_sample=False)[0]["generated_text"]
96
  ans = raw.split("Answer:")[-1].strip()
97
  return ans if guard(ctx, ans) else "I don't know based on the resume."
98
 
99
+ # ──────────────────────────────
100
+ # 5) GRADIO UI
101
+ # ──────────────────────────────
102
  demo = gr.ChatInterface(
103
+ fn=chat,
104
+ title="Sri Kolagani β€’ RΓ©sumΓ© Q&A",
105
+ description="Ask any questionβ€”answers are grounded in the rΓ©sumΓ© only.",
106
  )
107
 
108
  if __name__ == "__main__":