ChatBotsTA commited on
Commit
12a2cb7
Β·
verified Β·
1 Parent(s): 1a83899

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -174
app.py CHANGED
@@ -1,216 +1,115 @@
1
- import os
2
- import io
3
- import re
4
- import json
5
- import base64
6
- import requests
7
- import numpy as np
8
  import streamlit as st
9
  from pypdf import PdfReader
10
  import matplotlib.pyplot as plt
11
 
12
  # -----------------------------
13
- # Config / Secrets (safe)
14
  # -----------------------------
15
  st.set_page_config(page_title="PDF Summarizer + Audio + QA", page_icon="πŸ“„", layout="wide")
16
 
17
- # Prefer environment variable (Spaces sets secrets as env vars), *then* try st.secrets safely.
18
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
19
- if not HF_TOKEN:
20
- try:
21
- # Access st.secrets inside try/except so we don't crash when no secrets file exists.
22
- HF_TOKEN = st.secrets.get("HF_TOKEN", "") if hasattr(st, "secrets") else ""
23
- except Exception:
24
- HF_TOKEN = ""
25
-
26
  HEADERS_JSON = {
27
  "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
28
  "Content-Type": "application/json",
29
  "Accept": "application/json",
30
  }
31
 
32
- SUMMARIZER_MODEL = "facebook/bart-large-cnn"
33
- TTS_MODEL = "facebook/mms-tts-eng"
 
 
 
34
  EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
35
  QA_MODEL = "deepset/roberta-base-squad2"
36
 
37
  # -----------------------------
38
- # Helper: Hugging Face inference
39
  # -----------------------------
40
- def hf_infer_json(model_id: str, payload: dict, router=False, accept=None, timeout=120):
41
- """
42
- Send request to Hugging Face Hosted Inference API.
43
- If `router=True` we'll use the router base path (useful for some pipelines).
44
- If backend returns binary (audio), this returns raw bytes.
45
- """
46
  if router:
47
  url = f"https://router.huggingface.co/hf-inference/models/{model_id}"
48
  else:
49
  url = f"https://api-inference.huggingface.co/models/{model_id}"
50
-
51
  headers = HEADERS_JSON.copy()
52
  if accept:
53
  headers["Accept"] = accept
54
-
55
- try:
56
- r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=timeout)
57
- r.raise_for_status()
58
- except requests.exceptions.RequestException as e:
59
- # Bubble up a useful message
60
- raise RuntimeError(f"Hugging Face request failed: {e}")
61
-
62
- # Try to decode JSON; if fails, return bytes/content
63
  try:
64
  return r.json()
65
- except ValueError:
66
  return r.content
67
 
68
- # -----------------------------
69
- # Text / PDF utilities
70
- # -----------------------------
71
- def extract_text_from_pdf(file) -> str:
72
- reader = PdfReader(file)
73
- pages = []
74
- for p in reader.pages:
75
- try:
76
- pages.append(p.extract_text() or "")
77
- except Exception:
78
- pages.append("")
79
- return "\n".join(pages)
80
-
81
- def clean_text(s: str) -> str:
82
- return re.sub(r"\s+", " ", s).strip()
83
-
84
- def split_into_chunks(text: str, max_chars: int = 1800, overlap: int = 200):
85
- text = clean_text(text)
86
  chunks = []
87
  i = 0
88
  while i < len(text):
89
  chunk = text[i:i+max_chars]
90
  last_dot = chunk.rfind(". ")
91
  if last_dot > 400:
92
- chunk = chunk[: last_dot + 1]
93
  i += last_dot + 1 - overlap
94
  else:
95
  i += max_chars - overlap
96
  chunks.append(chunk.strip())
97
  return [c for c in chunks if c]
98
 
99
- # -----------------------------
100
- # Embeddings + similarity
101
- # -----------------------------
102
  def embed_texts(texts):
103
- """
104
- Calls the feature-extraction pipeline on the router endpoint.
105
- Returns numpy array shape (n_texts, dim)
106
- """
107
  url = f"https://router.huggingface.co/hf-inference/models/{EMB_MODEL}/pipeline/feature-extraction"
108
- headers = {
109
- "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
110
- "Content-Type": "application/json",
111
- "Accept": "application/json",
112
- }
113
- try:
114
- r = requests.post(url, headers=headers, data=json.dumps({"inputs": texts}), timeout=120)
115
- r.raise_for_status()
116
- except requests.exceptions.RequestException as e:
117
- raise RuntimeError(f"Embedding request failed: {e}")
118
-
119
  arr = np.array(r.json(), dtype=np.float32)
120
-
121
- # Cases:
122
- # - arr.ndim == 1 -> single vector (dim,) -> reshape to (1,dim)
123
- # - arr.ndim == 2 -> batch of vectors (n, dim) -> return as-is
124
- # - arr.ndim == 3 -> model returned token-level vectors per item: mean-pool per item -> (n, dim)
125
- if arr.ndim == 1:
126
- return arr.reshape(1, -1)
127
  if arr.ndim == 2:
128
- return arr
129
  if arr.ndim == 3:
130
- pooled = np.array([a.mean(axis=0) for a in arr])
131
- return pooled
132
- # Fallback
133
- return arr.reshape(arr.shape[0], -1)
134
 
135
  def cosine_sim(a, b):
136
- """
137
- a: (m, d), b: (n, d) -> returns (m, n)
138
- """
139
- a_n = a / (np.linalg.norm(a, axis=-1, keepdims=True) + 1e-8)
140
- b_n = b / (np.linalg.norm(b, axis=-1, keepdims=True) + 1e-8)
141
- return a_n @ b_n.T
142
 
143
- # -----------------------------
144
- # Summarization
145
- # -----------------------------
146
- def summarize_long_text(text: str, per_chunk_max_len=220, final_max_len=250):
147
- chunks = split_into_chunks(text, max_chars=1800, overlap=200)
148
  mini_summaries = []
149
  for c in chunks:
150
- try:
151
- out = hf_infer_json(
152
- SUMMARIZER_MODEL,
153
- {"inputs": c, "parameters": {"max_length": per_chunk_max_len, "min_length": 60, "do_sample": False}},
154
- router=False,
155
- )
156
- except Exception as e:
157
- # if API fails, include the chunk (truncated) as fallback
158
- mini_summaries.append(c[:1000])
159
- continue
160
-
161
- # Hosted inference often returns a list of dicts with 'summary_text'
162
- if isinstance(out, list) and len(out) and isinstance(out[0], dict) and "summary_text" in out[0]:
163
  mini_summaries.append(out[0]["summary_text"])
164
- elif isinstance(out, dict) and "summary_text" in out:
165
- mini_summaries.append(out["summary_text"])
166
  else:
167
- mini_summaries.append(c[:1000])
168
-
169
- joined = " ".join(mini_summaries)
170
- try:
171
- final = hf_infer_json(
172
- SUMMARIZER_MODEL,
173
- {"inputs": joined, "parameters": {"max_length": final_max_len, "min_length": 80, "do_sample": False}},
174
- router=False,
175
- )
176
- except Exception:
177
- return joined[:1200], chunks
178
-
179
- if isinstance(final, list) and len(final) and isinstance(final[0], dict) and "summary_text" in final[0]:
180
- return final[0]["summary_text"], chunks
181
- if isinstance(final, dict) and "summary_text" in final:
182
- return final["summary_text"], chunks
183
 
184
- return joined[:1200], chunks
185
-
186
- # -----------------------------
187
- # TTS
188
- # -----------------------------
189
  def tts_wav_bytes(text: str) -> bytes:
190
- try:
191
- res = hf_infer_json(TTS_MODEL, {"inputs": text}, router=False, accept="audio/wav", timeout=180)
192
- except Exception as e:
193
- raise RuntimeError(f"TTS request failed: {e}")
194
-
195
- if isinstance(res, (bytes, bytearray)):
196
- return res
197
- if isinstance(res, dict) and "audio" in res:
198
  try:
199
- return base64.b64decode(res["audio"])
 
 
 
 
200
  except Exception:
201
- pass
202
- raise RuntimeError("TTS API did not return audio bytes.")
 
 
 
 
 
 
 
 
 
 
203
 
204
- # -----------------------------
205
- # Visualization helper
206
- # -----------------------------
207
  def make_word_freq_chart(text: str, top_k=20):
208
  text = text.lower()
209
- stop = set(
210
- (
211
- "the a an and of to in is are for with on by as at this that from be was were it its it's into or if not your you we they their our can may such more most other also than which".split()
212
- )
213
- )
214
  tokens = re.findall(r"[a-zA-Z]{3,}", text)
215
  freq = {}
216
  for t in tokens:
@@ -233,14 +132,10 @@ def make_word_freq_chart(text: str, top_k=20):
233
  # UI
234
  # -----------------------------
235
  st.title("πŸ“„ PDF β†’ Summary Β· πŸ”Š Audio Β· πŸ“Š Chart Β· ❓ Q&A")
236
- st.caption("Powered by Hugging Face Hosted Inference API (free models).")
237
-
238
- if not HF_TOKEN:
239
- st.warning("No HF_TOKEN found. Add HF_TOKEN in Space Settings β†’ Secrets (recommended). The app will still run but HF API calls will fail without a token.")
240
 
241
  uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
242
 
243
- # session state
244
  if "doc_text" not in st.session_state:
245
  st.session_state.doc_text = ""
246
  st.session_state.chunks = []
@@ -254,22 +149,17 @@ if uploaded:
254
  st.success(f"Loaded {len(text)} characters.")
255
 
256
  st.write("### Actions")
257
- c1, c2, c3 = st.columns(3)
258
-
259
- with c1:
260
  if st.button("πŸ“ Summarize"):
261
  with st.spinner("Summarizing..."):
262
- try:
263
- summary, chunks = summarize_long_text(st.session_state.doc_text)
264
- st.session_state.summary = summary
265
- st.session_state.chunks = chunks
266
- st.success("Summary ready.")
267
- st.write("#### Summary")
268
- st.write(summary)
269
- except Exception as e:
270
- st.error(f"Summarization failed: {e}")
271
-
272
- with c2:
273
  if st.button("πŸ”Š Generate Audio (summary)"):
274
  target_text = st.session_state.summary or st.session_state.doc_text[:1200]
275
  with st.spinner("Generating audio..."):
@@ -280,7 +170,7 @@ if uploaded:
280
  except Exception as e:
281
  st.error(f"TTS failed: {e}")
282
 
283
- with c3:
284
  if st.button("πŸ“Š Show Word-Frequency Chart"):
285
  with st.spinner("Building chart..."):
286
  make_word_freq_chart(st.session_state.doc_text)
@@ -293,10 +183,11 @@ if uploaded:
293
  st.session_state.chunks = split_into_chunks(st.session_state.doc_text)
294
  with st.spinner("Thinking..."):
295
  try:
296
- # embed once/cache
297
  if st.session_state.chunk_vecs is None:
298
- st.session_state.chunk_vecs = embed_texts(st.session_state.chunks)
299
- vecs = st.session_state.chunk_vecs
 
 
300
 
301
  q_vec = embed_texts([question])
302
  sims = cosine_sim(q_vec, vecs).flatten()
 
1
+ import os, io, re, json, base64, requests, numpy as np
 
 
 
 
 
 
2
  import streamlit as st
3
  from pypdf import PdfReader
4
  import matplotlib.pyplot as plt
5
 
6
  # -----------------------------
7
+ # Config
8
  # -----------------------------
9
  st.set_page_config(page_title="PDF Summarizer + Audio + QA", page_icon="πŸ“„", layout="wide")
10
 
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
 
 
 
 
 
 
 
12
  HEADERS_JSON = {
13
  "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
14
  "Content-Type": "application/json",
15
  "Accept": "application/json",
16
  }
17
 
18
+ SUMMARIZER_MODEL = "pszemraj/long-t5-tglobal-base-16384-book-summary"
19
+ TTS_MODELS = [
20
+ "espnet/kan-bayashi_ljspeech_vits",
21
+ "facebook/fastspeech2-en-ljspeech"
22
+ ]
23
  EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
24
  QA_MODEL = "deepset/roberta-base-squad2"
25
 
26
  # -----------------------------
27
+ # API helpers
28
  # -----------------------------
29
+ def hf_infer_json(model_id: str, payload: dict, router=False, accept=None):
 
 
 
 
 
30
  if router:
31
  url = f"https://router.huggingface.co/hf-inference/models/{model_id}"
32
  else:
33
  url = f"https://api-inference.huggingface.co/models/{model_id}"
 
34
  headers = HEADERS_JSON.copy()
35
  if accept:
36
  headers["Accept"] = accept
37
+ r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=120)
38
+ r.raise_for_status()
 
 
 
 
 
 
 
39
  try:
40
  return r.json()
41
+ except requests.exceptions.JSONDecodeError:
42
  return r.content
43
 
44
+ def split_into_chunks(text: str, max_chars: int = 1500, overlap: int = 200):
45
+ text = re.sub(r"\s+", " ", text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  chunks = []
47
  i = 0
48
  while i < len(text):
49
  chunk = text[i:i+max_chars]
50
  last_dot = chunk.rfind(". ")
51
  if last_dot > 400:
52
+ chunk = chunk[:last_dot+1]
53
  i += last_dot + 1 - overlap
54
  else:
55
  i += max_chars - overlap
56
  chunks.append(chunk.strip())
57
  return [c for c in chunks if c]
58
 
 
 
 
59
  def embed_texts(texts):
 
 
 
 
60
  url = f"https://router.huggingface.co/hf-inference/models/{EMB_MODEL}/pipeline/feature-extraction"
61
+ headers = HEADERS_JSON
62
+ r = requests.post(url, headers=headers, data=json.dumps({"inputs": texts}), timeout=120)
63
+ r.raise_for_status()
 
 
 
 
 
 
 
 
64
  arr = np.array(r.json(), dtype=np.float32)
 
 
 
 
 
 
 
65
  if arr.ndim == 2:
66
+ return arr.mean(axis=0, keepdims=True)
67
  if arr.ndim == 3:
68
+ pooled = [a.mean(axis=0) for a in arr]
69
+ return np.vstack(pooled)
70
+ return np.array(arr)
 
71
 
72
  def cosine_sim(a, b):
73
+ a = a / (np.linalg.norm(a, axis=-1, keepdims=True) + 1e-8)
74
+ b = b / (np.linalg.norm(b, axis=-1, keepdims=True) + 1e-8)
75
+ return a @ b.T
 
 
 
76
 
77
+ def summarize_long_text(text: str):
78
+ chunks = split_into_chunks(text)
 
 
 
79
  mini_summaries = []
80
  for c in chunks:
81
+ out = hf_infer_json(SUMMARIZER_MODEL, {"inputs": c}, router=False)
82
+ if isinstance(out, list) and len(out) and "summary_text" in out[0]:
 
 
 
 
 
 
 
 
 
 
 
83
  mini_summaries.append(out[0]["summary_text"])
 
 
84
  else:
85
+ mini_summaries.append(c[:800])
86
+ return " ".join(mini_summaries), chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
 
 
 
 
 
88
  def tts_wav_bytes(text: str) -> bytes:
89
+ for model in TTS_MODELS:
 
 
 
 
 
 
 
90
  try:
91
+ res = hf_infer_json(model, {"inputs": text}, router=False, accept="audio/wav")
92
+ if isinstance(res, (bytes, bytearray)):
93
+ return res
94
+ if isinstance(res, dict) and "audio" in res:
95
+ return base64.b64decode(res["audio"])
96
  except Exception:
97
+ continue
98
+ raise RuntimeError("All TTS models failed.")
99
+
100
+ def extract_text_from_pdf(file) -> str:
101
+ reader = PdfReader(file)
102
+ pages = []
103
+ for p in reader.pages:
104
+ try:
105
+ pages.append(p.extract_text() or "")
106
+ except:
107
+ pages.append("")
108
+ return "\n".join(pages)
109
 
 
 
 
110
  def make_word_freq_chart(text: str, top_k=20):
111
  text = text.lower()
112
+ stop = set(("the a an and of to in is are for with on by as at this that from be was were it its it’s into or if not your you we they their our can may such more most other also than which".split()))
 
 
 
 
113
  tokens = re.findall(r"[a-zA-Z]{3,}", text)
114
  freq = {}
115
  for t in tokens:
 
132
  # UI
133
  # -----------------------------
134
  st.title("πŸ“„ PDF β†’ Summary Β· πŸ”Š Audio Β· πŸ“Š Chart Β· ❓ Q&A")
135
+ st.caption("Free models via Hugging Face Hosted Inference API.")
 
 
 
136
 
137
  uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
138
 
 
139
  if "doc_text" not in st.session_state:
140
  st.session_state.doc_text = ""
141
  st.session_state.chunks = []
 
149
  st.success(f"Loaded {len(text)} characters.")
150
 
151
  st.write("### Actions")
152
+ with st.container():
 
 
153
  if st.button("πŸ“ Summarize"):
154
  with st.spinner("Summarizing..."):
155
+ summary, chunks = summarize_long_text(st.session_state.doc_text)
156
+ st.session_state.summary = summary
157
+ st.session_state.chunks = chunks
158
+ st.success("Summary ready.")
159
+ st.write("#### Summary")
160
+ st.write(st.session_state.summary)
161
+
162
+ with st.container():
 
 
 
163
  if st.button("πŸ”Š Generate Audio (summary)"):
164
  target_text = st.session_state.summary or st.session_state.doc_text[:1200]
165
  with st.spinner("Generating audio..."):
 
170
  except Exception as e:
171
  st.error(f"TTS failed: {e}")
172
 
173
+ with st.container():
174
  if st.button("πŸ“Š Show Word-Frequency Chart"):
175
  with st.spinner("Building chart..."):
176
  make_word_freq_chart(st.session_state.doc_text)
 
183
  st.session_state.chunks = split_into_chunks(st.session_state.doc_text)
184
  with st.spinner("Thinking..."):
185
  try:
 
186
  if st.session_state.chunk_vecs is None:
187
+ vecs = embed_texts(st.session_state.chunks)
188
+ st.session_state.chunk_vecs = vecs
189
+ else:
190
+ vecs = st.session_state.chunk_vecs
191
 
192
  q_vec = embed_texts([question])
193
  sims = cosine_sim(q_vec, vecs).flatten()