NIKKI77 commited on
Commit
5181b3c
·
1 Parent(s): d796f80

Subtitle KIS v1.1 – initial

Browse files
Files changed (3) hide show
  1. Dockerfile +13 -4
  2. README.md +9 -1
  3. backend/app.py +14 -5
Dockerfile CHANGED
@@ -35,8 +35,7 @@ USER appuser
35
  # Python deps (user site)
36
  RUN pip install --no-cache-dir --user -r requirements.txt
37
 
38
- # Preload spaCy + NLTK data
39
- RUN python -m spacy download en_core_web_sm
40
  RUN python - <<'PY'
41
  import nltk
42
  for pkg in ["punkt","punkt_tab","wordnet","omw-1.4"]:
@@ -57,7 +56,17 @@ except Exception as e:
57
  print("Prefetch skipped:", e)
58
  PY
59
 
 
 
 
 
 
 
 
 
 
 
 
60
  EXPOSE 7860
 
61
 
62
- # Give cold start more time
63
- CMD ["gunicorn","-w","1","-k","gthread","--threads","4","--timeout","300","-b","0.0.0.0:7860","backend.app:app"]
 
35
  # Python deps (user site)
36
  RUN pip install --no-cache-dir --user -r requirements.txt
37
 
38
+ # NLTK data (you use wordnet + tokenizers). spaCy removed since unused.
 
39
  RUN python - <<'PY'
40
  import nltk
41
  for pkg in ["punkt","punkt_tab","wordnet","omw-1.4"]:
 
56
  print("Prefetch skipped:", e)
57
  PY
58
 
59
+ # (Optional) smoke test to catch FAISS/torch issues early
60
+ RUN python - <<'PY'
61
+ import sys
62
+ print("PY:", sys.version)
63
+ import faiss, torch
64
+ print("FAISS:", faiss.__version__)
65
+ print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available())
66
+ PY
67
+
68
+ # Spaces port + gunicorn binding
69
+ ENV PORT=7860
70
  EXPOSE 7860
71
+ CMD ["bash","-lc","gunicorn -w 1 -k gthread --threads 4 --timeout 300 -b 0.0.0.0:${PORT:-7860} backend.app:app"]
72
 
 
 
README.md CHANGED
@@ -5,8 +5,16 @@ colorFrom: indigo
5
  colorTo: blue
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
- # Subtitle KIS (KSI Version 1.1)
11
 
12
  GPU-accelerated subtitle search & summarize (SBERT + FAISS + DistilBART + punctuation).
 
 
 
 
 
 
 
 
5
  colorTo: blue
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # Subtitle KIS (KSI v1.1)
12
 
13
  GPU-accelerated subtitle search & summarize (SBERT + FAISS + DistilBART + punctuation).
14
+
15
+ ## How to run locally
16
+ ```bash
17
+ pip install -r requirements.txt
18
+ export PORT=7860
19
+ python backend/app.py
20
+ # open http://localhost:7860
backend/app.py CHANGED
@@ -6,8 +6,9 @@ from flask import Flask, render_template, request, jsonify
6
  from markupsafe import escape, Markup
7
  from nltk.corpus import wordnet
8
  from nltk.stem import WordNetLemmatizer
9
- from semantic_search import search_query
10
- from nlp_summary import summarize_text
 
11
  from autocomplete import get_suggestions
12
  from config import ABBREVIATION_MAP, VIDEO_METADATA, SEARCH_CONFIG
13
 
@@ -31,6 +32,11 @@ def apply_csp(response):
31
  def index():
32
  return render_template("index.html")
33
 
 
 
 
 
 
34
  # Template filter: convert HH:MM:SS to seconds
35
  @app.template_filter("jump_time")
36
  def jump_time(timestamp):
@@ -91,7 +97,10 @@ def perform_search(query, start=0, shown=0, previous_results=None, semantic_mode
91
  if previous_results is None:
92
  previous_results = []
93
 
94
-
 
 
 
95
  raw_results, _ = search_query(query, offset=0, top_k=1000, semantic_mode=semantic_mode)
96
 
97
  # Keyword mode
@@ -107,7 +116,6 @@ def perform_search(query, start=0, shown=0, previous_results=None, semantic_mode
107
  friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid_id), None)
108
  r["video_title"] = VIDEO_METADATA.get(friendly_key, {}).get("title", "Unknown Title")
109
 
110
-
111
  context_chunks = []
112
  if idx > 0:
113
  context_chunks.append(paged_results[idx - 1]["summary_input"])
@@ -211,4 +219,5 @@ def autocomplete():
211
  return flask_json.dumps(get_suggestions(term))
212
 
213
  if __name__ == "__main__":
214
- app.run(debug=True)
 
 
6
  from markupsafe import escape, Markup
7
  from nltk.corpus import wordnet
8
  from nltk.stem import WordNetLemmatizer
9
+ # NOTE: heavy imports moved to lazy inside perform_search()
10
+ # from semantic_search import search_query
11
+ # from nlp_summary import summarize_text
12
  from autocomplete import get_suggestions
13
  from config import ABBREVIATION_MAP, VIDEO_METADATA, SEARCH_CONFIG
14
 
 
32
  def index():
33
  return render_template("index.html")
34
 
35
+ # Health check (fast) — for HF Spaces readiness
36
+ @app.get("/health")
37
+ def health():
38
+ return {"ok": True}, 200
39
+
40
  # Template filter: convert HH:MM:SS to seconds
41
  @app.template_filter("jump_time")
42
  def jump_time(timestamp):
 
97
  if previous_results is None:
98
  previous_results = []
99
 
100
+ # 🔸 Lazy imports so heavy modules load on first search, not at boot
101
+ from semantic_search import search_query
102
+ from nlp_summary import summarize_text
103
+
104
  raw_results, _ = search_query(query, offset=0, top_k=1000, semantic_mode=semantic_mode)
105
 
106
  # Keyword mode
 
116
  friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid_id), None)
117
  r["video_title"] = VIDEO_METADATA.get(friendly_key, {}).get("title", "Unknown Title")
118
 
 
119
  context_chunks = []
120
  if idx > 0:
121
  context_chunks.append(paged_results[idx - 1]["summary_input"])
 
219
  return flask_json.dumps(get_suggestions(term))
220
 
221
  if __name__ == "__main__":
222
+ port = int(os.environ.get("PORT", 7860)) # HF Spaces default
223
+ app.run(host="0.0.0.0", port=port, debug=False)