D Ф m i И i q ц e L Ф y e r commited on
Commit
89414df
·
1 Parent(s): d44b2e1

Sync from Sandbox: fix NER, E-E-A-T, backend v2.4.1

Browse files
Files changed (5) hide show
  1. Dockerfile +13 -34
  2. Icon/r +0 -0
  3. requirements.txt +38 -0
  4. syscred/backend_app.py +72 -2
  5. syscred/static/index.html +53 -9
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
  # SysCRED Docker Configuration for Hugging Face Spaces
2
- # OPTIMIZED version with Distilled Models for faster startup
3
  FROM python:3.10-slim
4
 
5
  WORKDIR /app
@@ -7,48 +7,27 @@ WORKDIR /app
7
  ENV PYTHONDONTWRITEBYTECODE=1
8
  ENV PYTHONUNBUFFERED=1
9
  ENV PYTHONPATH=/app
10
-
11
- # ============================================
12
- # KEY OPTIMIZATION: Use distilled models
13
- # ============================================
14
  ENV SYSCRED_LOAD_ML_MODELS=true
15
- ENV SYSCRED_USE_DISTILLED=true
16
- ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
17
- ENV HF_HOME=/app/.cache/huggingface
18
 
19
  # Install system dependencies
20
- RUN apt-get update && apt-get install -y \
21
  build-essential \
22
  && rm -rf /var/lib/apt/lists/*
23
 
24
- # Copy optimized requirements (distilled models, CPU-only torch)
25
- COPY syscred/requirements-distilled.txt /app/requirements.txt
26
 
27
- # Install dependencies
28
  RUN pip install --no-cache-dir -r requirements.txt
29
 
30
- # ============================================
31
- # PRE-DOWNLOAD DISTILLED MODELS (Build Time)
32
- # This avoids timeout during first request
33
- # ============================================
34
- RUN python -c "from transformers import pipeline; \
35
- pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english'); \
36
- pipeline('ner', model='dslim/bert-base-NER'); \
37
- print('✓ Distilled models pre-downloaded')"
38
-
39
- # Download small spaCy models
40
- RUN pip install spacy && \
41
- python -m spacy download en_core_web_sm && \
42
- python -m spacy download fr_core_news_sm && \
43
- echo '✓ spaCy models downloaded'
44
-
45
- # Pre-download sentence transformer (small version)
46
- RUN python -c "from sentence_transformers import SentenceTransformer; \
47
- SentenceTransformer('all-MiniLM-L6-v2'); \
48
- print('✓ Sentence transformer pre-downloaded')"
49
 
50
  # Copy application code
51
  COPY syscred/ /app/syscred/
 
52
 
53
  # Create user for HF Spaces (required)
54
  RUN useradd -m -u 1000 user
@@ -58,8 +37,8 @@ ENV PATH=/home/user/.local/bin:$PATH
58
 
59
  WORKDIR /app
60
 
 
61
  EXPOSE 7860
62
 
63
- # Run with HF Spaces port (7860)
64
- # Increased workers to 4 for better concurrency, timeout 600s
65
- CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "4", "--timeout", "600", "syscred.backend_app:app"]
 
1
  # SysCRED Docker Configuration for Hugging Face Spaces
2
+ # Full version with PyTorch and Transformers
3
  FROM python:3.10-slim
4
 
5
  WORKDIR /app
 
7
  ENV PYTHONDONTWRITEBYTECODE=1
8
  ENV PYTHONUNBUFFERED=1
9
  ENV PYTHONPATH=/app
 
 
 
 
10
  ENV SYSCRED_LOAD_ML_MODELS=true
11
+ ENV SYSCRED_ENV=production
 
 
12
 
13
  # Install system dependencies
14
+ RUN apt-get update && apt-get install -y --no-install-recommends \
15
  build-essential \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
+ # Copy full requirements
19
+ COPY requirements.txt /app/requirements.txt
20
 
21
+ # Install dependencies (includes PyTorch, Transformers, spaCy)
22
  RUN pip install --no-cache-dir -r requirements.txt
23
 
24
+ # Download spaCy models for NER
25
+ RUN python -m spacy download en_core_web_md || true
26
+ RUN python -m spacy download fr_core_news_md || true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # Copy application code
29
  COPY syscred/ /app/syscred/
30
+ COPY ontology/ /app/ontology/
31
 
32
  # Create user for HF Spaces (required)
33
  RUN useradd -m -u 1000 user
 
37
 
38
  WORKDIR /app
39
 
40
+ # HF Spaces uses port 7860
41
  EXPOSE 7860
42
 
43
+ # Run with HF Spaces port
44
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "2", "--timeout", "300", "syscred.backend_app:app"]
 
Icon/r ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SysCRED - Requirements (Full version with ML models)
2
+ # Système Hybride de Vérification de Crédibilité
3
+ # (c) Dominique S. Loyer
4
+ # Version complète pour HuggingFace Spaces et développement local
5
+
6
+ # === Core Dependencies ===
7
+ requests>=2.28.0
8
+ beautifulsoup4>=4.11.0
9
+ python-whois>=0.8.0
10
+ lxml>=4.9.0
11
+
12
+ # === RDF/Ontology ===
13
+ rdflib>=6.0.0
14
+
15
+ # === Machine Learning ===
16
+ transformers>=4.30.0
17
+ torch>=2.0.0
18
+ numpy>=1.24.0
19
+ sentence-transformers>=2.2.0
20
+ accelerate>=0.20.0
21
+ spacy>=3.6.0
22
+
23
+ # === Explainability ===
24
+ lime>=0.2.0
25
+
26
+ # === Web Backend ===
27
+ flask>=2.3.0
28
+ flask-cors>=4.0.0
29
+ python-dotenv>=1.0.0
30
+ pandas>=2.0.0
31
+
32
+ # === Production/Database ===
33
+ gunicorn>=20.1.0
34
+ psycopg2-binary>=2.9.0
35
+ flask-sqlalchemy>=3.0.0
36
+
37
+ # === Development/Testing ===
38
+ pytest>=7.0.0
syscred/backend_app.py CHANGED
@@ -22,12 +22,16 @@ import traceback
22
  from pathlib import Path
23
  try:
24
  from dotenv import load_dotenv
25
- env_path = Path(__file__).parent / '.env'
 
 
 
 
26
  if env_path.exists():
27
  load_dotenv(env_path)
28
  print(f"[SysCRED Backend] Loaded .env from {env_path}")
29
  else:
30
- print(f"[SysCRED Backend] No .env file found at {env_path}")
31
  except ImportError:
32
  print("[SysCRED Backend] python-dotenv not installed, using system env vars")
33
 
@@ -85,6 +89,16 @@ except ImportError as e:
85
  app = Flask(__name__)
86
  CORS(app) # Enable CORS for frontend
87
 
 
 
 
 
 
 
 
 
 
 
88
  # Initialize Database
89
  try:
90
  init_db(app) # [NEW] Setup DB connection
@@ -267,6 +281,62 @@ def verify_endpoint():
267
 
268
  print(f"[SysCRED Backend] Score: {result.get('scoreCredibilite', 'N/A')}")
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  # [NEW] Persist to Database
271
  try:
272
  new_analysis = AnalysisResult(
 
22
  from pathlib import Path
23
  try:
24
  from dotenv import load_dotenv
25
+ # .env is at project root (parent of syscred/)
26
+ env_path = Path(__file__).resolve().parent.parent / '.env'
27
+ if not env_path.exists():
28
+ # Fallback: check syscred/ directory
29
+ env_path = Path(__file__).parent / '.env'
30
  if env_path.exists():
31
  load_dotenv(env_path)
32
  print(f"[SysCRED Backend] Loaded .env from {env_path}")
33
  else:
34
+ print(f"[SysCRED Backend] No .env file found, using system env vars")
35
  except ImportError:
36
  print("[SysCRED Backend] python-dotenv not installed, using system env vars")
37
 
 
89
  app = Flask(__name__)
90
  CORS(app) # Enable CORS for frontend
91
 
92
+ # Allow iframe embedding on UQAM domains (for syscred.uqam.ca mirror)
93
+ @app.after_request
94
+ def add_security_headers(response):
95
+ """Add security headers allowing UQAM iframe embedding."""
96
+ response.headers['X-Frame-Options'] = 'ALLOW-FROM https://syscred.uqam.ca'
97
+ response.headers['Content-Security-Policy'] = (
98
+ "frame-ancestors 'self' https://syscred.uqam.ca https://*.uqam.ca"
99
+ )
100
+ return response
101
+
102
  # Initialize Database
103
  try:
104
  init_db(app) # [NEW] Setup DB connection
 
281
 
282
  print(f"[SysCRED Backend] Score: {result.get('scoreCredibilite', 'N/A')}")
283
 
284
+ # [NEW] TREC Evidence Search + IR Metrics
285
+ try:
286
+ global trec_retriever, eval_metrics
287
+
288
+ # Initialize TREC if needed
289
+ if trec_retriever is None and TREC_AVAILABLE:
290
+ trec_retriever = TRECRetriever(use_stemming=True, enable_prf=False)
291
+ trec_retriever.corpus = TREC_DEMO_CORPUS
292
+ eval_metrics = EvaluationMetrics()
293
+ print("[SysCRED Backend] TREC Retriever initialized with demo corpus")
294
+
295
+ if trec_retriever and eval_metrics:
296
+ import time
297
+ start_time = time.time()
298
+
299
+ # Use the input text as query
300
+ query_text = input_data[:200] if not credibility_system.is_url(input_data) else result.get('informationEntree', input_data)[:200]
301
+
302
+ trec_result = trec_retriever.retrieve_evidence(query_text, k=5, model='bm25')
303
+ search_time = (time.time() - start_time) * 1000
304
+
305
+ retrieved_ids = [e.doc_id for e in trec_result.evidences]
306
+
307
+ # Use climate-related docs as "relevant" for demo evaluation
308
+ # In production, this would come from qrels files
309
+ relevant_ids = set(TREC_DEMO_CORPUS.keys()) # All docs as relevant pool
310
+
311
+ # Compute IR metrics
312
+ k = len(retrieved_ids) if retrieved_ids else 1
313
+ precision = eval_metrics.precision_at_k(retrieved_ids, relevant_ids, k) if retrieved_ids else 0
314
+ recall = eval_metrics.recall_at_k(retrieved_ids, relevant_ids, k) if retrieved_ids else 0
315
+ ap = eval_metrics.average_precision(retrieved_ids, relevant_ids) if retrieved_ids else 0
316
+ mrr = eval_metrics.mrr(retrieved_ids, relevant_ids) if retrieved_ids else 0
317
+
318
+ relevance_dict = {doc: 1 for doc in relevant_ids}
319
+ ndcg = eval_metrics.ndcg_at_k(retrieved_ids, relevance_dict, k) if retrieved_ids else 0
320
+
321
+ # TF-IDF score from top result
322
+ tfidf_score = trec_result.evidences[0].score if trec_result.evidences else 0
323
+
324
+ result['trec_metrics'] = {
325
+ 'precision': round(precision, 4),
326
+ 'recall': round(recall, 4),
327
+ 'map': round(ap, 4),
328
+ 'ndcg': round(ndcg, 4),
329
+ 'tfidf_score': round(tfidf_score, 4),
330
+ 'mrr': round(mrr, 4),
331
+ 'retrieved_count': len(retrieved_ids),
332
+ 'corpus_size': len(TREC_DEMO_CORPUS),
333
+ 'search_time_ms': round(search_time, 2)
334
+ }
335
+ print(f"[SysCRED Backend] TREC: P={precision:.3f} R={recall:.3f} MAP={ap:.3f} NDCG={ndcg:.3f} MRR={mrr:.3f}")
336
+ except Exception as e:
337
+ print(f"[SysCRED Backend] TREC metrics error: {e}")
338
+ result['trec_metrics'] = {'error': str(e)}
339
+
340
  # [NEW] Persist to Database
341
  try:
342
  new_analysis = AnalysisResult(
syscred/static/index.html CHANGED
@@ -927,9 +927,35 @@
927
 
928
  <script>
929
  // Backend URLs
 
 
930
  const LOCAL_API_URL = 'http://localhost:5001';
931
- const REMOTE_API_URL = 'https://domloyer-syscred.hf.space';
932
- let API_URL = '';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933
 
934
  function toggleBackend() {
935
  const toggle = document.getElementById('backendToggle');
@@ -939,7 +965,7 @@
939
 
940
  if (toggle.checked) {
941
  API_URL = REMOTE_API_URL;
942
- status.textContent = 'Backend: HF Space (ML complet, plus lent)';
943
  status.className = 'backend-status remote';
944
  labelLocal.classList.remove('active');
945
  labelRemote.classList.add('active');
@@ -950,7 +976,7 @@
950
  labelLocal.classList.add('active');
951
  labelRemote.classList.remove('active');
952
  }
953
- console.log('[SysCRED] Backend switched to:', API_URL);
954
  }
955
 
956
  async function analyzeUrl() {
@@ -1158,9 +1184,23 @@
1158
  // ========================================
1159
  const nerSection = document.getElementById('nerSection');
1160
  const nerEntities = document.getElementById('nerEntities');
1161
- const entities = nlpAnalysis.entities || data.ner_entities || [];
1162
 
1163
- if (entities && entities.length > 0) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1164
  nerSection.style.display = 'block';
1165
  let nerHTML = '';
1166
 
@@ -1204,7 +1244,7 @@
1204
  // DISPLAY E-E-A-T METRICS
1205
  // ========================================
1206
  const eeatSection = document.getElementById('eeatSection');
1207
- const eeatData = data.eeat_score || data.eeatMetrics || null;
1208
 
1209
  if (eeatData) {
1210
  eeatSection.style.display = 'block';
@@ -1622,18 +1662,22 @@
1622
 
1623
  if(!overlay) return;
1624
 
1625
- title.textContent = d.name;
1626
 
1627
  let typeColor = "#94a3b8";
1628
  if(d.group === 1) typeColor = "#8b5cf6"; // Report
1629
  if(d.group === 3) typeColor = "#22c55e"; // Good
1630
  if(d.group === 4) typeColor = "#ef4444"; // Bad
1631
 
 
 
 
1632
  body.innerHTML = `
1633
  <div style="margin-bottom:0.5rem">
1634
  <span style="background:${typeColor}; color:white; padding:2px 6px; border-radius:4px; font-size:0.75rem;">${d.type || 'Unknown Type'}</span>
1635
  </div>
1636
- <div><strong>URI:</strong> <br><span style="font-family:monospace; color:#a855f7; word-break:break-all;">${d.id}</span></div>
 
1637
  `;
1638
 
1639
  overlay.classList.add('visible');
 
927
 
928
  <script>
929
  // Backend URLs
930
+ // Empty string means relative path (same domain), which works for HF Space
931
+ const REMOTE_API_URL = '';
932
  const LOCAL_API_URL = 'http://localhost:5001';
933
+
934
+ // Detect if we are already on localhost
935
+ const isLocalhost = window.location.hostname === 'localhost' || window.location.hostname === '127.0.0.1';
936
+ let API_URL = isLocalhost ? LOCAL_API_URL : REMOTE_API_URL;
937
+
938
+ // Set initial toggle state based on environment
939
+ document.addEventListener('DOMContentLoaded', () => {
940
+ const toggle = document.getElementById('backendToggle');
941
+ const status = document.getElementById('backendStatus');
942
+ const labelLocal = document.getElementById('labelLocal');
943
+ const labelRemote = document.getElementById('labelRemote');
944
+
945
+ if (isLocalhost) {
946
+ toggle.checked = false;
947
+ status.textContent = 'Backend: localhost:5001 (léger, sans ML)';
948
+ status.className = 'backend-status local';
949
+ labelLocal.classList.add('active');
950
+ labelRemote.classList.remove('active');
951
+ } else {
952
+ toggle.checked = true;
953
+ status.textContent = 'Backend: HF Space (ML complet)';
954
+ status.className = 'backend-status remote';
955
+ labelLocal.classList.remove('active');
956
+ labelRemote.classList.add('active');
957
+ }
958
+ });
959
 
960
  function toggleBackend() {
961
  const toggle = document.getElementById('backendToggle');
 
965
 
966
  if (toggle.checked) {
967
  API_URL = REMOTE_API_URL;
968
+ status.textContent = 'Backend: HF Space (ML complet)';
969
  status.className = 'backend-status remote';
970
  labelLocal.classList.remove('active');
971
  labelRemote.classList.add('active');
 
976
  labelLocal.classList.add('active');
977
  labelRemote.classList.remove('active');
978
  }
979
+ console.log('[SysCRED] Backend switched to:', API_URL || 'Relative Path (HF Space)');
980
  }
981
 
982
  async function analyzeUrl() {
 
1184
  // ========================================
1185
  const nerSection = document.getElementById('nerSection');
1186
  const nerEntities = document.getElementById('nerEntities');
1187
+ let nerData = data.ner_entities || nlpAnalysis.entities || {};
1188
 
1189
+ // Convert dict format to array if needed
1190
+ let entities = [];
1191
+ if (Array.isArray(nerData)) {
1192
+ entities = nerData;
1193
+ } else if (typeof nerData === 'object' && nerData !== null) {
1194
+ for (const [label, items] of Object.entries(nerData)) {
1195
+ if (Array.isArray(items)) {
1196
+ items.forEach(item => {
1197
+ entities.push({...item, label: label});
1198
+ });
1199
+ }
1200
+ }
1201
+ }
1202
+
1203
+ if (entities.length > 0) {
1204
  nerSection.style.display = 'block';
1205
  let nerHTML = '';
1206
 
 
1244
  // DISPLAY E-E-A-T METRICS
1245
  // ========================================
1246
  const eeatSection = document.getElementById('eeatSection');
1247
+ const eeatData = data.eeat_scores || data.eeat_scores || null;
1248
 
1249
  if (eeatData) {
1250
  eeatSection.style.display = 'block';
 
1662
 
1663
  if(!overlay) return;
1664
 
1665
+ title.textContent = d.name || d.label || 'Unknown';
1666
 
1667
  let typeColor = "#94a3b8";
1668
  if(d.group === 1) typeColor = "#8b5cf6"; // Report
1669
  if(d.group === 3) typeColor = "#22c55e"; // Good
1670
  if(d.group === 4) typeColor = "#ef4444"; // Bad
1671
 
1672
+ // Use uri field if available, fallback to id
1673
+ const displayUri = d.uri || d.id || 'N/A';
1674
+
1675
  body.innerHTML = `
1676
  <div style="margin-bottom:0.5rem">
1677
  <span style="background:${typeColor}; color:white; padding:2px 6px; border-radius:4px; font-size:0.75rem;">${d.type || 'Unknown Type'}</span>
1678
  </div>
1679
+ <div><strong>URI:</strong> <br><span style="font-family:monospace; color:#a855f7; word-break:break-all;">${displayUri}</span></div>
1680
+ ${d.score ? `<div style="margin-top:0.5rem"><strong>Score:</strong> ${(d.score * 100).toFixed(0)}%</div>` : ''}
1681
  `;
1682
 
1683
  overlay.classList.add('visible');