HebaElshimy commited on
Commit
9acf49e
·
verified ·
1 Parent(s): d3991d3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -62
app.py CHANGED
@@ -235,32 +235,53 @@ def classify_with_semantic_similarity(title: str, abstract: str, criteria: Dict)
235
  }
236
 
237
  def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
238
- """Use zero-shot classification as a secondary method"""
239
 
240
- study_text = f"{title} {abstract}".strip()
241
 
242
- if not study_text or len(study_text) < 10:
243
- return None
 
 
 
 
244
 
245
  try:
246
- # Create labels from criteria
247
- candidate_labels = ["should be included in systematic review", "should be excluded from systematic review"]
248
-
249
- # Use the criteria as hypothesis
250
- hypothesis_template = f"This study {{}}, based on the criteria: {criteria_text}"
251
-
252
- result = classifier(study_text, candidate_labels, hypothesis_template=hypothesis_template)
 
 
 
 
253
 
254
  top_label = result['labels'][0]
255
  top_score = result['scores'][0]
256
 
257
- if 'included' in top_label:
 
258
  decision = 'INCLUDE'
 
 
 
 
 
 
 
 
 
 
 
 
259
  else:
260
  decision = 'EXCLUDE'
261
-
262
- confidence = int(top_score * 100)
263
- reasoning = f"Zero-shot classification: {top_label} (confidence: {confidence}%)"
264
 
265
  return {
266
  'decision': decision,
@@ -269,50 +290,109 @@ def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Di
269
  }
270
 
271
  except Exception as e:
272
- return None
273
-
274
- def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
275
- """Enhanced classification using multiple approaches"""
276
-
277
- # Parse criteria
278
- parsed_criteria = parse_criteria(criteria_text)
279
-
280
- if not parsed_criteria['include'] and not parsed_criteria['exclude']:
281
  return {
282
  'decision': 'UNCLEAR',
283
- 'confidence': 20,
284
- 'reasoning': 'No clear inclusion/exclusion criteria provided'
285
  }
 
 
 
286
 
287
- # Method 1: Semantic similarity
288
- semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
289
 
290
- # Method 2: Zero-shot classification (as backup)
291
- zero_shot_result = classify_with_zero_shot(title, abstract, criteria_text)
 
 
 
292
 
293
- # Combine results (prioritize semantic similarity)
294
- if semantic_result['confidence'] > 60:
295
- return semantic_result
296
- elif zero_shot_result and zero_shot_result['confidence'] > 70:
297
- return zero_shot_result
298
- elif semantic_result['confidence'] > 40:
299
- # Add zero-shot info if available
300
- combined_reasoning = semantic_result['reasoning']
301
- if zero_shot_result:
302
- combined_reasoning += f" | {zero_shot_result['reasoning']}"
303
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  return {
305
- 'decision': semantic_result['decision'],
306
- 'confidence': semantic_result['confidence'],
307
- 'reasoning': combined_reasoning
308
  }
309
  else:
310
  return {
311
  'decision': 'UNCLEAR',
312
- 'confidence': 35,
313
- 'reasoning': 'Low confidence from all classification methods'
314
  }
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  def process_studies(file, title_col, abstract_col, criteria, sample_size):
317
  """Main processing function"""
318
 
@@ -388,7 +468,7 @@ def process_studies(file, title_col, abstract_col, criteria, sample_size):
388
  def update_column_choices(file):
389
  """Update column dropdown choices when file is uploaded"""
390
  if file is None:
391
- return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), "Please upload a file first."
392
 
393
  try:
394
  df = pd.read_csv(file.name)
@@ -397,22 +477,33 @@ def update_column_choices(file):
397
  # Auto-detect columns
398
  detection = detect_columns(df)
399
 
 
 
 
 
 
 
 
 
400
  preview_text = f"""
401
  **File loaded successfully!** 📁
402
  - **{len(df)} studies** found
403
  - **{len(columns)} columns** detected
404
 
 
 
405
  **Preview of first few rows:**
406
- {df.head(3).to_string()}
407
  """
408
 
409
  return (
410
  gr.Dropdown(choices=columns, value=detection['suggested_title']),
411
  gr.Dropdown(choices=columns, value=detection['suggested_abstract']),
 
412
  preview_text
413
  )
414
  except Exception as e:
415
- return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), f"Error reading file: {str(e)}"
416
 
417
  # Create the Gradio interface
418
  def create_interface():
@@ -452,24 +543,35 @@ def create_interface():
452
  interactive=True
453
  )
454
 
 
 
 
 
 
 
 
455
  with gr.Column(scale=1):
456
  gr.Markdown("### 🎯 3. Define Inclusion Criteria")
457
 
458
  criteria_input = gr.Textbox(
459
  label="Inclusion/Exclusion Criteria",
460
- placeholder="""Example:
461
-
462
- INCLUDE:
463
- - randomized controlled trial, clinical trial
464
- - adult participants, human subjects
465
- - diabetes, glucose, glycemic control
466
- - published after 2015
 
467
 
468
- EXCLUDE:
469
- - animal studies, in vitro
470
- - pediatric, children
471
- - review articles, meta-analysis
472
- - case reports, case series""",
 
 
 
473
  lines=15,
474
  info="Be specific about what should be included or excluded"
475
  )
@@ -502,7 +604,7 @@ EXCLUDE:
502
  file_input.change(
503
  fn=update_column_choices,
504
  inputs=[file_input],
505
- outputs=[title_column, abstract_column, file_preview]
506
  )
507
 
508
  process_btn.click(
 
235
  }
236
 
237
  def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
238
+ """Use zero-shot classification as primary method"""
239
 
240
+ study_text = f"Title: {title}. Abstract: {abstract}".strip()
241
 
242
+ if not study_text or len(study_text) < 20:
243
+ return {
244
+ 'decision': 'UNCLEAR',
245
+ 'confidence': 30,
246
+ 'reasoning': 'Insufficient text for analysis'
247
+ }
248
 
249
  try:
250
+ # Create more specific labels based on criteria
251
+ candidate_labels = [
252
+ "relevant study that should be included",
253
+ "irrelevant study that should be excluded",
254
+ "systematic review or meta-analysis",
255
+ "animal or laboratory study",
256
+ "case report or editorial"
257
+ ]
258
+
259
+ # Use zero-shot classification
260
+ result = classifier(study_text, candidate_labels)
261
 
262
  top_label = result['labels'][0]
263
  top_score = result['scores'][0]
264
 
265
+ # Decision logic based on classification
266
+ if 'relevant' in top_label or 'included' in top_label:
267
  decision = 'INCLUDE'
268
+ confidence = min(int(top_score * 100), 95)
269
+ elif 'systematic review' in top_label:
270
+ # Check if systematic reviews should be excluded
271
+ if 'systematic review' in criteria_text.lower() and 'exclude' in criteria_text.lower():
272
+ decision = 'EXCLUDE'
273
+ confidence = min(int(top_score * 100), 90)
274
+ else:
275
+ decision = 'INCLUDE' # Include if not specifically excluded
276
+ confidence = min(int(top_score * 80), 85)
277
+ elif 'animal' in top_label or 'case report' in top_label:
278
+ decision = 'EXCLUDE'
279
+ confidence = min(int(top_score * 100), 95)
280
  else:
281
  decision = 'EXCLUDE'
282
+ confidence = min(int(top_score * 100), 90)
283
+
284
+ reasoning = f"Zero-shot classification: '{top_label}' (score: {top_score:.2f})"
285
 
286
  return {
287
  'decision': decision,
 
290
  }
291
 
292
  except Exception as e:
 
 
 
 
 
 
 
 
 
293
  return {
294
  'decision': 'UNCLEAR',
295
+ 'confidence': 30,
296
+ 'reasoning': f'Classification error: {str(e)}'
297
  }
298
+
299
+ def enhanced_keyword_classification(title: str, abstract: str, criteria: Dict) -> Dict:
300
+ """Enhanced keyword-based classification with better logic"""
301
 
302
+ study_text = f"{title} {abstract}".lower()
 
303
 
304
+ # Strong inclusion signals
305
+ include_signals = []
306
+ for criterion in criteria.get('include', []):
307
+ if criterion.lower() in study_text:
308
+ include_signals.append(criterion)
309
 
310
+ # Strong exclusion signals
311
+ exclude_signals = []
312
+ for criterion in criteria.get('exclude', []):
313
+ if criterion.lower() in study_text:
314
+ exclude_signals.append(criterion)
315
+
316
+ # Additional exclusion patterns
317
+ exclusion_patterns = [
318
+ 'systematic review', 'meta-analysis', 'animal study', 'animal model',
319
+ 'in vitro', 'case report', 'case series', 'editorial', 'commentary',
320
+ 'letter to editor', 'conference abstract'
321
+ ]
322
+
323
+ auto_exclude = []
324
+ for pattern in exclusion_patterns:
325
+ if pattern in study_text:
326
+ auto_exclude.append(pattern)
327
+
328
+ # Decision logic
329
+ if auto_exclude and not include_signals:
330
+ return {
331
+ 'decision': 'EXCLUDE',
332
+ 'confidence': 80,
333
+ 'reasoning': f"Study type exclusion: {', '.join(auto_exclude)}"
334
+ }
335
+ elif exclude_signals:
336
+ return {
337
+ 'decision': 'EXCLUDE',
338
+ 'confidence': 75,
339
+ 'reasoning': f"Matches exclusion criteria: {', '.join(exclude_signals)}"
340
+ }
341
+ elif include_signals:
342
  return {
343
+ 'decision': 'INCLUDE',
344
+ 'confidence': 70,
345
+ 'reasoning': f"Matches inclusion criteria: {', '.join(include_signals)}"
346
  }
347
  else:
348
  return {
349
  'decision': 'UNCLEAR',
350
+ 'confidence': 40,
351
+ 'reasoning': 'No clear matches with inclusion/exclusion criteria'
352
  }
353
 
354
+ def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
355
+ """Enhanced classification using multiple approaches"""
356
+
357
+ # Parse criteria
358
+ parsed_criteria = parse_criteria(criteria_text)
359
+
360
+ # Method 1: Enhanced keyword matching
361
+ keyword_result = enhanced_keyword_classification(title, abstract, parsed_criteria)
362
+
363
+ # Method 2: Zero-shot classification
364
+ zero_shot_result = classify_with_zero_shot(title, abstract, criteria_text)
365
+
366
+ # Method 3: Semantic similarity (if available)
367
+ semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
368
+
369
+ # Combine results with priority logic
370
+ results = [keyword_result, zero_shot_result, semantic_result]
371
+
372
+ # If any method has high confidence (>75%), use it
373
+ high_confidence_results = [r for r in results if r['confidence'] > 75]
374
+ if high_confidence_results:
375
+ best_result = max(high_confidence_results, key=lambda x: x['confidence'])
376
+ return best_result
377
+
378
+ # If keyword method finds clear exclusion, prioritize it
379
+ if keyword_result['decision'] == 'EXCLUDE' and keyword_result['confidence'] > 60:
380
+ return keyword_result
381
+
382
+ # Otherwise, use the highest confidence result
383
+ best_result = max(results, key=lambda x: x['confidence'])
384
+
385
+ # Combine reasoning from multiple methods
386
+ combined_reasoning = f"{best_result['reasoning']}"
387
+ if best_result != zero_shot_result:
388
+ combined_reasoning += f" | {zero_shot_result['reasoning']}"
389
+
390
+ return {
391
+ 'decision': best_result['decision'],
392
+ 'confidence': best_result['confidence'],
393
+ 'reasoning': combined_reasoning
394
+ }
395
+
396
  def process_studies(file, title_col, abstract_col, criteria, sample_size):
397
  """Main processing function"""
398
 
 
468
  def update_column_choices(file):
469
  """Update column dropdown choices when file is uploaded"""
470
  if file is None:
471
+ return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), "Please upload a file first."
472
 
473
  try:
474
  df = pd.read_csv(file.name)
 
477
  # Auto-detect columns
478
  detection = detect_columns(df)
479
 
480
+ # Look for full text availability column
481
+ fulltext_col = None
482
+ fulltext_patterns = ['full_text', 'fulltext', 'full_text_available', 'pdf_available', 'available']
483
+ for col in columns:
484
+ if any(pattern in col.lower() for pattern in fulltext_patterns):
485
+ fulltext_col = col
486
+ break
487
+
488
  preview_text = f"""
489
  **File loaded successfully!** 📁
490
  - **{len(df)} studies** found
491
  - **{len(columns)} columns** detected
492
 
493
+ **Available columns:** {', '.join(columns)}
494
+
495
  **Preview of first few rows:**
496
+ {df.head(3)[['Title', 'Abstract']].to_string() if 'Title' in df.columns and 'Abstract' in df.columns else df.head(3).to_string()}
497
  """
498
 
499
  return (
500
  gr.Dropdown(choices=columns, value=detection['suggested_title']),
501
  gr.Dropdown(choices=columns, value=detection['suggested_abstract']),
502
+ gr.Dropdown(choices=columns, value=fulltext_col, visible=True),
503
  preview_text
504
  )
505
  except Exception as e:
506
+ return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), f"Error reading file: {str(e)}"
507
 
508
  # Create the Gradio interface
509
  def create_interface():
 
543
  interactive=True
544
  )
545
 
546
+ fulltext_column = gr.Dropdown(
547
+ label="Full Text Available Column (Optional)",
548
+ choices=[],
549
+ interactive=True,
550
+ info="Select column indicating if full text is available (for phase 2 screening)"
551
+ )
552
+
553
  with gr.Column(scale=1):
554
  gr.Markdown("### 🎯 3. Define Inclusion Criteria")
555
 
556
  criteria_input = gr.Textbox(
557
  label="Inclusion/Exclusion Criteria",
558
+ value="""INCLUDE:
559
+ - prospective cohort studies
560
+ - case-control studies
561
+ - environmental exposures
562
+ - adult participants
563
+ - cardiovascular disease
564
+ - cancer outcomes
565
+ - mortality outcomes
566
 
567
+ EXCLUDE:
568
+ - systematic reviews
569
+ - meta-analyses
570
+ - animal studies
571
+ - case reports
572
+ - editorials
573
+ - pediatric populations
574
+ - occupational exposures only""",
575
  lines=15,
576
  info="Be specific about what should be included or excluded"
577
  )
 
604
  file_input.change(
605
  fn=update_column_choices,
606
  inputs=[file_input],
607
+ outputs=[title_column, abstract_column, fulltext_column, file_preview]
608
  )
609
 
610
  process_btn.click(