Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -235,32 +235,53 @@ def classify_with_semantic_similarity(title: str, abstract: str, criteria: Dict)
|
|
| 235 |
}
|
| 236 |
|
| 237 |
def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
|
| 238 |
-
"""Use zero-shot classification as
|
| 239 |
|
| 240 |
-
study_text = f"{title} {abstract}".strip()
|
| 241 |
|
| 242 |
-
if not study_text or len(study_text) <
|
| 243 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
try:
|
| 246 |
-
# Create labels
|
| 247 |
-
candidate_labels = [
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
top_label = result['labels'][0]
|
| 255 |
top_score = result['scores'][0]
|
| 256 |
|
| 257 |
-
|
|
|
|
| 258 |
decision = 'INCLUDE'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
else:
|
| 260 |
decision = 'EXCLUDE'
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
reasoning = f"Zero-shot classification: {top_label} (
|
| 264 |
|
| 265 |
return {
|
| 266 |
'decision': decision,
|
|
@@ -269,50 +290,109 @@ def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Di
|
|
| 269 |
}
|
| 270 |
|
| 271 |
except Exception as e:
|
| 272 |
-
return None
|
| 273 |
-
|
| 274 |
-
def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
|
| 275 |
-
"""Enhanced classification using multiple approaches"""
|
| 276 |
-
|
| 277 |
-
# Parse criteria
|
| 278 |
-
parsed_criteria = parse_criteria(criteria_text)
|
| 279 |
-
|
| 280 |
-
if not parsed_criteria['include'] and not parsed_criteria['exclude']:
|
| 281 |
return {
|
| 282 |
'decision': 'UNCLEAR',
|
| 283 |
-
'confidence':
|
| 284 |
-
'reasoning': '
|
| 285 |
}
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
-
|
| 288 |
-
semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
|
| 289 |
|
| 290 |
-
#
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
-
#
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
return {
|
| 305 |
-
'decision':
|
| 306 |
-
'confidence':
|
| 307 |
-
'reasoning':
|
| 308 |
}
|
| 309 |
else:
|
| 310 |
return {
|
| 311 |
'decision': 'UNCLEAR',
|
| 312 |
-
'confidence':
|
| 313 |
-
'reasoning': '
|
| 314 |
}
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
def process_studies(file, title_col, abstract_col, criteria, sample_size):
|
| 317 |
"""Main processing function"""
|
| 318 |
|
|
@@ -388,7 +468,7 @@ def process_studies(file, title_col, abstract_col, criteria, sample_size):
|
|
| 388 |
def update_column_choices(file):
|
| 389 |
"""Update column dropdown choices when file is uploaded"""
|
| 390 |
if file is None:
|
| 391 |
-
return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), "Please upload a file first."
|
| 392 |
|
| 393 |
try:
|
| 394 |
df = pd.read_csv(file.name)
|
|
@@ -397,22 +477,33 @@ def update_column_choices(file):
|
|
| 397 |
# Auto-detect columns
|
| 398 |
detection = detect_columns(df)
|
| 399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
preview_text = f"""
|
| 401 |
**File loaded successfully!** 📁
|
| 402 |
- **{len(df)} studies** found
|
| 403 |
- **{len(columns)} columns** detected
|
| 404 |
|
|
|
|
|
|
|
| 405 |
**Preview of first few rows:**
|
| 406 |
-
{df.head(3).to_string()}
|
| 407 |
"""
|
| 408 |
|
| 409 |
return (
|
| 410 |
gr.Dropdown(choices=columns, value=detection['suggested_title']),
|
| 411 |
gr.Dropdown(choices=columns, value=detection['suggested_abstract']),
|
|
|
|
| 412 |
preview_text
|
| 413 |
)
|
| 414 |
except Exception as e:
|
| 415 |
-
return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), f"Error reading file: {str(e)}"
|
| 416 |
|
| 417 |
# Create the Gradio interface
|
| 418 |
def create_interface():
|
|
@@ -452,24 +543,35 @@ def create_interface():
|
|
| 452 |
interactive=True
|
| 453 |
)
|
| 454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
with gr.Column(scale=1):
|
| 456 |
gr.Markdown("### 🎯 3. Define Inclusion Criteria")
|
| 457 |
|
| 458 |
criteria_input = gr.Textbox(
|
| 459 |
label="Inclusion/Exclusion Criteria",
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
-
|
| 464 |
-
- adult participants
|
| 465 |
-
-
|
| 466 |
-
-
|
|
|
|
| 467 |
|
| 468 |
-
EXCLUDE:
|
| 469 |
-
-
|
| 470 |
-
-
|
| 471 |
-
-
|
| 472 |
-
- case reports
|
|
|
|
|
|
|
|
|
|
| 473 |
lines=15,
|
| 474 |
info="Be specific about what should be included or excluded"
|
| 475 |
)
|
|
@@ -502,7 +604,7 @@ EXCLUDE:
|
|
| 502 |
file_input.change(
|
| 503 |
fn=update_column_choices,
|
| 504 |
inputs=[file_input],
|
| 505 |
-
outputs=[title_column, abstract_column, file_preview]
|
| 506 |
)
|
| 507 |
|
| 508 |
process_btn.click(
|
|
|
|
| 235 |
}
|
| 236 |
|
| 237 |
def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
|
| 238 |
+
"""Use zero-shot classification as primary method"""
|
| 239 |
|
| 240 |
+
study_text = f"Title: {title}. Abstract: {abstract}".strip()
|
| 241 |
|
| 242 |
+
if not study_text or len(study_text) < 20:
|
| 243 |
+
return {
|
| 244 |
+
'decision': 'UNCLEAR',
|
| 245 |
+
'confidence': 30,
|
| 246 |
+
'reasoning': 'Insufficient text for analysis'
|
| 247 |
+
}
|
| 248 |
|
| 249 |
try:
|
| 250 |
+
# Create more specific labels based on criteria
|
| 251 |
+
candidate_labels = [
|
| 252 |
+
"relevant study that should be included",
|
| 253 |
+
"irrelevant study that should be excluded",
|
| 254 |
+
"systematic review or meta-analysis",
|
| 255 |
+
"animal or laboratory study",
|
| 256 |
+
"case report or editorial"
|
| 257 |
+
]
|
| 258 |
+
|
| 259 |
+
# Use zero-shot classification
|
| 260 |
+
result = classifier(study_text, candidate_labels)
|
| 261 |
|
| 262 |
top_label = result['labels'][0]
|
| 263 |
top_score = result['scores'][0]
|
| 264 |
|
| 265 |
+
# Decision logic based on classification
|
| 266 |
+
if 'relevant' in top_label or 'included' in top_label:
|
| 267 |
decision = 'INCLUDE'
|
| 268 |
+
confidence = min(int(top_score * 100), 95)
|
| 269 |
+
elif 'systematic review' in top_label:
|
| 270 |
+
# Check if systematic reviews should be excluded
|
| 271 |
+
if 'systematic review' in criteria_text.lower() and 'exclude' in criteria_text.lower():
|
| 272 |
+
decision = 'EXCLUDE'
|
| 273 |
+
confidence = min(int(top_score * 100), 90)
|
| 274 |
+
else:
|
| 275 |
+
decision = 'INCLUDE' # Include if not specifically excluded
|
| 276 |
+
confidence = min(int(top_score * 80), 85)
|
| 277 |
+
elif 'animal' in top_label or 'case report' in top_label:
|
| 278 |
+
decision = 'EXCLUDE'
|
| 279 |
+
confidence = min(int(top_score * 100), 95)
|
| 280 |
else:
|
| 281 |
decision = 'EXCLUDE'
|
| 282 |
+
confidence = min(int(top_score * 100), 90)
|
| 283 |
+
|
| 284 |
+
reasoning = f"Zero-shot classification: '{top_label}' (score: {top_score:.2f})"
|
| 285 |
|
| 286 |
return {
|
| 287 |
'decision': decision,
|
|
|
|
| 290 |
}
|
| 291 |
|
| 292 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
return {
|
| 294 |
'decision': 'UNCLEAR',
|
| 295 |
+
'confidence': 30,
|
| 296 |
+
'reasoning': f'Classification error: {str(e)}'
|
| 297 |
}
|
| 298 |
+
|
| 299 |
+
def enhanced_keyword_classification(title: str, abstract: str, criteria: Dict) -> Dict:
|
| 300 |
+
"""Enhanced keyword-based classification with better logic"""
|
| 301 |
|
| 302 |
+
study_text = f"{title} {abstract}".lower()
|
|
|
|
| 303 |
|
| 304 |
+
# Strong inclusion signals
|
| 305 |
+
include_signals = []
|
| 306 |
+
for criterion in criteria.get('include', []):
|
| 307 |
+
if criterion.lower() in study_text:
|
| 308 |
+
include_signals.append(criterion)
|
| 309 |
|
| 310 |
+
# Strong exclusion signals
|
| 311 |
+
exclude_signals = []
|
| 312 |
+
for criterion in criteria.get('exclude', []):
|
| 313 |
+
if criterion.lower() in study_text:
|
| 314 |
+
exclude_signals.append(criterion)
|
| 315 |
+
|
| 316 |
+
# Additional exclusion patterns
|
| 317 |
+
exclusion_patterns = [
|
| 318 |
+
'systematic review', 'meta-analysis', 'animal study', 'animal model',
|
| 319 |
+
'in vitro', 'case report', 'case series', 'editorial', 'commentary',
|
| 320 |
+
'letter to editor', 'conference abstract'
|
| 321 |
+
]
|
| 322 |
+
|
| 323 |
+
auto_exclude = []
|
| 324 |
+
for pattern in exclusion_patterns:
|
| 325 |
+
if pattern in study_text:
|
| 326 |
+
auto_exclude.append(pattern)
|
| 327 |
+
|
| 328 |
+
# Decision logic
|
| 329 |
+
if auto_exclude and not include_signals:
|
| 330 |
+
return {
|
| 331 |
+
'decision': 'EXCLUDE',
|
| 332 |
+
'confidence': 80,
|
| 333 |
+
'reasoning': f"Study type exclusion: {', '.join(auto_exclude)}"
|
| 334 |
+
}
|
| 335 |
+
elif exclude_signals:
|
| 336 |
+
return {
|
| 337 |
+
'decision': 'EXCLUDE',
|
| 338 |
+
'confidence': 75,
|
| 339 |
+
'reasoning': f"Matches exclusion criteria: {', '.join(exclude_signals)}"
|
| 340 |
+
}
|
| 341 |
+
elif include_signals:
|
| 342 |
return {
|
| 343 |
+
'decision': 'INCLUDE',
|
| 344 |
+
'confidence': 70,
|
| 345 |
+
'reasoning': f"Matches inclusion criteria: {', '.join(include_signals)}"
|
| 346 |
}
|
| 347 |
else:
|
| 348 |
return {
|
| 349 |
'decision': 'UNCLEAR',
|
| 350 |
+
'confidence': 40,
|
| 351 |
+
'reasoning': 'No clear matches with inclusion/exclusion criteria'
|
| 352 |
}
|
| 353 |
|
| 354 |
+
def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
|
| 355 |
+
"""Enhanced classification using multiple approaches"""
|
| 356 |
+
|
| 357 |
+
# Parse criteria
|
| 358 |
+
parsed_criteria = parse_criteria(criteria_text)
|
| 359 |
+
|
| 360 |
+
# Method 1: Enhanced keyword matching
|
| 361 |
+
keyword_result = enhanced_keyword_classification(title, abstract, parsed_criteria)
|
| 362 |
+
|
| 363 |
+
# Method 2: Zero-shot classification
|
| 364 |
+
zero_shot_result = classify_with_zero_shot(title, abstract, criteria_text)
|
| 365 |
+
|
| 366 |
+
# Method 3: Semantic similarity (if available)
|
| 367 |
+
semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
|
| 368 |
+
|
| 369 |
+
# Combine results with priority logic
|
| 370 |
+
results = [keyword_result, zero_shot_result, semantic_result]
|
| 371 |
+
|
| 372 |
+
# If any method has high confidence (>75%), use it
|
| 373 |
+
high_confidence_results = [r for r in results if r['confidence'] > 75]
|
| 374 |
+
if high_confidence_results:
|
| 375 |
+
best_result = max(high_confidence_results, key=lambda x: x['confidence'])
|
| 376 |
+
return best_result
|
| 377 |
+
|
| 378 |
+
# If keyword method finds clear exclusion, prioritize it
|
| 379 |
+
if keyword_result['decision'] == 'EXCLUDE' and keyword_result['confidence'] > 60:
|
| 380 |
+
return keyword_result
|
| 381 |
+
|
| 382 |
+
# Otherwise, use the highest confidence result
|
| 383 |
+
best_result = max(results, key=lambda x: x['confidence'])
|
| 384 |
+
|
| 385 |
+
# Combine reasoning from multiple methods
|
| 386 |
+
combined_reasoning = f"{best_result['reasoning']}"
|
| 387 |
+
if best_result != zero_shot_result:
|
| 388 |
+
combined_reasoning += f" | {zero_shot_result['reasoning']}"
|
| 389 |
+
|
| 390 |
+
return {
|
| 391 |
+
'decision': best_result['decision'],
|
| 392 |
+
'confidence': best_result['confidence'],
|
| 393 |
+
'reasoning': combined_reasoning
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
def process_studies(file, title_col, abstract_col, criteria, sample_size):
|
| 397 |
"""Main processing function"""
|
| 398 |
|
|
|
|
| 468 |
def update_column_choices(file):
|
| 469 |
"""Update column dropdown choices when file is uploaded"""
|
| 470 |
if file is None:
|
| 471 |
+
return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), "Please upload a file first."
|
| 472 |
|
| 473 |
try:
|
| 474 |
df = pd.read_csv(file.name)
|
|
|
|
| 477 |
# Auto-detect columns
|
| 478 |
detection = detect_columns(df)
|
| 479 |
|
| 480 |
+
# Look for full text availability column
|
| 481 |
+
fulltext_col = None
|
| 482 |
+
fulltext_patterns = ['full_text', 'fulltext', 'full_text_available', 'pdf_available', 'available']
|
| 483 |
+
for col in columns:
|
| 484 |
+
if any(pattern in col.lower() for pattern in fulltext_patterns):
|
| 485 |
+
fulltext_col = col
|
| 486 |
+
break
|
| 487 |
+
|
| 488 |
preview_text = f"""
|
| 489 |
**File loaded successfully!** 📁
|
| 490 |
- **{len(df)} studies** found
|
| 491 |
- **{len(columns)} columns** detected
|
| 492 |
|
| 493 |
+
**Available columns:** {', '.join(columns)}
|
| 494 |
+
|
| 495 |
**Preview of first few rows:**
|
| 496 |
+
{df.head(3)[['Title', 'Abstract']].to_string() if 'Title' in df.columns and 'Abstract' in df.columns else df.head(3).to_string()}
|
| 497 |
"""
|
| 498 |
|
| 499 |
return (
|
| 500 |
gr.Dropdown(choices=columns, value=detection['suggested_title']),
|
| 501 |
gr.Dropdown(choices=columns, value=detection['suggested_abstract']),
|
| 502 |
+
gr.Dropdown(choices=columns, value=fulltext_col, visible=True),
|
| 503 |
preview_text
|
| 504 |
)
|
| 505 |
except Exception as e:
|
| 506 |
+
return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), f"Error reading file: {str(e)}"
|
| 507 |
|
| 508 |
# Create the Gradio interface
|
| 509 |
def create_interface():
|
|
|
|
| 543 |
interactive=True
|
| 544 |
)
|
| 545 |
|
| 546 |
+
fulltext_column = gr.Dropdown(
|
| 547 |
+
label="Full Text Available Column (Optional)",
|
| 548 |
+
choices=[],
|
| 549 |
+
interactive=True,
|
| 550 |
+
info="Select column indicating if full text is available (for phase 2 screening)"
|
| 551 |
+
)
|
| 552 |
+
|
| 553 |
with gr.Column(scale=1):
|
| 554 |
gr.Markdown("### 🎯 3. Define Inclusion Criteria")
|
| 555 |
|
| 556 |
criteria_input = gr.Textbox(
|
| 557 |
label="Inclusion/Exclusion Criteria",
|
| 558 |
+
value="""INCLUDE:
|
| 559 |
+
- prospective cohort studies
|
| 560 |
+
- case-control studies
|
| 561 |
+
- environmental exposures
|
| 562 |
+
- adult participants
|
| 563 |
+
- cardiovascular disease
|
| 564 |
+
- cancer outcomes
|
| 565 |
+
- mortality outcomes
|
| 566 |
|
| 567 |
+
EXCLUDE:
|
| 568 |
+
- systematic reviews
|
| 569 |
+
- meta-analyses
|
| 570 |
+
- animal studies
|
| 571 |
+
- case reports
|
| 572 |
+
- editorials
|
| 573 |
+
- pediatric populations
|
| 574 |
+
- occupational exposures only""",
|
| 575 |
lines=15,
|
| 576 |
info="Be specific about what should be included or excluded"
|
| 577 |
)
|
|
|
|
| 604 |
file_input.change(
|
| 605 |
fn=update_column_choices,
|
| 606 |
inputs=[file_input],
|
| 607 |
+
outputs=[title_column, abstract_column, fulltext_column, file_preview]
|
| 608 |
)
|
| 609 |
|
| 610 |
process_btn.click(
|