Spaces:

HebaElshimy
/

systematic-reviews

Running

App Files Files Community

HebaElshimy commited on May 25

Commit

9acf49e

verified ·

1 Parent(s): d3991d3

Upload app.py

Browse files

Files changed (1) hide show

app.py +164 -62

app.py CHANGED Viewed

@@ -235,32 +235,53 @@ def classify_with_semantic_similarity(title: str, abstract: str, criteria: Dict)
         }
 def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
-    """Use zero-shot classification as a secondary method"""
-    study_text = f"{title} {abstract}".strip()
-    if not study_text or len(study_text) < 10:
-        return None
     try:
-        # Create labels from criteria
-        candidate_labels = ["should be included in systematic review", "should be excluded from systematic review"]
-        # Use the criteria as hypothesis
-        hypothesis_template = f"This study {{}}, based on the criteria: {criteria_text}"
-        result = classifier(study_text, candidate_labels, hypothesis_template=hypothesis_template)
         top_label = result['labels'][0]
         top_score = result['scores'][0]
-        if 'included' in top_label:
             decision = 'INCLUDE'
         else:
             decision = 'EXCLUDE'
-        confidence = int(top_score * 100)
-        reasoning = f"Zero-shot classification: {top_label} (confidence: {confidence}%)"
         return {
             'decision': decision,
@@ -269,50 +290,109 @@ def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Di
         }
     except Exception as e:
-        return None
-def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
-    """Enhanced classification using multiple approaches"""
-    # Parse criteria
-    parsed_criteria = parse_criteria(criteria_text)
-    if not parsed_criteria['include'] and not parsed_criteria['exclude']:
         return {
             'decision': 'UNCLEAR',
-            'confidence': 20,
-            'reasoning': 'No clear inclusion/exclusion criteria provided'
         }
-    # Method 1: Semantic similarity
-    semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
-    # Method 2: Zero-shot classification (as backup)
-    zero_shot_result = classify_with_zero_shot(title, abstract, criteria_text)
-    # Combine results (prioritize semantic similarity)
-    if semantic_result['confidence'] > 60:
-        return semantic_result
-    elif zero_shot_result and zero_shot_result['confidence'] > 70:
-        return zero_shot_result
-    elif semantic_result['confidence'] > 40:
-        # Add zero-shot info if available
-        combined_reasoning = semantic_result['reasoning']
-        if zero_shot_result:
-            combined_reasoning += f" | {zero_shot_result['reasoning']}"
         return {
-            'decision': semantic_result['decision'],
-            'confidence': semantic_result['confidence'],
-            'reasoning': combined_reasoning
         }
     else:
         return {
             'decision': 'UNCLEAR',
-            'confidence': 35,
-            'reasoning': 'Low confidence from all classification methods'
         }
 def process_studies(file, title_col, abstract_col, criteria, sample_size):
     """Main processing function"""
@@ -388,7 +468,7 @@ def process_studies(file, title_col, abstract_col, criteria, sample_size):
 def update_column_choices(file):
     """Update column dropdown choices when file is uploaded"""
     if file is None:
-        return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), "Please upload a file first."
     try:
         df = pd.read_csv(file.name)
@@ -397,22 +477,33 @@ def update_column_choices(file):
         # Auto-detect columns
         detection = detect_columns(df)
         preview_text = f"""
 **File loaded successfully!** 📁
 - **{len(df)} studies** found
 - **{len(columns)} columns** detected
 **Preview of first few rows:**
-{df.head(3).to_string()}
         """
         return (
             gr.Dropdown(choices=columns, value=detection['suggested_title']),
             gr.Dropdown(choices=columns, value=detection['suggested_abstract']),
             preview_text
         )
     except Exception as e:
-        return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), f"Error reading file: {str(e)}"
 # Create the Gradio interface
 def create_interface():
@@ -452,24 +543,35 @@ def create_interface():
                     interactive=True
                 )
             with gr.Column(scale=1):
                 gr.Markdown("### 🎯 3. Define Inclusion Criteria")
                 criteria_input = gr.Textbox(
                     label="Inclusion/Exclusion Criteria",
-                    placeholder="""Example:
-INCLUDE:
-- randomized controlled trial, clinical trial
-- adult participants, human subjects
-- diabetes, glucose, glycemic control
-- published after 2015
-EXCLUDE:
-- animal studies, in vitro
-- pediatric, children
-- review articles, meta-analysis
-- case reports, case series""",
                     lines=15,
                     info="Be specific about what should be included or excluded"
                 )
@@ -502,7 +604,7 @@ EXCLUDE:
         file_input.change(
             fn=update_column_choices,
             inputs=[file_input],
-            outputs=[title_column, abstract_column, file_preview]
         )
         process_btn.click(

         }
 def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
+    """Use zero-shot classification as primary method"""
+    study_text = f"Title: {title}. Abstract: {abstract}".strip()
+    if not study_text or len(study_text) < 20:
+        return {
+            'decision': 'UNCLEAR',
+            'confidence': 30,
+            'reasoning': 'Insufficient text for analysis'
+        }
     try:
+        # Create more specific labels based on criteria
+        candidate_labels = [
+            "relevant study that should be included",
+            "irrelevant study that should be excluded",
+            "systematic review or meta-analysis",
+            "animal or laboratory study",
+            "case report or editorial"
+        ]
+        # Use zero-shot classification
+        result = classifier(study_text, candidate_labels)
         top_label = result['labels'][0]
         top_score = result['scores'][0]
+        # Decision logic based on classification
+        if 'relevant' in top_label or 'included' in top_label:
             decision = 'INCLUDE'
+            confidence = min(int(top_score * 100), 95)
+        elif 'systematic review' in top_label:
+            # Check if systematic reviews should be excluded
+            if 'systematic review' in criteria_text.lower() and 'exclude' in criteria_text.lower():
+                decision = 'EXCLUDE'
+                confidence = min(int(top_score * 100), 90)
+            else:
+                decision = 'INCLUDE'  # Include if not specifically excluded
+                confidence = min(int(top_score * 80), 85)
+        elif 'animal' in top_label or 'case report' in top_label:
+            decision = 'EXCLUDE'
+            confidence = min(int(top_score * 100), 95)
         else:
             decision = 'EXCLUDE'
+            confidence = min(int(top_score * 100), 90)
+        reasoning = f"Zero-shot classification: '{top_label}' (score: {top_score:.2f})"
         return {
             'decision': decision,
         }
     except Exception as e:
         return {
             'decision': 'UNCLEAR',
+            'confidence': 30,
+            'reasoning': f'Classification error: {str(e)}'
         }
+def enhanced_keyword_classification(title: str, abstract: str, criteria: Dict) -> Dict:
+    """Enhanced keyword-based classification with better logic"""
+    study_text = f"{title} {abstract}".lower()
+    # Strong inclusion signals
+    include_signals = []
+    for criterion in criteria.get('include', []):
+        if criterion.lower() in study_text:
+            include_signals.append(criterion)
+    # Strong exclusion signals
+    exclude_signals = []
+    for criterion in criteria.get('exclude', []):
+        if criterion.lower() in study_text:
+            exclude_signals.append(criterion)
+    # Additional exclusion patterns
+    exclusion_patterns = [
+        'systematic review', 'meta-analysis', 'animal study', 'animal model',
+        'in vitro', 'case report', 'case series', 'editorial', 'commentary',
+        'letter to editor', 'conference abstract'
+    ]
+    auto_exclude = []
+    for pattern in exclusion_patterns:
+        if pattern in study_text:
+            auto_exclude.append(pattern)
+    # Decision logic
+    if auto_exclude and not include_signals:
+        return {
+            'decision': 'EXCLUDE',
+            'confidence': 80,
+            'reasoning': f"Study type exclusion: {', '.join(auto_exclude)}"
+        }
+    elif exclude_signals:
+        return {
+            'decision': 'EXCLUDE',
+            'confidence': 75,
+            'reasoning': f"Matches exclusion criteria: {', '.join(exclude_signals)}"
+        }
+    elif include_signals:
         return {
+            'decision': 'INCLUDE',
+            'confidence': 70,
+            'reasoning': f"Matches inclusion criteria: {', '.join(include_signals)}"
         }
     else:
         return {
             'decision': 'UNCLEAR',
+            'confidence': 40,
+            'reasoning': 'No clear matches with inclusion/exclusion criteria'
         }
+def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
+    """Enhanced classification using multiple approaches"""
+    # Parse criteria
+    parsed_criteria = parse_criteria(criteria_text)
+    # Method 1: Enhanced keyword matching
+    keyword_result = enhanced_keyword_classification(title, abstract, parsed_criteria)
+    # Method 2: Zero-shot classification
+    zero_shot_result = classify_with_zero_shot(title, abstract, criteria_text)
+    # Method 3: Semantic similarity (if available)
+    semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
+    # Combine results with priority logic
+    results = [keyword_result, zero_shot_result, semantic_result]
+    # If any method has high confidence (>75%), use it
+    high_confidence_results = [r for r in results if r['confidence'] > 75]
+    if high_confidence_results:
+        best_result = max(high_confidence_results, key=lambda x: x['confidence'])
+        return best_result
+    # If keyword method finds clear exclusion, prioritize it
+    if keyword_result['decision'] == 'EXCLUDE' and keyword_result['confidence'] > 60:
+        return keyword_result
+    # Otherwise, use the highest confidence result
+    best_result = max(results, key=lambda x: x['confidence'])
+    # Combine reasoning from multiple methods
+    combined_reasoning = f"{best_result['reasoning']}"
+    if best_result != zero_shot_result:
+        combined_reasoning += f" | {zero_shot_result['reasoning']}"
+    return {
+        'decision': best_result['decision'],
+        'confidence': best_result['confidence'],
+        'reasoning': combined_reasoning
+    }
 def process_studies(file, title_col, abstract_col, criteria, sample_size):
     """Main processing function"""
 def update_column_choices(file):
     """Update column dropdown choices when file is uploaded"""
     if file is None:
+        return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), "Please upload a file first."
     try:
         df = pd.read_csv(file.name)
         # Auto-detect columns
         detection = detect_columns(df)
+        # Look for full text availability column
+        fulltext_col = None
+        fulltext_patterns = ['full_text', 'fulltext', 'full_text_available', 'pdf_available', 'available']
+        for col in columns:
+            if any(pattern in col.lower() for pattern in fulltext_patterns):
+                fulltext_col = col
+                break
         preview_text = f"""
 **File loaded successfully!** 📁
 - **{len(df)} studies** found
 - **{len(columns)} columns** detected
+**Available columns:** {', '.join(columns)}
 **Preview of first few rows:**
+{df.head(3)[['Title', 'Abstract']].to_string() if 'Title' in df.columns and 'Abstract' in df.columns else df.head(3).to_string()}
         """
         return (
             gr.Dropdown(choices=columns, value=detection['suggested_title']),
             gr.Dropdown(choices=columns, value=detection['suggested_abstract']),
+            gr.Dropdown(choices=columns, value=fulltext_col, visible=True),
             preview_text
         )
     except Exception as e:
+        return gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), f"Error reading file: {str(e)}"
 # Create the Gradio interface
 def create_interface():
                     interactive=True
                 )
+                fulltext_column = gr.Dropdown(
+                    label="Full Text Available Column (Optional)",
+                    choices=[],
+                    interactive=True,
+                    info="Select column indicating if full text is available (for phase 2 screening)"
+                )
             with gr.Column(scale=1):
                 gr.Markdown("### 🎯 3. Define Inclusion Criteria")
                 criteria_input = gr.Textbox(
                     label="Inclusion/Exclusion Criteria",
+                    value="""INCLUDE:
+- prospective cohort studies
+- case-control studies
+- environmental exposures
+- adult participants
+- cardiovascular disease
+- cancer outcomes
+- mortality outcomes
+EXCLUDE:
+- systematic reviews
+- meta-analyses
+- animal studies
+- case reports
+- editorials
+- pediatric populations
+- occupational exposures only""",
                     lines=15,
                     info="Be specific about what should be included or excluded"
                 )
         file_input.change(
             fn=update_column_choices,
             inputs=[file_input],
+            outputs=[title_column, abstract_column, fulltext_column, file_preview]
         )
         process_btn.click(