File size: 4,274 Bytes
61cf524
 
 
 
 
 
 
 
 
 
 
 
5bf61fc
61cf524
a352d2f
edaa6c5
5bf61fc
fee8826
 
 
 
9f2a957
fee8826
edaa6c5
 
 
 
5bf61fc
fee8826
5bf61fc
fee8826
 
 
 
 
b369092
5bf61fc
fee8826
 
 
4b8c206
b369092
 
 
 
 
 
 
fee8826
b369092
 
 
 
61cf524
 
b369092
61cf524
 
5bf61fc
a352d2f
5bf61fc
 
a352d2f
61cf524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a352d2f
61cf524
 
 
 
4b8c206
61cf524
 
 
 
 
4b8c206
61cf524
 
 
 
 
 
 
 
 
a180cf5
61cf524
4b8c206
b369092
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
"""app.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/11FAEDRYHuCI7iX5w3JaeKoD76-9pwrLi
"""

import os
import json
import pandas as pd
from rank_bm25 import BM25Okapi
import gradio as gr
import openai
from datasets import load_dataset

# Ensure Hugging Face CLI is authenticated
if "HF_TOKEN" not in os.environ:
    print("Please authenticate with Hugging Face CLI or set HF_TOKEN as an environment variable.")
    exit(1)

# Explicitly define dataset file paths
data_files = {
    "train": "hf://datasets/farah1/mental-health-posts-classification/train.csv",
    "validation": "hf://datasets/farah1/mental-health-posts-classification/validation.csv",
}

# Load dataset
try:
    print("Loading dataset...")
    dataset = load_dataset("csv", data_files=data_files)
    train_data = dataset["train"].to_pandas()
    validation_data = dataset["validation"].to_pandas()
    print("Dataset loaded successfully.")
    print("Train dataset columns:", train_data.columns)
except Exception as e:
    print(f"Failed to load dataset: {e}")
    train_data = pd.DataFrame()  # Fallback to empty DataFrame
    validation_data = pd.DataFrame()

# Check and create the 'text' column
if "text" not in train_data.columns:
    if "title" in train_data.columns and "content" in train_data.columns:
        train_data["text"] = train_data["title"] + " " + train_data["content"]
    else:
        raise ValueError("The 'text' column is missing, and the required 'title' and 'content' columns are not available to create it.")

# Ensure the necessary columns exist in the training dataset
required_columns = ["text", "Ground_Truth_Stress", "Ground_Truth_Anxiety", "Ground_Truth_Depression", "Ground_Truth_Other_binary"]
for column in required_columns:
    if column not in train_data.columns:
        raise ValueError(f"Missing required column '{column}' in the training dataset.")

# Initialize BM25
tokenized_train = [doc.split() for doc in train_data["text"]]
bm25 = BM25Okapi(tokenized_train)

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise ValueError("OpenAI API key is not set. Please set it as an environment variable.")

# Few-shot classification function
def classify_text(input_text, k=20):
    # Tokenize input text
    tokenized_text = input_text.split()
    # Get top-k similar examples using BM25
    scores = bm25.get_scores(tokenized_text)
    top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]

    # Build examples for prompt
    examples = "\n".join(
        f"Example {i+1}:\nText: {train_data.iloc[idx]['text']}\nClassification: "
        f"Stress={train_data.iloc[idx]['Ground_Truth_Stress']}, "
        f"Anxiety={train_data.iloc[idx]['Ground_Truth_Anxiety']}, "
        f"Depression={train_data.iloc[idx]['Ground_Truth_Depression']}, "
        f"Other={train_data.iloc[idx]['Ground_Truth_Other_binary']}\n"
        for i, idx in enumerate(top_k_indices)
    )

    # Construct OpenAI prompt
    prompt = f"""
    You are a mental health specialist. Classify the text into Stress, Anxiety, Depression, or Other:
    ### Examples:
    {examples}
    ### Text to Classify:
    "{input_text}"
    ### Output Format:
    - **Ground_Truth_Stress**: 1 or 0
    - **Ground_Truth_Anxiety**: 1 or 0
    - **Ground_Truth_Depression**: 1 or 0
    - **Ground_Truth_Other_binary**: 1 or 0
    """

    try:
        response = openai.ChatCompletion.create(
            messages=[
                {"role": "system", "content": "You are a mental health specialist."},
                {"role": "user", "content": prompt},
            ],
            model="gpt-4",
            temperature=0,
        )
        results = response.choices[0].message.content
        return json.loads(results)
    except Exception as e:
        return {"error": str(e)}

# Gradio Interface
interface = gr.Interface(
    fn=classify_text,
    inputs=gr.Textbox(lines=5, placeholder="Enter text for classification..."),
    outputs="json",
    title="Mental Health Text Classifier",
    description="Classify text into Stress, Anxiety, Depression, or Other using BM25 and GPT-4.",
)

if __name__ == "__main__":
    interface.launch()