Spaces:
Sleeping
Sleeping
File size: 4,274 Bytes
61cf524 5bf61fc 61cf524 a352d2f edaa6c5 5bf61fc fee8826 9f2a957 fee8826 edaa6c5 5bf61fc fee8826 5bf61fc fee8826 b369092 5bf61fc fee8826 4b8c206 b369092 fee8826 b369092 61cf524 b369092 61cf524 5bf61fc a352d2f 5bf61fc a352d2f 61cf524 a352d2f 61cf524 4b8c206 61cf524 4b8c206 61cf524 a180cf5 61cf524 4b8c206 b369092 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/11FAEDRYHuCI7iX5w3JaeKoD76-9pwrLi
"""
import os
import json
import pandas as pd
from rank_bm25 import BM25Okapi
import gradio as gr
import openai
from datasets import load_dataset
# Ensure Hugging Face CLI is authenticated
if "HF_TOKEN" not in os.environ:
print("Please authenticate with Hugging Face CLI or set HF_TOKEN as an environment variable.")
exit(1)
# Explicitly define dataset file paths
data_files = {
"train": "hf://datasets/farah1/mental-health-posts-classification/train.csv",
"validation": "hf://datasets/farah1/mental-health-posts-classification/validation.csv",
}
# Load dataset
try:
print("Loading dataset...")
dataset = load_dataset("csv", data_files=data_files)
train_data = dataset["train"].to_pandas()
validation_data = dataset["validation"].to_pandas()
print("Dataset loaded successfully.")
print("Train dataset columns:", train_data.columns)
except Exception as e:
print(f"Failed to load dataset: {e}")
train_data = pd.DataFrame() # Fallback to empty DataFrame
validation_data = pd.DataFrame()
# Check and create the 'text' column
if "text" not in train_data.columns:
if "title" in train_data.columns and "content" in train_data.columns:
train_data["text"] = train_data["title"] + " " + train_data["content"]
else:
raise ValueError("The 'text' column is missing, and the required 'title' and 'content' columns are not available to create it.")
# Ensure the necessary columns exist in the training dataset
required_columns = ["text", "Ground_Truth_Stress", "Ground_Truth_Anxiety", "Ground_Truth_Depression", "Ground_Truth_Other_binary"]
for column in required_columns:
if column not in train_data.columns:
raise ValueError(f"Missing required column '{column}' in the training dataset.")
# Initialize BM25
tokenized_train = [doc.split() for doc in train_data["text"]]
bm25 = BM25Okapi(tokenized_train)
# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
raise ValueError("OpenAI API key is not set. Please set it as an environment variable.")
# Few-shot classification function
def classify_text(input_text, k=20):
# Tokenize input text
tokenized_text = input_text.split()
# Get top-k similar examples using BM25
scores = bm25.get_scores(tokenized_text)
top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
# Build examples for prompt
examples = "\n".join(
f"Example {i+1}:\nText: {train_data.iloc[idx]['text']}\nClassification: "
f"Stress={train_data.iloc[idx]['Ground_Truth_Stress']}, "
f"Anxiety={train_data.iloc[idx]['Ground_Truth_Anxiety']}, "
f"Depression={train_data.iloc[idx]['Ground_Truth_Depression']}, "
f"Other={train_data.iloc[idx]['Ground_Truth_Other_binary']}\n"
for i, idx in enumerate(top_k_indices)
)
# Construct OpenAI prompt
prompt = f"""
You are a mental health specialist. Classify the text into Stress, Anxiety, Depression, or Other:
### Examples:
{examples}
### Text to Classify:
"{input_text}"
### Output Format:
- **Ground_Truth_Stress**: 1 or 0
- **Ground_Truth_Anxiety**: 1 or 0
- **Ground_Truth_Depression**: 1 or 0
- **Ground_Truth_Other_binary**: 1 or 0
"""
try:
response = openai.ChatCompletion.create(
messages=[
{"role": "system", "content": "You are a mental health specialist."},
{"role": "user", "content": prompt},
],
model="gpt-4",
temperature=0,
)
results = response.choices[0].message.content
return json.loads(results)
except Exception as e:
return {"error": str(e)}
# Gradio Interface
interface = gr.Interface(
fn=classify_text,
inputs=gr.Textbox(lines=5, placeholder="Enter text for classification..."),
outputs="json",
title="Mental Health Text Classifier",
description="Classify text into Stress, Anxiety, Depression, or Other using BM25 and GPT-4.",
)
if __name__ == "__main__":
interface.launch()
|