kluvin commited on
Commit
73bcf15
·
verified ·
1 Parent(s): 7707108

Upload folder using huggingface_hub

Browse files
.dockerignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.so
7
+ *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ .venv/
12
+ venv/
13
+ .git/
14
+ .gitignore
15
+ *.md
16
+ .DS_Store
17
+ *.log
.model_cache/ml_models.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5933762e9f06076ee9bc264398641cc4adaf61008402d294826990beb42140b
3
+ size 148608363
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.13 slim image
2
+ FROM python:3.13-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install uv for faster dependency management
8
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
9
+
10
+ # Copy dependency files from parent directory
11
+ COPY ../pyproject.toml /tmp/pyproject.toml
12
+
13
+ # Install dependencies
14
+ RUN uv pip install --system --no-cache-dir flask gunicorn transformers torch accelerate huggingface-hub
15
+
16
+ # Copy web app files
17
+ COPY . .
18
+
19
+ # Download model at build time (optional - caches model in image)
20
+ # Comment out to download on first run instead
21
+ RUN python -c "from transformers import pipeline; pipeline('sentiment-analysis', model='kluvin/bertweet-tweet-sentiment')"
22
+
23
+ # Expose port (7860 for HF Spaces, 5000 for local)
24
+ EXPOSE 7860
25
+
26
+ # Set environment variables
27
+ ENV FLASK_APP=app.py
28
+ ENV PYTHONUNBUFFERED=1
29
+ ENV PORT=7860
30
+
31
+ # Run the app with gunicorn for production
32
+ # --preload loads models once before forking workers (saves memory)
33
+ CMD gunicorn --preload --bind 0.0.0.0:$PORT --workers 2 --timeout 120 app:app
README.md CHANGED
@@ -1,11 +1,36 @@
1
  ---
2
- title: 158 Mls Tweet Sentiment
3
- emoji: 💻
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: docker
 
7
  pinned: false
8
- short_description: Sentiment Analysis over bertweet + ML
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Tweet Sentiment Classifier
3
+ emoji: 🐦
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
 
9
  ---
10
 
11
+ # Tweet Sentiment Classifier
12
+
13
+ Compare 5 different sentiment analysis models side-by-side on tweet text:
14
+
15
+ - **BERTweet** (Transformer) - Fine-tuned from `vinai/bertweet-base`
16
+ - **Decision Tree** (TF-IDF)
17
+ - **Random Forest** (TF-IDF)
18
+ - **Logistic Regression** (TF-IDF)
19
+ - **Linear SVM** (TF-IDF)
20
+
21
+ ## How it works
22
+
23
+ Type or paste a tweet and click "Analyze Sentiment" to see predictions from all 5 models simultaneously.
24
+
25
+ ## First run
26
+
27
+ Models download and train on first request (~2-3 minutes). Subsequent requests are instant thanks to caching.
28
+
29
+ ## Tech Stack
30
+
31
+ - **Frontend**: HTMX for reactive updates
32
+ - **Backend**: Flask
33
+ - **Models**: HuggingFace Transformers (BERTweet) + scikit-learn
34
+ - **Data**: SetFit/tweet_sentiment_extraction dataset
35
+
36
+ Built as a class project for DAT158 Machine Learning.
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, render_template
2
+ from transformers import pipeline
3
+ from sklearn.pipeline import Pipeline
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.svm import LinearSVC
6
+ from sklearn.ensemble import RandomForestClassifier
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.tree import DecisionTreeClassifier
9
+ import polars as pl
10
+ import joblib
11
+ from pathlib import Path
12
+ import logging
13
+ import os
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
19
+ )
20
+ logger = logging.getLogger(__name__)
21
+
22
+ app = Flask(__name__)
23
+
24
+ CLASS_ID_TO_SENTIMENT = {
25
+ "0": "negative",
26
+ "1": "neutral",
27
+ "2": "positive"
28
+ }
29
+
30
+ # Use HF Spaces persistent storage if available, otherwise local cache
31
+ CACHE_DIR = Path(os.getenv("HF_HOME", ".")) / ".model_cache"
32
+ CACHE_DIR.mkdir(exist_ok=True)
33
+
34
+ logger.info("Loading BERTweet from HuggingFace Hub...")
35
+ bertweet_pipeline = pipeline("sentiment-analysis", model="kluvin/bertweet-tweet-sentiment")
36
+ logger.info("BERTweet loaded successfully")
37
+
38
+ # Define model configurations
39
+ model_configs = {
40
+ "Decision Tree": Pipeline([
41
+ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")),
42
+ ("clf", DecisionTreeClassifier(max_depth=10, random_state=42))
43
+ ]),
44
+ "Random Forest": Pipeline([
45
+ ("tfidf", TfidfVectorizer(max_features=500, stop_words="english")),
46
+ ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
47
+ ]),
48
+ "Logistic Regression": Pipeline([
49
+ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")),
50
+ ("clf", LogisticRegression(max_iter=1000, random_state=42))
51
+ ]),
52
+ "Linear SVM": Pipeline([
53
+ ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")),
54
+ ("clf", LinearSVC(random_state=42))
55
+ ])
56
+ }
57
+
58
+ sklearn_pipelines = {}
59
+ cache_file = CACHE_DIR / "ml_models.joblib"
60
+
61
+ if cache_file.exists():
62
+ logger.info("Loading cached ML models...")
63
+ try:
64
+ sklearn_pipelines = joblib.load(cache_file)
65
+ logger.info("✓ Cached models loaded successfully!")
66
+ except Exception as e:
67
+ logger.error(f"Failed to load cache: {e}")
68
+ logger.info("Will retrain models...")
69
+
70
+ if not sklearn_pipelines:
71
+ logger.info("Loading training data and training ML models...")
72
+ splits = {'train': 'train.jsonl'}
73
+ df = pl.read_ndjson('hf://datasets/SetFit/tweet_sentiment_extraction/' + splits['train'])
74
+ X_train = df['text'].to_list()
75
+ y_train = df['label'].to_list()
76
+
77
+ logger.info("Training models...")
78
+ for model_name, sklearn_pipeline in model_configs.items():
79
+ logger.info(f" Training {model_name}...")
80
+ sklearn_pipeline.fit(X_train, y_train)
81
+ sklearn_pipelines[model_name] = sklearn_pipeline
82
+
83
+ logger.info("Saving models to cache...")
84
+ joblib.dump(sklearn_pipelines, cache_file)
85
+ logger.info(f"✓ Models cached at {cache_file}")
86
+
87
+ logger.info("All models loaded and ready!")
88
+
89
+ def render_model_result(model_name: str, sentiment_name: str, probability: float | None) -> str:
90
+ probability_text = f"Probability: {probability:.2%}" if probability else "N/A"
91
+ return f'''
92
+ <div class="model-result {sentiment_name}">
93
+ <h3>{model_name}</h3>
94
+ <p class="sentiment">{sentiment_name.capitalize()}</p>
95
+ <p class="confidence">{probability_text}</p>
96
+ </div>
97
+ '''
98
+
99
+ @app.route('/')
100
+ def home():
101
+ return render_template('index.html')
102
+
103
+ @app.route('/classify', methods=['POST'])
104
+ def clasify():
105
+ try:
106
+ text_input = request.form['text']
107
+
108
+ if not text_input.strip():
109
+ return '''
110
+ <div class="result error">
111
+ <h2>Error: Please enter some text</h2>
112
+ </div>
113
+ '''
114
+
115
+ logger.info(f"Classifying: {text_input[:50]}...")
116
+
117
+ results_html = ""
118
+
119
+ pipeline_output = bertweet_pipeline(text_input)[0]
120
+ predicted_class_id = pipeline_output['label']
121
+ probability = pipeline_output['score']
122
+ sentiment_name = CLASS_ID_TO_SENTIMENT[predicted_class_id]
123
+
124
+ results_html += render_model_result("BERTweet (Transformer)", sentiment_name, probability)
125
+
126
+ for model_name, sklearn_pipeline in sklearn_pipelines.items():
127
+ inputs = [text_input]
128
+ predicted_class = sklearn_pipeline.predict(inputs)[0]
129
+
130
+ classifier = sklearn_pipeline.named_steps['clf']
131
+ if hasattr(classifier, 'predict_proba'):
132
+ class_probabilities = sklearn_pipeline.predict_proba(inputs)[0]
133
+ probability = class_probabilities.max()
134
+ elif hasattr(classifier, 'decision_function'):
135
+ decision_scores = sklearn_pipeline.decision_function(inputs)[0]
136
+ probability = 1.0 / (1.0 + abs(decision_scores.min()))
137
+ else:
138
+ probability = None
139
+
140
+ sentiment_name = CLASS_ID_TO_SENTIMENT[str(predicted_class)]
141
+
142
+ results_html += render_model_result(model_name, sentiment_name, probability)
143
+
144
+ return f'<div class="results-grid">{results_html}</div>'
145
+ except Exception as e:
146
+ logger.error(f"Classification error: {e}", exc_info=True)
147
+ return f'''
148
+ <div class="result error">
149
+ <h2>Error: {e}</h2>
150
+ </div>
151
+ '''
152
+
153
+ if __name__ == "__main__":
154
+ if app.debug:
155
+ logger.setLevel(logging.DEBUG)
156
+ app.run(debug=True)
docker-compose.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ web:
5
+ build: .
6
+ ports:
7
+ - "5000:7860"
8
+ environment:
9
+ - PYTHONUNBUFFERED=1
10
+ - PORT=7860
11
+ restart: unless-stopped
static/images/bird.png ADDED
static/index.css ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: 'Segoe UI', sans-serif;
3
+ background: linear-gradient(135deg, #74ebd5, #9face6);
4
+ display: flex;
5
+ justify-content: center;
6
+ align-items: center;
7
+ height: 100vh;
8
+ margin: 0;
9
+ }
10
+
11
+ .container {
12
+ background-color: white;
13
+ padding: 40px;
14
+ border-radius: 15px;
15
+ box-shadow: 0 8px 20px rgba(0, 0, 0, 0.2);
16
+ width: 90%;
17
+ max-width: 1200px;
18
+ text-align: center;
19
+ }
20
+
21
+ textarea {
22
+ width: 100%;
23
+ padding: 10px;
24
+ font-size: 1rem;
25
+ border-radius: 8px;
26
+ border: 1px solid #ccc;
27
+ resize: none;
28
+ }
29
+
30
+ button {
31
+ background-color: #007bff;
32
+ color: white;
33
+ border: none;
34
+ padding: 10px 20px;
35
+ margin-top: 10px;
36
+ border-radius: 8px;
37
+ font-size: 1rem;
38
+ cursor: pointer;
39
+ }
40
+
41
+ button:hover {
42
+ background-color: #0056b3;
43
+ }
44
+
45
+ .result {
46
+ margin-top: 20px;
47
+ padding: 10px;
48
+ border-radius: 8px;
49
+ }
50
+
51
+ .positive {
52
+ color: #00953e;
53
+ }
54
+
55
+ .negative {
56
+ color: #b00400;
57
+ }
58
+
59
+ .neutral {
60
+ color: #c99e00;
61
+ }
62
+
63
+ .bird {
64
+ width: 25px;
65
+ }
66
+
67
+ /* Results grid layout */
68
+ .results-grid {
69
+ display: grid;
70
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
71
+ gap: 15px;
72
+ margin-top: 20px;
73
+ }
74
+
75
+ .model-result {
76
+ padding: 15px;
77
+ border-radius: 8px;
78
+ border: 2px solid;
79
+ text-align: center;
80
+ }
81
+
82
+ .model-result h3 {
83
+ margin: 0 0 10px 0;
84
+ font-size: 1rem;
85
+ color: #333;
86
+ }
87
+
88
+ .model-result .sentiment {
89
+ font-size: 1.3rem;
90
+ font-weight: bold;
91
+ margin: 5px 0;
92
+ }
93
+
94
+ .model-result .confidence {
95
+ font-size: 0.9rem;
96
+ color: #666;
97
+ margin: 5px 0;
98
+ }
99
+
100
+ /* Sentiment-specific background colors */
101
+ .model-result.positive {
102
+ background-color: #d4edda;
103
+ border-color: #28a745;
104
+ }
105
+
106
+ .model-result.positive .sentiment {
107
+ color: #00953e;
108
+ }
109
+
110
+ .model-result.negative {
111
+ background-color: #f8d7da;
112
+ border-color: #dc3545;
113
+ }
114
+
115
+ .model-result.negative .sentiment {
116
+ color: #b00400;
117
+ }
118
+
119
+ .model-result.neutral {
120
+ background-color: #fff3cd;
121
+ border-color: #ffc107;
122
+ }
123
+
124
+ .model-result.neutral .sentiment {
125
+ color: #c99e00;
126
+ }
127
+
128
+ .result.error {
129
+ background-color: #f8d7da;
130
+ border: 2px solid #dc3545;
131
+ padding: 15px;
132
+ border-radius: 8px;
133
+ text-align: center;
134
+ }
templates/index.html ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Tweet Sentiment Classifier</title>
5
+ <link rel="stylesheet" href="{{ url_for('static', filename='index.css') }}">
6
+ <script src="https://unpkg.com/htmx.org@1.9.10"></script>
7
+ </head>
8
+ <body>
9
+ <div class="container">
10
+ <h1><img class="bird" src="../{{ url_for('static', filename='images/bird.png') }}"> Tweet Sentiment Classifier</h1>
11
+ <p style="text-align: center; color: #666; margin-bottom: 20px;">
12
+ </p>
13
+ <form hx-post="/classify" hx-target="#result">
14
+ <textarea name="text" rows="4" placeholder="Type or paste a tweet..." required></textarea>
15
+ <br>
16
+ <button type="submit">Analyze Sentiment</button>
17
+ </form>
18
+
19
+ <div id="result"></div>
20
+ </div>
21
+ </body>
22
+ </html>