Spaces:

Aryan047
/

Dynamic-event-detector

Sleeping

App Files Files Community

Aryan047 commited on 21 days ago

Commit

f9e8817

verified ·

1 Parent(s): 61cbfa4

Deploy meme-vs-event Streamlit app

Browse files

Files changed (4) hide show

Dockerfile +18 -14
README.md +30 -13
app.py +250 -0
requirements.txt +5 -3

Dockerfile CHANGED Viewed

@@ -1,20 +1,24 @@
-FROM python:3.13.5-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    HF_HOME=/home/user/.cache/huggingface
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /home/user/app
+COPY --chown=user:user requirements.txt .
+RUN pip install --user --no-cache-dir -r requirements.txt
+COPY --chown=user:user app.py .
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", \
+     "--server.port=7860", \
+     "--server.address=0.0.0.0", \
+     "--server.headless=true", \
+     "--browser.gatherUsageStats=false"]

README.md CHANGED Viewed

@@ -1,20 +1,37 @@
 ---
-title: Dynamic Event Detector
-emoji: 🚀
-colorFrom: red
-colorTo: red
 sdk: docker
-app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: This model distinguishes between a "real event" and "meme"
-license: mit
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: Meme vs Real Event Classifier
+colorFrom: blue
+colorTo: indigo
 sdk: docker
+app_port: 7860
 pinned: false
+license: apache-2.0
 ---
+# Meme vs Real Event Tweet Classifier
+Streamlit demo for a fine-tuned `bert-base-uncased` model that classifies a
+tweet as a **meme / low-signal post** or a **real-world event**.
+The model weights live in a separate Hugging Face model repo and are loaded
+at startup via `transformers.AutoModelForSequenceClassification.from_pretrained`.
+## Configure the model repo
+The app reads the model id from the `MODEL_ID` environment variable, defaulting
+to `Aryan047/Dynamic-event-detector`. To override in the Space UI go to
+**Settings -> Variables and secrets** and set `MODEL_ID` to any other model repo.
+## Local development
+```bash
+pip install -r requirements.txt
+streamlit run app.py
+```
+## Files
+- `app.py` - Streamlit application (single-tweet tab, batch-CSV tab)
+- `requirements.txt` - runtime dependencies
+- `upload_model.py` - one-shot helper to push `artifacts_meme_vs_event/bert_classifier/`
+  to a new Hugging Face model repo. Not used by the Space itself.

app.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""Streamlit Space: Meme vs Real Event tweet classifier.
+Loads a fine-tuned bert-base-uncased from the Hugging Face Hub and exposes:
+  - Single-tweet tab: live prediction + probability bar chart
+  - Batch CSV tab:    upload a CSV with a `text` column, download predictions
+Matching preprocessing (same regex as the training notebook) is reapplied
+so results mirror what the notebook produces locally.
+"""
+from __future__ import annotations
+import io
+import os
+import re
+import numpy as np
+import pandas as pd
+import streamlit as st
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+MODEL_ID = os.environ.get("MODEL_ID", "Aryan047/Dynamic-event-detector")
+MAX_LENGTH = 128
+LABELS = {0: "meme", 1: "real_event"}
+_URL_RE = re.compile(r"https?://\S+|www\.\S+")
+_MENTION_RE = re.compile(r"@\w+")
+_HASHTAG_RE = re.compile(r"#")
+_NON_WORD_RE = re.compile(r"[^a-z0-9\s]")
+_WS_RE = re.compile(r"\s+")
+def clean_tweet(text: str) -> str:
+    if not isinstance(text, str):
+        return ""
+    t = text.lower()
+    t = _URL_RE.sub(" ", t)
+    t = _MENTION_RE.sub(" ", t)
+    t = _HASHTAG_RE.sub(" ", t)
+    t = _NON_WORD_RE.sub(" ", t)
+    t = _WS_RE.sub(" ", t).strip()
+    return t
+@st.cache_resource(show_spinner="Loading model from Hugging Face Hub...")
+def load_model(model_id: str):
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForSequenceClassification.from_pretrained(model_id)
+    model.eval()
+    return tokenizer, model
+@torch.no_grad()
+def predict_one(tokenizer, model, text: str) -> dict:
+    cleaned = clean_tweet(text)
+    if not cleaned:
+        return {
+            "label": "meme",
+            "confidence": 0.0,
+            "prob_meme": 1.0,
+            "prob_real_event": 0.0,
+            "clean_text": "",
+        }
+    enc = tokenizer(cleaned, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
+    probs = F.softmax(model(**enc).logits[0], dim=-1).numpy()
+    pred = int(np.argmax(probs))
+    return {
+        "label": LABELS[pred],
+        "confidence": float(probs[pred]),
+        "prob_meme": float(probs[0]),
+        "prob_real_event": float(probs[1]),
+        "clean_text": cleaned,
+    }
+@torch.no_grad()
+def predict_many(tokenizer, model, texts: list[str], batch_size: int = 32) -> pd.DataFrame:
+    cleaned = [clean_tweet(t) for t in texts]
+    labels, confs, p0s, p1s = [], [], [], []
+    progress = st.progress(0.0, text="Running predictions...")
+    total = max(len(cleaned), 1)
+    for i in range(0, len(cleaned), batch_size):
+        chunk = cleaned[i : i + batch_size]
+        empty_mask = [len(c) == 0 for c in chunk]
+        model_inputs = [c if c else "empty" for c in chunk]
+        enc = tokenizer(
+            model_inputs,
+            truncation=True,
+            padding=True,
+            max_length=MAX_LENGTH,
+            return_tensors="pt",
+        )
+        probs = F.softmax(model(**enc).logits, dim=-1).numpy()
+        for j, p in enumerate(probs):
+            if empty_mask[j]:
+                labels.append("meme")
+                confs.append(0.0)
+                p0s.append(1.0)
+                p1s.append(0.0)
+            else:
+                pred = int(np.argmax(p))
+                labels.append(LABELS[pred])
+                confs.append(float(p[pred]))
+                p0s.append(float(p[0]))
+                p1s.append(float(p[1]))
+        progress.progress(min((i + batch_size) / total, 1.0))
+    progress.empty()
+    return pd.DataFrame(
+        {
+            "text": texts,
+            "clean_text": cleaned,
+            "label": labels,
+            "confidence": confs,
+            "prob_meme": p0s,
+            "prob_real_event": p1s,
+        }
+    )
+def render_single_tab(tokenizer, model) -> None:
+    st.subheader("Classify a single tweet")
+    st.caption("Paste any tweet-style text. Labels: `meme` or `real_event`.")
+    default_example = "Massive 6.5 earthquake just rocked Istanbul, buildings swaying"
+    text = st.text_area("Tweet text", value=default_example, height=120)
+    if st.button("Predict", type="primary"):
+        if not text.strip():
+            st.warning("Please enter some text.")
+            return
+        result = predict_one(tokenizer, model, text)
+        col1, col2 = st.columns(2)
+        col1.metric("Predicted label", result["label"])
+        col2.metric("Confidence", f"{result['confidence']:.2%}")
+        st.markdown("**Class probabilities**")
+        st.bar_chart(
+            pd.DataFrame(
+                {"probability": [result["prob_meme"], result["prob_real_event"]]},
+                index=["meme", "real_event"],
+            )
+        )
+        with st.expander("Details"):
+            st.write({"cleaned_text": result["clean_text"]})
+def render_batch_tab(tokenizer, model) -> None:
+    st.subheader("Classify a CSV of tweets")
+    st.caption("Upload a CSV with a `text` column. Predictions are added as new columns.")
+    uploaded = st.file_uploader("CSV file", type=["csv"])
+    if uploaded is None:
+        st.info("Waiting for a CSV upload...")
+        return
+    try:
+        df = pd.read_csv(uploaded)
+    except Exception as exc:
+        st.error(f"Could not read CSV: {exc}")
+        return
+    if "text" not in df.columns:
+        st.error(f"CSV must contain a `text` column. Found: {list(df.columns)}")
+        return
+    max_rows = 5000
+    if len(df) > max_rows:
+        st.warning(f"CSV has {len(df)} rows. Truncating to first {max_rows} for the demo.")
+        df = df.head(max_rows).copy()
+    st.write(f"Loaded {len(df)} rows. Preview:")
+    st.dataframe(df.head(5))
+    if st.button("Run batch prediction", type="primary"):
+        out = predict_many(tokenizer, model, df["text"].tolist())
+        merged = pd.concat(
+            [df.reset_index(drop=True).drop(columns=["text"]), out.reset_index(drop=True)],
+            axis=1,
+        )
+        st.success(f"Classified {len(merged)} tweets.")
+        st.dataframe(merged.head(50))
+        counts = merged["label"].value_counts().reindex(["meme", "real_event"], fill_value=0)
+        st.markdown("**Label distribution**")
+        st.bar_chart(counts)
+        buf = io.StringIO()
+        merged.to_csv(buf, index=False)
+        st.download_button(
+            label="Download predictions CSV",
+            data=buf.getvalue(),
+            file_name="meme_vs_event_predictions.csv",
+            mime="text/csv",
+        )
+def main() -> None:
+    st.set_page_config(
+        page_title="Meme vs Real Event Classifier",
+        page_icon="",
+        layout="centered",
+    )
+    st.title("Meme vs Real Event Tweet Classifier")
+    st.caption(
+        f"Fine-tuned `bert-base-uncased` loaded from "
+        f"[`{MODEL_ID}`](https://huggingface.co/{MODEL_ID})."
+    )
+    tokenizer, model = load_model(MODEL_ID)
+    single_tab, batch_tab, about_tab = st.tabs(["Single tweet", "Batch CSV", "About"])
+    with single_tab:
+        render_single_tab(tokenizer, model)
+    with batch_tab:
+        render_batch_tab(tokenizer, model)
+    with about_tab:
+        st.markdown(
+            """
+            **Pipeline**: tweets were embedded with `all-mpnet-base-v2`, clustered with
+            BERTopic, cross-checked against the GDELT DOC 2.0 API with a lifespan-aware
+            rule, and the resulting `(tweet, label)` pairs were used to fine-tune
+            `bert-base-uncased`.
+            - **Input**: raw tweet text
+            - **Preprocessing**: lowercase, strip URLs / mentions / hashtag chars / non-word
+            - **Max length**: 128 tokens
+            - **Labels**: `0 = meme`, `1 = real_event`
+            """
+        )
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
-altair
-pandas
-streamlit

+streamlit>=1.36.0
+torch>=2.1.0
+transformers>=4.40.0
+pandas>=2.0.0
+numpy>=1.24.0