| |
| """Swedish Causality Detection - HuggingFace Space""" |
|
|
| import gradio as gr |
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
| from sklearn.linear_model import LogisticRegression |
| from datasets import load_dataset |
| import pickle |
| import os |
|
|
| |
| classifier = None |
| embedder = None |
|
|
| def load_models(): |
| """Load or train the causality classifier.""" |
| global classifier, embedder |
|
|
| |
| embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') |
|
|
| model_path = "causality_classifier.pkl" |
|
|
| if os.path.exists(model_path): |
| with open(model_path, 'rb') as f: |
| classifier = pickle.load(f) |
| else: |
| |
| print("Training classifier...") |
| dataset = load_dataset("UppsalaNLP/swedish-causality-binary") |
|
|
| train_texts = dataset['train']['target_sentence'] |
| train_labels = dataset['train']['label'] |
|
|
| |
| train_embeddings = embedder.encode(train_texts, show_progress_bar=True) |
|
|
| |
| classifier = LogisticRegression(max_iter=1000, random_state=42) |
| classifier.fit(train_embeddings, train_labels) |
|
|
| |
| with open(model_path, 'wb') as f: |
| pickle.dump(classifier, f) |
|
|
| print("Classifier trained and saved!") |
|
|
| def detect_causality(text: str) -> dict: |
| """Detect causality in Swedish text.""" |
| if not text.strip(): |
| return {"Causal": 0.0, "Non-causal": 0.0} |
|
|
| |
| embedding = embedder.encode([text]) |
|
|
| |
| probs = classifier.predict_proba(embedding)[0] |
|
|
| return { |
| "Non-causal": float(probs[0]), |
| "Causal": float(probs[1]) |
| } |
|
|
| def analyze_text(text: str) -> tuple: |
| """Analyze text and return results.""" |
| if not text.strip(): |
| return {}, "Please enter some text to analyze." |
|
|
| |
| scores = detect_causality(text) |
|
|
| |
| is_causal = scores["Causal"] > scores["Non-causal"] |
| confidence = max(scores.values()) |
|
|
| if is_causal: |
| result = f"**Causal relation detected** (confidence: {confidence:.1%})\n\n" |
| result += "This sentence appears to express a cause-effect relationship." |
| else: |
| result = f"**No causal relation detected** (confidence: {confidence:.1%})\n\n" |
| result += "This sentence does not appear to express a cause-effect relationship." |
|
|
| return scores, result |
|
|
| |
| EXAMPLES = [ |
| ["Den lägre produktiviteten kan bero på att kvinnor har kortare arbetslivserfarenhet än män."], |
| ["Klimatförändringarna leder till ökade havsnivåer och extrema väderhändelser."], |
| ["Sverige är ett land i Nordeuropa med ungefär 10 miljoner invånare."], |
| ["Regeringen presenterade sin budget för nästa år."], |
| ["Bristen på utbildning orsakar hög arbetslöshet bland unga."], |
| ["Stockholm är Sveriges huvudstad och största stad."], |
| ] |
|
|
| |
| print("Loading models...") |
| load_models() |
| print("Models loaded!") |
|
|
| |
| with gr.Blocks(title="Swedish Causality Detection", theme=gr.themes.Soft()) as demo: |
| gr.Markdown(""" |
| # Swedish Causality Detection |
| |
| Detect causal relations in Swedish text using machine learning. |
| |
| **Author:** Birger Moëll, Uppsala NLP |
| |
| This tool classifies whether a Swedish sentence expresses a cause-effect relationship. |
| The model is trained on the [Swedish Causality Binary Dataset](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary). |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=2): |
| text_input = gr.Textbox( |
| label="Swedish Text", |
| placeholder="Enter a Swedish sentence to analyze...", |
| lines=3 |
| ) |
| analyze_btn = gr.Button("Analyze", variant="primary") |
|
|
| with gr.Column(scale=1): |
| label_output = gr.Label(label="Causality Score") |
|
|
| result_output = gr.Markdown(label="Analysis Result") |
|
|
| gr.Examples( |
| examples=EXAMPLES, |
| inputs=text_input, |
| label="Example Sentences (click to try)" |
| ) |
|
|
| analyze_btn.click( |
| fn=analyze_text, |
| inputs=text_input, |
| outputs=[label_output, result_output] |
| ) |
|
|
| text_input.submit( |
| fn=analyze_text, |
| inputs=text_input, |
| outputs=[label_output, result_output] |
| ) |
|
|
| gr.Markdown(""" |
| --- |
| |
| ## About |
| |
| This classifier uses sentence embeddings from `paraphrase-multilingual-MiniLM-L12-v2` |
| and logistic regression trained on Swedish government reports annotated for causality. |
| |
| **Dataset:** [UppsalaNLP/swedish-causality-binary](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary) |
| |
| **Citation:** |
| ``` |
| Dürlich et al. (2022). Cause and Effect in Governmental Reports: |
| Two Data Sets for Causality Detection in Swedish. |
| ``` |
| |
| **Links:** |
| - [Uppsala NLP](https://huggingface.co/UppsalaNLP) |
| - [GitHub Repository](https://github.com/UppsalaNLP/Swedish-Causality-Datasets) |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|