Spaces:

UppsalaNLP
/

swedish-causality-detection

Sleeping

App Files Files Community

swedish-causality-detection / app.py

birgermoell

Upload app.py with huggingface_hub

0bcc156 verified 4 months ago

raw

history blame contribute delete

5.23 kB

	#!/usr/bin/env python3
	"""Swedish Causality Detection - HuggingFace Space"""

	import gradio as gr
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.linear_model import LogisticRegression
	from datasets import load_dataset
	import pickle
	import os

	# Global model variables
	classifier = None
	embedder = None

	def load_models():
	"""Load or train the causality classifier."""
	global classifier, embedder

	# Load embedding model
	embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

	model_path = "causality_classifier.pkl"

	if os.path.exists(model_path):
	with open(model_path, 'rb') as f:
	classifier = pickle.load(f)
	else:
	# Train classifier on the dataset
	print("Training classifier...")
	dataset = load_dataset("UppsalaNLP/swedish-causality-binary")

	train_texts = dataset['train']['target_sentence']
	train_labels = dataset['train']['label']

	# Generate embeddings
	train_embeddings = embedder.encode(train_texts, show_progress_bar=True)

	# Train logistic regression
	classifier = LogisticRegression(max_iter=1000, random_state=42)
	classifier.fit(train_embeddings, train_labels)

	# Save model
	with open(model_path, 'wb') as f:
	pickle.dump(classifier, f)

	print("Classifier trained and saved!")

	def detect_causality(text: str) -> dict:
	"""Detect causality in Swedish text."""
	if not text.strip():
	return {"Causal": 0.0, "Non-causal": 0.0}

	# Generate embedding
	embedding = embedder.encode([text])

	# Get prediction probabilities
	probs = classifier.predict_proba(embedding)[0]

	return {
	"Non-causal": float(probs[0]),
	"Causal": float(probs[1])
	}

	def analyze_text(text: str) -> tuple:
	"""Analyze text and return results."""
	if not text.strip():
	return {}, "Please enter some text to analyze."

	# Get causality scores
	scores = detect_causality(text)

	# Determine result
	is_causal = scores["Causal"] > scores["Non-causal"]
	confidence = max(scores.values())

	if is_causal:
	result = f"Causal relation detected (confidence: {confidence:.1%})\n\n"
	result += "This sentence appears to express a cause-effect relationship."
	else:
	result = f"No causal relation detected (confidence: {confidence:.1%})\n\n"
	result += "This sentence does not appear to express a cause-effect relationship."

	return scores, result

	# Example sentences
	EXAMPLES = [
	["Den lägre produktiviteten kan bero på att kvinnor har kortare arbetslivserfarenhet än män."],
	["Klimatförändringarna leder till ökade havsnivåer och extrema väderhändelser."],
	["Sverige är ett land i Nordeuropa med ungefär 10 miljoner invånare."],
	["Regeringen presenterade sin budget för nästa år."],
	["Bristen på utbildning orsakar hög arbetslöshet bland unga."],
	["Stockholm är Sveriges huvudstad och största stad."],
	]

	# Load models at startup
	print("Loading models...")
	load_models()
	print("Models loaded!")

	# Create Gradio interface
	with gr.Blocks(title="Swedish Causality Detection", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# Swedish Causality Detection

	Detect causal relations in Swedish text using machine learning.

	Author: Birger Moëll, Uppsala NLP

	This tool classifies whether a Swedish sentence expresses a cause-effect relationship.
	The model is trained on the [Swedish Causality Binary Dataset](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary).
	""")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Swedish Text",
	placeholder="Enter a Swedish sentence to analyze...",
	lines=3
	)
	analyze_btn = gr.Button("Analyze", variant="primary")

	with gr.Column(scale=1):
	label_output = gr.Label(label="Causality Score")

	result_output = gr.Markdown(label="Analysis Result")

	gr.Examples(
	examples=EXAMPLES,
	inputs=text_input,
	label="Example Sentences (click to try)"
	)

	analyze_btn.click(
	fn=analyze_text,
	inputs=text_input,
	outputs=[label_output, result_output]
	)

	text_input.submit(
	fn=analyze_text,
	inputs=text_input,
	outputs=[label_output, result_output]
	)

	gr.Markdown("""
	---

	## About

	This classifier uses sentence embeddings from `paraphrase-multilingual-MiniLM-L12-v2`
	and logistic regression trained on Swedish government reports annotated for causality.

	Dataset: [UppsalaNLP/swedish-causality-binary](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary)

	Citation:
	```
	Dürlich et al. (2022). Cause and Effect in Governmental Reports:
	Two Data Sets for Causality Detection in Swedish.
	```

	Links:
	- [Uppsala NLP](https://huggingface.co/UppsalaNLP)
	- [GitHub Repository](https://github.com/UppsalaNLP/Swedish-Causality-Datasets)
	""")

	if __name__ == "__main__":
	demo.launch()