Spaces:

poltextlab
/

babelmachine

Running

App Files Files Community

poltextlab commited on Mar 27

Commit

b1c2932

•

1 Parent(s): b62c233

first upload

Browse files

base application

Files changed (8) hide show

app.py +26 -0
interfaces/cap.py +128 -0
interfaces/emotion.py +51 -0
interfaces/language_domain_models.csv +15 -0
interfaces/manifesto.py +55 -0
interfaces/sentiment.py +51 -0
label_dicts.py +110 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import gradio as gr
+from interfaces.cap import demo as cap_demo
+from interfaces.manifesto import demo as manifesto_demo
+from interfaces.sentiment import demo as sentiment_demo
+from interfaces.emotion import demo as emotion_demo
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        <div style="display: block; text-align: left; padding:0; margin:0;">
+            <h1 style="text-align: center">Babel Machine Demo</h1>
+            <p>This is a demo for text classification using language models finetuned on data labeled by <a href="https://www.comparativeagendas.net/">CAP</a>, <a href="https://manifesto-project.wzb.eu/">Manifesto Project</a>, sentiment, and emotion coding systems.<br>
+            For the coding of complete datasets, please visit the official <a href="https://babel.poltextlab.com/">Babel Machine</a> site.</p>
+        </div>
+        """)
+    gr.TabbedInterface(
+        interface_list=[cap_demo, manifesto_demo, sentiment_demo, emotion_demo],
+        tab_names=["CAP", "Manifesto", "sentiment", "emotion"],
+    )
+if __name__ == "__main__":
+    demo.launch()
+# TODO: add all languages & domains

interfaces/cap.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+import pandas as pd
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+from huggingface_hub.utils._errors import RepositoryNotFoundError
+from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "danish",
+    "dutch",
+    "english",
+    "french",
+    "german",
+    "hungarian",
+    "italian",
+    "polish",
+    "portuguese",
+    "spanish",
+    "czech",
+    "slovak",
+    "norwegian"
+]
+domains = {
+    "media": "media",
+    "social media": "social",
+    "parliamentary speech": "parlspeech",
+    "legislative documents": "legislative",
+    "executive speech": "execspeech",
+    "executive order": "execorder",
+    "party programs": "party",
+    "judiciary": "judiciary",
+    "budget": "budget",
+    "public opinion": "publicopinion",
+    "local government agenda": "localgovernment"
+}
+def check_huggingface_path(checkpoint_path: str):
+    try:
+        hf_api = HfApi(token=HF_TOKEN)
+        hf_api.model_info(checkpoint_path, token=HF_TOKEN)
+        return True
+    except RepositoryNotFoundError:
+        return False
+def build_huggingface_path(language: str, domain: str):
+    base_path = "xlm-roberta-large"
+    lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
+    lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
+    path_map = {
+        "L": lang_path,
+        "L-D": lang_domain_path,
+        "X": lang_domain_path,
+    }
+    value = None
+    try:
+        lang_domain_table = pd.read_csv("language_domain_models.csv")
+        lang_domain_table["language"] = lang_domain_table["language"].str.lower()
+        lang_domain_table.columns = lang_domain_table.columns.str.lower()
+        # get the row for the language and them get the value from the domain column
+        row = lang_domain_table[(lang_domain_table["language"] == language)]
+        tmp = row.get(domain)
+        if not tmp.empty:
+            value = tmp.iloc[0]
+    except (AttributeError, FileNotFoundError):
+        value = None
+    if value and value in path_map:
+        model_path = path_map[value]
+        if check_huggingface_path(model_path):
+            # if the model is available on Huggingface, return the path
+            return model_path
+        else:
+            # if the model is not available on Huggingface, look for other models
+            filtered_path_map = {k: v for k, v in path_map.items() if k != value}
+            for k, v in filtered_path_map.items():
+                if check_huggingface_path(v):
+                    return v
+    elif check_huggingface_path(lang_domain_path):
+        return lang_domain_path
+    elif check_huggingface_path(lang_path):
+        return lang_path
+    else:
+        return "poltextlab/xlm-roberta-large-pooled-cap"
+def predict(text, model_id, tokenizer_id):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    model.to(device)
+    inputs = tokenizer(text,
+                       max_length=512,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_cap(text, language, domain):
+    domain = domains[domain]
+    model_id = build_huggingface_path(language, domain)
+    tokenizer_id = "xlm-roberta-large"
+    return predict(text, model_id, tokenizer_id)
+demo = gr.Interface(
+    fn=predict_cap,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language"),
+            gr.Dropdown(domains.keys(), label="Domain")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/emotion.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+from label_dicts import MANIFESTO_LABEL_NAMES
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "czech", "english", "french", "german", "hungarian", "italian"
+]
+def build_huggingface_path(language: str):
+    return "poltextlab/xlm-roberta-large-pooled-emotions"
+def predict(text, model_id, tokenizer_id):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    model.to(device)
+    inputs = tokenizer(text,
+                       max_length=512,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    output_pred = {model.config.id2label[i]: probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_cap(text, language):
+    model_id = build_huggingface_path(language)
+    tokenizer_id = "xlm-roberta-large"
+    return predict(text, model_id, tokenizer_id)
+demo = gr.Interface(
+    fn=predict_cap,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/language_domain_models.csv ADDED Viewed

	@@ -0,0 +1,15 @@

+language,media,social,parlspeech,legislative,execspeech,execorder,party,judiciary,budget,publicopinion,localgovernment
+danish,,,L-D,L,L,,,,,,
+dutch,L-D,L,L,L-D,L,L,,,,,
+english,L,,L,L-D,L,L,L,L,,,
+french,,,,L-D,L-D,L-D,L,,,,
+german,L,,L-D,,,,L-D,,,,
+hungarian,L,,L-D,L-D,L,,,,L-D,L-D,
+italian,,L,L,L-D,,,,,,,
+polish,,,,X,,,,,,,
+portuguese,,,,L-D,L,L,,,,,
+spanish,L,,L,L-D,L,L-D,L,,,,
+czech,,,,,,,,,,,
+polish,,,,,,,,,,,
+slovak,,,,,,,,,,,
+norwegian,,,,,,,,,,,

interfaces/manifesto.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+from label_dicts import MANIFESTO_LABEL_NAMES
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "armenian", "bulgarian", "croatian", "czech", "danish", "dutch", "english",
+    "estonian", "finnish", "french", "georgian", "german", "greek", "hebrew",
+    "hungarian", "icelandic", "italian", "japanese", "korean", "latvian",
+    "lithuanian", "norwegian", "polish", "portuguese", "romanian", "russian",
+    "serbian", "slovak", "slovenian", "spanish", "swedish", "turkish"
+]
+def build_huggingface_path(language: str):
+    return "poltextlab/xlm-roberta-large-manifesto"
+def predict(text, model_id, tokenizer_id):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    model.to(device)
+    inputs = tokenizer(text,
+                       max_length=512,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_cap(text, language):
+    model_id = build_huggingface_path(language)
+    tokenizer_id = "xlm-roberta-large"
+    return predict(text, model_id, tokenizer_id)
+demo = gr.Interface(
+    fn=predict_cap,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language")],
+    outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])

interfaces/sentiment.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from huggingface_hub import HfApi
+from label_dicts import MANIFESTO_LABEL_NAMES
+HF_TOKEN = os.environ["hf_read"]
+languages = [
+    "czech", "english", "french", "german", "hungarian", "italian"
+]
+def build_huggingface_path(language: str):
+    return "poltextlab/xlm-roberta-large-pooled-sentiment"
+def predict(text, model_id, tokenizer_id):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    model.to(device)
+    inputs = tokenizer(text,
+                       max_length=512,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    output_pred = {model.config.id2label[i]: probs[i] for i in np.argsort(probs)[::-1]}
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    return output_pred, output_info
+def predict_cap(text, language):
+    model_id = build_huggingface_path(language)
+    tokenizer_id = "xlm-roberta-large"
+    return predict(text, model_id, tokenizer_id)
+demo = gr.Interface(
+    fn=predict_cap,
+    inputs=[gr.Textbox(lines=6, label="Input"),
+            gr.Dropdown(languages, label="Language")],
+    outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])

label_dicts.py ADDED Viewed

	@@ -0,0 +1,110 @@

+CAP_NUM_DICT = {
+    0: 1,
+    1: 2,
+    2: 3,
+    3: 4,
+    4: 5,
+    5: 6,
+    6: 7,
+    7: 8,
+    8: 9,
+    9: 10,
+    10: 12,
+    11: 13,
+    12: 14,
+    13: 15,
+    14: 16,
+    15: 17,
+    16: 18,
+    17: 19,
+    18: 20,
+    19: 21,
+    20: 23,
+    21: 999,
+}
+CAP_LABEL_NAMES = {
+    1: "Macroeconomics",
+    2: "Civil Rights",
+    3: "Health",
+    4: "Agriculture",
+    5: "Labor",
+    6: "Education",
+    7: "Environment",
+    8: "Energy",
+    9: "Immigration",
+    10: "Transportation",
+    12: "Law and Crime",
+    13: "Social Welfare",
+    14: "Housing",
+    15: "Domestic Commerce",
+    16: "Defense",
+    17: "Technology",
+    18: "Foreign Trade",
+    19: "International Affairs",
+    20: "Government Operations",
+    21: "Public Lands",
+    23: "Culture",
+    999: "No Policy Content"
+}
+MANIFESTO_LABEL_NAMES = {
+    0: "No Policy Goal",
+    999: "No Policy Goal",
+    101: "Foreign Special Relationships: Positive",
+    102: "Foreign Special Relationships: Negative",
+    103: "Anti-Imperialism",
+    104: "Military: Positive",
+    105: "Military: Negative",
+    106: "Peace",
+    107: "Internationalism: Positive",
+    108: "European Community/Union: Positive",
+    109: "Internationalism: Negative",
+    110: "European Community/Union: Negative",
+    201: "Freedom and Human Rights",
+    202: "Democracy",
+    203: "Constitutionalism: Positive",
+    204: "Constitutionalism: Negative",
+    301: "Federalism",
+    302: "Centralisation",
+    303: "Governmental and Administrative Efficiency",
+    304: "Political Corruption",
+    305: "Political Authority",
+    401: "Free Market Economy",
+    402: "Incentives",
+    403: "Market Regulation",
+    404: "Economic Planning",
+    405: "Corporatism/Mixed Economy",
+    406: "Protectionism: Positive",
+    407: "Protectionism: Negative",
+    408: "Economic Goals",
+    409: "Keynesian Demand Management",
+    410: "Economic Growth: Positive",
+    411: "Technology and Infrastructure",
+    412: "Controlled Economy",
+    413: "Nationalisation",
+    414: "Economic Orthodoxy",
+    415: "Marxist Analysis: Positive",
+    416: "Anti-Growth Economy: Positive",
+    501: "Environmental Protection: Positive",
+    502: "Culture: Positive",
+    503: "Equality: Positive",
+    504: "Welfare State Expansion",
+    505: "Welfare State Limitation",
+    506: "Education Expansion",
+    507: "Education Limitation",
+    601: "National Way of Life: Positive",
+    602: "National Way of Life: Negative",
+    603: "Traditional Morality: Positive",
+    604: "Traditional Morality: Negative",
+    605: "Law and Order: Positive",
+    606: "Civic Mindedness: Positive",
+    607: "Multiculturalism: Positive",
+    608: "Multiculturalism: Negative",
+    701: "Labour Groups: Positive",
+    702: "Labour Groups: Negative",
+    703: "Agriculture and Farmers: Positive",
+    704: "Middle Class and Professional Groups",
+    705: "Underprivileged Minority Groups",
+    706: "Non-economic Demographic Groups"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pandas
+torch==2.2.1
+transformers==4.39.1
+sentencepiece==0.2.0