poltextlab commited on
Commit
b1c2932
1 Parent(s): b62c233

first upload

Browse files

base application

app.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from interfaces.cap import demo as cap_demo
4
+ from interfaces.manifesto import demo as manifesto_demo
5
+ from interfaces.sentiment import demo as sentiment_demo
6
+ from interfaces.emotion import demo as emotion_demo
7
+
8
+ with gr.Blocks() as demo:
9
+ gr.Markdown(
10
+ """
11
+ <div style="display: block; text-align: left; padding:0; margin:0;">
12
+ <h1 style="text-align: center">Babel Machine Demo</h1>
13
+ <p>This is a demo for text classification using language models finetuned on data labeled by <a href="https://www.comparativeagendas.net/">CAP</a>, <a href="https://manifesto-project.wzb.eu/">Manifesto Project</a>, sentiment, and emotion coding systems.<br>
14
+ For the coding of complete datasets, please visit the official <a href="https://babel.poltextlab.com/">Babel Machine</a> site.</p>
15
+ </div>
16
+ """)
17
+
18
+ gr.TabbedInterface(
19
+ interface_list=[cap_demo, manifesto_demo, sentiment_demo, emotion_demo],
20
+ tab_names=["CAP", "Manifesto", "sentiment", "emotion"],
21
+ )
22
+
23
+ if __name__ == "__main__":
24
+ demo.launch()
25
+
26
+ # TODO: add all languages & domains
interfaces/cap.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ import pandas as pd
7
+ from transformers import AutoModelForSequenceClassification
8
+ from transformers import AutoTokenizer
9
+ from huggingface_hub import HfApi
10
+ from huggingface_hub.utils._errors import RepositoryNotFoundError
11
+
12
+ from label_dicts import CAP_NUM_DICT, CAP_LABEL_NAMES
13
+
14
+ HF_TOKEN = os.environ["hf_read"]
15
+
16
+ languages = [
17
+ "danish",
18
+ "dutch",
19
+ "english",
20
+ "french",
21
+ "german",
22
+ "hungarian",
23
+ "italian",
24
+ "polish",
25
+ "portuguese",
26
+ "spanish",
27
+ "czech",
28
+ "slovak",
29
+ "norwegian"
30
+ ]
31
+
32
+ domains = {
33
+ "media": "media",
34
+ "social media": "social",
35
+ "parliamentary speech": "parlspeech",
36
+ "legislative documents": "legislative",
37
+ "executive speech": "execspeech",
38
+ "executive order": "execorder",
39
+ "party programs": "party",
40
+ "judiciary": "judiciary",
41
+ "budget": "budget",
42
+ "public opinion": "publicopinion",
43
+ "local government agenda": "localgovernment"
44
+ }
45
+
46
+ def check_huggingface_path(checkpoint_path: str):
47
+ try:
48
+ hf_api = HfApi(token=HF_TOKEN)
49
+ hf_api.model_info(checkpoint_path, token=HF_TOKEN)
50
+ return True
51
+ except RepositoryNotFoundError:
52
+ return False
53
+
54
+ def build_huggingface_path(language: str, domain: str):
55
+ base_path = "xlm-roberta-large"
56
+ lang_domain_path = f"poltextlab/{base_path}-{language}-{domain}-cap-v3"
57
+ lang_path = f"poltextlab/{base_path}-{language}-cap-v3"
58
+
59
+ path_map = {
60
+ "L": lang_path,
61
+ "L-D": lang_domain_path,
62
+ "X": lang_domain_path,
63
+ }
64
+ value = None
65
+
66
+ try:
67
+ lang_domain_table = pd.read_csv("language_domain_models.csv")
68
+ lang_domain_table["language"] = lang_domain_table["language"].str.lower()
69
+ lang_domain_table.columns = lang_domain_table.columns.str.lower()
70
+ # get the row for the language and them get the value from the domain column
71
+ row = lang_domain_table[(lang_domain_table["language"] == language)]
72
+ tmp = row.get(domain)
73
+ if not tmp.empty:
74
+ value = tmp.iloc[0]
75
+ except (AttributeError, FileNotFoundError):
76
+ value = None
77
+
78
+ if value and value in path_map:
79
+ model_path = path_map[value]
80
+ if check_huggingface_path(model_path):
81
+ # if the model is available on Huggingface, return the path
82
+ return model_path
83
+ else:
84
+ # if the model is not available on Huggingface, look for other models
85
+ filtered_path_map = {k: v for k, v in path_map.items() if k != value}
86
+ for k, v in filtered_path_map.items():
87
+ if check_huggingface_path(v):
88
+ return v
89
+ elif check_huggingface_path(lang_domain_path):
90
+ return lang_domain_path
91
+ elif check_huggingface_path(lang_path):
92
+ return lang_path
93
+ else:
94
+ return "poltextlab/xlm-roberta-large-pooled-cap"
95
+
96
+ def predict(text, model_id, tokenizer_id):
97
+ device = torch.device("cpu")
98
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
99
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
100
+ model.to(device)
101
+
102
+ inputs = tokenizer(text,
103
+ max_length=512,
104
+ truncation=True,
105
+ padding="do_not_pad",
106
+ return_tensors="pt").to(device)
107
+ model.eval()
108
+
109
+ with torch.no_grad():
110
+ logits = model(**inputs).logits
111
+
112
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
113
+ output_pred = {f"[{CAP_NUM_DICT[i]}] {CAP_LABEL_NAMES[CAP_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
114
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
115
+ return output_pred, output_info
116
+
117
+ def predict_cap(text, language, domain):
118
+ domain = domains[domain]
119
+ model_id = build_huggingface_path(language, domain)
120
+ tokenizer_id = "xlm-roberta-large"
121
+ return predict(text, model_id, tokenizer_id)
122
+
123
+ demo = gr.Interface(
124
+ fn=predict_cap,
125
+ inputs=[gr.Textbox(lines=6, label="Input"),
126
+ gr.Dropdown(languages, label="Language"),
127
+ gr.Dropdown(domains.keys(), label="Domain")],
128
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/emotion.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ from transformers import AutoModelForSequenceClassification
7
+ from transformers import AutoTokenizer
8
+ from huggingface_hub import HfApi
9
+
10
+ from label_dicts import MANIFESTO_LABEL_NAMES
11
+
12
+ HF_TOKEN = os.environ["hf_read"]
13
+
14
+ languages = [
15
+ "czech", "english", "french", "german", "hungarian", "italian"
16
+ ]
17
+
18
+ def build_huggingface_path(language: str):
19
+ return "poltextlab/xlm-roberta-large-pooled-emotions"
20
+
21
+ def predict(text, model_id, tokenizer_id):
22
+ device = torch.device("cpu")
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
24
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
25
+ model.to(device)
26
+
27
+ inputs = tokenizer(text,
28
+ max_length=512,
29
+ truncation=True,
30
+ padding="do_not_pad",
31
+ return_tensors="pt").to(device)
32
+ model.eval()
33
+
34
+ with torch.no_grad():
35
+ logits = model(**inputs).logits
36
+
37
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
38
+ output_pred = {model.config.id2label[i]: probs[i] for i in np.argsort(probs)[::-1]}
39
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
40
+ return output_pred, output_info
41
+
42
+ def predict_cap(text, language):
43
+ model_id = build_huggingface_path(language)
44
+ tokenizer_id = "xlm-roberta-large"
45
+ return predict(text, model_id, tokenizer_id)
46
+
47
+ demo = gr.Interface(
48
+ fn=predict_cap,
49
+ inputs=[gr.Textbox(lines=6, label="Input"),
50
+ gr.Dropdown(languages, label="Language")],
51
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/language_domain_models.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ language,media,social,parlspeech,legislative,execspeech,execorder,party,judiciary,budget,publicopinion,localgovernment
2
+ danish,,,L-D,L,L,,,,,,
3
+ dutch,L-D,L,L,L-D,L,L,,,,,
4
+ english,L,,L,L-D,L,L,L,L,,,
5
+ french,,,,L-D,L-D,L-D,L,,,,
6
+ german,L,,L-D,,,,L-D,,,,
7
+ hungarian,L,,L-D,L-D,L,,,,L-D,L-D,
8
+ italian,,L,L,L-D,,,,,,,
9
+ polish,,,,X,,,,,,,
10
+ portuguese,,,,L-D,L,L,,,,,
11
+ spanish,L,,L,L-D,L,L-D,L,,,,
12
+ czech,,,,,,,,,,,
13
+ polish,,,,,,,,,,,
14
+ slovak,,,,,,,,,,,
15
+ norwegian,,,,,,,,,,,
interfaces/manifesto.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ from transformers import AutoModelForSequenceClassification
7
+ from transformers import AutoTokenizer
8
+ from huggingface_hub import HfApi
9
+
10
+ from label_dicts import MANIFESTO_LABEL_NAMES
11
+
12
+ HF_TOKEN = os.environ["hf_read"]
13
+
14
+ languages = [
15
+ "armenian", "bulgarian", "croatian", "czech", "danish", "dutch", "english",
16
+ "estonian", "finnish", "french", "georgian", "german", "greek", "hebrew",
17
+ "hungarian", "icelandic", "italian", "japanese", "korean", "latvian",
18
+ "lithuanian", "norwegian", "polish", "portuguese", "romanian", "russian",
19
+ "serbian", "slovak", "slovenian", "spanish", "swedish", "turkish"
20
+ ]
21
+
22
+ def build_huggingface_path(language: str):
23
+ return "poltextlab/xlm-roberta-large-manifesto"
24
+
25
+ def predict(text, model_id, tokenizer_id):
26
+ device = torch.device("cpu")
27
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
28
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
29
+ model.to(device)
30
+
31
+ inputs = tokenizer(text,
32
+ max_length=512,
33
+ truncation=True,
34
+ padding="do_not_pad",
35
+ return_tensors="pt").to(device)
36
+ model.eval()
37
+
38
+ with torch.no_grad():
39
+ logits = model(**inputs).logits
40
+
41
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
42
+ output_pred = {f"[{model.config.id2label[i]}] {MANIFESTO_LABEL_NAMES[int(model.config.id2label[i])]}": probs[i] for i in np.argsort(probs)[::-1]}
43
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
44
+ return output_pred, output_info
45
+
46
+ def predict_cap(text, language):
47
+ model_id = build_huggingface_path(language)
48
+ tokenizer_id = "xlm-roberta-large"
49
+ return predict(text, model_id, tokenizer_id)
50
+
51
+ demo = gr.Interface(
52
+ fn=predict_cap,
53
+ inputs=[gr.Textbox(lines=6, label="Input"),
54
+ gr.Dropdown(languages, label="Language")],
55
+ outputs=[gr.Label(num_top_classes=5, label="Output"), gr.Markdown()])
interfaces/sentiment.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import os
4
+ import torch
5
+ import numpy as np
6
+ from transformers import AutoModelForSequenceClassification
7
+ from transformers import AutoTokenizer
8
+ from huggingface_hub import HfApi
9
+
10
+ from label_dicts import MANIFESTO_LABEL_NAMES
11
+
12
+ HF_TOKEN = os.environ["hf_read"]
13
+
14
+ languages = [
15
+ "czech", "english", "french", "german", "hungarian", "italian"
16
+ ]
17
+
18
+ def build_huggingface_path(language: str):
19
+ return "poltextlab/xlm-roberta-large-pooled-sentiment"
20
+
21
+ def predict(text, model_id, tokenizer_id):
22
+ device = torch.device("cpu")
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN)
24
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
25
+ model.to(device)
26
+
27
+ inputs = tokenizer(text,
28
+ max_length=512,
29
+ truncation=True,
30
+ padding="do_not_pad",
31
+ return_tensors="pt").to(device)
32
+ model.eval()
33
+
34
+ with torch.no_grad():
35
+ logits = model(**inputs).logits
36
+
37
+ probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
38
+ output_pred = {model.config.id2label[i]: probs[i] for i in np.argsort(probs)[::-1]}
39
+ output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
40
+ return output_pred, output_info
41
+
42
+ def predict_cap(text, language):
43
+ model_id = build_huggingface_path(language)
44
+ tokenizer_id = "xlm-roberta-large"
45
+ return predict(text, model_id, tokenizer_id)
46
+
47
+ demo = gr.Interface(
48
+ fn=predict_cap,
49
+ inputs=[gr.Textbox(lines=6, label="Input"),
50
+ gr.Dropdown(languages, label="Language")],
51
+ outputs=[gr.Label(num_top_classes=3, label="Output"), gr.Markdown()])
label_dicts.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CAP_NUM_DICT = {
2
+ 0: 1,
3
+ 1: 2,
4
+ 2: 3,
5
+ 3: 4,
6
+ 4: 5,
7
+ 5: 6,
8
+ 6: 7,
9
+ 7: 8,
10
+ 8: 9,
11
+ 9: 10,
12
+ 10: 12,
13
+ 11: 13,
14
+ 12: 14,
15
+ 13: 15,
16
+ 14: 16,
17
+ 15: 17,
18
+ 16: 18,
19
+ 17: 19,
20
+ 18: 20,
21
+ 19: 21,
22
+ 20: 23,
23
+ 21: 999,
24
+ }
25
+
26
+ CAP_LABEL_NAMES = {
27
+ 1: "Macroeconomics",
28
+ 2: "Civil Rights",
29
+ 3: "Health",
30
+ 4: "Agriculture",
31
+ 5: "Labor",
32
+ 6: "Education",
33
+ 7: "Environment",
34
+ 8: "Energy",
35
+ 9: "Immigration",
36
+ 10: "Transportation",
37
+ 12: "Law and Crime",
38
+ 13: "Social Welfare",
39
+ 14: "Housing",
40
+ 15: "Domestic Commerce",
41
+ 16: "Defense",
42
+ 17: "Technology",
43
+ 18: "Foreign Trade",
44
+ 19: "International Affairs",
45
+ 20: "Government Operations",
46
+ 21: "Public Lands",
47
+ 23: "Culture",
48
+ 999: "No Policy Content"
49
+ }
50
+
51
+ MANIFESTO_LABEL_NAMES = {
52
+ 0: "No Policy Goal",
53
+ 999: "No Policy Goal",
54
+ 101: "Foreign Special Relationships: Positive",
55
+ 102: "Foreign Special Relationships: Negative",
56
+ 103: "Anti-Imperialism",
57
+ 104: "Military: Positive",
58
+ 105: "Military: Negative",
59
+ 106: "Peace",
60
+ 107: "Internationalism: Positive",
61
+ 108: "European Community/Union: Positive",
62
+ 109: "Internationalism: Negative",
63
+ 110: "European Community/Union: Negative",
64
+ 201: "Freedom and Human Rights",
65
+ 202: "Democracy",
66
+ 203: "Constitutionalism: Positive",
67
+ 204: "Constitutionalism: Negative",
68
+ 301: "Federalism",
69
+ 302: "Centralisation",
70
+ 303: "Governmental and Administrative Efficiency",
71
+ 304: "Political Corruption",
72
+ 305: "Political Authority",
73
+ 401: "Free Market Economy",
74
+ 402: "Incentives",
75
+ 403: "Market Regulation",
76
+ 404: "Economic Planning",
77
+ 405: "Corporatism/Mixed Economy",
78
+ 406: "Protectionism: Positive",
79
+ 407: "Protectionism: Negative",
80
+ 408: "Economic Goals",
81
+ 409: "Keynesian Demand Management",
82
+ 410: "Economic Growth: Positive",
83
+ 411: "Technology and Infrastructure",
84
+ 412: "Controlled Economy",
85
+ 413: "Nationalisation",
86
+ 414: "Economic Orthodoxy",
87
+ 415: "Marxist Analysis: Positive",
88
+ 416: "Anti-Growth Economy: Positive",
89
+ 501: "Environmental Protection: Positive",
90
+ 502: "Culture: Positive",
91
+ 503: "Equality: Positive",
92
+ 504: "Welfare State Expansion",
93
+ 505: "Welfare State Limitation",
94
+ 506: "Education Expansion",
95
+ 507: "Education Limitation",
96
+ 601: "National Way of Life: Positive",
97
+ 602: "National Way of Life: Negative",
98
+ 603: "Traditional Morality: Positive",
99
+ 604: "Traditional Morality: Negative",
100
+ 605: "Law and Order: Positive",
101
+ 606: "Civic Mindedness: Positive",
102
+ 607: "Multiculturalism: Positive",
103
+ 608: "Multiculturalism: Negative",
104
+ 701: "Labour Groups: Positive",
105
+ 702: "Labour Groups: Negative",
106
+ 703: "Agriculture and Farmers: Positive",
107
+ 704: "Middle Class and Professional Groups",
108
+ 705: "Underprivileged Minority Groups",
109
+ 706: "Non-economic Demographic Groups"
110
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pandas
2
+ torch==2.2.1
3
+ transformers==4.39.1
4
+ sentencepiece==0.2.0