CCCCC5 commited on
Commit
2fb89f5
1 Parent(s): 18c1ca9

Upload 3 files

Browse files
LoughranMcDonald_MasterDictionary_2020.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import torch
4
+ import pandas as pd
5
+ from openprompt.plms import load_plm
6
+ from openprompt import PromptDataLoader
7
+ from openprompt.prompts import ManualVerbalizer
8
+ from openprompt.prompts import ManualTemplate
9
+ from openprompt.data_utils import InputExample
10
+ from openprompt import PromptForClassification
11
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
12
+
13
+ def readLMwords():
14
+ alldata = pd.read_csv("LoughranMcDonald_MasterDictionary_2020.csv")
15
+ positive = list(alldata[alldata["Positive"]!=0]["Word"].str.lower())
16
+ negative = list(alldata[alldata["Negative"]!=0]["Word"].str.lower())
17
+ uncertainty = list(alldata[alldata["Uncertainty"]!=0]["Word"].str.lower())
18
+ return positive,negative,uncertainty
19
+
20
+
21
+ def sentiment_analysis(sentence, model_name):
22
+ model_name = "HYCCC/"+model_name
23
+ raw_sentences = sentence.strip().split('\n')
24
+ template = '{"placeholder":"text_a"} Shares are {"mask"}.'
25
+ classes = ['positive', 'neutral', 'negative']
26
+ positive,negative,neutral = readLMwords()
27
+ label_words = {
28
+ "positive": positive,
29
+ "neutral": neutral,
30
+ "negative": negative,
31
+ }
32
+ type_dic = {
33
+ "HYCCC/RoBERTa_Chinese_AnnualReport_tuned":"roberta",
34
+ "HYCCC/RoBERTa_Chinese_FinancialNews_tuned":"roberta",
35
+ "HYCCC/RoBERTa_English_AnnualReport_tuned":"roberta",
36
+ "HYCCC/RoBERTa_English_FinancialNews_tuned":"roberta",
37
+ }
38
+
39
+ if 'Chinese' in model_name:
40
+ tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
41
+ model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
42
+
43
+ translated_tokens = model.generate(
44
+ **tokenizer(raw_sentences, return_tensors="pt", padding=True)
45
+ )
46
+ sentences_translated = []
47
+ for t in translated_tokens:
48
+ sentences_translated.append(tokenizer.decode(t, skip_special_tokens=True))
49
+ sentences = sentences_translated
50
+ else:
51
+ sentences = raw_sentences
52
+
53
+ testdata = []
54
+ for i,sentence in enumerate(sentences):
55
+ testdata.append(InputExample(guid=i,text_a=sentence,label=0))
56
+
57
+ plm, tokenizer, model_config, WrapperClass = load_plm(type_dic[model_name], model_name)
58
+
59
+ promptTemplate = ManualTemplate(
60
+ text = template,
61
+ tokenizer = tokenizer,
62
+ )
63
+ promptVerbalizer = ManualVerbalizer(
64
+ classes = classes,
65
+ label_words = label_words,
66
+ tokenizer = tokenizer,
67
+ )
68
+ test_dataloader = PromptDataLoader(
69
+ dataset = testdata,
70
+ tokenizer = tokenizer,
71
+ template = promptTemplate,
72
+ tokenizer_wrapper_class = WrapperClass,
73
+ batch_size = 4,
74
+ max_seq_length = 512,
75
+ )
76
+ prompt_model = PromptForClassification(
77
+ plm=plm,
78
+ template=promptTemplate,
79
+ verbalizer=promptVerbalizer,
80
+ freeze_plm=True
81
+ )
82
+ result = []
83
+ for step, inputs in enumerate(test_dataloader):
84
+ logits = prompt_model(inputs)
85
+ result.extend(torch.argmax(logits, dim=-1))
86
+ output = '\n'.join([f"{classes[res]}, {raw_sentences[i]}" for i,res in enumerate(result)])
87
+ return str(output)
88
+
89
+
90
+
91
+ demo = gr.Interface(fn=sentiment_analysis,
92
+ inputs = [gr.TextArea(placeholder="Enter sentence here. If you have multiple sentences, separate them with '\\n'.",
93
+ label="Sentence",lines=5,
94
+ max_lines = 10),
95
+ gr.Radio(choices=["RoBERTa_Chinese_AnnualReport_tuned",
96
+ "RoBERTa_Chinese_FinancialNews_tuned",
97
+ "RoBERTa_English_AnnualReport_tuned",
98
+ "RoBERTa_English_FinancialNews_tuned"],
99
+ label="Model Selection")],
100
+ outputs=gr.TextArea(label="Sentiment",lines=5, show_copy_button=True, max_lines = 10),
101
+ title = "Prompt Learning-Based Disclosure Sentiment Detection"
102
+ )
103
+
104
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ scikit-learn
5
+ openprompt
6
+ sacremoses