File size: 4,985 Bytes
3f9aebf
4816a7c
 
 
 
 
 
 
 
 
 
 
 
 
9afb236
4816a7c
 
 
3f9aebf
 
 
 
 
 
4816a7c
 
 
 
 
 
 
 
 
 
 
 
 
 
16346aa
 
 
 
 
 
3f9aebf
 
 
 
 
 
 
4816a7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d228e3
4816a7c
3f9aebf
711306c
4816a7c
 
 
 
 
 
 
1d228e3
4816a7c
 
 
 
 
 
3f9aebf
7504e3d
4816a7c
1d228e3
 
4816a7c
 
 
ce0b273
4816a7c
 
 
 
 
 
 
 
 
 
16346aa
4816a7c
 
 
671160e
 
4816a7c
671160e
 
 
 
 
1d228e3
4816a7c
d008982
4816a7c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, MarianMTModel, MarianTokenizer, pipeline
import nltk.data
import pandas as pd
import matplotlib.pyplot as plt

nltk.download('punkt')

import gradio as gr
from gradio.mix import Parallel

tokenizer_t5 = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
model_t5 = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")

pretrained_sentiment = "ProsusAI/finbert"
pretrained_ner = "51la5/roberta-large-NER"

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

tokenizer_translate = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")
model_translate = MarianMTModel.from_pretrained(
    "Helsinki-NLP/opus-mt-id-en")
#finetuned_model = MarianMTModel.from_pretrained(
#    "wolfrage89/annual_report_translation_id_en")

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=pretrained_sentiment,
    tokenizer=pretrained_sentiment,
    return_all_scores=True
)

ner_pipeline = pipeline(
    "ner",
    model=pretrained_ner,
    tokenizer=pretrained_ner,
    grouped_entities=True
)

examples = [
    "Perusahaan industri e-commerce Indonesia, Bukalapak telah memberhentikan puluhan karyawan dari beberapa function; Berlawanan dengan PHK sebelumnya, perusahaan mengontrak jajaran pekerja kantornya, harian Kompas melaporkan.",
    "Dengan pabrik produksi baru, perusahaan akan meningkatkan kapasitasnya untuk memenuhi peningkatan permintaan yang diharapkan dan akan meningkatkan penggunaan bahan baku dan oleh karena itu meningkatkan profitabilitas produksi.",
    "Lifetree didirikan pada tahun 2000, dan pendapatannya meningkat rata-rata 40% dengan margin di akhir 30-an."
]

def get_translation(text):
    translated_tokens = model_translate.generate(
        **tokenizer_translate([text], return_tensors='pt', max_length=104, truncation=True))[0]
    translated_sentence = tokenizer_translate.decode(
        translated_tokens, skip_special_tokens=True)
    return translated_sentence

def summ_t5(text):
    input_ids = tokenizer_t5.encode(text, return_tensors='pt')
    summary_ids = model_t5.generate(input_ids,
                max_length=100,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=2,
                use_cache=True)
    summary_text = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text
    
def sentiment_analysis(text):
    output = sentiment_pipeline(text)
    return {elm["label"]: elm["score"] for elm in output[0]}

def ner(text):
    output = ner_pipeline(text)
    for elm in output:
        elm['entity'] = elm['entity_group']
    return {"text": text, "entities": output}
    
def sentiment_df(text):
    df = pd.DataFrame(columns=['Text', 'Eng', 'Label', 'Score'])
    text_list = sentence_tokenizer.tokenize(text)
    eng_text = [get_translation(text) for text in text_list]
    result = [sentiment_analysis(text) for text in eng_text]
    labels = []
    scores = []
    for pred in result:
        idx = list(pred.values()).index(max(list(pred.values())))
        labels.append(list(pred.keys())[idx])
        scores.append(round(list(pred.values())[idx], 3))
    df['Text'] = text_list
    df['Eng'] = eng_text 
    df['Label'] = labels
    df['Score'] = scores
    return df
    
def run(text):
    summ_ = summ_t5(text)
    summ_translated = get_translation(summ_)
    sent_ = sentiment_analysis(summ_translated )
    ner_ = ner(summ_)
    df_sentiment = sentiment_df(text)
    return summ_, sent_, ner_, df_sentiment

if __name__ == "__main__":
    with gr.Blocks() as demo:
        gr.Markdown("""<h1 style="text-align:center">Financial Statement Analysis - Indonesia</h1>""")

        gr.Markdown(
            """
            Creator: Wira Indra Kusuma
            """
            )
        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(label="Input Text")
                analyze_button = gr.Button(label="Analyze")
                examples_bar = gr.Examples(examples=examples, inputs=input_text)
                
            with gr.Column():
                summ_output = gr.Textbox(label="Article Summary")
                ner_output = gr.HighlightedText(label="NER of Summary")
                sent_output = gr.Label(label="Sentiment of Summary")
                
        dataframe_component = gr.DataFrame(type="pandas",
                                            label="Dataframe",
                                            max_rows=(20,'fixed'),
                                            overflow_row_behaviour='paginate',
                                            wrap=True)
                
                
        analyze_button.click(run, inputs=input_text, outputs=[summ_output, sent_output, ner_output, dataframe_component ])
    demo.launch()