w11wo commited on
Commit
76d290c
1 Parent(s): fa79515

Initial Prototype

Browse files
Files changed (7) hide show
  1. README.md +3 -3
  2. app.py +1 -1
  3. get_documents.ipynb +0 -92
  4. model.py +18 -50
  5. requirements.txt +0 -1
  6. sample.json +0 -0
  7. utils.py +3 -132
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: LazarusNLP
3
- emoji: 👀
4
  colorFrom: red
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.18.0
8
  app_file: app.py
 
1
  ---
2
+ title: NusaBERT
3
+ emoji: 🇮🇩
4
  colorFrom: red
5
+ colorTo: white
6
  sdk: gradio
7
  sdk_version: 4.18.0
8
  app_file: app.py
app.py CHANGED
@@ -11,6 +11,6 @@ if __name__ == "__main__":
11
 
12
  # interface = gr.TabbedInterface(interfaces, titles, theme="soft")
13
  with gr.Blocks(theme="soft") as demo:
14
- gr.Markdown("# LazarusNLP Indonesian NLP Demo")
15
  gr.TabbedInterface(interfaces, titles, theme="soft")
16
  demo.launch()
 
11
 
12
  # interface = gr.TabbedInterface(interfaces, titles, theme="soft")
13
  with gr.Blocks(theme="soft") as demo:
14
+ gr.Markdown("# NusaBERT Demo")
15
  gr.TabbedInterface(interfaces, titles, theme="soft")
16
  demo.launch()
get_documents.ipynb DELETED
@@ -1,92 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "/opt/miniconda3/envs/hf/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
- " from .autonotebook import tqdm as notebook_tqdm\n"
14
- ]
15
- }
16
- ],
17
- "source": [
18
- "from datasets import load_dataset\n",
19
- "dataset = load_dataset('LazarusNLP/wikipedia_id_20230520')"
20
- ]
21
- },
22
- {
23
- "cell_type": "code",
24
- "execution_count": 2,
25
- "metadata": {},
26
- "outputs": [],
27
- "source": [
28
- "import pandas as pd\n",
29
- "train = pd.DataFrame(dataset['train'])"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": 3,
35
- "metadata": {},
36
- "outputs": [],
37
- "source": [
38
- "first_element = train.groupby('id').first()"
39
- ]
40
- },
41
- {
42
- "cell_type": "code",
43
- "execution_count": 8,
44
- "metadata": {},
45
- "outputs": [],
46
- "source": [
47
- "sample = first_element.sample(n=10000).reset_index()"
48
- ]
49
- },
50
- {
51
- "cell_type": "code",
52
- "execution_count": 9,
53
- "metadata": {},
54
- "outputs": [],
55
- "source": [
56
- "sample_dict = sample.to_dict(orient=\"list\")"
57
- ]
58
- },
59
- {
60
- "cell_type": "code",
61
- "execution_count": 10,
62
- "metadata": {},
63
- "outputs": [],
64
- "source": [
65
- "import json\n",
66
- "with open('sample.json', 'w') as f:\n",
67
- " json.dump(sample_dict, f)"
68
- ]
69
- }
70
- ],
71
- "metadata": {
72
- "kernelspec": {
73
- "display_name": "hf",
74
- "language": "python",
75
- "name": "python3"
76
- },
77
- "language_info": {
78
- "codemirror_mode": {
79
- "name": "ipython",
80
- "version": 3
81
- },
82
- "file_extension": ".py",
83
- "mimetype": "text/x-python",
84
- "name": "python",
85
- "nbconvert_exporter": "python",
86
- "pygments_lexer": "ipython3",
87
- "version": "3.11.7"
88
- }
89
- },
90
- "nbformat": 4,
91
- "nbformat_minor": 2
92
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.py CHANGED
@@ -1,15 +1,12 @@
1
- from utils import (
2
- text_analysis_interface,
3
- token_classification_interface,
4
- search_interface,
5
- text_interface,
6
- SentenceSimilarity,
7
- )
8
  from transformers import pipeline
 
 
 
9
 
10
  models = {
11
  "Text Analysis": {
12
- "title": "# Text Analysis",
13
  "examples": [
14
  "Allianz adalah persuhaan asuransi yang di dirikan pada tanggal February 5, 1890 di Berlin, Jerman.",
15
  "Restaurant ini sangat tidak enak. Enakan Pizza Hut.",
@@ -21,32 +18,15 @@ models = {
21
  "POS Tagging",
22
  "NER Tagging",
23
  ],
24
- "desc": "A tool to showcase the full capabilities of text analysis LazarusNLP has to offer.",
25
  "interface": text_analysis_interface,
26
  "pipe": [
27
- pipeline(
28
- "text-classification",
29
- model="w11wo/indonesian-roberta-base-sentiment-classifier",
30
- tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
31
- ),
32
- pipeline(
33
- "text-classification",
34
- model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
35
- tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
36
- ),
37
- pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
38
- pipeline(model="w11wo/indonesian-roberta-base-nerp-tagger"),
39
  ],
40
  },
41
- "Document Search": {
42
- "title": "# Document Search 🔍",
43
- "examples": ["Stadion bola Indonesia.", "Rusia dan Serbia", "Politik."],
44
- "output_label": "Top 5 related documents",
45
- "desc": "A semantic search tool to get the most related documents 📖 based on user's query.",
46
- "interface": search_interface,
47
- "pipe": SentenceSimilarity("LazarusNLP/all-indobert-base-v2", "sample.json"),
48
- "top_k": 5,
49
- },
50
  "Sentiment Analysis": {
51
  "title": "Sentiment Analysis",
52
  "examples": [
@@ -55,13 +35,9 @@ models = {
55
  "keren sekali aplikasi ini bisa menunjukan data diri secara detail, sangat di rekomendasikan untuk di pakai.",
56
  ],
57
  "output_label": "Sentiment Analysis",
58
- "desc": "A sentiment-text-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's SmSA dataset consisting of Indonesian comments and reviews.",
59
  "interface": text_interface,
60
- "pipe": pipeline(
61
- "text-classification",
62
- model="w11wo/indonesian-roberta-base-sentiment-classifier",
63
- tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
64
- ),
65
  },
66
  "Emotion Detection": {
67
  "title": "Emotion Classifier",
@@ -71,18 +47,10 @@ models = {
71
  "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
72
  ],
73
  "output_label": "Emotion Classifier",
74
- "desc": "An emotion classifier based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's EmoT dataset",
75
  "interface": text_interface,
76
- "pipe": pipeline(
77
- "text-classification",
78
- model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
79
- tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
80
- ),
81
  },
82
- # "summarization": {
83
- # "examples": [],
84
- # "desc": "This model is a fine-tuned version of LazarusNLP/IndoNanoT5-base on the indonlg dataset.",
85
- # },
86
  "POS Tagging": {
87
  "title": "POS Tagging",
88
  "examples": [
@@ -91,9 +59,9 @@ models = {
91
  "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
92
  ],
93
  "output_label": "POS Tagging",
94
- "desc": "A part-of-speech token-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's POSP dataset consisting of tag-labelled news.",
95
  "interface": token_classification_interface,
96
- "pipe": pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
97
  },
98
  "NER Tagging": {
99
  "title": "NER Tagging",
@@ -103,8 +71,8 @@ models = {
103
  "Jakarta, Indonesia akan menjadi bagian salah satu tempat yang akan didatangi.",
104
  ],
105
  "output_label": "NER Tagging",
106
- "desc": "A NER Tagging token-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's NERP dataset consisting of tag-labelled news.",
107
  "interface": token_classification_interface,
108
- "pipe": pipeline(model="w11wo/indonesian-roberta-base-nerp-tagger"),
109
  },
110
  }
 
1
+ from utils import text_analysis_interface, token_classification_interface, text_interface
 
 
 
 
 
 
2
  from transformers import pipeline
3
+ import os
4
+
5
+ auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
6
 
7
  models = {
8
  "Text Analysis": {
9
+ "title": "Text Analysis",
10
  "examples": [
11
  "Allianz adalah persuhaan asuransi yang di dirikan pada tanggal February 5, 1890 di Berlin, Jerman.",
12
  "Restaurant ini sangat tidak enak. Enakan Pizza Hut.",
 
18
  "POS Tagging",
19
  "NER Tagging",
20
  ],
21
+ "desc": "A tool to showcase the full capabilities of text analysis NusaBERT fine-tuning has to offer.",
22
  "interface": text_analysis_interface,
23
  "pipe": [
24
+ pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
25
+ pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
26
+ pipeline(model="LazarusNLP/NusaBERT-base-POSP", auth_token=auth_token),
27
+ pipeline(model="LazarusNLP/NusaBERT-base-NERP", auth_token=auth_token),
 
 
 
 
 
 
 
 
28
  ],
29
  },
 
 
 
 
 
 
 
 
 
30
  "Sentiment Analysis": {
31
  "title": "Sentiment Analysis",
32
  "examples": [
 
35
  "keren sekali aplikasi ini bisa menunjukan data diri secara detail, sangat di rekomendasikan untuk di pakai.",
36
  ],
37
  "output_label": "Sentiment Analysis",
38
+ "desc": "A sentiment-text-classification model based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's SmSA dataset consisting of Indonesian comments and reviews.",
39
  "interface": text_interface,
40
+ "pipe": pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
 
 
 
 
41
  },
42
  "Emotion Detection": {
43
  "title": "Emotion Classifier",
 
47
  "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
48
  ],
49
  "output_label": "Emotion Classifier",
50
+ "desc": "An emotion classifier based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's EmoT dataset",
51
  "interface": text_interface,
52
+ "pipe": pipeline(model="LazarusNLP/NusaBERT-base-EmoT", auth_token=auth_token),
 
 
 
 
53
  },
 
 
 
 
54
  "POS Tagging": {
55
  "title": "POS Tagging",
56
  "examples": [
 
59
  "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
60
  ],
61
  "output_label": "POS Tagging",
62
+ "desc": "A part-of-speech token-classification model based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's POSP dataset consisting of tag-labelled news.",
63
  "interface": token_classification_interface,
64
+ "pipe": pipeline(model="LazarusNLP/NusaBERT-base-POSP", auth_token=auth_token),
65
  },
66
  "NER Tagging": {
67
  "title": "NER Tagging",
 
71
  "Jakarta, Indonesia akan menjadi bagian salah satu tempat yang akan didatangi.",
72
  ],
73
  "output_label": "NER Tagging",
74
+ "desc": "A NER Tagging token-classification model based on the BERT model. The model was originally the pre-trained NusaBERT Base model, which is then fine-tuned on indonlu's NERP dataset consisting of tag-labelled news.",
75
  "interface": token_classification_interface,
76
+ "pipe": pipeline(model="LazarusNLP/NusaBERT-base-NERP", auth_token=auth_token),
77
  },
78
  }
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
  gradio==4.19.1
2
  scipy==1.12.0
3
- sentence_transformers==2.3.1
4
  transformers==4.37.2
 
1
  gradio==4.19.1
2
  scipy==1.12.0
 
3
  transformers==4.37.2
sample.json DELETED
The diff for this file is too large to render. See raw diff
 
utils.py CHANGED
@@ -1,56 +1,11 @@
1
  import gradio as gr
2
  from functools import partial
3
  from transformers import pipeline, pipelines
4
- from sentence_transformers import SentenceTransformer, util
5
- import json
6
 
7
 
8
  ######################
9
  ##### INFERENCE ######
10
  ######################
11
- class SentenceSimilarity:
12
-
13
- def __init__(self, model: str, corpus_path: str):
14
- f = open(corpus_path)
15
- data = json.load(f)
16
- self.id, self.url, self.title, self.text = (
17
- data["id"],
18
- data["url"],
19
- data["title"],
20
- data["text"],
21
- )
22
- self.model = SentenceTransformer(model)
23
- self.corpus_embeddings = self.model.encode(self.text)
24
-
25
- def __call__(self, query: str, corpus: list[str], top_k: int = 5):
26
- query_embedding = self.model.encode(query)
27
- output = util.semantic_search(
28
- query_embedding, self.corpus_embeddings, top_k=top_k
29
- )
30
- return output[0]
31
-
32
-
33
- # Sentence Similarity
34
- def sentence_similarity(
35
- query: str,
36
- texts: list[str],
37
- titles: list[str],
38
- urls: list[str],
39
- pipe: SentenceSimilarity,
40
- top_k: int,
41
- ) -> list[str]:
42
- answer = pipe(query=query, corpus=texts, top_k=top_k)
43
- output = [
44
- f"""
45
- Cosine Similarity Score: {round(ans['score'], 3)}
46
- ## [{titles[ans['corpus_id']]} 🔗]({urls[ans['corpus_id']]})
47
- {texts[ans['corpus_id']]}
48
- """
49
- for ans in answer
50
- ]
51
- return output
52
-
53
-
54
  # Text Analysis
55
  def cls_inference(input: list[str], pipe: pipeline) -> dict:
56
  results = pipe(input, top_k=None)
@@ -77,77 +32,7 @@ def text_analysis(text, pipes: list[pipeline]):
77
  ######################
78
  ##### INTERFACE ######
79
  ######################
80
- def text_interface(
81
- pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str
82
- ):
83
- return gr.Interface(
84
- fn=partial(cls_inference, pipe=pipe),
85
- inputs=[
86
- gr.Textbox(lines=5, label="Input Text"),
87
- ],
88
- title=title,
89
- description=desc,
90
- outputs=[gr.Label(label=output_label)],
91
- examples=examples,
92
- allow_flagging="never",
93
- )
94
-
95
-
96
- def search_interface(
97
- pipe: SentenceSimilarity,
98
- examples: list[str],
99
- output_label: str,
100
- title: str,
101
- desc: str,
102
- top_k: int,
103
- ):
104
- with gr.Blocks() as sentence_similarity_interface:
105
- gr.Markdown(title)
106
- gr.Markdown(desc)
107
- with gr.Row():
108
- # input on the left
109
- with gr.Column():
110
- input_text = gr.Textbox(lines=5, label="Query")
111
- # display documents
112
- df = gr.DataFrame(
113
- [
114
- [id, f"<a href='{url}' target='_blank'>{title} 🔗</a>"]
115
- for id, title, url in zip(pipe.id, pipe.title, pipe.url)
116
- ],
117
- headers=["ID", "Title"],
118
- wrap=True,
119
- datatype=["markdown", "html"],
120
- interactive=False,
121
- height=300,
122
- )
123
- button = gr.Button("Search...")
124
- with gr.Column():
125
- # outputs top_k results in accordion format
126
- outputs = []
127
- for i in range(top_k):
128
- # open the first accordion
129
- with gr.Accordion(label=f"Document {i + 1}", open=i == 0) as a:
130
- output = gr.Markdown()
131
- outputs.append(output)
132
- gr.Examples(examples, inputs=[input_text], outputs=outputs)
133
- button.click(
134
- fn=partial(
135
- sentence_similarity,
136
- pipe=pipe,
137
- texts=pipe.text,
138
- titles=pipe.title,
139
- urls=pipe.url,
140
- top_k=top_k,
141
- ),
142
- inputs=[input_text],
143
- outputs=outputs,
144
- )
145
- return sentence_similarity_interface
146
-
147
-
148
- def token_classification_interface(
149
- pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str
150
- ):
151
  return gr.Interface(
152
  fn=partial(tagging, pipe=pipe),
153
  inputs=[
@@ -161,9 +46,7 @@ def token_classification_interface(
161
  )
162
 
163
 
164
- def text_analysis_interface(
165
- pipe: list, examples: list[str], output_label: str, title: str, desc: str
166
- ):
167
  with gr.Blocks() as text_analysis_interface:
168
  gr.Markdown(title)
169
  gr.Markdown(desc)
@@ -172,9 +55,7 @@ def text_analysis_interface(
172
  outputs = [
173
  (
174
  gr.HighlightedText(label=label)
175
- if isinstance(
176
- p, pipelines.token_classification.TokenClassificationPipeline
177
- )
178
  else gr.Label(label=label)
179
  )
180
  for label, p in zip(output_label, pipe)
@@ -191,13 +72,3 @@ def text_analysis_interface(
191
  outputs=outputs,
192
  )
193
  return text_analysis_interface
194
-
195
-
196
- # Summary
197
- # summary_interface = gr.Interface.from_pipeline(
198
- # pipes["summarization"],
199
- # title="Summarization",
200
- # examples=details["summarization"]["examples"],
201
- # description=details["summarization"]["description"],
202
- # allow_flagging="never",
203
- # )
 
1
  import gradio as gr
2
  from functools import partial
3
  from transformers import pipeline, pipelines
 
 
4
 
5
 
6
  ######################
7
  ##### INFERENCE ######
8
  ######################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Text Analysis
10
  def cls_inference(input: list[str], pipe: pipeline) -> dict:
11
  results = pipe(input, top_k=None)
 
32
  ######################
33
  ##### INTERFACE ######
34
  ######################
35
+ def token_classification_interface(pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  return gr.Interface(
37
  fn=partial(tagging, pipe=pipe),
38
  inputs=[
 
46
  )
47
 
48
 
49
+ def text_analysis_interface(pipe: list, examples: list[str], output_label: str, title: str, desc: str):
 
 
50
  with gr.Blocks() as text_analysis_interface:
51
  gr.Markdown(title)
52
  gr.Markdown(desc)
 
55
  outputs = [
56
  (
57
  gr.HighlightedText(label=label)
58
+ if isinstance(p, pipelines.token_classification.TokenClassificationPipeline)
 
 
59
  else gr.Label(label=label)
60
  )
61
  for label, p in zip(output_label, pipe)
 
72
  outputs=outputs,
73
  )
74
  return text_analysis_interface