mdj1412 commited on
Commit
ec39d42
1 Parent(s): 4f2c346

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -14
app.py CHANGED
@@ -4,7 +4,6 @@ import fasttext
4
  from transformers import AutoModelForSequenceClassification
5
  from transformers import AutoTokenizer
6
 
7
- import random
8
  import numpy as np
9
  import pandas as pd
10
  import torch
@@ -27,7 +26,7 @@ class LanguageIdentification:
27
  self.model = fasttext.load_model(pretrained_lang_model)
28
 
29
  def predict_lang(self, text):
30
- predictions = self.model.predict(text, k=2) # returns top 2 matching languages
31
  return predictions
32
 
33
  LANGUAGE = LanguageIdentification()
@@ -46,10 +45,13 @@ def tokenized_data(tokenizer, inputs):
46
 
47
  examples = []
48
  df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
49
- random.seed(100)
50
- for i in range(15):
51
- idx = random.randint(0, 50)
52
- examples.extend([ ['Eng', df.iloc[idx, 0]], ['Kor', df.iloc[idx, 1]] ])
 
 
 
53
 
54
 
55
  eng_model_name = "roberta-base"
@@ -75,23 +77,31 @@ kor_model = AutoModelForSequenceClassification.from_pretrained(
75
 
76
 
77
  def builder(lang, text):
 
 
78
  if lang == 'Any':
79
  pred = LANGUAGE.predict_lang(text)
80
- if pred[0][0] == '__label__ko':
81
- lang = 'Kor'
82
- else: # '__label__en'
83
  lang = 'Eng'
84
- # else:
85
- # raise NotImplementedError("It's neither Korean nor English.")
 
 
 
 
 
86
  if lang == 'Eng':
87
  model = eng_model
88
  tokenizer = eng_tokenizer
 
 
89
  if lang == 'Kor':
90
  model = kor_model
91
  tokenizer = kor_tokenizer
 
92
 
 
93
  inputs = tokenized_data(tokenizer, text)
94
-
95
  model.eval()
96
  with torch.no_grad():
97
  logits = model(input_ids=inputs['input_ids'],
@@ -103,13 +113,13 @@ def builder(lang, text):
103
 
104
  prediction = torch.argmax(logits, axis=1)
105
 
106
- return {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()}
107
  return id2label[prediction.item()]
108
 
109
 
110
 
111
  demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']), "text"],
112
- outputs=gr.Label(num_top_classes=2, label='Result', color='CadetBlue'),
113
  # outputs='label',
114
  title=title, description=description, examples=examples)
115
 
@@ -119,6 +129,7 @@ demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']),
119
  # allow_flagging="auto",
120
  # description=description, examples=examples)
121
 
 
122
  if __name__ == "__main__":
123
  # print(examples)
124
  demo.launch()
 
4
  from transformers import AutoModelForSequenceClassification
5
  from transformers import AutoTokenizer
6
 
 
7
  import numpy as np
8
  import pandas as pd
9
  import torch
 
26
  self.model = fasttext.load_model(pretrained_lang_model)
27
 
28
  def predict_lang(self, text):
29
+ predictions = self.model.predict(text, k=200) # returns top 200 matching languages
30
  return predictions
31
 
32
  LANGUAGE = LanguageIdentification()
 
45
 
46
  examples = []
47
  df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
48
+ np.random.seed(100)
49
+
50
+ idx = np.random.choice(50, size=5, replace=False)
51
+ eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
52
+ kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
53
+ examples = eng_examples + kor_examples
54
+
55
 
56
 
57
  eng_model_name = "roberta-base"
 
77
 
78
 
79
  def builder(lang, text):
80
+ percent_kor, percent_eng = 0, 0
81
+
82
  if lang == 'Any':
83
  pred = LANGUAGE.predict_lang(text)
84
+ if '__label__en' in pred[0]:
 
 
85
  lang = 'Eng'
86
+ idx = pred[0].index('__label__en')
87
+ percent_eng = pred[1][idx]
88
+ if '__label__ko' in pred[0]:
89
+ lang = 'Kor'
90
+ idx = pred[0].index('__label__ko')
91
+ percent_kor = pred[1][idx]
92
+
93
  if lang == 'Eng':
94
  model = eng_model
95
  tokenizer = eng_tokenizer
96
+ if percent_eng==0: percent_eng=1
97
+
98
  if lang == 'Kor':
99
  model = kor_model
100
  tokenizer = kor_tokenizer
101
+ if percent_kor==0: percent_kor=1
102
 
103
+
104
  inputs = tokenized_data(tokenizer, text)
 
105
  model.eval()
106
  with torch.no_grad():
107
  logits = model(input_ids=inputs['input_ids'],
 
113
 
114
  prediction = torch.argmax(logits, axis=1)
115
 
116
+ return [ {'Kor': percent_kor, 'Eng': percent_eng, 'Other': 1-(percent_kor+percent_eng)}, {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()} ]
117
  return id2label[prediction.item()]
118
 
119
 
120
 
121
  demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']), "text"],
122
+ outputs=[ gr.Label(num_top_classes=3, label='Lang'), gr.Label(num_top_classes=2, label='Result') ],
123
  # outputs='label',
124
  title=title, description=description, examples=examples)
125
 
 
129
  # allow_flagging="auto",
130
  # description=description, examples=examples)
131
 
132
+
133
  if __name__ == "__main__":
134
  # print(examples)
135
  demo.launch()