File size: 2,293 Bytes
48024e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128a751
 
48024e7
 
 
 
 
 
 
 
 
 
 
 
 
 
128a751
8c9adb4
7402b15
48024e7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import gradio as gr

model_path = "jb2k/bert-base-multilingual-cased-language-detection"

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

language_dict = {0: 'Arabic',
 1: 'Basque',
 2: 'Breton',
 3: 'Catalan',
 4: 'Chinese_China',
 5: 'Chinese_Hongkong',
 6: 'Chinese_Taiwan',
 7: 'Chuvash',
 8: 'Czech',
 9: 'Dhivehi',
 10: 'Dutch',
 11: 'English',
 12: 'Esperanto',
 13: 'Estonian',
 14: 'French',
 15: 'Frisian',
 16: 'Georgian',
 17: 'German',
 18: 'Greek',
 19: 'Hakha_Chin',
 20: 'Indonesian',
 21: 'Interlingua',
 22: 'Italian',
 23: 'Japanese',
 24: 'Kabyle',
 25: 'Kinyarwanda',
 26: 'Kyrgyz',
 27: 'Latvian',
 28: 'Maltese',
 29: 'Mongolian',
 30: 'Persian',
 31: 'Polish',
 32: 'Portuguese',
 33: 'Romanian',
 34: 'Romansh_Sursilvan',
 35: 'Russian',
 36: 'Sakha',
 37: 'Slovenian',
 38: 'Spanish',
 39: 'Swedish',
 40: 'Tamil',
 41: 'Tatar',
 42: 'Turkish',
 43: 'Ukranian',
 44: 'Welsh'}

examples = ['Transformers are really cool!', 'Трансформеры действительно классные!', '¡Los transformadores son realmente geniales!']

def inference(sentence):
  tokenized_sentence = tokenizer(sentence, return_tensors='pt')
  output = model(**tokenized_sentence)
  predictions = torch.nn.functional.softmax(output.logits, dim=-1)
  certainy, highest_value = torch.max(predictions, dim=-1, keepdim=False,  out=None)
  highest_value_int = highest_value.item()
  language = language_dict[highest_value_int]
  return language

if __name__ == '__main__':
  interFace = gr.Interface(fn=inference,
                              inputs=gr.inputs.Textbox(placeholder="Enter text here", label="Text content", lines=5),
                              outputs=gr.outputs.Label(num_top_classes=6, label="Language of this text is "),
                              verbose=True,
                              examples = examples,
                              title="Language Detector",
                              description="Language detector with support for 45 languages. Created as part of the huggingface course community event.",
                              theme="grass")
  interFace.launch()