File size: 7,924 Bytes
2678b8c
 
3c7a691
2678b8c
f59c7b5
c978338
2678b8c
 
 
 
 
 
 
 
 
 
 
 
 
ff66157
2678b8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f59c7b5
2678b8c
 
 
f59c7b5
88abc31
 
 
8507606
 
88abc31
8507606
88abc31
 
2678b8c
 
c8cbb2b
2678b8c
3c7a691
92a0f65
 
 
 
2678b8c
 
c978338
92a0f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635f231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2678b8c
635f231
 
 
 
c978338
635f231
 
7c1e17d
635f231
 
 
2678b8c
635f231
 
3c7a691
 
635f231
 
c978338
88abc31
635f231
 
 
 
 
 
88abc31
92a0f65
 
 
 
 
 
 
8507606
92a0f65
 
 
74aeece
92a0f65
 
 
74aeece
02f5728
92a0f65
 
2678b8c
c978338
635f231
 
92a0f65
 
 
 
 
635f231
 
 
 
 
88abc31
2678b8c
c978338
f59c7b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import os
from collections import OrderedDict
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import gradio as gr
from shitsu import ShitsuScorer
from huggingface_hub import hf_hub_download

class OptimizedShitsuScorer:
    def __init__(self, max_models=2):
        self.scorers = OrderedDict()
        self.max_models = max_models
        self.current_language = None

    def get_scorer(self, language):
        if language in self.scorers:
            # Move the accessed language to the end (most recently used)
            self.scorers.move_to_end(language)
        else:
            gr.Warning("A new language is being loaded in memory, this could take a while...")
            # If we're at capacity, remove the least recently used model
            if len(self.scorers) >= self.max_models:
                self.scorers.popitem(last=False)
            
            # Load the new model
            self.scorers[language] = ShitsuScorer(language)
        
        self.current_language = language
        return self.scorers[language]

    def score(self, text, language):
        scorer = self.get_scorer(language)
        return scorer.score([text])[0]

    def get_loaded_languages(self):
        return list(self.scorers.keys())

optimized_scorer = OptimizedShitsuScorer(max_models=2)
# Preload English model
optimized_scorer.get_scorer('en')

example_inputs = [
    "The Beatles were a popular band in the 1960s. They released many hit songs.",
    "Chocolate is a type of sweet food that people often eat for dessert.",
    "I'm thinking of going to the beach this weekend. The weather is supposed to be great!", 
    "Can you believe it's already September? This year is flying by!",
    "Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties of nature at the scale of atoms and subatomic particles.",
    
]

def get_score(user_text, language):
    score = optimized_scorer.score(user_text, language)
    formatted_score = f"{score:.4g}"
    loaded_languages = optimized_scorer.get_loaded_languages()
    display_loaded_languages = [('Currently loaded languages: \n', None)]
    for language in loaded_languages:
        display_loaded_languages.append((language_map[language], language))
        display_loaded_languages.append((' ', None))
    return f'<div class="nice-box"> Score: {formatted_score}</div>', display_loaded_languages

language_options = ['am', 'ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'gu', 'ha', 'hi', 'hu', 'id', 'it', 'ja', 'jv', 'kn', 'ko', 'lt', 'mr', 'nl', 'no', 'yo', 'zh']

language_map = {
    'am': 'Amharic',
    'ar': 'Arabic',
    'bg': 'Bulgarian',
    'bn': 'Bengali',
    'cs': 'Czech',
    'da': 'Danish',
    'de': 'German',
    'el': 'Greek',
    'en': 'English',
    'es': 'Spanish',
    'fa': 'Persian',
    'fi': 'Finnish',
    'fr': 'French',
    'gu': 'Gujarati',
    'ha': 'Hausa',
    'hi': 'Hindi',
    'hu': 'Hungarian',
    'id': 'Indonesian',
    'it': 'Italian',
    'ja': 'Japanese',
    'jv': 'Javanese',
    'kn': 'Kannada',
    'ko': 'Korean',
    'lt': 'Lithuanian',
    'mr': 'Marathi',
    'nl': 'Dutch',
    'no': 'Norwegian',
    'yo': 'Yoruba',
    'zh': 'Chinese'
}

color_map = {
    "am": "green",     # Ethiopia's flag has green
    "ar": "black",     # Many Arab flags feature black
    "bg": "white",     # Bulgaria's flag has white
    "bn": "green",     # Bangladesh's flag is green and red
    "cs": "blue",      # Czech Republic's flag has blue
    "da": "red",       # Denmark's flag is red and white
    "de": "black",     # Germany's flag has black
    "el": "blue",      # Greece's flag has blue
    "en": "red",       # UK/US flags have red
    "es": "yellow",    # Spain's flag has yellow
    "fa": "green",     # Iran's flag has green
    "fi": "blue",      # Finland's flag is blue and white
    "fr": "blue",      # France's flag has blue
    "gu": "saffron",   # India (Gujarat) flag's color
    "ha": "green",     # Nigeria's flag has green
    "hi": "orange",    # India's flag has orange
    "hu": "red",       # Hungary's flag has red
    "id": "red",       # Indonesia's flag is red and white
    "it": "green",     # Italy's flag has green
    "ja": "red",       # Japan's flag has a red sun
    "jv": "brown",     # Associated with traditional Javanese culture
    "kn": "yellow",    # Karnataka (Indian state) flag has yellow
    "ko": "blue",      # South Korea's flag has blue
    "lt": "yellow",    # Lithuania's flag has yellow
    "mr": "saffron",   # Marathi culture often uses saffron
    "nl": "orange",    # The Netherlands is often associated with orange
    "no": "red",       # Norway's flag is red, white, and blue
    "yo": "green",     # Nigeria's flag for Yoruba-speaking people
    "zh": "red"        # China's flag is red
}


css = '''
#gen_btn{height: 100%}
#title{text-align: center}
#title h1{font-size: 3em; display:inline-flex; align-items:center}
#title img{width: 100px; margin-right: 0.5em}
#gallery .grid-wrap{height: 10vh}
.card_internal{display: flex;height: 100px;margin-top: .5em}
.card_internal img{margin-right: 1em}
.styler{--form-gap-width: 0px !important}
.nice-box {
    border: 2px solid #007bff;
    border-radius: 10px;
    padding: 15px;
    background-color: #f8f9fa;
    font-size: 18px;
    text-align: center;
    min-height: 60px;
    display: flex;
    align-items: center;
    justify-content: center;
}
'''

theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="sky",
)

with gr.Blocks(theme=theme, css=css) as demo:
    title = gr.HTML(
        """<h1><img src="https://huggingface.co/spaces/Dusduo/shitsu-text-scorer-demo/resolve/main/shitsu-logo.jpeg" alt="LightBlue"> Shitsu Text Scorer</h1>""",
        elem_id="title",
    )
    gr.Markdown(
    """This is a demo of [Shitsu text scorer](https://huggingface.co/lightblue/shitsu_text_scorer) for multiple languages, which scores text based on the amount of useful, textbook-like information in it.
    
    It outputs a score generally between 0 and 1 but can exceed both of these bounds as it is a regressor.
    
    ⚠️ By default, the English version of the scorer is preloaded in memory. When using another language for the first time, beware extensive loading time.
    """
    )
    with gr.Row():
        user_text = gr.Textbox(label='Input text', placeholder='Type something here...')
        with gr.Column(scale=0):
            submit_btn = gr.Button("Submit")
            score = gr.HTML(
                value='<div class="nice-box"> Score...  </div>',
                label="Output"
            )
    
    with gr.Row():
        language_choice = gr.Dropdown(
                choices=language_options,
                label="Choose a language",
                info="Type to search",
                value="en",
                allow_custom_value=True,
                scale=3
            )
        
        loaded_languages = gr.HighlightedText(
            value = [('Currently loaded languages: \n', None), ('English', 'en')],
            label="",
            combine_adjacent=True,
            show_legend=False, #True,
            color_map=color_map,
            scale=1)
    
    #loaded_languages = gr.Markdown("Currently loaded languages: en")
    gr.Examples(examples=example_inputs, inputs=user_text)
    
    gr.Markdown(
    """
    ---
    
    ## 🛈 **Additional Information**
    This model can also be found on [Github](https://github.com/lightblue-tech/shitsu) and has its own pip installable package.
    
    This model is based on fasttext embeddings, meaning that it can be used on large amounts of data with limited compute quickly.

    This scorer can be used to filter useful information from large text corpora in many languages.  
    """
    )
    
    submit_btn.click(get_score, inputs=[user_text, language_choice], outputs=[score, loaded_languages])
    
demo.launch()