malte.ostendorff@telekom.de commited on
Commit
3c258f1
0 Parent(s):
Files changed (6) hide show
  1. .gitignore +176 -0
  2. README.md +22 -0
  3. app.py +235 -0
  4. languages.py +188 -0
  5. packages.txt +1 -0
  6. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ # End of https://www.toptal.com/developers/gitignore/api/python
README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Seed Crawl Annotator
3
+ emoji: 🐨
4
+ colorFrom: red
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.6.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Annotate Web Languages
14
+
15
+ ## Usage
16
+
17
+ ```bash
18
+ # Run the Gradio app
19
+ gradio app.py # auto reload
20
+ python app.py # static
21
+
22
+ ```
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ import time
6
+ import gradio as gr
7
+ from selenium import webdriver
8
+ from selenium.common.exceptions import WebDriverException
9
+ from PIL import Image
10
+ from io import BytesIO
11
+ import base64
12
+
13
+ import trafilatura
14
+
15
+ from huggingface_hub import whoami
16
+
17
+ from languages import ISO_CODE_TO_LANGUAGE_NAME
18
+
19
+ OFFLINE = os.environ.get("OFFLINE", False)
20
+
21
+ def pil_image_to_base64(image):
22
+ # Save the image to a BytesIO buffer
23
+ buffer = BytesIO()
24
+ image.save(buffer, format="PNG") # You can change the format if needed
25
+ buffer.seek(0)
26
+
27
+ # Encode the bytes into a base64 string
28
+ img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
29
+
30
+ # Format the base64 string for use in an HTML image tag
31
+ html_img_tag_src = f"data:image/png;base64,{img_base64}"
32
+ return html_img_tag_src
33
+
34
+ def fetch_screenshot_and_text_from_url(url):
35
+ screen_width = 1080
36
+ height = 350
37
+ text = ""
38
+
39
+ if OFFLINE:
40
+ screenshot = Image.new('RGB', (350, height))
41
+ text = f"Some dummy text for {url} (offline mode enabled)"
42
+
43
+ else:
44
+ options = webdriver.ChromeOptions()
45
+ options.add_argument('--headless')
46
+ options.add_argument('--no-sandbox')
47
+ options.add_argument('--disable-dev-shm-usage')
48
+
49
+ try:
50
+ driver = webdriver.Chrome(options=options)
51
+ #driver.set_window_size(1080, 720) # Adjust the window size here
52
+ driver.get(url)
53
+
54
+ driver.implicitly_wait(10)
55
+
56
+ # Wait for the page to fully load; you may adjust the sleep time or implement a wait condition
57
+ # time.sleep(2)
58
+
59
+ # fetch html from web page
60
+ html_str = driver.page_source
61
+
62
+ # Execute JS to find the full height of the rendered page
63
+ scroll_height = driver.execute_script("return document.body.scrollHeight")
64
+
65
+ # Resize the window to full page height
66
+ driver.set_window_size(screen_width, max(scroll_height + 200, 900))
67
+
68
+ raw_screenshot = driver.get_screenshot_as_png()
69
+
70
+ screenshot = Image.open(BytesIO(raw_screenshot))
71
+
72
+ # extract text
73
+ text = trafilatura.extract(html_str)
74
+
75
+ except WebDriverException as e:
76
+ screenshot = Image.new('RGB', (1, 1))
77
+ finally:
78
+ if driver:
79
+ driver.quit()
80
+
81
+
82
+ # embed base65 encoded image as <img> tag into html string
83
+ screenshot_html_str = f"""<div style="width: 100%; height: {height}px; overflow-y: scroll;"><img src="{pil_image_to_base64(screenshot)}" /></div>"""
84
+
85
+ # return gr.update(value=html_str, visible=True), text, gr.update(visible=True)
86
+ return screenshot_html_str, text
87
+
88
+
89
+ with gr.Blocks(fill_height=True) as demo:
90
+
91
+ gr.Markdown(
92
+ """
93
+ # Seed Crawl Annotator
94
+ """)
95
+
96
+ profile_state = gr.State([])
97
+ gr.LoginButton()
98
+
99
+
100
+ with gr.Column(visible=False) as wrapper_col:
101
+ def handle_login(profile: gr.OAuthProfile | None) -> dict:
102
+ if profile:
103
+ gr.Info(f"Logged in as {profile.username}")
104
+ return {
105
+ profile_state: f"{profile.username}",
106
+ wrapper_col: gr.update(visible=True),
107
+ }
108
+ else:
109
+ gr.Warning(f"You need to login to use this app.")
110
+ return {
111
+ profile_state: None,
112
+ wrapper_col: gr.update(visible=False),
113
+ }
114
+
115
+ demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col])
116
+
117
+ url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
118
+
119
+ with gr.Row():
120
+ set_random_btn = gr.Button("Set Random URL", variant="secondary", interactive=True)
121
+
122
+ load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
123
+
124
+ with gr.Row():
125
+ extracted_text = gr.Textbox(label="Extracted text", max_lines=15, lines=15, visible=False, placeholder="Click on `Load URL` to fetch Web page's text content.")
126
+
127
+ screenshot_scrollable = gr.HTML(visible=False)
128
+
129
+ with gr.Column(visible=False) as output_col:
130
+ with gr.Row():
131
+ language_codes = gr.Dropdown(
132
+ [("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
133
+ label="Language codes",
134
+ multiselect=True,
135
+ # allow_custom_value=True,
136
+ )
137
+ categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
138
+
139
+ with gr.Row():
140
+ do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
141
+ dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
142
+ # random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
143
+
144
+
145
+ def set_random_url():
146
+ candidate_urls = [
147
+ "http://example.com",
148
+ "https://wikipedia.org/",
149
+ "https://occiglot.eu",
150
+ "https://ostendorff.org",
151
+ "https://fr.wikipedia.org/",
152
+ "https://amazon.com/"
153
+ ]
154
+ selected_url = random.choice(candidate_urls)
155
+ return selected_url
156
+
157
+ set_random_btn.click(fn=set_random_url, outputs=url_field)
158
+
159
+
160
+ def load_url(url):
161
+ screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
162
+
163
+ if not screenshot_html_str or not text:
164
+ gr.Error("Could not fetch data for url")
165
+ else:
166
+
167
+ return {
168
+ screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
169
+ extracted_text: gr.update(value=text, visible=True),
170
+ output_col: gr.update(visible=True),
171
+ language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
172
+ categories: gr.update(value=None),
173
+ }
174
+
175
+ load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
176
+
177
+ def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
178
+
179
+ if profile_state:
180
+ html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
181
+ gr.Info("Thanks for your feedback")
182
+ else:
183
+ gr.Error("Feedback could not be saved")
184
+ html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
185
+
186
+ return {
187
+ url_field: "",
188
+ output_col: gr.update(visible=False),
189
+ extracted_text: gr.update(value=None, visible=False),
190
+ screenshot_scrollable: gr.update(value="", visible=False),
191
+ }
192
+
193
+ # def do_crawl(profile_state, url, language_codes, categories):
194
+ # return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
195
+
196
+ # def dont_crawl(profile_state, url, language_codes, categories):
197
+ # return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
198
+
199
+
200
+ do_crawl_btn.click(
201
+ fn=do_crawl,
202
+ inputs=[profile_state, url_field, language_codes, categories],
203
+ outputs=[
204
+ url_field,
205
+ output_col,
206
+ extracted_text,
207
+ screenshot_scrollable
208
+ ],
209
+ api_name="do_crawl",
210
+ )
211
+ dont_crawl_btn.click(
212
+ fn=do_crawl,
213
+ inputs=[profile_state, url_field, language_codes, categories],
214
+ outputs=[
215
+ url_field,
216
+ output_col,
217
+ extracted_text,
218
+ screenshot_scrollable
219
+ ],
220
+ api_name="do_crawl",
221
+ )
222
+
223
+ # dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
224
+
225
+ # def random_subpage(url):
226
+ # new_url = "http://example.com"
227
+
228
+ # return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
229
+
230
+ # random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")
231
+
232
+
233
+
234
+ if __name__ == "__main__":
235
+ demo.launch()
languages.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Taken from:
2
+ # https://gist.github.com/jrnk/8eb57b065ea0b098d571
3
+ ISO_CODE_TO_LANGUAGE_NAME = {
4
+ "aa": "Afar",
5
+ "ab": "Abkhazian",
6
+ "ae": "Avestan",
7
+ "af": "Afrikaans",
8
+ "ak": "Akan",
9
+ "am": "Amharic",
10
+ "an": "Aragonese",
11
+ "ar": "Arabic",
12
+ "as": "Assamese",
13
+ "av": "Avaric",
14
+ "ay": "Aymara",
15
+ "az": "Azerbaijani",
16
+ "ba": "Bashkir",
17
+ "be": "Belarusian",
18
+ "bg": "Bulgarian",
19
+ "bh": "Bihari languages",
20
+ "bi": "Bislama",
21
+ "bm": "Bambara",
22
+ "bn": "Bengali",
23
+ "bo": "Tibetan",
24
+ "br": "Breton",
25
+ "bs": "Bosnian",
26
+ "ca": "Catalan; Valencian",
27
+ "ce": "Chechen",
28
+ "ch": "Chamorro",
29
+ "co": "Corsican",
30
+ "cr": "Cree",
31
+ "cs": "Czech",
32
+ "cu": "Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic",
33
+ "cv": "Chuvash",
34
+ "cy": "Welsh",
35
+ "da": "Danish",
36
+ "de": "German",
37
+ "dv": "Divehi; Dhivehi; Maldivian",
38
+ "dz": "Dzongkha",
39
+ "ee": "Ewe",
40
+ "el": "Greek, Modern (1453-)",
41
+ "en": "English",
42
+ "eo": "Esperanto",
43
+ "es": "Spanish; Castilian",
44
+ "et": "Estonian",
45
+ "eu": "Basque",
46
+ "fa": "Persian",
47
+ "ff": "Fulah",
48
+ "fi": "Finnish",
49
+ "fj": "Fijian",
50
+ "fo": "Faroese",
51
+ "fr": "French",
52
+ "fy": "Western Frisian",
53
+ "ga": "Irish",
54
+ "gd": "Gaelic; Scomttish Gaelic",
55
+ "gl": "Galician",
56
+ "gn": "Guarani",
57
+ "gu": "Gujarati",
58
+ "gv": "Manx",
59
+ "ha": "Hausa",
60
+ "he": "Hebrew",
61
+ "hi": "Hindi",
62
+ "ho": "Hiri Motu",
63
+ "hr": "Croatian",
64
+ "ht": "Haitian; Haitian Creole",
65
+ "hu": "Hungarian",
66
+ "hy": "Armenian",
67
+ "hz": "Herero",
68
+ "ia": "Interlingua (International Auxiliary Language Association)",
69
+ "id": "Indonesian",
70
+ "ie": "Interlingue; Occidental",
71
+ "ig": "Igbo",
72
+ "ii": "Sichuan Yi; Nuosu",
73
+ "ik": "Inupiaq",
74
+ "io": "Ido",
75
+ "is": "Icelandic",
76
+ "it": "Italian",
77
+ "iu": "Inuktitut",
78
+ "ja": "Japanese",
79
+ "jv": "Javanese",
80
+ "ka": "Georgian",
81
+ "kg": "Kongo",
82
+ "ki": "Kikuyu; Gikuyu",
83
+ "kj": "Kuanyama; Kwanyama",
84
+ "kk": "Kazakh",
85
+ "kl": "Kalaallisut; Greenlandic",
86
+ "km": "Central Khmer",
87
+ "kn": "Kannada",
88
+ "ko": "Korean",
89
+ "kr": "Kanuri",
90
+ "ks": "Kashmiri",
91
+ "ku": "Kurdish",
92
+ "kv": "Komi",
93
+ "kw": "Cornish",
94
+ "ky": "Kirghiz; Kyrgyz",
95
+ "la": "Latin",
96
+ "lb": "Luxembourgish; Letzeburgesch",
97
+ "lg": "Ganda",
98
+ "li": "Limburgan; Limburger; Limburgish",
99
+ "ln": "Lingala",
100
+ "lo": "Lao",
101
+ "lt": "Lithuanian",
102
+ "lu": "Luba-Katanga",
103
+ "lv": "Latvian",
104
+ "mg": "Malagasy",
105
+ "mh": "Marshallese",
106
+ "mi": "Maori",
107
+ "mk": "Macedonian",
108
+ "ml": "Malayalam",
109
+ "mn": "Mongolian",
110
+ "mr": "Marathi",
111
+ "ms": "Malay",
112
+ "mt": "Maltese",
113
+ "my": "Burmese",
114
+ "na": "Nauru",
115
+ "nb": "Bokmål, Norwegian; Norwegian Bokmål",
116
+ "nd": "Ndebele, North; North Ndebele",
117
+ "ne": "Nepali",
118
+ "ng": "Ndonga",
119
+ "nl": "Dutch; Flemish",
120
+ "nn": "Norwegian Nynorsk; Nynorsk, Norwegian",
121
+ "no": "Norwegian",
122
+ "nr": "Ndebele, South; South Ndebele",
123
+ "nv": "Navajo; Navaho",
124
+ "ny": "Chichewa; Chewa; Nyanja",
125
+ "oc": "Occitan (post 1500)",
126
+ "oj": "Ojibwa",
127
+ "om": "Oromo",
128
+ "or": "Oriya",
129
+ "os": "Ossetian; Ossetic",
130
+ "pa": "Panjabi; Punjabi",
131
+ "pi": "Pali",
132
+ "pl": "Polish",
133
+ "ps": "Pushto; Pashto",
134
+ "pt": "Portuguese",
135
+ "qu": "Quechua",
136
+ "rm": "Romansh",
137
+ "rn": "Rundi",
138
+ "ro": "Romanian; Moldavian; Moldovan",
139
+ "ru": "Russian",
140
+ "rw": "Kinyarwanda",
141
+ "sa": "Sanskrit",
142
+ "sc": "Sardinian",
143
+ "sd": "Sindhi",
144
+ "se": "Northern Sami",
145
+ "sg": "Sango",
146
+ "si": "Sinhala; Sinhalese",
147
+ "sk": "Slovak",
148
+ "sl": "Slovenian",
149
+ "sm": "Samoan",
150
+ "sn": "Shona",
151
+ "so": "Somali",
152
+ "sq": "Albanian",
153
+ "sr": "Serbian",
154
+ "ss": "Swati",
155
+ "st": "Sotho, Southern",
156
+ "su": "Sundanese",
157
+ "sv": "Swedish",
158
+ "sw": "Swahili",
159
+ "ta": "Tamil",
160
+ "te": "Telugu",
161
+ "tg": "Tajik",
162
+ "th": "Thai",
163
+ "ti": "Tigrinya",
164
+ "tk": "Turkmen",
165
+ "tl": "Tagalog",
166
+ "tn": "Tswana",
167
+ "to": "Tonga (Tonga Islands)",
168
+ "tr": "Turkish",
169
+ "ts": "Tsonga",
170
+ "tt": "Tatar",
171
+ "tw": "Twi",
172
+ "ty": "Tahitian",
173
+ "ug": "Uighur; Uyghur",
174
+ "uk": "Ukrainian",
175
+ "ur": "Urdu",
176
+ "uz": "Uzbek",
177
+ "ve": "Venda",
178
+ "vi": "Vietnamese",
179
+ "vo": "Volapük",
180
+ "wa": "Walloon",
181
+ "wo": "Wolof",
182
+ "xh": "Xhosa",
183
+ "yi": "Yiddish",
184
+ "yo": "Yoruba",
185
+ "za": "Zhuang; Chuang",
186
+ "zh": "Chinese",
187
+ "zu": "Zulu"
188
+ }
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ chromium-driver
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ selenium >=4.0.0, < 5.0.0
2
+ gradio>=3.40.1
3
+ Pillow>=8.3.1,<9.0
4
+ trafilatura
5
+ gradio[oauth]