Spaces:
Running
Running
malte.ostendorff@telekom.de
commited on
Commit
•
3c258f1
0
Parent(s):
init
Browse files- .gitignore +176 -0
- README.md +22 -0
- app.py +235 -0
- languages.py +188 -0
- packages.txt +1 -0
- requirements.txt +5 -0
.gitignore
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created by https://www.toptal.com/developers/gitignore/api/python
|
2 |
+
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
3 |
+
|
4 |
+
### Python ###
|
5 |
+
# Byte-compiled / optimized / DLL files
|
6 |
+
__pycache__/
|
7 |
+
*.py[cod]
|
8 |
+
*$py.class
|
9 |
+
|
10 |
+
# C extensions
|
11 |
+
*.so
|
12 |
+
|
13 |
+
# Distribution / packaging
|
14 |
+
.Python
|
15 |
+
build/
|
16 |
+
develop-eggs/
|
17 |
+
dist/
|
18 |
+
downloads/
|
19 |
+
eggs/
|
20 |
+
.eggs/
|
21 |
+
lib/
|
22 |
+
lib64/
|
23 |
+
parts/
|
24 |
+
sdist/
|
25 |
+
var/
|
26 |
+
wheels/
|
27 |
+
share/python-wheels/
|
28 |
+
*.egg-info/
|
29 |
+
.installed.cfg
|
30 |
+
*.egg
|
31 |
+
MANIFEST
|
32 |
+
|
33 |
+
# PyInstaller
|
34 |
+
# Usually these files are written by a python script from a template
|
35 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
36 |
+
*.manifest
|
37 |
+
*.spec
|
38 |
+
|
39 |
+
# Installer logs
|
40 |
+
pip-log.txt
|
41 |
+
pip-delete-this-directory.txt
|
42 |
+
|
43 |
+
# Unit test / coverage reports
|
44 |
+
htmlcov/
|
45 |
+
.tox/
|
46 |
+
.nox/
|
47 |
+
.coverage
|
48 |
+
.coverage.*
|
49 |
+
.cache
|
50 |
+
nosetests.xml
|
51 |
+
coverage.xml
|
52 |
+
*.cover
|
53 |
+
*.py,cover
|
54 |
+
.hypothesis/
|
55 |
+
.pytest_cache/
|
56 |
+
cover/
|
57 |
+
|
58 |
+
# Translations
|
59 |
+
*.mo
|
60 |
+
*.pot
|
61 |
+
|
62 |
+
# Django stuff:
|
63 |
+
*.log
|
64 |
+
local_settings.py
|
65 |
+
db.sqlite3
|
66 |
+
db.sqlite3-journal
|
67 |
+
|
68 |
+
# Flask stuff:
|
69 |
+
instance/
|
70 |
+
.webassets-cache
|
71 |
+
|
72 |
+
# Scrapy stuff:
|
73 |
+
.scrapy
|
74 |
+
|
75 |
+
# Sphinx documentation
|
76 |
+
docs/_build/
|
77 |
+
|
78 |
+
# PyBuilder
|
79 |
+
.pybuilder/
|
80 |
+
target/
|
81 |
+
|
82 |
+
# Jupyter Notebook
|
83 |
+
.ipynb_checkpoints
|
84 |
+
|
85 |
+
# IPython
|
86 |
+
profile_default/
|
87 |
+
ipython_config.py
|
88 |
+
|
89 |
+
# pyenv
|
90 |
+
# For a library or package, you might want to ignore these files since the code is
|
91 |
+
# intended to run in multiple environments; otherwise, check them in:
|
92 |
+
# .python-version
|
93 |
+
|
94 |
+
# pipenv
|
95 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
96 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
97 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
98 |
+
# install all needed dependencies.
|
99 |
+
#Pipfile.lock
|
100 |
+
|
101 |
+
# poetry
|
102 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
103 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
104 |
+
# commonly ignored for libraries.
|
105 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
106 |
+
#poetry.lock
|
107 |
+
|
108 |
+
# pdm
|
109 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
110 |
+
#pdm.lock
|
111 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
112 |
+
# in version control.
|
113 |
+
# https://pdm.fming.dev/#use-with-ide
|
114 |
+
.pdm.toml
|
115 |
+
|
116 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
117 |
+
__pypackages__/
|
118 |
+
|
119 |
+
# Celery stuff
|
120 |
+
celerybeat-schedule
|
121 |
+
celerybeat.pid
|
122 |
+
|
123 |
+
# SageMath parsed files
|
124 |
+
*.sage.py
|
125 |
+
|
126 |
+
# Environments
|
127 |
+
.env
|
128 |
+
.venv
|
129 |
+
env/
|
130 |
+
venv/
|
131 |
+
ENV/
|
132 |
+
env.bak/
|
133 |
+
venv.bak/
|
134 |
+
|
135 |
+
# Spyder project settings
|
136 |
+
.spyderproject
|
137 |
+
.spyproject
|
138 |
+
|
139 |
+
# Rope project settings
|
140 |
+
.ropeproject
|
141 |
+
|
142 |
+
# mkdocs documentation
|
143 |
+
/site
|
144 |
+
|
145 |
+
# mypy
|
146 |
+
.mypy_cache/
|
147 |
+
.dmypy.json
|
148 |
+
dmypy.json
|
149 |
+
|
150 |
+
# Pyre type checker
|
151 |
+
.pyre/
|
152 |
+
|
153 |
+
# pytype static type analyzer
|
154 |
+
.pytype/
|
155 |
+
|
156 |
+
# Cython debug symbols
|
157 |
+
cython_debug/
|
158 |
+
|
159 |
+
# PyCharm
|
160 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
161 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
162 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
163 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
164 |
+
#.idea/
|
165 |
+
|
166 |
+
### Python Patch ###
|
167 |
+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
168 |
+
poetry.toml
|
169 |
+
|
170 |
+
# ruff
|
171 |
+
.ruff_cache/
|
172 |
+
|
173 |
+
# LSP config files
|
174 |
+
pyrightconfig.json
|
175 |
+
|
176 |
+
# End of https://www.toptal.com/developers/gitignore/api/python
|
README.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Seed Crawl Annotator
|
3 |
+
emoji: 🐨
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.6.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
# Annotate Web Languages
|
14 |
+
|
15 |
+
## Usage
|
16 |
+
|
17 |
+
```bash
|
18 |
+
# Run the Gradio app
|
19 |
+
gradio app.py # auto reload
|
20 |
+
python app.py # static
|
21 |
+
|
22 |
+
```
|
app.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import os
|
4 |
+
import random
|
5 |
+
import time
|
6 |
+
import gradio as gr
|
7 |
+
from selenium import webdriver
|
8 |
+
from selenium.common.exceptions import WebDriverException
|
9 |
+
from PIL import Image
|
10 |
+
from io import BytesIO
|
11 |
+
import base64
|
12 |
+
|
13 |
+
import trafilatura
|
14 |
+
|
15 |
+
from huggingface_hub import whoami
|
16 |
+
|
17 |
+
from languages import ISO_CODE_TO_LANGUAGE_NAME
|
18 |
+
|
19 |
+
OFFLINE = os.environ.get("OFFLINE", False)
|
20 |
+
|
21 |
+
def pil_image_to_base64(image):
|
22 |
+
# Save the image to a BytesIO buffer
|
23 |
+
buffer = BytesIO()
|
24 |
+
image.save(buffer, format="PNG") # You can change the format if needed
|
25 |
+
buffer.seek(0)
|
26 |
+
|
27 |
+
# Encode the bytes into a base64 string
|
28 |
+
img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
29 |
+
|
30 |
+
# Format the base64 string for use in an HTML image tag
|
31 |
+
html_img_tag_src = f"data:image/png;base64,{img_base64}"
|
32 |
+
return html_img_tag_src
|
33 |
+
|
34 |
+
def fetch_screenshot_and_text_from_url(url):
|
35 |
+
screen_width = 1080
|
36 |
+
height = 350
|
37 |
+
text = ""
|
38 |
+
|
39 |
+
if OFFLINE:
|
40 |
+
screenshot = Image.new('RGB', (350, height))
|
41 |
+
text = f"Some dummy text for {url} (offline mode enabled)"
|
42 |
+
|
43 |
+
else:
|
44 |
+
options = webdriver.ChromeOptions()
|
45 |
+
options.add_argument('--headless')
|
46 |
+
options.add_argument('--no-sandbox')
|
47 |
+
options.add_argument('--disable-dev-shm-usage')
|
48 |
+
|
49 |
+
try:
|
50 |
+
driver = webdriver.Chrome(options=options)
|
51 |
+
#driver.set_window_size(1080, 720) # Adjust the window size here
|
52 |
+
driver.get(url)
|
53 |
+
|
54 |
+
driver.implicitly_wait(10)
|
55 |
+
|
56 |
+
# Wait for the page to fully load; you may adjust the sleep time or implement a wait condition
|
57 |
+
# time.sleep(2)
|
58 |
+
|
59 |
+
# fetch html from web page
|
60 |
+
html_str = driver.page_source
|
61 |
+
|
62 |
+
# Execute JS to find the full height of the rendered page
|
63 |
+
scroll_height = driver.execute_script("return document.body.scrollHeight")
|
64 |
+
|
65 |
+
# Resize the window to full page height
|
66 |
+
driver.set_window_size(screen_width, max(scroll_height + 200, 900))
|
67 |
+
|
68 |
+
raw_screenshot = driver.get_screenshot_as_png()
|
69 |
+
|
70 |
+
screenshot = Image.open(BytesIO(raw_screenshot))
|
71 |
+
|
72 |
+
# extract text
|
73 |
+
text = trafilatura.extract(html_str)
|
74 |
+
|
75 |
+
except WebDriverException as e:
|
76 |
+
screenshot = Image.new('RGB', (1, 1))
|
77 |
+
finally:
|
78 |
+
if driver:
|
79 |
+
driver.quit()
|
80 |
+
|
81 |
+
|
82 |
+
# embed base65 encoded image as <img> tag into html string
|
83 |
+
screenshot_html_str = f"""<div style="width: 100%; height: {height}px; overflow-y: scroll;"><img src="{pil_image_to_base64(screenshot)}" /></div>"""
|
84 |
+
|
85 |
+
# return gr.update(value=html_str, visible=True), text, gr.update(visible=True)
|
86 |
+
return screenshot_html_str, text
|
87 |
+
|
88 |
+
|
89 |
+
with gr.Blocks(fill_height=True) as demo:
|
90 |
+
|
91 |
+
gr.Markdown(
|
92 |
+
"""
|
93 |
+
# Seed Crawl Annotator
|
94 |
+
""")
|
95 |
+
|
96 |
+
profile_state = gr.State([])
|
97 |
+
gr.LoginButton()
|
98 |
+
|
99 |
+
|
100 |
+
with gr.Column(visible=False) as wrapper_col:
|
101 |
+
def handle_login(profile: gr.OAuthProfile | None) -> dict:
|
102 |
+
if profile:
|
103 |
+
gr.Info(f"Logged in as {profile.username}")
|
104 |
+
return {
|
105 |
+
profile_state: f"{profile.username}",
|
106 |
+
wrapper_col: gr.update(visible=True),
|
107 |
+
}
|
108 |
+
else:
|
109 |
+
gr.Warning(f"You need to login to use this app.")
|
110 |
+
return {
|
111 |
+
profile_state: None,
|
112 |
+
wrapper_col: gr.update(visible=False),
|
113 |
+
}
|
114 |
+
|
115 |
+
demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col])
|
116 |
+
|
117 |
+
url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
|
118 |
+
|
119 |
+
with gr.Row():
|
120 |
+
set_random_btn = gr.Button("Set Random URL", variant="secondary", interactive=True)
|
121 |
+
|
122 |
+
load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
|
123 |
+
|
124 |
+
with gr.Row():
|
125 |
+
extracted_text = gr.Textbox(label="Extracted text", max_lines=15, lines=15, visible=False, placeholder="Click on `Load URL` to fetch Web page's text content.")
|
126 |
+
|
127 |
+
screenshot_scrollable = gr.HTML(visible=False)
|
128 |
+
|
129 |
+
with gr.Column(visible=False) as output_col:
|
130 |
+
with gr.Row():
|
131 |
+
language_codes = gr.Dropdown(
|
132 |
+
[("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
|
133 |
+
label="Language codes",
|
134 |
+
multiselect=True,
|
135 |
+
# allow_custom_value=True,
|
136 |
+
)
|
137 |
+
categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
|
138 |
+
|
139 |
+
with gr.Row():
|
140 |
+
do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
|
141 |
+
dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
|
142 |
+
# random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
|
143 |
+
|
144 |
+
|
145 |
+
def set_random_url():
|
146 |
+
candidate_urls = [
|
147 |
+
"http://example.com",
|
148 |
+
"https://wikipedia.org/",
|
149 |
+
"https://occiglot.eu",
|
150 |
+
"https://ostendorff.org",
|
151 |
+
"https://fr.wikipedia.org/",
|
152 |
+
"https://amazon.com/"
|
153 |
+
]
|
154 |
+
selected_url = random.choice(candidate_urls)
|
155 |
+
return selected_url
|
156 |
+
|
157 |
+
set_random_btn.click(fn=set_random_url, outputs=url_field)
|
158 |
+
|
159 |
+
|
160 |
+
def load_url(url):
|
161 |
+
screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
|
162 |
+
|
163 |
+
if not screenshot_html_str or not text:
|
164 |
+
gr.Error("Could not fetch data for url")
|
165 |
+
else:
|
166 |
+
|
167 |
+
return {
|
168 |
+
screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
|
169 |
+
extracted_text: gr.update(value=text, visible=True),
|
170 |
+
output_col: gr.update(visible=True),
|
171 |
+
language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
|
172 |
+
categories: gr.update(value=None),
|
173 |
+
}
|
174 |
+
|
175 |
+
load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
|
176 |
+
|
177 |
+
def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
|
178 |
+
|
179 |
+
if profile_state:
|
180 |
+
html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
|
181 |
+
gr.Info("Thanks for your feedback")
|
182 |
+
else:
|
183 |
+
gr.Error("Feedback could not be saved")
|
184 |
+
html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
|
185 |
+
|
186 |
+
return {
|
187 |
+
url_field: "",
|
188 |
+
output_col: gr.update(visible=False),
|
189 |
+
extracted_text: gr.update(value=None, visible=False),
|
190 |
+
screenshot_scrollable: gr.update(value="", visible=False),
|
191 |
+
}
|
192 |
+
|
193 |
+
# def do_crawl(profile_state, url, language_codes, categories):
|
194 |
+
# return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
|
195 |
+
|
196 |
+
# def dont_crawl(profile_state, url, language_codes, categories):
|
197 |
+
# return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
|
198 |
+
|
199 |
+
|
200 |
+
do_crawl_btn.click(
|
201 |
+
fn=do_crawl,
|
202 |
+
inputs=[profile_state, url_field, language_codes, categories],
|
203 |
+
outputs=[
|
204 |
+
url_field,
|
205 |
+
output_col,
|
206 |
+
extracted_text,
|
207 |
+
screenshot_scrollable
|
208 |
+
],
|
209 |
+
api_name="do_crawl",
|
210 |
+
)
|
211 |
+
dont_crawl_btn.click(
|
212 |
+
fn=do_crawl,
|
213 |
+
inputs=[profile_state, url_field, language_codes, categories],
|
214 |
+
outputs=[
|
215 |
+
url_field,
|
216 |
+
output_col,
|
217 |
+
extracted_text,
|
218 |
+
screenshot_scrollable
|
219 |
+
],
|
220 |
+
api_name="do_crawl",
|
221 |
+
)
|
222 |
+
|
223 |
+
# dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
|
224 |
+
|
225 |
+
# def random_subpage(url):
|
226 |
+
# new_url = "http://example.com"
|
227 |
+
|
228 |
+
# return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
|
229 |
+
|
230 |
+
# random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
if __name__ == "__main__":
|
235 |
+
demo.launch()
|
languages.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Taken from:
|
2 |
+
# https://gist.github.com/jrnk/8eb57b065ea0b098d571
|
3 |
+
ISO_CODE_TO_LANGUAGE_NAME = {
|
4 |
+
"aa": "Afar",
|
5 |
+
"ab": "Abkhazian",
|
6 |
+
"ae": "Avestan",
|
7 |
+
"af": "Afrikaans",
|
8 |
+
"ak": "Akan",
|
9 |
+
"am": "Amharic",
|
10 |
+
"an": "Aragonese",
|
11 |
+
"ar": "Arabic",
|
12 |
+
"as": "Assamese",
|
13 |
+
"av": "Avaric",
|
14 |
+
"ay": "Aymara",
|
15 |
+
"az": "Azerbaijani",
|
16 |
+
"ba": "Bashkir",
|
17 |
+
"be": "Belarusian",
|
18 |
+
"bg": "Bulgarian",
|
19 |
+
"bh": "Bihari languages",
|
20 |
+
"bi": "Bislama",
|
21 |
+
"bm": "Bambara",
|
22 |
+
"bn": "Bengali",
|
23 |
+
"bo": "Tibetan",
|
24 |
+
"br": "Breton",
|
25 |
+
"bs": "Bosnian",
|
26 |
+
"ca": "Catalan; Valencian",
|
27 |
+
"ce": "Chechen",
|
28 |
+
"ch": "Chamorro",
|
29 |
+
"co": "Corsican",
|
30 |
+
"cr": "Cree",
|
31 |
+
"cs": "Czech",
|
32 |
+
"cu": "Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic",
|
33 |
+
"cv": "Chuvash",
|
34 |
+
"cy": "Welsh",
|
35 |
+
"da": "Danish",
|
36 |
+
"de": "German",
|
37 |
+
"dv": "Divehi; Dhivehi; Maldivian",
|
38 |
+
"dz": "Dzongkha",
|
39 |
+
"ee": "Ewe",
|
40 |
+
"el": "Greek, Modern (1453-)",
|
41 |
+
"en": "English",
|
42 |
+
"eo": "Esperanto",
|
43 |
+
"es": "Spanish; Castilian",
|
44 |
+
"et": "Estonian",
|
45 |
+
"eu": "Basque",
|
46 |
+
"fa": "Persian",
|
47 |
+
"ff": "Fulah",
|
48 |
+
"fi": "Finnish",
|
49 |
+
"fj": "Fijian",
|
50 |
+
"fo": "Faroese",
|
51 |
+
"fr": "French",
|
52 |
+
"fy": "Western Frisian",
|
53 |
+
"ga": "Irish",
|
54 |
+
"gd": "Gaelic; Scomttish Gaelic",
|
55 |
+
"gl": "Galician",
|
56 |
+
"gn": "Guarani",
|
57 |
+
"gu": "Gujarati",
|
58 |
+
"gv": "Manx",
|
59 |
+
"ha": "Hausa",
|
60 |
+
"he": "Hebrew",
|
61 |
+
"hi": "Hindi",
|
62 |
+
"ho": "Hiri Motu",
|
63 |
+
"hr": "Croatian",
|
64 |
+
"ht": "Haitian; Haitian Creole",
|
65 |
+
"hu": "Hungarian",
|
66 |
+
"hy": "Armenian",
|
67 |
+
"hz": "Herero",
|
68 |
+
"ia": "Interlingua (International Auxiliary Language Association)",
|
69 |
+
"id": "Indonesian",
|
70 |
+
"ie": "Interlingue; Occidental",
|
71 |
+
"ig": "Igbo",
|
72 |
+
"ii": "Sichuan Yi; Nuosu",
|
73 |
+
"ik": "Inupiaq",
|
74 |
+
"io": "Ido",
|
75 |
+
"is": "Icelandic",
|
76 |
+
"it": "Italian",
|
77 |
+
"iu": "Inuktitut",
|
78 |
+
"ja": "Japanese",
|
79 |
+
"jv": "Javanese",
|
80 |
+
"ka": "Georgian",
|
81 |
+
"kg": "Kongo",
|
82 |
+
"ki": "Kikuyu; Gikuyu",
|
83 |
+
"kj": "Kuanyama; Kwanyama",
|
84 |
+
"kk": "Kazakh",
|
85 |
+
"kl": "Kalaallisut; Greenlandic",
|
86 |
+
"km": "Central Khmer",
|
87 |
+
"kn": "Kannada",
|
88 |
+
"ko": "Korean",
|
89 |
+
"kr": "Kanuri",
|
90 |
+
"ks": "Kashmiri",
|
91 |
+
"ku": "Kurdish",
|
92 |
+
"kv": "Komi",
|
93 |
+
"kw": "Cornish",
|
94 |
+
"ky": "Kirghiz; Kyrgyz",
|
95 |
+
"la": "Latin",
|
96 |
+
"lb": "Luxembourgish; Letzeburgesch",
|
97 |
+
"lg": "Ganda",
|
98 |
+
"li": "Limburgan; Limburger; Limburgish",
|
99 |
+
"ln": "Lingala",
|
100 |
+
"lo": "Lao",
|
101 |
+
"lt": "Lithuanian",
|
102 |
+
"lu": "Luba-Katanga",
|
103 |
+
"lv": "Latvian",
|
104 |
+
"mg": "Malagasy",
|
105 |
+
"mh": "Marshallese",
|
106 |
+
"mi": "Maori",
|
107 |
+
"mk": "Macedonian",
|
108 |
+
"ml": "Malayalam",
|
109 |
+
"mn": "Mongolian",
|
110 |
+
"mr": "Marathi",
|
111 |
+
"ms": "Malay",
|
112 |
+
"mt": "Maltese",
|
113 |
+
"my": "Burmese",
|
114 |
+
"na": "Nauru",
|
115 |
+
"nb": "Bokmål, Norwegian; Norwegian Bokmål",
|
116 |
+
"nd": "Ndebele, North; North Ndebele",
|
117 |
+
"ne": "Nepali",
|
118 |
+
"ng": "Ndonga",
|
119 |
+
"nl": "Dutch; Flemish",
|
120 |
+
"nn": "Norwegian Nynorsk; Nynorsk, Norwegian",
|
121 |
+
"no": "Norwegian",
|
122 |
+
"nr": "Ndebele, South; South Ndebele",
|
123 |
+
"nv": "Navajo; Navaho",
|
124 |
+
"ny": "Chichewa; Chewa; Nyanja",
|
125 |
+
"oc": "Occitan (post 1500)",
|
126 |
+
"oj": "Ojibwa",
|
127 |
+
"om": "Oromo",
|
128 |
+
"or": "Oriya",
|
129 |
+
"os": "Ossetian; Ossetic",
|
130 |
+
"pa": "Panjabi; Punjabi",
|
131 |
+
"pi": "Pali",
|
132 |
+
"pl": "Polish",
|
133 |
+
"ps": "Pushto; Pashto",
|
134 |
+
"pt": "Portuguese",
|
135 |
+
"qu": "Quechua",
|
136 |
+
"rm": "Romansh",
|
137 |
+
"rn": "Rundi",
|
138 |
+
"ro": "Romanian; Moldavian; Moldovan",
|
139 |
+
"ru": "Russian",
|
140 |
+
"rw": "Kinyarwanda",
|
141 |
+
"sa": "Sanskrit",
|
142 |
+
"sc": "Sardinian",
|
143 |
+
"sd": "Sindhi",
|
144 |
+
"se": "Northern Sami",
|
145 |
+
"sg": "Sango",
|
146 |
+
"si": "Sinhala; Sinhalese",
|
147 |
+
"sk": "Slovak",
|
148 |
+
"sl": "Slovenian",
|
149 |
+
"sm": "Samoan",
|
150 |
+
"sn": "Shona",
|
151 |
+
"so": "Somali",
|
152 |
+
"sq": "Albanian",
|
153 |
+
"sr": "Serbian",
|
154 |
+
"ss": "Swati",
|
155 |
+
"st": "Sotho, Southern",
|
156 |
+
"su": "Sundanese",
|
157 |
+
"sv": "Swedish",
|
158 |
+
"sw": "Swahili",
|
159 |
+
"ta": "Tamil",
|
160 |
+
"te": "Telugu",
|
161 |
+
"tg": "Tajik",
|
162 |
+
"th": "Thai",
|
163 |
+
"ti": "Tigrinya",
|
164 |
+
"tk": "Turkmen",
|
165 |
+
"tl": "Tagalog",
|
166 |
+
"tn": "Tswana",
|
167 |
+
"to": "Tonga (Tonga Islands)",
|
168 |
+
"tr": "Turkish",
|
169 |
+
"ts": "Tsonga",
|
170 |
+
"tt": "Tatar",
|
171 |
+
"tw": "Twi",
|
172 |
+
"ty": "Tahitian",
|
173 |
+
"ug": "Uighur; Uyghur",
|
174 |
+
"uk": "Ukrainian",
|
175 |
+
"ur": "Urdu",
|
176 |
+
"uz": "Uzbek",
|
177 |
+
"ve": "Venda",
|
178 |
+
"vi": "Vietnamese",
|
179 |
+
"vo": "Volapük",
|
180 |
+
"wa": "Walloon",
|
181 |
+
"wo": "Wolof",
|
182 |
+
"xh": "Xhosa",
|
183 |
+
"yi": "Yiddish",
|
184 |
+
"yo": "Yoruba",
|
185 |
+
"za": "Zhuang; Chuang",
|
186 |
+
"zh": "Chinese",
|
187 |
+
"zu": "Zulu"
|
188 |
+
}
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
chromium-driver
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
selenium >=4.0.0, < 5.0.0
|
2 |
+
gradio>=3.40.1
|
3 |
+
Pillow>=8.3.1,<9.0
|
4 |
+
trafilatura
|
5 |
+
gradio[oauth]
|