Spaces:

malteos
/

seed-crawl-annotator

Running

App Files Files Community

malte.ostendorff@telekom.de commited on 3 days ago

Commit

3c258f1

•

0 Parent(s):

init

Browse files

Files changed (6) hide show

.gitignore +176 -0
README.md +22 -0
app.py +235 -0
languages.py +188 -0
packages.txt +1 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,176 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+# End of https://www.toptal.com/developers/gitignore/api/python

README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+---
+title: Seed Crawl Annotator
+emoji: 🐨
+colorFrom: red
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.6.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# Annotate Web Languages
+## Usage
+```bash
+# Run the Gradio app
+gradio app.py  # auto reload
+python app.py  # static
+```

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from __future__ import annotations
+import os
+import random
+import time
+import gradio as gr
+from selenium import webdriver
+from selenium.common.exceptions import WebDriverException
+from PIL import Image
+from io import BytesIO
+import base64
+import trafilatura
+from huggingface_hub import whoami
+from languages import ISO_CODE_TO_LANGUAGE_NAME
+OFFLINE = os.environ.get("OFFLINE", False)
+def pil_image_to_base64(image):
+    # Save the image to a BytesIO buffer
+    buffer = BytesIO()
+    image.save(buffer, format="PNG")  # You can change the format if needed
+    buffer.seek(0)
+    # Encode the bytes into a base64 string
+    img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+    # Format the base64 string for use in an HTML image tag
+    html_img_tag_src = f"data:image/png;base64,{img_base64}"
+    return html_img_tag_src
+def fetch_screenshot_and_text_from_url(url):
+    screen_width = 1080
+    height = 350
+    text = ""
+    if OFFLINE:
+        screenshot = Image.new('RGB', (350, height))
+        text = f"Some dummy text for {url} (offline mode enabled)"
+    else:
+        options = webdriver.ChromeOptions()
+        options.add_argument('--headless')
+        options.add_argument('--no-sandbox')
+        options.add_argument('--disable-dev-shm-usage')
+        try:
+            driver = webdriver.Chrome(options=options)
+            #driver.set_window_size(1080, 720)  # Adjust the window size here
+            driver.get(url)
+            driver.implicitly_wait(10)
+            # Wait for the page to fully load; you may adjust the sleep time or implement a wait condition
+            # time.sleep(2)
+            # fetch html from web page
+            html_str = driver.page_source
+            # Execute JS to find the full height of the rendered page
+            scroll_height = driver.execute_script("return document.body.scrollHeight")
+            # Resize the window to full page height
+            driver.set_window_size(screen_width, max(scroll_height + 200, 900))
+            raw_screenshot = driver.get_screenshot_as_png()
+            screenshot = Image.open(BytesIO(raw_screenshot))
+            # extract text
+            text = trafilatura.extract(html_str)
+        except WebDriverException as e:
+            screenshot = Image.new('RGB', (1, 1))
+        finally:
+            if driver:
+                driver.quit()
+    # embed base65 encoded image as <img> tag into html string
+    screenshot_html_str = f"""<div style="width: 100%; height: {height}px; overflow-y: scroll;"><img src="{pil_image_to_base64(screenshot)}" /></div>"""
+    # return gr.update(value=html_str, visible=True), text, gr.update(visible=True)
+    return screenshot_html_str, text
+with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown(
+    """
+    # Seed Crawl Annotator
+    """)
+    profile_state = gr.State([])
+    gr.LoginButton()
+    with gr.Column(visible=False) as wrapper_col:
+        def handle_login(profile: gr.OAuthProfile | None) -> dict:
+            if profile:
+                gr.Info(f"Logged in as {profile.username}")
+                return {
+                    profile_state: f"{profile.username}",
+                    wrapper_col: gr.update(visible=True),
+                }
+            else:
+                gr.Warning(f"You need to login to use this app.")
+                return {
+                    profile_state: None,
+                    wrapper_col: gr.update(visible=False),
+                }
+        demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col])
+        url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
+        with gr.Row():
+            set_random_btn = gr.Button("Set Random URL", variant="secondary", interactive=True)
+            load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
+        with gr.Row():
+            extracted_text = gr.Textbox(label="Extracted text", max_lines=15, lines=15, visible=False, placeholder="Click on `Load URL` to fetch Web page's text content.")
+            screenshot_scrollable = gr.HTML(visible=False)
+        with gr.Column(visible=False) as output_col:
+            with gr.Row():
+                language_codes = gr.Dropdown(
+                        [("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
+                        label="Language codes",
+                        multiselect=True,
+                        # allow_custom_value=True,
+                )
+                categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
+            with gr.Row():
+                do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
+                dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
+                # random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
+        def set_random_url():
+            candidate_urls = [
+                "http://example.com",
+                "https://wikipedia.org/",
+                "https://occiglot.eu",
+                "https://ostendorff.org",
+                "https://fr.wikipedia.org/",
+                "https://amazon.com/"
+            ]
+            selected_url = random.choice(candidate_urls)
+            return selected_url
+        set_random_btn.click(fn=set_random_url, outputs=url_field)
+        def load_url(url):
+            screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
+            if not screenshot_html_str or not text:
+                gr.Error("Could not fetch data for url")
+            else:
+                return {
+                    screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
+                    extracted_text:  gr.update(value=text, visible=True),
+                    output_col: gr.update(visible=True),
+                    language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
+                    categories:  gr.update(value=None),
+                }
+        load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
+        def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
+            if profile_state:
+                html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
+                gr.Info("Thanks for your feedback")
+            else:
+                gr.Error("Feedback could not be saved")
+                html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
+            return {
+                url_field: "",
+                output_col: gr.update(visible=False),
+                extracted_text: gr.update(value=None, visible=False),
+                screenshot_scrollable: gr.update(value="", visible=False),
+            }
+        # def do_crawl(profile_state, url, language_codes, categories):
+        #     return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
+        # def dont_crawl(profile_state, url, language_codes, categories):
+        #     return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
+        do_crawl_btn.click(
+            fn=do_crawl,
+            inputs=[profile_state, url_field, language_codes, categories],
+            outputs=[
+                url_field,
+                output_col,
+                extracted_text,
+                screenshot_scrollable
+            ],
+            api_name="do_crawl",
+        )
+        dont_crawl_btn.click(
+            fn=do_crawl,
+            inputs=[profile_state, url_field, language_codes, categories],
+            outputs=[
+                url_field,
+                output_col,
+                extracted_text,
+                screenshot_scrollable
+            ],
+            api_name="do_crawl",
+        )
+        # dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
+        # def random_subpage(url):
+        #     new_url = "http://example.com"
+        #     return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
+        # random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")
+if __name__ == "__main__":
+    demo.launch()

languages.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Taken from:
+# https://gist.github.com/jrnk/8eb57b065ea0b098d571
+ISO_CODE_TO_LANGUAGE_NAME = {
+  "aa": "Afar",
+  "ab": "Abkhazian",
+  "ae": "Avestan",
+  "af": "Afrikaans",
+  "ak": "Akan",
+  "am": "Amharic",
+  "an": "Aragonese",
+  "ar": "Arabic",
+  "as": "Assamese",
+  "av": "Avaric",
+  "ay": "Aymara",
+  "az": "Azerbaijani",
+  "ba": "Bashkir",
+  "be": "Belarusian",
+  "bg": "Bulgarian",
+  "bh": "Bihari languages",
+  "bi": "Bislama",
+  "bm": "Bambara",
+  "bn": "Bengali",
+  "bo": "Tibetan",
+  "br": "Breton",
+  "bs": "Bosnian",
+  "ca": "Catalan; Valencian",
+  "ce": "Chechen",
+  "ch": "Chamorro",
+  "co": "Corsican",
+  "cr": "Cree",
+  "cs": "Czech",
+  "cu": "Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic",
+  "cv": "Chuvash",
+  "cy": "Welsh",
+  "da": "Danish",
+  "de": "German",
+  "dv": "Divehi; Dhivehi; Maldivian",
+  "dz": "Dzongkha",
+  "ee": "Ewe",
+  "el": "Greek, Modern (1453-)",
+  "en": "English",
+  "eo": "Esperanto",
+  "es": "Spanish; Castilian",
+  "et": "Estonian",
+  "eu": "Basque",
+  "fa": "Persian",
+  "ff": "Fulah",
+  "fi": "Finnish",
+  "fj": "Fijian",
+  "fo": "Faroese",
+  "fr": "French",
+  "fy": "Western Frisian",
+  "ga": "Irish",
+  "gd": "Gaelic; Scomttish Gaelic",
+  "gl": "Galician",
+  "gn": "Guarani",
+  "gu": "Gujarati",
+  "gv": "Manx",
+  "ha": "Hausa",
+  "he": "Hebrew",
+  "hi": "Hindi",
+  "ho": "Hiri Motu",
+  "hr": "Croatian",
+  "ht": "Haitian; Haitian Creole",
+  "hu": "Hungarian",
+  "hy": "Armenian",
+  "hz": "Herero",
+  "ia": "Interlingua (International Auxiliary Language Association)",
+  "id": "Indonesian",
+  "ie": "Interlingue; Occidental",
+  "ig": "Igbo",
+  "ii": "Sichuan Yi; Nuosu",
+  "ik": "Inupiaq",
+  "io": "Ido",
+  "is": "Icelandic",
+  "it": "Italian",
+  "iu": "Inuktitut",
+  "ja": "Japanese",
+  "jv": "Javanese",
+  "ka": "Georgian",
+  "kg": "Kongo",
+  "ki": "Kikuyu; Gikuyu",
+  "kj": "Kuanyama; Kwanyama",
+  "kk": "Kazakh",
+  "kl": "Kalaallisut; Greenlandic",
+  "km": "Central Khmer",
+  "kn": "Kannada",
+  "ko": "Korean",
+  "kr": "Kanuri",
+  "ks": "Kashmiri",
+  "ku": "Kurdish",
+  "kv": "Komi",
+  "kw": "Cornish",
+  "ky": "Kirghiz; Kyrgyz",
+  "la": "Latin",
+  "lb": "Luxembourgish; Letzeburgesch",
+  "lg": "Ganda",
+  "li": "Limburgan; Limburger; Limburgish",
+  "ln": "Lingala",
+  "lo": "Lao",
+  "lt": "Lithuanian",
+  "lu": "Luba-Katanga",
+  "lv": "Latvian",
+  "mg": "Malagasy",
+  "mh": "Marshallese",
+  "mi": "Maori",
+  "mk": "Macedonian",
+  "ml": "Malayalam",
+  "mn": "Mongolian",
+  "mr": "Marathi",
+  "ms": "Malay",
+  "mt": "Maltese",
+  "my": "Burmese",
+  "na": "Nauru",
+  "nb": "Bokmål, Norwegian; Norwegian Bokmål",
+  "nd": "Ndebele, North; North Ndebele",
+  "ne": "Nepali",
+  "ng": "Ndonga",
+  "nl": "Dutch; Flemish",
+  "nn": "Norwegian Nynorsk; Nynorsk, Norwegian",
+  "no": "Norwegian",
+  "nr": "Ndebele, South; South Ndebele",
+  "nv": "Navajo; Navaho",
+  "ny": "Chichewa; Chewa; Nyanja",
+  "oc": "Occitan (post 1500)",
+  "oj": "Ojibwa",
+  "om": "Oromo",
+  "or": "Oriya",
+  "os": "Ossetian; Ossetic",
+  "pa": "Panjabi; Punjabi",
+  "pi": "Pali",
+  "pl": "Polish",
+  "ps": "Pushto; Pashto",
+  "pt": "Portuguese",
+  "qu": "Quechua",
+  "rm": "Romansh",
+  "rn": "Rundi",
+  "ro": "Romanian; Moldavian; Moldovan",
+  "ru": "Russian",
+  "rw": "Kinyarwanda",
+  "sa": "Sanskrit",
+  "sc": "Sardinian",
+  "sd": "Sindhi",
+  "se": "Northern Sami",
+  "sg": "Sango",
+  "si": "Sinhala; Sinhalese",
+  "sk": "Slovak",
+  "sl": "Slovenian",
+  "sm": "Samoan",
+  "sn": "Shona",
+  "so": "Somali",
+  "sq": "Albanian",
+  "sr": "Serbian",
+  "ss": "Swati",
+  "st": "Sotho, Southern",
+  "su": "Sundanese",
+  "sv": "Swedish",
+  "sw": "Swahili",
+  "ta": "Tamil",
+  "te": "Telugu",
+  "tg": "Tajik",
+  "th": "Thai",
+  "ti": "Tigrinya",
+  "tk": "Turkmen",
+  "tl": "Tagalog",
+  "tn": "Tswana",
+  "to": "Tonga (Tonga Islands)",
+  "tr": "Turkish",
+  "ts": "Tsonga",
+  "tt": "Tatar",
+  "tw": "Twi",
+  "ty": "Tahitian",
+  "ug": "Uighur; Uyghur",
+  "uk": "Ukrainian",
+  "ur": "Urdu",
+  "uz": "Uzbek",
+  "ve": "Venda",
+  "vi": "Vietnamese",
+  "vo": "Volapük",
+  "wa": "Walloon",
+  "wo": "Wolof",
+  "xh": "Xhosa",
+  "yi": "Yiddish",
+  "yo": "Yoruba",
+  "za": "Zhuang; Chuang",
+  "zh": "Chinese",
+  "zu": "Zulu"
+}

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ chromium-driver

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+selenium >=4.0.0, < 5.0.0
+gradio>=3.40.1
+Pillow>=8.3.1,<9.0
+trafilatura
+gradio[oauth]