Spaces:

librarian-bots
/

metadata_request_service

Sleeping

App Files Files Community

davanstrien HF staff commited on Sep 12, 2023

Commit

92e2ee4

•

1 Parent(s): f016495

draft app

Browse files

Files changed (3) hide show

app.py +262 -0
requirements.in +9 -0
requirements.txt +209 -0

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import asyncio
+import os
+import re
+from typing import Dict
+import gradio as gr
+import httpx
+from cachetools import TTLCache, cached
+from cashews import NOT_NONE, cache
+from dotenv import load_dotenv
+from httpx import AsyncClient, Limits
+from huggingface_hub import (
+    ModelCard,
+    ModelFilter,
+    get_repo_discussions,
+    hf_hub_url,
+    list_models,
+    logging,
+)
+from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
+from tqdm.asyncio import tqdm as atqdm
+from tqdm.auto import tqdm
+import random
+cache.setup("mem://")
+load_dotenv()
+token = os.environ["HUGGINGFACE_TOKEN"]
+user_agent = os.environ["USER_AGENT"]
+assert token
+assert user_agent
+headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
+limits = Limits(max_keepalive_connections=10, max_connections=50)
+def create_client():
+    return AsyncClient(headers=headers, limits=limits, http2=True)
+@cached(cache=TTLCache(maxsize=100, ttl=60 * 10))
+def get_models(user_or_org):
+    model_filter = ModelFilter(library="transformers", author=user_or_org)
+    return list(
+        tqdm(
+            iter(
+                list_models(
+                    filter=model_filter,
+                    sort="downloads",
+                    direction=-1,
+                    cardData=True,
+                    full=True,
+                )
+            )
+        )
+    )
+def filter_models(models):
+    new_models = []
+    for model in tqdm(models):
+        try:
+            if card_data := model.cardData:
+                base_model = card_data.get("base_model", None)
+                if not base_model:
+                    new_models.append(model)
+        except AttributeError:
+            continue
+    return new_models
+MODEL_ID_RE_PATTERN = re.compile(
+    "This model is a fine-tuned version of \[(.*?)\]\(.*?\)"
+)
+BASE_MODEL_PATTERN = re.compile("base_model:\s+(.+)")
+@cached(cache=TTLCache(maxsize=100, ttl=60 * 3))
+def has_model_card(model):
+    if siblings := model.siblings:
+        for sibling in siblings:
+            if sibling.rfilename == "README.md":
+                return True
+    return False
+@cached(cache=TTLCache(maxsize=100, ttl=60))
+def check_already_has_base_model(text):
+    return bool(re.search(BASE_MODEL_PATTERN, text))
+@cached(cache=TTLCache(maxsize=100, ttl=60))
+def extract_model_name(text):
+    return match.group(1) if (match := re.search(MODEL_ID_RE_PATTERN, text)) else None
+# semaphore = asyncio.Semaphore(10)  # Maximum number of concurrent tasks
+@cache(ttl=120, condition=NOT_NONE)
+async def check_readme_for_match(model):
+    if not has_model_card(model):
+        return None
+    model_card_url = hf_hub_url(model.modelId, "README.md")
+    client = create_client()
+    try:
+        resp = await client.get(model_card_url)
+        if check_already_has_base_model(resp.text):
+            return None
+        else:
+            return None if resp.status_code != 200 else extract_model_name(resp.text)
+    except httpx.ConnectError:
+        return None
+    except httpx.ReadTimeout:
+        return None
+    except httpx.ConnectTimeout:
+        return None
+    except Exception as e:
+        print(e)
+        return None
+@cache(ttl=120, condition=NOT_NONE)
+async def check_model_exists(model, match):
+    client = create_client()
+    url = f"https://huggingface.co/api/models/{match}"
+    try:
+        resp = await client.get(url)
+        if resp.status_code == 200:
+            return {"modelid": model.modelId, "match": match}
+        if resp.status_code == 401:
+            return False
+    except httpx.ConnectError:
+        return None
+    except httpx.ReadTimeout:
+        return None
+    except httpx.ConnectTimeout:
+        return None
+    except Exception as e:
+        print(e)
+        return None
+@cache(ttl=120, condition=NOT_NONE)
+async def check_model(model):
+    match = await check_readme_for_match(model)
+    if match:
+        return await check_model_exists(model, match)
+async def prep_tasks(models):
+    tasks = []
+    for model in models:
+        task = asyncio.create_task(check_model(model))
+        tasks.append(task)
+    return [await f for f in atqdm.as_completed(tasks)]
+def get_data_for_user(user_or_org):
+    models = get_models(user_or_org)
+    models = filter_models(models)
+    results = asyncio.run(prep_tasks(models))
+    results = [r for r in results if r is not None]
+    return results
+logger = logging.get_logger()
+token = os.getenv("HUGGINGFACE_TOKEN")
+def generate_issue_text(based_model_regex_match, opened_by=None):
+    return f"""This pull request aims to enrich the metadata of your model by adding [`{based_model_regex_match}`](https://huggingface.co/{based_model_regex_match}) as a `base_model` field, situated in the `YAML` block of your model's `README.md`.
+How did we find this information? We performed a regular expression match on your `README.md` file to determine the connection.
+**Why add this?** Enhancing your model's metadata in this way:
+- **Boosts Discoverability** - It becomes straightforward to trace the relationships between various models on the Hugging Face Hub.
+- **Highlights Impact** - It showcases the contributions and influences different models have within the community.
+For a hands-on example of how such metadata can play a pivotal role in mapping model connections, take a look at [librarian-bots/base_model_explorer](https://huggingface.co/spaces/librarian-bots/base_model_explorer).
+This PR comes courtesy of [Librarian Bot](https://huggingface.co/librarian-bot) by request of {opened_by}"""
+def update_metadata(metadata_payload: Dict[str, str], user_making_request=None):
+    metadata_payload["opened_pr"] = False
+    regex_match = metadata_payload["match"]
+    repo_id = metadata_payload["modelid"]
+    try:
+        model_card = ModelCard.load(repo_id)
+    except RepositoryNotFoundError:
+        return metadata_payload
+    model_card.data["base_model"] = regex_match
+    template = generate_issue_text(regex_match, opened_by=user_making_request)
+    try:
+        if previous_discussions := list(get_repo_discussions(repo_id)):
+            logger.info("found previous discussions")
+            if prs := [
+                discussion
+                for discussion in previous_discussions
+                if discussion.is_pull_request
+            ]:
+                logger.info("found previous pull requests")
+                for pr in prs:
+                    if pr.author == "librarian-bot":
+                        logger.info("previously opened PR")
+                        if (
+                            pr.title
+                            == "Librarian Bot: Add base_model information to model"
+                        ):
+                            logger.info("previously opened PR to add base_model tag")
+                            metadata_payload["opened_pr"] = True
+                            return metadata_payload
+        model_card.push_to_hub(
+            repo_id,
+            token=token,
+            repo_type="model",
+            create_pr=True,
+            commit_message="Librarian Bot: Add base_model information to model",
+            commit_description=template,
+        )
+        metadata_payload["opened_pr"] = True
+        return metadata_payload
+    except HfHubHTTPError:
+        return metadata_payload
+def open_prs(profile: gr.OAuthProfile | None, user_or_org: str = None):
+    if not profile:
+        return "Please login to open PR requests"
+    username = profile.preferred_username
+    user_to_receive_prs = user_or_org or username
+    data = get_data_for_user(user_to_receive_prs)
+    if user_or_org:
+        random.sample(data, min(10, len(data)))
+    if not data:
+        return "No PRs to open"
+    results = []
+    for metadata_payload in data:
+        try:
+            results.append(
+                update_metadata(metadata_payload, user_making_request=username)
+            )
+        except Exception as e:
+            logger.error(e)
+    return f"Opened {len([r for r in results if r['opened_pr']])} PRs"
+with gr.Blocks() as demo:
+    gr.Markdown("# Librarian Bot")
+    gr.LoginButton(), gr.LogoutButton()
+    user = gr.Textbox(label="user or org to Open PRs for")
+    button = gr.Button()
+    results = gr.Markdown()
+    button.click(open_prs, [user], results)
+demo.queue(concurrency_count=1).launch()

requirements.in ADDED Viewed

	@@ -0,0 +1,9 @@

+cachetools
+cashews
+diskcache
+gradio[oauth]
+httpx[http2]
+huggingface_hub
+python-dotenv
+toolz
+tqdm

requirements.txt ADDED Viewed

	@@ -0,0 +1,209 @@

+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+aiofiles==23.2.1
+    # via gradio
+altair==5.1.1
+    # via gradio
+annotated-types==0.5.0
+    # via pydantic
+anyio==3.7.1
+    # via
+    #   fastapi
+    #   httpcore
+    #   starlette
+attrs==23.1.0
+    # via
+    #   jsonschema
+    #   referencing
+authlib==1.2.1
+    # via gradio
+cachetools==5.3.1
+    # via -r requirements.in
+cashews==6.2.0
+    # via -r requirements.in
+certifi==2023.7.22
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.15.1
+    # via cryptography
+charset-normalizer==3.2.0
+    # via requests
+click==8.1.7
+    # via uvicorn
+contourpy==1.1.0
+    # via matplotlib
+cryptography==41.0.3
+    # via authlib
+cycler==0.11.0
+    # via matplotlib
+diskcache==5.6.3
+    # via -r requirements.in
+fastapi==0.103.1
+    # via gradio
+ffmpy==0.3.1
+    # via gradio
+filelock==3.12.3
+    # via huggingface-hub
+fonttools==4.42.1
+    # via matplotlib
+fsspec==2023.9.0
+    # via
+    #   gradio-client
+    #   huggingface-hub
+gradio[oauth]==3.43.2
+    # via -r requirements.in
+gradio-client==0.5.0
+    # via gradio
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+h2==4.1.0
+    # via httpx
+hpack==4.0.0
+    # via h2
+httpcore==0.18.0
+    # via httpx
+httpx[http2]==0.25.0
+    # via
+    #   -r requirements.in
+    #   gradio
+    #   gradio-client
+huggingface-hub==0.17.0
+    # via
+    #   -r requirements.in
+    #   gradio
+    #   gradio-client
+hyperframe==6.0.1
+    # via h2
+idna==3.4
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+importlib-resources==6.0.1
+    # via gradio
+itsdangerous==2.1.2
+    # via gradio
+jinja2==3.1.2
+    # via
+    #   altair
+    #   gradio
+jsonschema==4.19.0
+    # via altair
+jsonschema-specifications==2023.7.1
+    # via jsonschema
+kiwisolver==1.4.5
+    # via matplotlib
+markupsafe==2.1.3
+    # via
+    #   gradio
+    #   jinja2
+matplotlib==3.7.2
+    # via gradio
+numpy==1.25.2
+    # via
+    #   altair
+    #   contourpy
+    #   gradio
+    #   matplotlib
+    #   pandas
+orjson==3.9.7
+    # via gradio
+packaging==23.1
+    # via
+    #   altair
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   matplotlib
+pandas==2.1.0
+    # via
+    #   altair
+    #   gradio
+pillow==10.0.0
+    # via
+    #   gradio
+    #   matplotlib
+pycparser==2.21
+    # via cffi
+pydantic==2.3.0
+    # via
+    #   fastapi
+    #   gradio
+pydantic-core==2.6.3
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pyparsing==3.0.9
+    # via matplotlib
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   pandas
+python-dotenv==1.0.0
+    # via -r requirements.in
+python-multipart==0.0.6
+    # via gradio
+pytz==2023.3.post1
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   gradio
+    #   huggingface-hub
+referencing==0.30.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.31.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+rpds-py==0.10.2
+    # via
+    #   jsonschema
+    #   referencing
+semantic-version==2.10.0
+    # via gradio
+six==1.16.0
+    # via python-dateutil
+sniffio==1.3.0
+    # via
+    #   anyio
+    #   httpcore
+    #   httpx
+starlette==0.27.0
+    # via fastapi
+toolz==0.12.0
+    # via
+    #   -r requirements.in
+    #   altair
+tqdm==4.66.1
+    # via
+    #   -r requirements.in
+    #   huggingface-hub
+typing-extensions==4.7.1
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+tzdata==2023.3
+    # via pandas
+urllib3==2.0.4
+    # via requests
+uvicorn==0.23.2
+    # via gradio
+websockets==11.0.3
+    # via
+    #   gradio
+    #   gradio-client