dataset-tldr

Paused

App Files Files Community

davanstrien HF staff commited on Jan 24

Commit

41869c7

•

1 Parent(s): f1bc1ad

Add fastapi.responses and starlette.responses imports

Browse files

Files changed (1) hide show

main.py +28 -20

main.py CHANGED Viewed

@@ -3,7 +3,7 @@ import random
 from pathlib import Path
 from statistics import mean
 from typing import Any, Iterator, Union
 import fasttext
 from dotenv import load_dotenv
 from fastapi import FastAPI
@@ -11,6 +11,7 @@ from httpx import AsyncClient, Client, Timeout
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
 from toolz import concat, groupby, valmap
 app = FastAPI()
 logger = logging.get_logger(__name__)
@@ -19,16 +20,17 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
-DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
 headers = {
     "authorization": f"Bearer ${HF_TOKEN}",
 }
 timeout = Timeout(60, read=120)
 client = Client(headers=headers, timeout=timeout)
 async_client = AsyncClient(headers=headers, timeout=timeout)
-# non exhaustive list of columns that might contain text which can be used for language detection
-# we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
 TARGET_COLUMN_NAMES = {
     "text",
     "input",
@@ -116,10 +118,20 @@ async def get_random_rows(
 def load_model(repo_id: str) -> fasttext.FastText._FastText:
-    model_path = hf_hub_download(repo_id, filename="model.bin")
     return fasttext.load_model(model_path)
 def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterator[str]:
     for row in rows:
         if isinstance(row, str):
@@ -139,21 +151,6 @@ def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterat
                 continue
-FASTTEXT_PREFIX_LENGTH = 9  # fasttext labels are formatted like "__label__eng_Latn"
-# model = load_model(DEFAULT_FAST_TEXT_MODEL)
-Path("code/models").mkdir(parents=True, exist_ok=True)
-model = fasttext.load_model(
-    hf_hub_download(
-        "facebook/fasttext-language-identification",
-        "model.bin",
-        cache_dir="code/models",
-        local_dir="code/models",
-        local_dir_use_symlinks=False,
-    )
-)
 def model_predict(inputs: str, k=1) -> list[dict[str, float]]:
     predictions = model.predict(inputs, k=k)
     return [
@@ -196,6 +193,17 @@ def predict_rows(rows, target_column, language_threshold_percent=0.2):
     }
 @app.get("/predict_dataset_language/{hub_id}")
 async def predict_language(
     hub_id: str,

 from pathlib import Path
 from statistics import mean
 from typing import Any, Iterator, Union
+from fastapi.responses import HTMLResponse
 import fasttext
 from dotenv import load_dotenv
 from fastapi import FastAPI
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
 from toolz import concat, groupby, valmap
+from starlette.responses import RedirectResponse
 app = FastAPI()
 logger = logging.get_logger(__name__)
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+FASTTEXT_PREFIX_LENGTH = 9  # fasttext labels are formatted like "__label__eng_Latn"
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
+DEFAULT_FAST_TEXT_MODEL = "facebook/fasttext-language-identification"
 headers = {
     "authorization": f"Bearer ${HF_TOKEN}",
 }
 timeout = Timeout(60, read=120)
 client = Client(headers=headers, timeout=timeout)
 async_client = AsyncClient(headers=headers, timeout=timeout)
 TARGET_COLUMN_NAMES = {
     "text",
     "input",
 def load_model(repo_id: str) -> fasttext.FastText._FastText:
+    Path("code/models").mkdir(parents=True, exist_ok=True)
+    model_path = hf_hub_download(
+        repo_id,
+        "model.bin",
+        cache_dir="code/models",
+        local_dir="code/models",
+        local_dir_use_symlinks=False,
+    )
     return fasttext.load_model(model_path)
+model = load_model(DEFAULT_FAST_TEXT_MODEL)
 def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterator[str]:
     for row in rows:
         if isinstance(row, str):
                 continue
 def model_predict(inputs: str, k=1) -> list[dict[str, float]]:
     predictions = model.predict(inputs, k=k)
     return [
     }
+# @app.get("/", response_class=HTMLResponse)
+# async def read_index():
+#     html_content = Path("index.html").read_text()
+#     return HTMLResponse(content=html_content)
+@app.get("/", include_in_schema=False)
+def root():
+    return RedirectResponse(url="/docs")
 @app.get("/predict_dataset_language/{hub_id}")
 async def predict_language(
     hub_id: str,